Merge tag 'kbuild-v5.13-2' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 8 May 2021 17:00:11 +0000 (10:00 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 8 May 2021 17:00:11 +0000 (10:00 -0700)
Pull more Kbuild updates from Masahiro Yamada:

 - Convert sh and sparc to use generic shell scripts to generate the
   syscall headers

 - refactor .gitignore files

 - Update kernel/config_data.gz only when the content of the .config
   is really changed, which avoids the unneeded re-link of vmlinux

 - move "remove stale files" workarounds to scripts/remove-stale-files

 - suppress unused-but-set-variable warnings by default for Clang
   as well

 - fix locale setting LANG=C to LC_ALL=C

 - improve 'make distclean'

 - always keep intermediate objects from scripts/link-vmlinux.sh

 - move IF_ENABLED out of <linux/kconfig.h> to make it self-contained

 - misc cleanups

* tag 'kbuild-v5.13-2' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild: (25 commits)
  linux/kconfig.h: replace IF_ENABLED() with PTR_IF() in <linux/kernel.h>
  kbuild: Don't remove link-vmlinux temporary files on exit/signal
  kbuild: remove the unneeded comments for external module builds
  kbuild: make distclean remove tag files in sub-directories
  kbuild: make distclean work against $(objtree) instead of $(srctree)
  kbuild: refactor modname-multi by using suffix-search
  kbuild: refactor fdtoverlay rule
  kbuild: parameterize the .o part of suffix-search
  arch: use cross_compiling to check whether it is a cross build or not
  kbuild: remove ARCH=sh64 support from top Makefile
  .gitignore: prefix local generated files with a slash
  kbuild: replace LANG=C with LC_ALL=C
  Makefile: Move -Wno-unused-but-set-variable out of GCC only block
  kbuild: add a script to remove stale generated files
  kbuild: update config_data.gz only when the content of .config is changed
  .gitignore: ignore only top-level modules.builtin
  .gitignore: move tags and TAGS close to other tag files
  kernel/.gitgnore: remove stale timeconst.h and hz.bc
  usr/include: refactor .gitignore
  genksyms: fix stale comment
  ...

2273 files changed:
CREDITS
Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe [new file with mode: 0644]
Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa [new file with mode: 0644]
Documentation/ABI/testing/sysfs-class-net-qmi
Documentation/ABI/testing/sysfs-class-rtrs-client
Documentation/ABI/testing/sysfs-devices-system-cpu
Documentation/ABI/testing/sysfs-driver-input-exc3000
Documentation/ABI/testing/sysfs-fs-f2fs
Documentation/ABI/testing/sysfs-kernel-mm-cma [new file with mode: 0644]
Documentation/admin-guide/devices.txt
Documentation/admin-guide/gpio/gpio-mockup.rst
Documentation/admin-guide/kernel-parameters.txt
Documentation/admin-guide/mm/memory-hotplug.rst
Documentation/admin-guide/mm/userfaultfd.rst
Documentation/admin-guide/reporting-issues.rst
Documentation/arm64/booting.rst
Documentation/arm64/elf_hwcaps.rst
Documentation/arm64/tagged-address-abi.rst
Documentation/core-api/dma-api.rst
Documentation/core-api/irq/irq-domain.rst
Documentation/core-api/symbol-namespaces.rst
Documentation/dev-tools/gdb-kernel-debugging.rst
Documentation/devicetree/bindings/arm/ete.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/arm/trbe.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/display/brcm,bcm2711-hdmi.yaml
Documentation/devicetree/bindings/display/renesas,du.yaml
Documentation/devicetree/bindings/dma/qcom,gpi.yaml
Documentation/devicetree/bindings/gpio/fairchild,74hc595.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/gpio/gpio-74x164.txt [deleted file]
Documentation/devicetree/bindings/gpio/realtek,otto-gpio.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/gpio/rockchip,gpio-bank.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/hwlock/sirf,hwspinlock.txt [deleted file]
Documentation/devicetree/bindings/i3c/silvaco,i3c-master.yaml
Documentation/devicetree/bindings/infiniband/hisilicon-hns-roce.txt
Documentation/devicetree/bindings/input/atmel,maxtouch.yaml
Documentation/devicetree/bindings/input/iqs626a.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/azoteq,iqs5xx.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/ilitek_ts_i2c.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/iqs5xx.txt [deleted file]
Documentation/devicetree/bindings/input/touchscreen/melfas,mms114.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/mms114.txt [deleted file]
Documentation/devicetree/bindings/input/touchscreen/mstar,msg2638.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/interrupt-controller/idt,32434-pic.yaml
Documentation/devicetree/bindings/iommu/arm,smmu.yaml
Documentation/devicetree/bindings/iommu/sprd,iommu.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/leds/leds-rt4505.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/media/renesas,vin.yaml
Documentation/devicetree/bindings/mtd/tango-nand.txt [deleted file]
Documentation/devicetree/bindings/net/renesas,etheravb.yaml
Documentation/devicetree/bindings/pci/hisilicon-pcie.txt [deleted file]
Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/pci/rcar-pci-host.yaml
Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/pci/tango-pcie.txt [deleted file]
Documentation/devicetree/bindings/pci/ti,j721e-pci-ep.yaml
Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml
Documentation/devicetree/bindings/pci/xilinx-nwl-pcie.txt
Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.txt
Documentation/devicetree/bindings/pwm/pwm-rockchip.txt [deleted file]
Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/pwm/toshiba,pwm-visconti.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/remoteproc/imx-rproc.txt [deleted file]
Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt
Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt
Documentation/devicetree/bindings/remoteproc/qcom,wcnss-pil.txt
Documentation/devicetree/bindings/remoteproc/st,stm32-rproc.yaml
Documentation/devicetree/bindings/riscv/microchip.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/serial/8250.yaml
Documentation/devicetree/bindings/thermal/brcm,ns-thermal.txt [deleted file]
Documentation/devicetree/bindings/thermal/brcm,ns-thermal.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/thermal/qcom-tsens.yaml
Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml
Documentation/devicetree/bindings/thermal/thermal-sensor.yaml
Documentation/devicetree/bindings/vendor-prefixes.yaml
Documentation/driver-api/gpio/consumer.rst
Documentation/driver-api/gpio/drivers-on-gpio.rst
Documentation/driver-api/pwm.rst
Documentation/driver-api/thermal/sysfs-api.rst
Documentation/driver-api/vfio.rst
Documentation/filesystems/f2fs.rst
Documentation/firmware-guide/acpi/gpio-properties.rst
Documentation/input/devices/rotary-encoder.rst
Documentation/input/joydev/joystick-api.rst
Documentation/input/joydev/joystick.rst
Documentation/process/changes.rst
Documentation/riscv/index.rst
Documentation/riscv/vm-layout.rst [new file with mode: 0644]
Documentation/security/index.rst
Documentation/security/landlock.rst [new file with mode: 0644]
Documentation/trace/coresight/coresight-trbe.rst [new file with mode: 0644]
Documentation/translations/it_IT/process/changes.rst
Documentation/translations/zh_CN/index.rst
Documentation/userspace-api/index.rst
Documentation/userspace-api/landlock.rst [new file with mode: 0644]
Documentation/virt/kvm/amd-memory-encryption.rst
Documentation/virt/kvm/api.rst
Documentation/virt/kvm/arm/index.rst
Documentation/virt/kvm/arm/ptp_kvm.rst [new file with mode: 0644]
Documentation/virt/kvm/devices/arm-vgic-its.rst
Documentation/virt/kvm/devices/arm-vgic-v3.rst
Documentation/virt/kvm/locking.rst
Documentation/virt/kvm/s390-diag.rst
Documentation/x86/x86_64/5level-paging.rst
MAINTAINERS
Makefile
arch/Kconfig
arch/alpha/include/asm/io.h
arch/alpha/kernel/pc873xx.c
arch/alpha/kernel/syscalls/syscall.tbl
arch/alpha/lib/csum_partial_copy.c
arch/arc/Kconfig
arch/arm/Kconfig
arch/arm/boot/compressed/Makefile
arch/arm/boot/dts/rk3036.dtsi
arch/arm/boot/dts/rk3288.dtsi
arch/arm/configs/dove_defconfig
arch/arm/configs/footbridge_defconfig
arch/arm/configs/magician_defconfig
arch/arm/configs/moxart_defconfig
arch/arm/configs/mps2_defconfig
arch/arm/configs/mvebu_v5_defconfig
arch/arm/configs/xcep_defconfig
arch/arm/include/asm/bug.h
arch/arm/include/asm/hypervisor.h
arch/arm/include/asm/io.h
arch/arm/include/asm/kexec.h
arch/arm/include/asm/memory.h
arch/arm/include/asm/set_memory.h
arch/arm/include/uapi/asm/Kbuild
arch/arm/include/uapi/asm/unistd.h
arch/arm/kernel/asm-offsets.c
arch/arm/kernel/entry-common.S
arch/arm/kernel/hw_breakpoint.c
arch/arm/kernel/machine_kexec.c
arch/arm/kernel/process.c
arch/arm/kernel/smccc-call.S
arch/arm/kernel/suspend.c
arch/arm/kernel/traps.c
arch/arm/mach-footbridge/Kconfig
arch/arm/mach-footbridge/Makefile
arch/arm/mach-footbridge/personal-pci.c [deleted file]
arch/arm/mach-footbridge/personal.c [deleted file]
arch/arm/mach-iop32x/n2100.c
arch/arm/mm/cache-v7.S
arch/arm/mm/dump.c
arch/arm/mm/init.c
arch/arm/mm/proc-v7.S
arch/arm/mm/ptdump_debugfs.c
arch/arm/probes/kprobes/test-arm.c
arch/arm/probes/kprobes/test-core.h
arch/arm/tools/Makefile
arch/arm/tools/syscall.tbl
arch/arm/tools/syscallhdr.sh [deleted file]
arch/arm/tools/syscalltbl.sh [deleted file]
arch/arm/xen/mm.c
arch/arm64/Kconfig
arch/arm64/boot/dts/rockchip/rk3368.dtsi
arch/arm64/boot/dts/rockchip/rk3399.dtsi
arch/arm64/include/asm/assembler.h
arch/arm64/include/asm/barrier.h
arch/arm64/include/asm/daifflags.h
arch/arm64/include/asm/el2_setup.h
arch/arm64/include/asm/fpsimd.h
arch/arm64/include/asm/fpsimdmacros.h
arch/arm64/include/asm/hyp_image.h
arch/arm64/include/asm/hypervisor.h
arch/arm64/include/asm/kernel-pgtable.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_asm.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_hyp.h
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/include/asm/kvm_pgtable.h
arch/arm64/include/asm/memory.h
arch/arm64/include/asm/pgtable-prot.h
arch/arm64/include/asm/sections.h
arch/arm64/include/asm/sparsemem.h
arch/arm64/include/asm/sysreg.h
arch/arm64/include/asm/unistd.h
arch/arm64/include/asm/unistd32.h
arch/arm64/kernel/alternative.c
arch/arm64/kernel/asm-offsets.c
arch/arm64/kernel/cpu-reset.S
arch/arm64/kernel/cpufeature.c
arch/arm64/kernel/cpuidle.c
arch/arm64/kernel/entry-common.c
arch/arm64/kernel/entry.S
arch/arm64/kernel/hyp-stub.S
arch/arm64/kernel/image-vars.h
arch/arm64/kernel/process.c
arch/arm64/kernel/stacktrace.c
arch/arm64/kernel/vdso/vdso.lds.S
arch/arm64/kernel/vdso32/Makefile
arch/arm64/kernel/vmlinux.lds.S
arch/arm64/kvm/arm.c
arch/arm64/kvm/debug.c
arch/arm64/kvm/fpsimd.c
arch/arm64/kvm/guest.c
arch/arm64/kvm/handle_exit.c
arch/arm64/kvm/hyp/Makefile
arch/arm64/kvm/hyp/fpsimd.S
arch/arm64/kvm/hyp/include/hyp/switch.h
arch/arm64/kvm/hyp/include/nvhe/early_alloc.h [new file with mode: 0644]
arch/arm64/kvm/hyp/include/nvhe/gfp.h [new file with mode: 0644]
arch/arm64/kvm/hyp/include/nvhe/mem_protect.h [new file with mode: 0644]
arch/arm64/kvm/hyp/include/nvhe/memory.h [new file with mode: 0644]
arch/arm64/kvm/hyp/include/nvhe/mm.h [new file with mode: 0644]
arch/arm64/kvm/hyp/include/nvhe/spinlock.h [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/Makefile
arch/arm64/kvm/hyp/nvhe/cache.S [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/debug-sr.c
arch/arm64/kvm/hyp/nvhe/early_alloc.c [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
arch/arm64/kvm/hyp/nvhe/host.S
arch/arm64/kvm/hyp/nvhe/hyp-init.S
arch/arm64/kvm/hyp/nvhe/hyp-main.c
arch/arm64/kvm/hyp/nvhe/hyp-smp.c
arch/arm64/kvm/hyp/nvhe/hyp.lds.S
arch/arm64/kvm/hyp/nvhe/mem_protect.c [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/mm.c [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/page_alloc.c [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/psci-relay.c
arch/arm64/kvm/hyp/nvhe/setup.c [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/stub.c [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/switch.c
arch/arm64/kvm/hyp/nvhe/tlb.c
arch/arm64/kvm/hyp/pgtable.c
arch/arm64/kvm/hyp/reserved_mem.c [new file with mode: 0644]
arch/arm64/kvm/hyp/vhe/switch.c
arch/arm64/kvm/hypercalls.c
arch/arm64/kvm/mmu.c
arch/arm64/kvm/perf.c
arch/arm64/kvm/pmu-emul.c
arch/arm64/kvm/pmu.c
arch/arm64/kvm/reset.c
arch/arm64/kvm/sys_regs.c
arch/arm64/kvm/trace_arm.h
arch/arm64/kvm/va_layout.c
arch/arm64/kvm/vgic/vgic-init.c
arch/arm64/kvm/vgic/vgic-its.c
arch/arm64/kvm/vgic/vgic-kvm-device.c
arch/arm64/kvm/vgic/vgic-mmio-v3.c
arch/arm64/kvm/vgic/vgic-mmio.c
arch/arm64/kvm/vgic/vgic-v3.c
arch/arm64/kvm/vgic/vgic-v4.c
arch/arm64/kvm/vgic/vgic.h
arch/arm64/lib/clear_page.S
arch/arm64/lib/copy_page.S
arch/arm64/mm/hugetlbpage.c
arch/arm64/mm/init.c
arch/arm64/mm/mmu.c
arch/arm64/mm/ptdump.c
arch/csky/include/asm/Kbuild
arch/csky/include/asm/asid.h
arch/csky/include/asm/barrier.h
arch/csky/include/asm/segment.h
arch/csky/include/asm/uaccess.h
arch/csky/include/asm/vdso.h
arch/csky/kernel/entry.S
arch/csky/lib/usercopy.c
arch/csky/mm/fault.c
arch/csky/mm/syscache.c
arch/h8300/include/asm/bitops.h
arch/hexagon/Makefile
arch/hexagon/configs/comet_defconfig
arch/hexagon/include/asm/futex.h
arch/hexagon/include/asm/io.h
arch/hexagon/include/asm/timex.h
arch/hexagon/kernel/hexagon_ksyms.c
arch/hexagon/kernel/ptrace.c
arch/hexagon/lib/Makefile
arch/hexagon/lib/divsi3.S [new file with mode: 0644]
arch/hexagon/lib/memcpy_likely_aligned.S [new file with mode: 0644]
arch/hexagon/lib/modsi3.S [new file with mode: 0644]
arch/hexagon/lib/udivsi3.S [new file with mode: 0644]
arch/hexagon/lib/umodsi3.S [new file with mode: 0644]
arch/ia64/Kconfig
arch/ia64/include/asm/io.h
arch/ia64/include/asm/uaccess.h
arch/ia64/kernel/syscalls/syscall.tbl
arch/ia64/mm/hugetlbpage.c
arch/m68k/atari/time.c
arch/m68k/coldfire/intc-simr.c
arch/m68k/configs/amcore_defconfig
arch/m68k/include/asm/bitops.h
arch/m68k/include/asm/io_mm.h
arch/m68k/kernel/syscalls/syscall.tbl
arch/microblaze/include/asm/ftrace.h
arch/microblaze/kernel/syscalls/syscall.tbl
arch/mips/Kconfig
arch/mips/include/asm/io.h
arch/mips/include/asm/kvm_host.h
arch/mips/kernel/syscalls/syscall_n32.tbl
arch/mips/kernel/syscalls/syscall_n64.tbl
arch/mips/kernel/syscalls/syscall_o32.tbl
arch/mips/kvm/mips.c
arch/mips/kvm/mmu.c
arch/mips/kvm/vz.c
arch/mips/mm/hugetlbpage.c
arch/nds32/include/asm/uaccess.h
arch/nds32/kernel/ftrace.c
arch/nios2/include/asm/uaccess.h
arch/openrisc/configs/or1ksim_defconfig
arch/parisc/Kconfig
arch/parisc/include/asm/Kbuild
arch/parisc/include/asm/io.h
arch/parisc/include/asm/pdc_chassis.h
arch/parisc/kernel/setup.c
arch/parisc/kernel/syscall.S
arch/parisc/kernel/syscalls/Makefile
arch/parisc/kernel/syscalls/syscall.tbl
arch/parisc/kernel/syscalls/syscallhdr.sh [deleted file]
arch/parisc/kernel/syscalls/syscalltbl.sh [deleted file]
arch/parisc/mm/hugetlbpage.c
arch/powerpc/Kconfig
arch/powerpc/include/asm/fsl_pamu_stash.h
arch/powerpc/include/asm/ftrace.h
arch/powerpc/include/asm/io.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/opal.h
arch/powerpc/include/asm/pci-bridge.h
arch/powerpc/include/asm/pci.h
arch/powerpc/kernel/module.c
arch/powerpc/kernel/syscalls/syscall.tbl
arch/powerpc/kexec/file_load_64.c
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s.h
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_64_mmu_radix.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/e500_mmu_host.c
arch/powerpc/kvm/trace_booke.h
arch/powerpc/lib/Makefile
arch/powerpc/mm/hugetlbpage.c
arch/powerpc/platforms/Kconfig.cputype
arch/powerpc/platforms/powernv/Makefile
arch/powerpc/platforms/powernv/memtrace.c
arch/powerpc/platforms/powernv/npu-dma.c [deleted file]
arch/powerpc/platforms/powernv/opal-call.c
arch/powerpc/platforms/powernv/pci-ioda.c
arch/powerpc/platforms/powernv/pci.c
arch/powerpc/platforms/powernv/pci.h
arch/powerpc/platforms/pseries/pci.c
arch/powerpc/platforms/pseries/svm.c
arch/riscv/Kconfig
arch/riscv/Kconfig.erratas [new file with mode: 0644]
arch/riscv/Kconfig.socs
arch/riscv/Makefile
arch/riscv/boot/Makefile
arch/riscv/boot/dts/Makefile
arch/riscv/boot/dts/microchip/Makefile [new file with mode: 0644]
arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts [new file with mode: 0644]
arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi [new file with mode: 0644]
arch/riscv/boot/dts/sifive/fu740-c000.dtsi
arch/riscv/boot/loader.lds.S
arch/riscv/configs/defconfig
arch/riscv/errata/Makefile [new file with mode: 0644]
arch/riscv/errata/alternative.c [new file with mode: 0644]
arch/riscv/errata/sifive/Makefile [new file with mode: 0644]
arch/riscv/errata/sifive/errata.c [new file with mode: 0644]
arch/riscv/errata/sifive/errata_cip_453.S [new file with mode: 0644]
arch/riscv/include/asm/alternative-macros.h [new file with mode: 0644]
arch/riscv/include/asm/alternative.h [new file with mode: 0644]
arch/riscv/include/asm/asm.h
arch/riscv/include/asm/csr.h
arch/riscv/include/asm/elf.h
arch/riscv/include/asm/errata_list.h [new file with mode: 0644]
arch/riscv/include/asm/ftrace.h
arch/riscv/include/asm/kexec.h [new file with mode: 0644]
arch/riscv/include/asm/page.h
arch/riscv/include/asm/pgtable.h
arch/riscv/include/asm/sbi.h
arch/riscv/include/asm/sections.h
arch/riscv/include/asm/set_memory.h
arch/riscv/include/asm/smp.h
arch/riscv/include/asm/string.h
arch/riscv/include/asm/syscall.h
arch/riscv/include/asm/tlbflush.h
arch/riscv/include/asm/uaccess.h
arch/riscv/include/asm/vendorid_list.h [new file with mode: 0644]
arch/riscv/kernel/Makefile
arch/riscv/kernel/crash_dump.c [new file with mode: 0644]
arch/riscv/kernel/crash_save_regs.S [new file with mode: 0644]
arch/riscv/kernel/entry.S
arch/riscv/kernel/head.S
arch/riscv/kernel/head.h
arch/riscv/kernel/kexec_relocate.S [new file with mode: 0644]
arch/riscv/kernel/machine_kexec.c [new file with mode: 0644]
arch/riscv/kernel/mcount.S
arch/riscv/kernel/module.c
arch/riscv/kernel/probes/kprobes.c
arch/riscv/kernel/sbi.c
arch/riscv/kernel/setup.c
arch/riscv/kernel/smp.c
arch/riscv/kernel/smpboot.c
arch/riscv/kernel/syscall_table.c
arch/riscv/kernel/time.c
arch/riscv/kernel/traps.c
arch/riscv/kernel/vdso.c
arch/riscv/kernel/vdso/Makefile
arch/riscv/kernel/vmlinux-xip.lds.S [new file with mode: 0644]
arch/riscv/kernel/vmlinux.lds.S
arch/riscv/mm/fault.c
arch/riscv/mm/init.c
arch/riscv/mm/kasan_init.c
arch/riscv/mm/physaddr.c
arch/riscv/mm/ptdump.c
arch/riscv/net/bpf_jit_comp64.c
arch/riscv/net/bpf_jit_core.c
arch/s390/Kconfig
arch/s390/configs/debug_defconfig
arch/s390/configs/defconfig
arch/s390/include/asm/cpu_mcf.h
arch/s390/include/asm/entry-common.h
arch/s390/include/asm/io.h
arch/s390/include/asm/kvm_host.h
arch/s390/include/asm/pci.h
arch/s390/include/asm/smp.h
arch/s390/kernel/perf_cpum_cf.c
arch/s390/kernel/perf_cpum_cf_common.c
arch/s390/kernel/perf_cpum_cf_diag.c
arch/s390/kernel/setup.c
arch/s390/kernel/smp.c
arch/s390/kernel/syscall.c
arch/s390/kernel/syscalls/syscall.tbl
arch/s390/kernel/traps.c
arch/s390/kvm/diag.c
arch/s390/kvm/gaccess.c
arch/s390/kvm/gaccess.h
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/vsie.c
arch/s390/mm/hugetlbpage.c
arch/s390/pci/pci.c
arch/s390/pci/pci_event.c
arch/sh/Kconfig
arch/sh/configs/edosk7705_defconfig
arch/sh/configs/se7206_defconfig
arch/sh/configs/sh2007_defconfig
arch/sh/configs/sh7724_generic_defconfig
arch/sh/configs/sh7770_generic_defconfig
arch/sh/configs/sh7785lcr_32bit_defconfig
arch/sh/include/asm/bitops.h
arch/sh/include/asm/io.h
arch/sh/kernel/ftrace.c
arch/sh/kernel/perf_event.c
arch/sh/kernel/syscalls/syscall.tbl
arch/sh/mm/Kconfig
arch/sh/mm/hugetlbpage.c
arch/sparc/configs/sparc64_defconfig
arch/sparc/include/asm/ftrace.h
arch/sparc/include/asm/io_64.h
arch/sparc/kernel/syscalls/syscall.tbl
arch/sparc/mm/hugetlbpage.c
arch/um/Kconfig
arch/um/Kconfig.debug
arch/um/drivers/cow.h
arch/um/drivers/hostaudio_kern.c
arch/um/drivers/vector_kern.c
arch/um/include/asm/pgtable.h
arch/um/include/uapi/asm/Kbuild [new file with mode: 0644]
arch/um/kernel/Makefile
arch/um/kernel/dyn.lds.S
arch/um/kernel/gmon_syms.c [deleted file]
arch/um/kernel/mem.c
arch/um/kernel/uml.lds.S
arch/x86/Kconfig
arch/x86/entry/syscalls/syscall_32.tbl
arch/x86/entry/syscalls/syscall_64.tbl
arch/x86/events/amd/iommu.c
arch/x86/events/amd/iommu.h
arch/x86/include/asm/cpufeatures.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/mem_encrypt.h
arch/x86/include/asm/svm.h
arch/x86/include/asm/vmx.h
arch/x86/include/uapi/asm/vmx.h
arch/x86/kernel/kvm.c
arch/x86/kernel/process.c
arch/x86/kvm/Makefile
arch/x86/kvm/cpuid.c
arch/x86/kvm/cpuid.h
arch/x86/kvm/emulate.c
arch/x86/kvm/kvm_cache_regs.h
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.h
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/mmu_audit.c
arch/x86/kvm/mmu/mmu_internal.h
arch/x86/kvm/mmu/paging_tmpl.h
arch/x86/kvm/mmu/spte.c
arch/x86/kvm/mmu/spte.h
arch/x86/kvm/mmu/tdp_mmu.c
arch/x86/kvm/mmu/tdp_mmu.h
arch/x86/kvm/reverse_cpuid.h [new file with mode: 0644]
arch/x86/kvm/svm/avic.c
arch/x86/kvm/svm/nested.c
arch/x86/kvm/svm/sev.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/svm.h
arch/x86/kvm/svm/vmenter.S
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/nested.h
arch/x86/kvm/vmx/sgx.c [new file with mode: 0644]
arch/x86/kvm/vmx/sgx.h [new file with mode: 0644]
arch/x86/kvm/vmx/vmcs12.c
arch/x86/kvm/vmx/vmcs12.h
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/vmx/vmx.h
arch/x86/kvm/vmx/vmx_ops.h
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
arch/x86/mm/mem_encrypt.c
arch/x86/mm/mem_encrypt_identity.c
arch/x86/mm/pat/set_memory.c
arch/x86/pci/amd_bus.c
arch/x86/um/Makefile
arch/x86/um/asm/elf.h
arch/x86/um/shared/sysdep/stub_32.h
arch/x86/xen/pci-swiotlb-xen.c
arch/xtensa/configs/xip_kc705_defconfig
arch/xtensa/kernel/syscalls/syscall.tbl
block/bio.c
block/blk-settings.c
certs/Kconfig
certs/Makefile
certs/system_certificates.S
certs/system_keyring.c
drivers/Makefile
drivers/acpi/acpi_memhotplug.c
drivers/acpi/arm64/gtdt.c
drivers/acpi/arm64/iort.c
drivers/acpi/custom_method.c
drivers/acpi/internal.h
drivers/acpi/irq.c
drivers/acpi/pci_mcfg.c
drivers/acpi/power.c
drivers/acpi/scan.c
drivers/acpi/sleep.h
drivers/ata/ahci_brcm.c
drivers/atm/firestream.c
drivers/auxdisplay/panel.c
drivers/base/firmware_loader/main.c
drivers/base/memory.c
drivers/block/brd.c
drivers/block/loop.c
drivers/block/rnbd/rnbd-clt.c
drivers/block/rnbd/rnbd-clt.h
drivers/block/rnbd/rnbd-srv.c
drivers/char/Kconfig
drivers/char/mem.c
drivers/clk/sifive/Kconfig
drivers/clk/sifive/fu740-prci.c
drivers/clk/sifive/fu740-prci.h
drivers/clk/sifive/sifive-prci.c
drivers/clk/sifive/sifive-prci.h
drivers/clocksource/arm_arch_timer.c
drivers/crypto/ccp/sev-dev.c
drivers/crypto/ccp/sev-dev.h
drivers/dma/Kconfig
drivers/dma/at_xdmac.c
drivers/dma/dw-edma/dw-edma-core.c
drivers/dma/dw-edma/dw-edma-core.h
drivers/dma/dw-edma/dw-edma-pcie.c
drivers/dma/dw-edma/dw-edma-v0-core.c
drivers/dma/dw-edma/dw-edma-v0-core.h
drivers/dma/dw-edma/dw-edma-v0-debugfs.c
drivers/dma/dw-edma/dw-edma-v0-debugfs.h
drivers/dma/dw-edma/dw-edma-v0-regs.h
drivers/dma/idxd/Makefile
drivers/dma/idxd/cdev.c
drivers/dma/idxd/device.c
drivers/dma/idxd/dma.c
drivers/dma/idxd/idxd.h
drivers/dma/idxd/init.c
drivers/dma/idxd/irq.c
drivers/dma/idxd/perfmon.c [new file with mode: 0644]
drivers/dma/idxd/perfmon.h [new file with mode: 0644]
drivers/dma/idxd/registers.h
drivers/dma/idxd/submit.c
drivers/dma/idxd/sysfs.c
drivers/dma/k3dma.c
drivers/dma/qcom/gpi.c
drivers/dma/qcom/hidma.c
drivers/dma/xilinx/xilinx_dma.c
drivers/firmware/psci/psci.c
drivers/firmware/smccc/Makefile
drivers/firmware/smccc/kvm_guest.c [new file with mode: 0644]
drivers/firmware/smccc/smccc.c
drivers/gpio/Kconfig
drivers/gpio/Makefile
drivers/gpio/gpio-104-dio-48e.c
drivers/gpio/gpio-aggregator.c
drivers/gpio/gpio-ich.c
drivers/gpio/gpio-it87.c
drivers/gpio/gpio-mockup.c
drivers/gpio/gpio-mpc8xxx.c
drivers/gpio/gpio-mxs.c
drivers/gpio/gpio-omap.c
drivers/gpio/gpio-realtek-otto.c [new file with mode: 0644]
drivers/gpio/gpio-sch.c
drivers/gpio/gpiolib-acpi.c
drivers/gpio/gpiolib-acpi.h
drivers/gpio/gpiolib-of.c
drivers/gpio/gpiolib.c
drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
drivers/gpu/drm/i915/gem/i915_gem_internal.c
drivers/gpu/drm/msm/adreno/adreno_gpu.c
drivers/gpu/drm/nouveau/nouveau_ttm.c
drivers/gpu/drm/qxl/qxl_drv.c
drivers/hwspinlock/Kconfig
drivers/hwspinlock/Makefile
drivers/hwspinlock/sirf_hwspinlock.c [deleted file]
drivers/hwtracing/coresight/Kconfig
drivers/hwtracing/coresight/Makefile
drivers/hwtracing/coresight/coresight-core.c
drivers/hwtracing/coresight/coresight-etm-perf.c
drivers/hwtracing/coresight/coresight-etm4x-core.c
drivers/hwtracing/coresight/coresight-etm4x-sysfs.c
drivers/hwtracing/coresight/coresight-etm4x.h
drivers/hwtracing/coresight/coresight-platform.c
drivers/hwtracing/coresight/coresight-priv.h
drivers/hwtracing/coresight/coresight-trbe.c [new file with mode: 0644]
drivers/hwtracing/coresight/coresight-trbe.h [new file with mode: 0644]
drivers/i3c/master.c
drivers/i3c/master/svc-i3c-master.c
drivers/infiniband/core/cache.c
drivers/infiniband/core/cm.c
drivers/infiniband/core/cm_msgs.h
drivers/infiniband/core/cma.c
drivers/infiniband/core/cma_configfs.c
drivers/infiniband/core/cma_priv.h
drivers/infiniband/core/core_priv.h
drivers/infiniband/core/counters.c
drivers/infiniband/core/device.c
drivers/infiniband/core/iwpm_msg.c
drivers/infiniband/core/mad.c
drivers/infiniband/core/mad_rmpp.c
drivers/infiniband/core/multicast.c
drivers/infiniband/core/nldev.c
drivers/infiniband/core/opa_smi.h
drivers/infiniband/core/rdma_core.c
drivers/infiniband/core/restrack.c
drivers/infiniband/core/roce_gid_mgmt.c
drivers/infiniband/core/rw.c
drivers/infiniband/core/sa.h
drivers/infiniband/core/sa_query.c
drivers/infiniband/core/security.c
drivers/infiniband/core/smi.c
drivers/infiniband/core/smi.h
drivers/infiniband/core/sysfs.c
drivers/infiniband/core/ucma.c
drivers/infiniband/core/umem.c
drivers/infiniband/core/umem_dmabuf.c
drivers/infiniband/core/user_mad.c
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/core/uverbs_ioctl.c
drivers/infiniband/core/verbs.c
drivers/infiniband/hw/bnxt_re/Kconfig
drivers/infiniband/hw/bnxt_re/bnxt_re.h
drivers/infiniband/hw/bnxt_re/hw_counters.c
drivers/infiniband/hw/bnxt_re/hw_counters.h
drivers/infiniband/hw/bnxt_re/ib_verbs.c
drivers/infiniband/hw/bnxt_re/ib_verbs.h
drivers/infiniband/hw/bnxt_re/main.c
drivers/infiniband/hw/bnxt_re/qplib_fp.c
drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
drivers/infiniband/hw/bnxt_re/qplib_res.c
drivers/infiniband/hw/cxgb4/cm.c
drivers/infiniband/hw/cxgb4/iw_cxgb4.h
drivers/infiniband/hw/cxgb4/provider.c
drivers/infiniband/hw/cxgb4/resource.c
drivers/infiniband/hw/cxgb4/t4.h
drivers/infiniband/hw/efa/efa.h
drivers/infiniband/hw/efa/efa_main.c
drivers/infiniband/hw/efa/efa_verbs.c
drivers/infiniband/hw/hfi1/affinity.c
drivers/infiniband/hw/hfi1/chip.c
drivers/infiniband/hw/hfi1/chip.h
drivers/infiniband/hw/hfi1/driver.c
drivers/infiniband/hw/hfi1/exp_rcv.c
drivers/infiniband/hw/hfi1/firmware.c
drivers/infiniband/hw/hfi1/hfi.h
drivers/infiniband/hw/hfi1/init.c
drivers/infiniband/hw/hfi1/iowait.h
drivers/infiniband/hw/hfi1/ipoib.h
drivers/infiniband/hw/hfi1/ipoib_main.c
drivers/infiniband/hw/hfi1/ipoib_tx.c
drivers/infiniband/hw/hfi1/mad.c
drivers/infiniband/hw/hfi1/mad.h
drivers/infiniband/hw/hfi1/mmu_rb.c
drivers/infiniband/hw/hfi1/msix.c
drivers/infiniband/hw/hfi1/netdev.h
drivers/infiniband/hw/hfi1/netdev_rx.c
drivers/infiniband/hw/hfi1/sdma.c
drivers/infiniband/hw/hfi1/sdma.h
drivers/infiniband/hw/hfi1/sysfs.c
drivers/infiniband/hw/hfi1/trace_tx.h
drivers/infiniband/hw/hfi1/user_sdma.c
drivers/infiniband/hw/hfi1/user_sdma.h
drivers/infiniband/hw/hfi1/verbs.c
drivers/infiniband/hw/hfi1/verbs.h
drivers/infiniband/hw/hfi1/verbs_txreq.h
drivers/infiniband/hw/hfi1/vnic.h
drivers/infiniband/hw/hfi1/vnic_main.c
drivers/infiniband/hw/hns/hns_roce_alloc.c
drivers/infiniband/hw/hns/hns_roce_cmd.c
drivers/infiniband/hw/hns/hns_roce_common.h
drivers/infiniband/hw/hns/hns_roce_cq.c
drivers/infiniband/hw/hns/hns_roce_device.h
drivers/infiniband/hw/hns/hns_roce_hw_v1.c
drivers/infiniband/hw/hns/hns_roce_hw_v2.c
drivers/infiniband/hw/hns/hns_roce_hw_v2.h
drivers/infiniband/hw/hns/hns_roce_main.c
drivers/infiniband/hw/hns/hns_roce_pd.c
drivers/infiniband/hw/hns/hns_roce_qp.c
drivers/infiniband/hw/hns/hns_roce_srq.c
drivers/infiniband/hw/i40iw/i40iw.h
drivers/infiniband/hw/i40iw/i40iw_cm.c
drivers/infiniband/hw/i40iw/i40iw_hmc.c
drivers/infiniband/hw/i40iw/i40iw_main.c
drivers/infiniband/hw/i40iw/i40iw_osdep.h
drivers/infiniband/hw/i40iw/i40iw_pble.c
drivers/infiniband/hw/i40iw/i40iw_puda.c
drivers/infiniband/hw/i40iw/i40iw_utils.c
drivers/infiniband/hw/i40iw/i40iw_verbs.c
drivers/infiniband/hw/i40iw/i40iw_virtchnl.c
drivers/infiniband/hw/mlx4/alias_GUID.c
drivers/infiniband/hw/mlx4/mad.c
drivers/infiniband/hw/mlx4/main.c
drivers/infiniband/hw/mlx4/mlx4_ib.h
drivers/infiniband/hw/mlx4/qp.c
drivers/infiniband/hw/mlx5/Makefile
drivers/infiniband/hw/mlx5/cmd.c
drivers/infiniband/hw/mlx5/cmd.h
drivers/infiniband/hw/mlx5/cong.c
drivers/infiniband/hw/mlx5/counters.c
drivers/infiniband/hw/mlx5/counters.h
drivers/infiniband/hw/mlx5/devx.c
drivers/infiniband/hw/mlx5/dm.c [new file with mode: 0644]
drivers/infiniband/hw/mlx5/dm.h [new file with mode: 0644]
drivers/infiniband/hw/mlx5/fs.c
drivers/infiniband/hw/mlx5/ib_rep.c
drivers/infiniband/hw/mlx5/ib_rep.h
drivers/infiniband/hw/mlx5/ib_virt.c
drivers/infiniband/hw/mlx5/mad.c
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/mr.c
drivers/infiniband/hw/mlx5/odp.c
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/hw/mlx5/std_types.c
drivers/infiniband/hw/mthca/mthca_av.c
drivers/infiniband/hw/mthca/mthca_dev.h
drivers/infiniband/hw/mthca/mthca_mad.c
drivers/infiniband/hw/mthca/mthca_provider.c
drivers/infiniband/hw/mthca/mthca_qp.c
drivers/infiniband/hw/ocrdma/ocrdma_ah.c
drivers/infiniband/hw/ocrdma/ocrdma_ah.h
drivers/infiniband/hw/ocrdma/ocrdma_main.c
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
drivers/infiniband/hw/qedr/main.c
drivers/infiniband/hw/qedr/qedr_iw_cm.c
drivers/infiniband/hw/qedr/verbs.c
drivers/infiniband/hw/qedr/verbs.h
drivers/infiniband/hw/qib/qib.h
drivers/infiniband/hw/qib/qib_common.h
drivers/infiniband/hw/qib/qib_file_ops.c
drivers/infiniband/hw/qib/qib_fs.c
drivers/infiniband/hw/qib/qib_iba6120.c
drivers/infiniband/hw/qib/qib_iba7220.c
drivers/infiniband/hw/qib/qib_iba7322.c
drivers/infiniband/hw/qib/qib_init.c
drivers/infiniband/hw/qib/qib_mad.c
drivers/infiniband/hw/qib/qib_qp.c
drivers/infiniband/hw/qib/qib_sd7220.c
drivers/infiniband/hw/qib/qib_sysfs.c
drivers/infiniband/hw/qib/qib_verbs.c
drivers/infiniband/hw/qib/qib_verbs.h
drivers/infiniband/hw/usnic/usnic_ib_main.c
drivers/infiniband/hw/usnic/usnic_ib_verbs.c
drivers/infiniband/hw/usnic/usnic_ib_verbs.h
drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
drivers/infiniband/sw/rdmavt/mad.c
drivers/infiniband/sw/rdmavt/mad.h
drivers/infiniband/sw/rdmavt/vt.c
drivers/infiniband/sw/rdmavt/vt.h
drivers/infiniband/sw/rxe/rxe_av.c
drivers/infiniband/sw/rxe/rxe_comp.c
drivers/infiniband/sw/rxe/rxe_hw_counters.c
drivers/infiniband/sw/rxe/rxe_hw_counters.h
drivers/infiniband/sw/rxe/rxe_loc.h
drivers/infiniband/sw/rxe/rxe_mr.c
drivers/infiniband/sw/rxe/rxe_pool.c
drivers/infiniband/sw/rxe/rxe_req.c
drivers/infiniband/sw/rxe/rxe_resp.c
drivers/infiniband/sw/rxe/rxe_verbs.c
drivers/infiniband/sw/rxe/rxe_verbs.h
drivers/infiniband/sw/siw/iwarp.h
drivers/infiniband/sw/siw/siw_cm.c
drivers/infiniband/sw/siw/siw_mem.c
drivers/infiniband/sw/siw/siw_mem.h
drivers/infiniband/sw/siw/siw_verbs.c
drivers/infiniband/sw/siw/siw_verbs.h
drivers/infiniband/ulp/ipoib/ipoib.h
drivers/infiniband/ulp/ipoib/ipoib_cm.c
drivers/infiniband/ulp/ipoib/ipoib_ib.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/iser/iscsi_iser.h
drivers/infiniband/ulp/isert/ib_isert.c
drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c
drivers/infiniband/ulp/rtrs/rtrs-clt.c
drivers/infiniband/ulp/rtrs/rtrs-clt.h
drivers/infiniband/ulp/rtrs/rtrs-pri.h
drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c
drivers/infiniband/ulp/rtrs/rtrs-srv.c
drivers/infiniband/ulp/rtrs/rtrs.c
drivers/infiniband/ulp/rtrs/rtrs.h
drivers/infiniband/ulp/srpt/ib_srpt.c
drivers/input/Makefile
drivers/input/joystick/xpad.c
drivers/input/keyboard/gpio_keys.c
drivers/input/keyboard/imx_keypad.c
drivers/input/keyboard/tca6416-keypad.c
drivers/input/keyboard/tegra-kbc.c
drivers/input/misc/Kconfig
drivers/input/misc/Makefile
drivers/input/misc/ims-pcu.c
drivers/input/misc/iqs626a.c [new file with mode: 0644]
drivers/input/misc/max8997_haptic.c
drivers/input/mouse/elan_i2c.h
drivers/input/mouse/elan_i2c_core.c
drivers/input/serio/apbps2.c
drivers/input/touchscreen.c [new file with mode: 0644]
drivers/input/touchscreen/Kconfig
drivers/input/touchscreen/Makefile
drivers/input/touchscreen/ar1021_i2c.c
drivers/input/touchscreen/atmel_mxt_ts.c
drivers/input/touchscreen/bu21029_ts.c
drivers/input/touchscreen/cyttsp_core.c
drivers/input/touchscreen/cyttsp_core.h
drivers/input/touchscreen/elants_i2c.c
drivers/input/touchscreen/exc3000.c
drivers/input/touchscreen/hycon-hy46xx.c [new file with mode: 0644]
drivers/input/touchscreen/ili210x.c
drivers/input/touchscreen/ilitek_ts_i2c.c [new file with mode: 0644]
drivers/input/touchscreen/iqs5xx.c
drivers/input/touchscreen/lpc32xx_ts.c
drivers/input/touchscreen/melfas_mip4.c
drivers/input/touchscreen/mms114.c
drivers/input/touchscreen/msg2638.c [new file with mode: 0644]
drivers/input/touchscreen/of_touchscreen.c [deleted file]
drivers/input/touchscreen/silead.c
drivers/input/touchscreen/stmfts.c
drivers/input/touchscreen/tsc2007.h
drivers/input/touchscreen/tsc2007_core.c
drivers/input/touchscreen/wacom_i2c.c
drivers/input/touchscreen/wm831x-ts.c
drivers/input/touchscreen/zinitix.c
drivers/iommu/Kconfig
drivers/iommu/Makefile
drivers/iommu/amd/amd_iommu.h
drivers/iommu/amd/amd_iommu_types.h
drivers/iommu/amd/init.c
drivers/iommu/amd/iommu.c
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
drivers/iommu/arm/arm-smmu/arm-smmu.c
drivers/iommu/arm/arm-smmu/arm-smmu.h
drivers/iommu/arm/arm-smmu/qcom_iommu.c
drivers/iommu/dma-iommu.c
drivers/iommu/exynos-iommu.c
drivers/iommu/fsl_pamu.c
drivers/iommu/fsl_pamu.h
drivers/iommu/fsl_pamu_domain.c
drivers/iommu/fsl_pamu_domain.h
drivers/iommu/intel/dmar.c
drivers/iommu/intel/iommu.c
drivers/iommu/intel/irq_remapping.c
drivers/iommu/intel/pasid.c
drivers/iommu/intel/pasid.h
drivers/iommu/intel/svm.c
drivers/iommu/io-pgfault.c [new file with mode: 0644]
drivers/iommu/iommu-sva-lib.h
drivers/iommu/iommu.c
drivers/iommu/iova.c
drivers/iommu/ipmmu-vmsa.c
drivers/iommu/msm_iommu.c
drivers/iommu/mtk_iommu.c
drivers/iommu/mtk_iommu_v1.c
drivers/iommu/of_iommu.c
drivers/iommu/omap-iommu.c
drivers/iommu/rockchip-iommu.c
drivers/iommu/s390-iommu.c
drivers/iommu/sprd-iommu.c [new file with mode: 0644]
drivers/iommu/sun50i-iommu.c
drivers/iommu/tegra-gart.c
drivers/iommu/tegra-smmu.c
drivers/iommu/virtio-iommu.c
drivers/irqchip/irq-gic-v3-its.c
drivers/isdn/capi/kcapi_proc.c
drivers/leds/Kconfig
drivers/leds/Makefile
drivers/leds/blink/Kconfig
drivers/leds/blink/Makefile
drivers/leds/blink/leds-lgm-sso.c
drivers/leds/flash/Kconfig
drivers/leds/flash/Makefile
drivers/leds/flash/leds-rt4505.c [new file with mode: 0644]
drivers/leds/leds-lm3642.c
drivers/leds/leds-pca9532.c
drivers/leds/trigger/ledtrig-pattern.c
drivers/md/bcache/super.c
drivers/md/dm-cache-target.c
drivers/md/dm-clone-metadata.c
drivers/md/dm-ebs-target.c
drivers/md/dm-integrity.c
drivers/md/dm-ioctl.c
drivers/md/dm-raid.c
drivers/md/dm-rq.c
drivers/md/dm-snap-persistent.c
drivers/md/dm-snap.c
drivers/md/dm-table.c
drivers/md/dm-thin.c
drivers/md/dm-verity-target.c
drivers/md/dm-writecache.c
drivers/md/dm.c
drivers/md/persistent-data/dm-btree-internal.h
drivers/md/persistent-data/dm-btree-spine.c
drivers/md/persistent-data/dm-space-map-common.c
drivers/md/persistent-data/dm-space-map-common.h
drivers/md/persistent-data/dm-space-map-disk.c
drivers/media/usb/pwc/pwc-uncompress.c
drivers/media/usb/uvc/uvc_video.c
drivers/media/usb/uvc/uvcvideo.h
drivers/misc/uacce/uacce.c
drivers/mtd/ubi/build.c
drivers/mtd/ubi/ubi.h
drivers/net/can/m_can/m_can.c
drivers/net/can/spi/mcp251x.c
drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
drivers/net/dsa/microchip/ksz8795_spi.c
drivers/net/dsa/microchip/ksz8863_smi.c
drivers/net/ethernet/adaptec/starfire.c
drivers/net/ethernet/amd/atarilance.c
drivers/net/ethernet/amd/pcnet32.c
drivers/net/ethernet/atheros/alx/main.c
drivers/net/ethernet/atheros/atl1c/atl1c_main.c
drivers/net/ethernet/broadcom/bnx2.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
drivers/net/ethernet/broadcom/bnxt/bnxt.c
drivers/net/ethernet/broadcom/bnxt/bnxt.h
drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
drivers/net/ethernet/broadcom/tg3.c
drivers/net/ethernet/brocade/bna/bnad.c
drivers/net/ethernet/cadence/macb_main.c
drivers/net/ethernet/chelsio/cxgb4/sge.c
drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
drivers/net/ethernet/cisco/enic/enic_main.c
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
drivers/net/ethernet/intel/i40e/i40e.h
drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
drivers/net/ethernet/intel/i40e/i40e_client.c
drivers/net/ethernet/intel/i40e/i40e_common.c
drivers/net/ethernet/intel/i40e/i40e_ethtool.c
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/i40e/i40e_txrx.c
drivers/net/ethernet/intel/i40e/i40e_type.h
drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c
drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
drivers/net/ethernet/realtek/r8169_main.c
drivers/net/ethernet/sfc/efx.c
drivers/net/ethernet/sfc/falcon/efx.c
drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
drivers/net/ethernet/stmicro/stmmac/hwif.h
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
drivers/net/ipa/gsi.c
drivers/net/ipa/gsi_reg.h
drivers/net/phy/marvell.c
drivers/net/virtio_net.c
drivers/net/wan/hdlc_fr.c
drivers/net/wireless/intel/iwlwifi/mvm/tt.c
drivers/net/wireless/intersil/hostap/hostap_proc.c
drivers/net/wireless/intersil/orinoco/orinoco_nortel.c
drivers/net/wireless/intersil/orinoco/orinoco_pci.c
drivers/net/wireless/intersil/orinoco/orinoco_plx.c
drivers/net/wireless/intersil/orinoco/orinoco_tmd.c
drivers/nvdimm/btt.c
drivers/nvdimm/pmem.c
drivers/nvme/host/core.c
drivers/nvme/host/fabrics.c
drivers/nvme/host/fabrics.h
drivers/nvme/host/fc.c
drivers/nvme/host/ioctl.c
drivers/nvme/host/multipath.c
drivers/nvme/host/nvme.h
drivers/nvme/host/pci.c
drivers/nvme/host/rdma.c
drivers/nvme/host/tcp.c
drivers/nvme/target/admin-cmd.c
drivers/nvme/target/loop.c
drivers/of/overlay.c
drivers/parport/parport_ip32.c
drivers/pci/ats.c
drivers/pci/controller/Kconfig
drivers/pci/controller/Makefile
drivers/pci/controller/cadence/pci-j721e.c
drivers/pci/controller/dwc/Kconfig
drivers/pci/controller/dwc/Makefile
drivers/pci/controller/dwc/pci-keystone.c
drivers/pci/controller/dwc/pci-layerscape-ep.c
drivers/pci/controller/dwc/pcie-designware-ep.c
drivers/pci/controller/dwc/pcie-designware-host.c
drivers/pci/controller/dwc/pcie-designware.c
drivers/pci/controller/dwc/pcie-designware.h
drivers/pci/controller/dwc/pcie-fu740.c [new file with mode: 0644]
drivers/pci/controller/dwc/pcie-intel-gw.c
drivers/pci/controller/dwc/pcie-tegra194.c
drivers/pci/controller/mobiveil/Kconfig
drivers/pci/controller/pci-host-common.c
drivers/pci/controller/pci-hyperv.c
drivers/pci/controller/pci-tegra.c
drivers/pci/controller/pci-thunder-ecam.c
drivers/pci/controller/pci-thunder-pem.c
drivers/pci/controller/pci-xgene.c
drivers/pci/controller/pcie-altera-msi.c
drivers/pci/controller/pcie-brcmstb.c
drivers/pci/controller/pcie-iproc-msi.c
drivers/pci/controller/pcie-mediatek-gen3.c [new file with mode: 0644]
drivers/pci/controller/pcie-mediatek.c
drivers/pci/controller/pcie-microchip-host.c
drivers/pci/controller/pcie-rcar-host.c
drivers/pci/controller/pcie-xilinx-nwl.c
drivers/pci/controller/pcie-xilinx.c
drivers/pci/controller/vmd.c
drivers/pci/endpoint/functions/pci-epf-ntb.c
drivers/pci/endpoint/functions/pci-epf-test.c
drivers/pci/endpoint/pci-epc-core.c
drivers/pci/endpoint/pci-epf-core.c
drivers/pci/hotplug/acpi_pcihp.c
drivers/pci/hotplug/acpiphp.h
drivers/pci/hotplug/acpiphp_glue.c
drivers/pci/hotplug/cpqphp_nvram.c
drivers/pci/hotplug/s390_pci_hpc.c
drivers/pci/hotplug/shpchp_hpc.c
drivers/pci/msi.c
drivers/pci/of.c
drivers/pci/pci-acpi.c
drivers/pci/pci-label.c
drivers/pci/pci-sysfs.c
drivers/pci/pci.c
drivers/pci/pci.h
drivers/pci/pcie/aer.c
drivers/pci/pcie/pme.c
drivers/pci/pcie/rcec.c
drivers/pci/probe.c
drivers/pci/quirks.c
drivers/pci/remove.c
drivers/pci/vpd.c
drivers/pci/xen-pcifront.c
drivers/pcmcia/cistpl.c
drivers/pcmcia/ds.c
drivers/pcmcia/pcmcia_cis.c
drivers/pcmcia/pcmcia_resource.c
drivers/pcmcia/rsrc_nonstatic.c
drivers/perf/arm_pmu.c
drivers/platform/chrome/cros_ec_lpc_mec.c
drivers/platform/chrome/cros_ec_typec.c
drivers/platform/chrome/cros_usbpd_notify.c
drivers/platform/chrome/wilco_ec/telemetry.c
drivers/platform/x86/dell/dell_rbu.c
drivers/ptp/Kconfig
drivers/ptp/Makefile
drivers/ptp/ptp_kvm.c [deleted file]
drivers/ptp/ptp_kvm_arm.c [new file with mode: 0644]
drivers/ptp/ptp_kvm_common.c [new file with mode: 0644]
drivers/ptp/ptp_kvm_x86.c [new file with mode: 0644]
drivers/pwm/Kconfig
drivers/pwm/Makefile
drivers/pwm/core.c
drivers/pwm/pwm-ab8500.c
drivers/pwm/pwm-atmel-hlcdc.c
drivers/pwm/pwm-atmel-tcb.c
drivers/pwm/pwm-atmel.c
drivers/pwm/pwm-bcm-iproc.c
drivers/pwm/pwm-bcm-kona.c
drivers/pwm/pwm-bcm2835.c
drivers/pwm/pwm-berlin.c
drivers/pwm/pwm-brcmstb.c
drivers/pwm/pwm-clps711x.c
drivers/pwm/pwm-crc.c
drivers/pwm/pwm-cros-ec.c
drivers/pwm/pwm-dwc.c
drivers/pwm/pwm-ep93xx.c
drivers/pwm/pwm-fsl-ftm.c
drivers/pwm/pwm-hibvt.c
drivers/pwm/pwm-img.c
drivers/pwm/pwm-imx-tpm.c
drivers/pwm/pwm-imx1.c
drivers/pwm/pwm-imx27.c
drivers/pwm/pwm-intel-lgm.c
drivers/pwm/pwm-iqs620a.c
drivers/pwm/pwm-jz4740.c
drivers/pwm/pwm-keembay.c
drivers/pwm/pwm-lp3943.c
drivers/pwm/pwm-lpc18xx-sct.c
drivers/pwm/pwm-lpc32xx.c
drivers/pwm/pwm-lpss.c
drivers/pwm/pwm-mediatek.c
drivers/pwm/pwm-meson.c
drivers/pwm/pwm-mtk-disp.c
drivers/pwm/pwm-mxs.c
drivers/pwm/pwm-omap-dmtimer.c
drivers/pwm/pwm-pca9685.c
drivers/pwm/pwm-pxa.c
drivers/pwm/pwm-rcar.c
drivers/pwm/pwm-renesas-tpu.c
drivers/pwm/pwm-rockchip.c
drivers/pwm/pwm-samsung.c
drivers/pwm/pwm-sifive.c
drivers/pwm/pwm-sl28cpld.c
drivers/pwm/pwm-spear.c
drivers/pwm/pwm-sprd.c
drivers/pwm/pwm-sti.c
drivers/pwm/pwm-stm32-lp.c
drivers/pwm/pwm-stm32.c
drivers/pwm/pwm-stmpe.c
drivers/pwm/pwm-sun4i.c
drivers/pwm/pwm-tegra.c
drivers/pwm/pwm-tiecap.c
drivers/pwm/pwm-tiehrpwm.c
drivers/pwm/pwm-twl-led.c
drivers/pwm/pwm-twl.c
drivers/pwm/pwm-visconti.c [new file with mode: 0644]
drivers/pwm/pwm-vt8500.c
drivers/remoteproc/Kconfig
drivers/remoteproc/imx_rproc.c
drivers/remoteproc/ingenic_rproc.c
drivers/remoteproc/keystone_remoteproc.c
drivers/remoteproc/mtk_scp.c
drivers/remoteproc/omap_remoteproc.c
drivers/remoteproc/pru_rproc.c
drivers/remoteproc/qcom_q6v5_adsp.c
drivers/remoteproc/qcom_q6v5_mss.c
drivers/remoteproc/qcom_q6v5_pas.c
drivers/remoteproc/qcom_q6v5_wcss.c
drivers/remoteproc/qcom_wcnss.c
drivers/remoteproc/remoteproc_cdev.c
drivers/remoteproc/remoteproc_core.c
drivers/remoteproc/remoteproc_coredump.c
drivers/remoteproc/remoteproc_debugfs.c
drivers/remoteproc/remoteproc_elf_loader.c
drivers/remoteproc/remoteproc_internal.h
drivers/remoteproc/remoteproc_sysfs.c
drivers/remoteproc/st_slim_rproc.c
drivers/remoteproc/stm32_rproc.c
drivers/remoteproc/ti_k3_dsp_remoteproc.c
drivers/remoteproc/ti_k3_r5_remoteproc.c
drivers/remoteproc/wkup_m3_rproc.c
drivers/reset/Kconfig
drivers/rpmsg/qcom_glink_native.c
drivers/rpmsg/qcom_smd.c
drivers/rpmsg/rpmsg_char.c
drivers/rpmsg/virtio_rpmsg_bus.c
drivers/rtc/Kconfig
drivers/rtc/interface.c
drivers/rtc/rtc-ab-eoz9.c
drivers/rtc/rtc-ds1307.c
drivers/rtc/rtc-ds1511.c
drivers/rtc/rtc-fsl-ftm-alarm.c
drivers/rtc/rtc-imx-sc.c
drivers/rtc/rtc-imxdi.c
drivers/rtc/rtc-m48t59.c
drivers/rtc/rtc-mxc.c
drivers/rtc/rtc-omap.c
drivers/rtc/rtc-pcf85063.c
drivers/rtc/rtc-pcf8523.c
drivers/rtc/rtc-pm8xxx.c
drivers/rtc/rtc-rv3028.c
drivers/rtc/rtc-rx6110.c
drivers/rtc/rtc-s5m.c
drivers/rtc/rtc-spear.c
drivers/rtc/rtc-tps65910.c
drivers/rtc/sysfs.c
drivers/s390/block/dasd_eckd.h
drivers/s390/cio/device.c
drivers/scsi/53c700.c
drivers/scsi/53c700.h
drivers/scsi/ch.c
drivers/scsi/cxlflash/main.c
drivers/scsi/esas2r/esas2r_main.c
drivers/scsi/ips.c
drivers/scsi/ips.h
drivers/scsi/lasi700.c
drivers/scsi/megaraid/mbox_defs.h
drivers/scsi/megaraid/mega_common.h
drivers/scsi/megaraid/megaraid_mbox.c
drivers/scsi/megaraid/megaraid_mbox.h
drivers/scsi/qla1280.c
drivers/scsi/scsicam.c
drivers/scsi/sni_53c710.c
drivers/soc/fsl/qbman/qman_portal.c
drivers/thermal/amlogic_thermal.c
drivers/thermal/broadcom/bcm2835_thermal.c
drivers/thermal/cpufreq_cooling.c
drivers/thermal/cpuidle_cooling.c
drivers/thermal/devfreq_cooling.c
drivers/thermal/gov_fair_share.c
drivers/thermal/gov_power_allocator.c
drivers/thermal/hisi_thermal.c
drivers/thermal/intel/Kconfig
drivers/thermal/intel/Makefile
drivers/thermal/intel/intel_tcc_cooling.c [new file with mode: 0644]
drivers/thermal/mtk_thermal.c
drivers/thermal/qcom/qcom-spmi-temp-alarm.c
drivers/thermal/qcom/tsens-8960.c
drivers/thermal/qcom/tsens-v0_1.c
drivers/thermal/qcom/tsens-v1.c
drivers/thermal/qcom/tsens.c
drivers/thermal/qcom/tsens.h
drivers/thermal/rcar_gen3_thermal.c
drivers/thermal/sun8i_thermal.c
drivers/thermal/tegra/soctherm.c
drivers/thermal/thermal_core.c
drivers/thermal/thermal_core.h
drivers/thermal/thermal_helpers.c
drivers/thermal/thermal_mmio.c
drivers/thermal/thermal_of.c
drivers/thermal/ti-soc-thermal/ti-bandgap.c
drivers/vdpa/Kconfig
drivers/vdpa/Makefile
drivers/vdpa/ifcvf/ifcvf_base.c
drivers/vdpa/ifcvf/ifcvf_base.h
drivers/vdpa/ifcvf/ifcvf_main.c
drivers/vdpa/mlx5/net/mlx5_vnet.c
drivers/vdpa/vdpa.c
drivers/vdpa/vdpa_sim/Makefile
drivers/vdpa/vdpa_sim/vdpa_sim.c
drivers/vdpa/vdpa_sim/vdpa_sim.h
drivers/vdpa/vdpa_sim/vdpa_sim_blk.c [new file with mode: 0644]
drivers/vdpa/virtio_pci/Makefile [new file with mode: 0644]
drivers/vdpa/virtio_pci/vp_vdpa.c [new file with mode: 0644]
drivers/vfio/vfio_iommu_type1.c
drivers/vhost/vdpa.c
drivers/vhost/vringh.c
drivers/video/fbdev/matrox/matroxfb_base.c
drivers/video/fbdev/vga16fb.c
drivers/virt/nitro_enclaves/ne_misc_dev.c
drivers/virtio/virtio_balloon.c
drivers/virtio/virtio_pci_modern.c
drivers/virtio/virtio_pci_modern_dev.c
drivers/xen/swiotlb-xen.c
fs/9p/v9fs.c
fs/9p/vfs_file.c
fs/Kconfig
fs/Kconfig.binfmt
fs/afs/dir.c
fs/afs/dir_silly.c
fs/afs/fs_operation.c
fs/afs/inode.c
fs/afs/internal.h
fs/afs/write.c
fs/autofs/autofs_i.h
fs/autofs/expire.c
fs/autofs/waitq.c
fs/binfmt_flat.c
fs/block_dev.c
fs/btrfs/compression.c
fs/btrfs/extent_io.c
fs/btrfs/inode.c
fs/btrfs/reflink.c
fs/btrfs/zlib.c
fs/btrfs/zstd.c
fs/buffer.c
fs/ceph/Kconfig
fs/ceph/addr.c
fs/ceph/cache.c
fs/ceph/cache.h
fs/ceph/caps.c
fs/ceph/debugfs.c
fs/ceph/dir.c
fs/ceph/export.c
fs/ceph/file.c
fs/ceph/inode.c
fs/ceph/io.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/metric.c
fs/ceph/metric.h
fs/ceph/snap.c
fs/ceph/super.h
fs/ceph/xattr.c
fs/cifs/cifs_fs_sb.h
fs/cifs/cifs_ioctl.h
fs/cifs/cifsfs.c
fs/cifs/cifsglob.h
fs/cifs/cifsproto.h
fs/cifs/connect.c
fs/cifs/dir.c
fs/cifs/file.c
fs/cifs/fs_context.c
fs/cifs/inode.c
fs/cifs/ioctl.c
fs/cifs/link.c
fs/cifs/misc.c
fs/cifs/xattr.c
fs/configfs/configfs_internal.h
fs/configfs/dir.c
fs/configfs/file.c
fs/configfs/inode.c
fs/configfs/item.c
fs/configfs/mount.c
fs/configfs/symlink.c
fs/d_path.c
fs/dax.c
fs/dcache.c
fs/ecryptfs/crypto.c
fs/ecryptfs/debug.c
fs/ecryptfs/dentry.c
fs/ecryptfs/ecryptfs_kernel.h
fs/ecryptfs/file.c
fs/ecryptfs/inode.c
fs/ecryptfs/keystore.c
fs/ecryptfs/kthread.c
fs/ecryptfs/main.c
fs/ecryptfs/messaging.c
fs/ecryptfs/miscdev.c
fs/ecryptfs/mmap.c
fs/ecryptfs/read_write.c
fs/ecryptfs/super.c
fs/eventpoll.c
fs/ext2/namei.c
fs/ext4/namei.c
fs/f2fs/Kconfig
fs/f2fs/acl.c
fs/f2fs/checkpoint.c
fs/f2fs/compress.c
fs/f2fs/compress.h [deleted file]
fs/f2fs/data.c
fs/f2fs/debug.c
fs/f2fs/dir.c
fs/f2fs/f2fs.h
fs/f2fs/file.c
fs/f2fs/gc.c
fs/f2fs/gc.h
fs/f2fs/inline.c
fs/f2fs/inode.c
fs/f2fs/namei.c
fs/f2fs/node.c
fs/f2fs/node.h
fs/f2fs/recovery.c
fs/f2fs/segment.c
fs/f2fs/segment.h
fs/f2fs/super.c
fs/f2fs/sysfs.c
fs/f2fs/verity.c
fs/f2fs/xattr.c
fs/fat/fatent.c
fs/file.c
fs/fuse/inode.c
fs/gfs2/glock.c
fs/hostfs/hostfs_kern.c
fs/hpfs/hpfs.h
fs/hugetlbfs/inode.c
fs/inode.c
fs/io_uring.c
fs/iomap/buffered-io.c
fs/isofs/rock.c
fs/jffs2/file.c
fs/jffs2/scan.c
fs/jffs2/summary.h
fs/locks.c
fs/nfs/callback_proc.c
fs/nfs/client.c
fs/nfs/delegation.c
fs/nfs/delegation.h
fs/nfs/dir.c
fs/nfs/export.c
fs/nfs/file.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/fs_context.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/io.c
fs/nfs/mount_clnt.c
fs/nfs/nfs3acl.c
fs/nfs/nfs3xdr.c
fs/nfs/nfs42proc.c
fs/nfs/nfs42xattr.c
fs/nfs/nfs4file.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4renewd.c
fs/nfs/nfs4state.c
fs/nfs/nfs4trace.h
fs/nfs/nfs4xdr.c
fs/nfs/nfstrace.c
fs/nfs/nfstrace.h
fs/nfs/pagelist.c
fs/nfs/pnfs.c
fs/nfs/proc.c
fs/nfs/super.c
fs/nfs/write.c
fs/nfsd/Kconfig
fs/nfsd/nfs4proc.c
fs/nfsd/nfs4state.c
fs/nfsd/nfs4xdr.c
fs/nfsd/nfsctl.c
fs/nfsd/nfssvc.c
fs/nfsd/state.h
fs/nfsd/xdr4.h
fs/nilfs2/cpfile.c
fs/nilfs2/ioctl.c
fs/nilfs2/namei.c
fs/nilfs2/segment.c
fs/nilfs2/the_nilfs.c
fs/ocfs2/acl.c
fs/ocfs2/acl.h
fs/ocfs2/alloc.c
fs/ocfs2/alloc.h
fs/ocfs2/aops.c
fs/ocfs2/aops.h
fs/ocfs2/blockcheck.c
fs/ocfs2/blockcheck.h
fs/ocfs2/buffer_head_io.c
fs/ocfs2/buffer_head_io.h
fs/ocfs2/cluster/heartbeat.c
fs/ocfs2/cluster/heartbeat.h
fs/ocfs2/cluster/masklog.c
fs/ocfs2/cluster/masklog.h
fs/ocfs2/cluster/netdebug.c
fs/ocfs2/cluster/nodemanager.c
fs/ocfs2/cluster/nodemanager.h
fs/ocfs2/cluster/ocfs2_heartbeat.h
fs/ocfs2/cluster/ocfs2_nodemanager.h
fs/ocfs2/cluster/quorum.c
fs/ocfs2/cluster/quorum.h
fs/ocfs2/cluster/sys.c
fs/ocfs2/cluster/sys.h
fs/ocfs2/cluster/tcp.c
fs/ocfs2/cluster/tcp.h
fs/ocfs2/cluster/tcp_internal.h
fs/ocfs2/dcache.c
fs/ocfs2/dcache.h
fs/ocfs2/dir.c
fs/ocfs2/dir.h
fs/ocfs2/dlm/dlmapi.h
fs/ocfs2/dlm/dlmast.c
fs/ocfs2/dlm/dlmcommon.h
fs/ocfs2/dlm/dlmconvert.c
fs/ocfs2/dlm/dlmconvert.h
fs/ocfs2/dlm/dlmdebug.c
fs/ocfs2/dlm/dlmdebug.h
fs/ocfs2/dlm/dlmdomain.c
fs/ocfs2/dlm/dlmdomain.h
fs/ocfs2/dlm/dlmlock.c
fs/ocfs2/dlm/dlmmaster.c
fs/ocfs2/dlm/dlmrecovery.c
fs/ocfs2/dlm/dlmthread.c
fs/ocfs2/dlm/dlmunlock.c
fs/ocfs2/dlmfs/dlmfs.c
fs/ocfs2/dlmfs/userdlm.c
fs/ocfs2/dlmfs/userdlm.h
fs/ocfs2/dlmglue.c
fs/ocfs2/dlmglue.h
fs/ocfs2/export.c
fs/ocfs2/export.h
fs/ocfs2/extent_map.c
fs/ocfs2/extent_map.h
fs/ocfs2/file.c
fs/ocfs2/file.h
fs/ocfs2/filecheck.c
fs/ocfs2/filecheck.h
fs/ocfs2/heartbeat.c
fs/ocfs2/heartbeat.h
fs/ocfs2/inode.c
fs/ocfs2/inode.h
fs/ocfs2/journal.c
fs/ocfs2/journal.h
fs/ocfs2/localalloc.c
fs/ocfs2/localalloc.h
fs/ocfs2/locks.c
fs/ocfs2/locks.h
fs/ocfs2/mmap.c
fs/ocfs2/move_extents.c
fs/ocfs2/move_extents.h
fs/ocfs2/namei.c
fs/ocfs2/namei.h
fs/ocfs2/ocfs1_fs_compat.h
fs/ocfs2/ocfs2.h
fs/ocfs2/ocfs2_fs.h
fs/ocfs2/ocfs2_ioctl.h
fs/ocfs2/ocfs2_lockid.h
fs/ocfs2/ocfs2_lockingver.h
fs/ocfs2/refcounttree.c
fs/ocfs2/refcounttree.h
fs/ocfs2/reservations.c
fs/ocfs2/reservations.h
fs/ocfs2/resize.c
fs/ocfs2/resize.h
fs/ocfs2/slot_map.c
fs/ocfs2/slot_map.h
fs/ocfs2/stack_o2cb.c
fs/ocfs2/stack_user.c
fs/ocfs2/stackglue.c
fs/ocfs2/stackglue.h
fs/ocfs2/suballoc.c
fs/ocfs2/suballoc.h
fs/ocfs2/super.c
fs/ocfs2/super.h
fs/ocfs2/symlink.c
fs/ocfs2/symlink.h
fs/ocfs2/sysfile.c
fs/ocfs2/sysfile.h
fs/ocfs2/uptodate.c
fs/ocfs2/uptodate.h
fs/ocfs2/xattr.c
fs/ocfs2/xattr.h
fs/orangefs/file.c
fs/orangefs/inode.c
fs/orangefs/orangefs-mod.c
fs/proc/generic.c
fs/proc/inode.c
fs/proc/proc_sysctl.c
fs/proc/task_mmu.c
fs/reiserfs/procfs.c
fs/super.c
fs/tracefs/inode.c
fs/ubifs/replay.c
fs/ubifs/sb.c
fs/ubifs/super.c
fs/udf/namei.c
fs/ufs/super.c
fs/userfaultfd.c
fs/xfs/libxfs/xfs_ag_resv.c
fs/xfs/libxfs/xfs_alloc.c
fs/xfs/libxfs/xfs_alloc_btree.c
fs/xfs/libxfs/xfs_log_format.h
fs/xfs/libxfs/xfs_rmap_btree.c
fs/xfs/libxfs/xfs_sb.c
fs/xfs/scrub/agheader.c
fs/xfs/scrub/fscounters.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_inode_item_recover.c
fs/xfs/xfs_log.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_ondisk.h
fs/xfs/xfs_reflink.c
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans.h
include/asm-generic/bitops/find.h
include/asm-generic/bitops/le.h
include/asm-generic/bitsperlong.h
include/asm-generic/io.h
include/dt-bindings/clock/sifive-fu740-prci.h
include/dt-bindings/input/atmel-maxtouch.h [new file with mode: 0644]
include/keys/system_keyring.h
include/kvm/arm_pmu.h
include/kvm/arm_vgic.h
include/linux/align.h [new file with mode: 0644]
include/linux/amd-iommu.h
include/linux/arm-smccc.h
include/linux/async.h
include/linux/bio.h
include/linux/bitmap.h
include/linux/bitops.h
include/linux/blkdev.h
include/linux/bpf_verifier.h
include/linux/buffer_head.h
include/linux/bug.h
include/linux/clocksource.h
include/linux/clocksource_ids.h [new file with mode: 0644]
include/linux/cma.h
include/linux/compaction.h
include/linux/compat.h
include/linux/configfs.h
include/linux/coresight.h
include/linux/cpuhotplug.h
include/linux/crc8.h
include/linux/cred.h
include/linux/dcache.h
include/linux/delayacct.h
include/linux/device-mapper.h
include/linux/dma-iommu.h
include/linux/dma-map-ops.h
include/linux/dma-mapping.h
include/linux/f2fs_fs.h
include/linux/file.h
include/linux/fs.h
include/linux/ftrace.h
include/linux/genl_magic_func.h
include/linux/genl_magic_struct.h
include/linux/gfp.h
include/linux/gpio/driver.h
include/linux/highmem.h
include/linux/huge_mm.h
include/linux/hugetlb.h
include/linux/init_task.h
include/linux/initrd.h
include/linux/intel-iommu.h
include/linux/intel-svm.h
include/linux/io-pgtable.h
include/linux/iomap.h
include/linux/iommu.h
include/linux/iova.h
include/linux/irqdomain.h
include/linux/kernel.h
include/linux/kvm_host.h
include/linux/lsm_hook_defs.h
include/linux/lsm_hooks.h
include/linux/memcontrol.h
include/linux/memory.h
include/linux/memory_hotplug.h
include/linux/memremap.h
include/linux/migrate.h
include/linux/mlx5/driver.h
include/linux/mlx5/mlx5_ifc.h
include/linux/mm.h
include/linux/mmzone.h
include/linux/msi.h
include/linux/netfilter_arp/arp_tables.h
include/linux/nfs4.h
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
include/linux/nfs_xdr.h
include/linux/pagemap.h
include/linux/pci-ecam.h
include/linux/pci.h
include/linux/perf_event.h
include/linux/pgtable.h
include/linux/platform_data/cros_ec_commands.h
include/linux/proc_fs.h
include/linux/profile.h
include/linux/psp-sev.h
include/linux/ptp_kvm.h [new file with mode: 0644]
include/linux/pwm.h
include/linux/remoteproc.h
include/linux/reset.h
include/linux/ring_buffer.h
include/linux/rpmsg.h
include/linux/sched.h
include/linux/sched/mm.h
include/linux/security.h
include/linux/seq_buf.h
include/linux/shrinker.h
include/linux/smp.h
include/linux/sunrpc/xprt.h
include/linux/swap.h
include/linux/swiotlb.h
include/linux/syscalls.h
include/linux/thermal.h
include/linux/timekeeping.h
include/linux/trace_events.h
include/linux/tracepoint.h
include/linux/userfaultfd_k.h
include/linux/vdpa.h
include/linux/virtio_pci_modern.h
include/linux/vm_event_item.h
include/linux/vmalloc.h
include/linux/vringh.h
include/net/sctp/command.h
include/rdma/ib_cache.h
include/rdma/ib_mad.h
include/rdma/ib_sa.h
include/rdma/ib_verbs.h
include/rdma/iw_cm.h
include/rdma/rdma_cm.h
include/rdma/rdma_counter.h
include/rdma/rdma_vt.h
include/rdma/restrack.h
include/rdma/rw.h
include/rdma/uverbs_ioctl.h
include/rdma/uverbs_named_ioctl.h
include/trace/events/cma.h
include/trace/events/intel_iommu.h
include/trace/events/io_uring.h
include/trace/events/kvm.h
include/trace/events/migrate.h
include/trace/events/mmflags.h
include/trace/events/rcu.h
include/trace/events/rpcrdma.h
include/trace/events/sched.h
include/trace/events/sunrpc.h
include/trace/events/timer.h
include/uapi/asm-generic/unistd.h
include/uapi/linux/dm-ioctl.h
include/uapi/linux/if_bonding.h
include/uapi/linux/iommu.h
include/uapi/linux/kexec.h
include/uapi/linux/kvm.h
include/uapi/linux/landlock.h [new file with mode: 0644]
include/uapi/linux/mempolicy.h
include/uapi/linux/netfilter/xt_SECMARK.h
include/uapi/linux/nfs4.h
include/uapi/linux/perf_event.h
include/uapi/linux/rpmsg.h
include/uapi/linux/seg6_local.h
include/uapi/linux/thermal.h
include/uapi/linux/userfaultfd.h
include/uapi/linux/vfio.h
include/uapi/rdma/hns-abi.h
include/uapi/rdma/mlx5_user_ioctl_cmds.h
include/uapi/rdma/mlx5_user_ioctl_verbs.h
include/uapi/rdma/rdma_netlink.h
include/xen/interface/elfnote.h
include/xen/interface/hvm/hvm_vcpu.h
include/xen/interface/io/xenbus.h
include/xen/swiotlb-xen.h
init/Kconfig
init/initramfs.c
init/main.c
ipc/sem.c
kernel/async.c
kernel/bpf/verifier.c
kernel/configs/android-base.config
kernel/cred.c
kernel/dma/direct.c
kernel/dma/direct.h
kernel/dma/map_benchmark.c
kernel/dma/mapping.c
kernel/dma/swiotlb.c
kernel/events/core.c
kernel/exit.c
kernel/fork.c
kernel/gcov/Kconfig
kernel/gcov/base.c
kernel/gcov/clang.c
kernel/gcov/fs.c
kernel/gcov/gcc_4_7.c
kernel/gcov/gcov.h
kernel/irq/irqdomain.c
kernel/kexec_core.c
kernel/kexec_file.c
kernel/kmod.c
kernel/resource.c
kernel/seccomp.c
kernel/sys.c
kernel/sys_ni.c
kernel/sysctl.c
kernel/time/clocksource.c
kernel/time/timekeeping.c
kernel/trace/fgraph.c
kernel/trace/ftrace.c
kernel/trace/ring_buffer.c
kernel/trace/synth_event_gen_test.c
kernel/trace/trace.c
kernel/trace/trace.h
kernel/trace/trace_clock.c
kernel/trace/trace_entries.h
kernel/trace/trace_event_perf.c
kernel/trace/trace_events.c
kernel/trace/trace_events_filter.c
kernel/trace/trace_events_hist.c
kernel/trace/trace_events_synth.c
kernel/trace/trace_events_trigger.c
kernel/trace/trace_functions.c
kernel/trace/trace_functions_graph.c
kernel/trace/trace_hwlat.c
kernel/trace/trace_kprobe.c
kernel/trace/trace_output.c
kernel/trace/trace_printk.c
kernel/trace/trace_probe.c
kernel/trace/trace_probe.h
kernel/trace/trace_probe_tmpl.h
kernel/trace/trace_selftest.c
kernel/trace/trace_seq.c
kernel/umh.c
kernel/up.c
kernel/user_namespace.c
lib/Kconfig.kfence
lib/bch.c
lib/bitmap.c
lib/bug.c
lib/cmdline.c
lib/crc8.c
lib/decompress_unlzma.c
lib/dynamic_debug.c
lib/find_bit.c
lib/genalloc.c
lib/iov_iter.c
lib/list_sort.c
lib/nlattr.c
lib/parser.c
lib/percpu_counter.c
lib/stackdepot.c
mm/Kconfig
mm/Makefile
mm/balloon_compaction.c
mm/cma.c
mm/cma.h
mm/cma_debug.c
mm/cma_sysfs.c [new file with mode: 0644]
mm/compaction.c
mm/filemap.c
mm/frontswap.c
mm/gup.c
mm/gup_test.c
mm/gup_test.h
mm/highmem.c
mm/huge_memory.c
mm/hugetlb.c
mm/hugetlb_cgroup.c
mm/internal.h
mm/kasan/kasan.h
mm/kasan/quarantine.c
mm/kasan/shadow.c
mm/kfence/core.c
mm/kfence/report.c
mm/khugepaged.c
mm/ksm.c
mm/list_lru.c
mm/madvise.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/mempool.c
mm/migrate.c
mm/mlock.c
mm/mmap.c
mm/mprotect.c
mm/mremap.c
mm/nommu.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/page_owner.c
mm/page_vma_mapped.c
mm/percpu-internal.h
mm/percpu.c
mm/pgalloc-track.h
mm/process_vm_access.c
mm/rmap.c
mm/shmem.c
mm/slab.c
mm/slub.c
mm/sparse.c
mm/swap.c
mm/swap_slots.c
mm/swap_state.c
mm/swapfile.c
mm/truncate.c
mm/userfaultfd.c
mm/util.c
mm/vmalloc.c
mm/vmscan.c
mm/vmstat.c
mm/workingset.c
mm/z3fold.c
mm/zpool.c
mm/zsmalloc.c
mm/zswap.c
net/bridge/br_netlink.c
net/ceph/auth.c
net/ceph/auth_x.c
net/ceph/decode.c
net/ethtool/netlink.c
net/hsr/hsr_forward.c
net/ipv4/netfilter/arp_tables.c
net/ipv4/netfilter/arptable_filter.c
net/ipv4/tcp.c
net/ipv4/tcp_cong.c
net/ipv6/seg6.c
net/ipv6/seg6_local.c
net/mptcp/subflow.c
net/netfilter/nf_conntrack_ftp.c
net/netfilter/nf_conntrack_h323_main.c
net/netfilter/nf_conntrack_irc.c
net/netfilter/nf_conntrack_pptp.c
net/netfilter/nf_conntrack_proto_tcp.c
net/netfilter/nf_conntrack_sane.c
net/netfilter/nf_tables_api.c
net/netfilter/nfnetlink.c
net/netfilter/nfnetlink_osf.c
net/netfilter/nft_set_hash.c
net/netfilter/xt_SECMARK.c
net/nfc/llcp_sock.c
net/openvswitch/actions.c
net/rds/ib_cm.c
net/rds/rdma_transport.c
net/sched/sch_frag.c
net/sctp/sm_make_chunk.c
net/sctp/sm_sideeffect.c
net/sctp/sm_statefuns.c
net/sctp/socket.c
net/smc/af_smc.c
net/sunrpc/clnt.c
net/sunrpc/rpcb_clnt.c
net/sunrpc/svc.c
net/sunrpc/svcsock.c
net/sunrpc/xprt.c
net/sunrpc/xprtrdma/backchannel.c
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/svc_rdma_sendto.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
net/sunrpc/xprtsock.c
net/vmw_vsock/vmci_transport.c
net/xdp/xsk_queue.h
samples/Kconfig
samples/Makefile
samples/configfs/configfs_sample.c
samples/kprobes/kprobe_example.c
samples/landlock/.gitignore [new file with mode: 0644]
samples/landlock/Makefile [new file with mode: 0644]
samples/landlock/sandboxer.c [new file with mode: 0644]
samples/vfio-mdev/mbochs.c
samples/vfio-mdev/mdpy.c
scripts/checkpatch.pl
scripts/gdb/linux/cpus.py
scripts/gdb/linux/symbols.py
scripts/kernel-doc
scripts/package/buildtar
scripts/recordmcount.pl
scripts/spelling.txt
scripts/ver_linux
security/Kconfig
security/Makefile
security/apparmor/mount.c
security/integrity/digsig.c
security/integrity/iint.c
security/integrity/ima/ima_main.c
security/integrity/ima/ima_policy.c
security/integrity/ima/ima_template.c
security/landlock/Kconfig [new file with mode: 0644]
security/landlock/Makefile [new file with mode: 0644]
security/landlock/common.h [new file with mode: 0644]
security/landlock/cred.c [new file with mode: 0644]
security/landlock/cred.h [new file with mode: 0644]
security/landlock/fs.c [new file with mode: 0644]
security/landlock/fs.h [new file with mode: 0644]
security/landlock/limits.h [new file with mode: 0644]
security/landlock/object.c [new file with mode: 0644]
security/landlock/object.h [new file with mode: 0644]
security/landlock/ptrace.c [new file with mode: 0644]
security/landlock/ptrace.h [new file with mode: 0644]
security/landlock/ruleset.c [new file with mode: 0644]
security/landlock/ruleset.h [new file with mode: 0644]
security/landlock/setup.c [new file with mode: 0644]
security/landlock/setup.h [new file with mode: 0644]
security/landlock/syscalls.c [new file with mode: 0644]
security/safesetid/lsm.c
security/security.c
security/selinux/hooks.c
security/selinux/include/objsec.h
security/selinux/ss/services.c
security/smack/smack.h
security/smack/smack_lsm.c
sound/pci/hda/hda_generic.c
sound/pci/hda/patch_realtek.c
sound/usb/mixer_maps.c
tools/build/Makefile.feature
tools/build/feature/Makefile
tools/build/feature/test-libtraceevent.c [new file with mode: 0644]
tools/gpio/gpio-utils.c
tools/include/asm-generic/bitops/find.h
tools/include/asm-generic/bitsperlong.h
tools/include/asm-generic/hugetlb_encode.h
tools/include/linux/bitmap.h
tools/include/linux/math64.h [new file with mode: 0644]
tools/include/linux/types.h
tools/include/uapi/linux/perf_event.h
tools/lib/bitmap.c
tools/lib/bpf/ringbuf.c
tools/lib/find_bit.c
tools/lib/perf/Documentation/libperf.txt
tools/lib/perf/evsel.c
tools/lib/perf/include/internal/evsel.h
tools/lib/perf/include/internal/mmap.h
tools/lib/perf/include/internal/tests.h
tools/lib/perf/include/internal/xyarray.h
tools/lib/perf/include/perf/bpf_perf.h [new file with mode: 0644]
tools/lib/perf/include/perf/event.h
tools/lib/perf/include/perf/evsel.h
tools/lib/perf/libperf.map
tools/lib/perf/mmap.c
tools/lib/perf/tests/Makefile
tools/lib/perf/tests/test-evsel.c
tools/perf/.gitignore
tools/perf/Documentation/intel-hybrid.txt [new file with mode: 0644]
tools/perf/Documentation/perf-annotate.txt
tools/perf/Documentation/perf-buildid-cache.txt
tools/perf/Documentation/perf-config.txt
tools/perf/Documentation/perf-data.txt
tools/perf/Documentation/perf-iostat.txt [new file with mode: 0644]
tools/perf/Documentation/perf-record.txt
tools/perf/Documentation/perf-report.txt
tools/perf/Documentation/perf-stat.txt
tools/perf/Documentation/perf-top.txt
tools/perf/Documentation/perf.txt
tools/perf/Documentation/topdown.txt
tools/perf/Makefile
tools/perf/Makefile.config
tools/perf/Makefile.perf
tools/perf/arch/arm/util/cs-etm.c
tools/perf/arch/arm64/util/Build
tools/perf/arch/arm64/util/kvm-stat.c
tools/perf/arch/arm64/util/machine.c
tools/perf/arch/arm64/util/perf_regs.c
tools/perf/arch/arm64/util/pmu.c [new file with mode: 0644]
tools/perf/arch/arm64/util/unwind-libunwind.c
tools/perf/arch/mips/Makefile [new file with mode: 0644]
tools/perf/arch/mips/entry/syscalls/mksyscalltbl [new file with mode: 0644]
tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl [new file with mode: 0644]
tools/perf/arch/mips/include/dwarf-regs-table.h [new file with mode: 0644]
tools/perf/arch/mips/include/perf_regs.h [new file with mode: 0644]
tools/perf/arch/mips/util/Build [new file with mode: 0644]
tools/perf/arch/mips/util/dwarf-regs.c [new file with mode: 0644]
tools/perf/arch/mips/util/perf_regs.c [new file with mode: 0644]
tools/perf/arch/mips/util/unwind-libunwind.c [new file with mode: 0644]
tools/perf/arch/powerpc/util/Build
tools/perf/arch/powerpc/util/event.c [new file with mode: 0644]
tools/perf/arch/powerpc/util/evsel.c [new file with mode: 0644]
tools/perf/arch/powerpc/util/kvm-stat.c
tools/perf/arch/powerpc/util/utils_header.h
tools/perf/arch/x86/tests/bp-modify.c
tools/perf/arch/x86/util/Build
tools/perf/arch/x86/util/iostat.c [new file with mode: 0644]
tools/perf/arch/x86/util/perf_regs.c
tools/perf/bench/epoll-wait.c
tools/perf/bench/inject-buildid.c
tools/perf/bench/numa.c
tools/perf/builtin-annotate.c
tools/perf/builtin-daemon.c
tools/perf/builtin-data.c
tools/perf/builtin-diff.c
tools/perf/builtin-lock.c
tools/perf/builtin-record.c
tools/perf/builtin-report.c
tools/perf/builtin-sched.c
tools/perf/builtin-script.c
tools/perf/builtin-stat.c
tools/perf/builtin-top.c
tools/perf/check-headers.sh
tools/perf/command-list.txt
tools/perf/examples/bpf/augmented_raw_syscalls.c
tools/perf/jvmti/jvmti_agent.c
tools/perf/perf-iostat.sh [new file with mode: 0644]
tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json
tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json [new file with mode: 0644]
tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json [new file with mode: 0644]
tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cache.json [new file with mode: 0644]
tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cycle.json [new file with mode: 0644]
tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/exception.json [new file with mode: 0644]
tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/instruction.json [new file with mode: 0644]
tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/memory.json [new file with mode: 0644]
tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/other.json [new file with mode: 0644]
tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/pipeline.json [new file with mode: 0644]
tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/sve.json [new file with mode: 0644]
tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json [new file with mode: 0644]
tools/perf/pmu-events/arch/arm64/mapfile.csv
tools/perf/pmu-events/arch/powerpc/mapfile.csv
tools/perf/pmu-events/arch/powerpc/power10/cache.json [new file with mode: 0644]
tools/perf/pmu-events/arch/powerpc/power10/floating_point.json [new file with mode: 0644]
tools/perf/pmu-events/arch/powerpc/power10/frontend.json [new file with mode: 0644]
tools/perf/pmu-events/arch/powerpc/power10/locks.json [new file with mode: 0644]
tools/perf/pmu-events/arch/powerpc/power10/marked.json [new file with mode: 0644]
tools/perf/pmu-events/arch/powerpc/power10/memory.json [new file with mode: 0644]
tools/perf/pmu-events/arch/powerpc/power10/others.json [new file with mode: 0644]
tools/perf/pmu-events/arch/powerpc/power10/pipeline.json [new file with mode: 0644]
tools/perf/pmu-events/arch/powerpc/power10/pmc.json [new file with mode: 0644]
tools/perf/pmu-events/arch/powerpc/power10/translation.json [new file with mode: 0644]
tools/perf/pmu-events/arch/powerpc/power8/metrics.json
tools/perf/pmu-events/arch/powerpc/power9/metrics.json
tools/perf/pmu-events/arch/x86/amdzen1/cache.json
tools/perf/pmu-events/arch/x86/amdzen1/core.json
tools/perf/pmu-events/arch/x86/amdzen1/floating-point.json
tools/perf/pmu-events/arch/x86/amdzen1/memory.json
tools/perf/pmu-events/arch/x86/amdzen1/other.json
tools/perf/pmu-events/arch/x86/amdzen1/recommended.json
tools/perf/pmu-events/arch/x86/amdzen2/branch.json
tools/perf/pmu-events/arch/x86/amdzen2/cache.json
tools/perf/pmu-events/arch/x86/amdzen2/core.json
tools/perf/pmu-events/arch/x86/amdzen2/floating-point.json
tools/perf/pmu-events/arch/x86/amdzen2/memory.json
tools/perf/pmu-events/arch/x86/amdzen2/other.json
tools/perf/pmu-events/arch/x86/amdzen2/recommended.json
tools/perf/pmu-events/arch/x86/amdzen3/branch.json [new file with mode: 0644]
tools/perf/pmu-events/arch/x86/amdzen3/cache.json [new file with mode: 0644]
tools/perf/pmu-events/arch/x86/amdzen3/core.json [new file with mode: 0644]
tools/perf/pmu-events/arch/x86/amdzen3/data-fabric.json [new file with mode: 0644]
tools/perf/pmu-events/arch/x86/amdzen3/floating-point.json [new file with mode: 0644]
tools/perf/pmu-events/arch/x86/amdzen3/memory.json [new file with mode: 0644]
tools/perf/pmu-events/arch/x86/amdzen3/other.json [new file with mode: 0644]
tools/perf/pmu-events/arch/x86/amdzen3/recommended.json [new file with mode: 0644]
tools/perf/pmu-events/arch/x86/mapfile.csv
tools/perf/pmu-events/jevents.c
tools/perf/scripts/python/netdev-times.py
tools/perf/tests/attr.c
tools/perf/tests/bp_signal.c
tools/perf/tests/code-reading.c
tools/perf/tests/demangle-ocaml-test.c
tools/perf/tests/evsel-roundtrip-name.c
tools/perf/tests/hists_cumulate.c
tools/perf/tests/hists_filter.c
tools/perf/tests/make
tools/perf/tests/parse-events.c
tools/perf/tests/parse-metric.c
tools/perf/tests/perf-time-to-tsc.c
tools/perf/tests/pmu-events.c
tools/perf/tests/shell/buildid.sh
tools/perf/tests/shell/daemon.sh
tools/perf/tests/shell/stat+csv_summary.sh [new file with mode: 0755]
tools/perf/tests/shell/stat+shadow_stat.sh
tools/perf/tests/shell/stat_bpf_counters.sh [new file with mode: 0755]
tools/perf/tests/switch-tracking.c
tools/perf/tests/topology.c
tools/perf/trace/beauty/fsconfig.sh
tools/perf/trace/beauty/include/linux/socket.h
tools/perf/ui/browsers/annotate.c
tools/perf/ui/browsers/hists.c
tools/perf/ui/stdio/hist.c
tools/perf/util/Build
tools/perf/util/annotate.c
tools/perf/util/annotate.h
tools/perf/util/bpf-loader.c
tools/perf/util/bpf_counter.c
tools/perf/util/bpf_counter.h
tools/perf/util/bpf_skel/bperf.h [new file with mode: 0644]
tools/perf/util/bpf_skel/bperf_follower.bpf.c [new file with mode: 0644]
tools/perf/util/bpf_skel/bperf_leader.bpf.c [new file with mode: 0644]
tools/perf/util/bpf_skel/bperf_u.h [new file with mode: 0644]
tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c
tools/perf/util/call-path.h
tools/perf/util/callchain.c
tools/perf/util/config.c
tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
tools/perf/util/cs-etm.c
tools/perf/util/cs-etm.h
tools/perf/util/data-convert-bt.c
tools/perf/util/data-convert-bt.h [deleted file]
tools/perf/util/data-convert-json.c [new file with mode: 0644]
tools/perf/util/data-convert.h
tools/perf/util/demangle-java.c
tools/perf/util/demangle-ocaml.c
tools/perf/util/dso.h
tools/perf/util/dwarf-aux.c
tools/perf/util/dwarf-aux.h
tools/perf/util/dwarf-regs.c
tools/perf/util/event.h
tools/perf/util/events_stats.h
tools/perf/util/evlist-hybrid.c [new file with mode: 0644]
tools/perf/util/evlist-hybrid.h [new file with mode: 0644]
tools/perf/util/evlist.c
tools/perf/util/evlist.h
tools/perf/util/evsel.c
tools/perf/util/evsel.h
tools/perf/util/expr.h
tools/perf/util/header.c
tools/perf/util/hist.c
tools/perf/util/hist.h
tools/perf/util/intel-pt.c
tools/perf/util/iostat.c [new file with mode: 0644]
tools/perf/util/iostat.h [new file with mode: 0644]
tools/perf/util/jitdump.c
tools/perf/util/levenshtein.c
tools/perf/util/libunwind/arm64.c
tools/perf/util/libunwind/x86_32.c
tools/perf/util/llvm-utils.c
tools/perf/util/machine.c
tools/perf/util/map.h
tools/perf/util/mem-events.h
tools/perf/util/metricgroup.c
tools/perf/util/metricgroup.h
tools/perf/util/parse-events-hybrid.c [new file with mode: 0644]
tools/perf/util/parse-events-hybrid.h [new file with mode: 0644]
tools/perf/util/parse-events.c
tools/perf/util/parse-events.h
tools/perf/util/parse-events.l
tools/perf/util/parse-events.y
tools/perf/util/pmu-hybrid.c [new file with mode: 0644]
tools/perf/util/pmu-hybrid.h [new file with mode: 0644]
tools/perf/util/pmu.c
tools/perf/util/pmu.h
tools/perf/util/probe-event.c
tools/perf/util/probe-finder.c
tools/perf/util/python-ext-sources
tools/perf/util/python.c
tools/perf/util/s390-cpumsf.c
tools/perf/util/s390-sample-raw.c
tools/perf/util/scripting-engines/trace-event-python.c
tools/perf/util/session.c
tools/perf/util/session.h
tools/perf/util/sort.c
tools/perf/util/sort.h
tools/perf/util/stat-display.c
tools/perf/util/stat-shadow.c
tools/perf/util/stat.c
tools/perf/util/stat.h
tools/perf/util/strbuf.h
tools/perf/util/strfilter.h
tools/perf/util/symbol-elf.c
tools/perf/util/symbol_fprintf.c
tools/perf/util/synthetic-events.c
tools/perf/util/syscalltbl.c
tools/perf/util/target.h
tools/perf/util/thread-stack.h
tools/perf/util/tsc.c
tools/perf/util/tsc.h
tools/perf/util/units.c
tools/perf/util/units.h
tools/perf/util/unwind-libunwind-local.c
tools/power/x86/turbostat/turbostat.8
tools/power/x86/turbostat/turbostat.c
tools/scripts/Makefile.include
tools/testing/ktest/examples/vmware.conf [new file with mode: 0644]
tools/testing/ktest/ktest.pl
tools/testing/selftests/Makefile
tools/testing/selftests/bpf/prog_tests/snprintf.c
tools/testing/selftests/bpf/progs/test_snprintf.c
tools/testing/selftests/dma/dma_map_benchmark.c
tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
tools/testing/selftests/kvm/.gitignore
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/aarch64/vgic_init.c [new file with mode: 0644]
tools/testing/selftests/kvm/dirty_log_test.c
tools/testing/selftests/kvm/include/kvm_util.h
tools/testing/selftests/kvm/include/test_util.h
tools/testing/selftests/kvm/kvm_page_table_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/lib/assert.c
tools/testing/selftests/kvm/lib/kvm_util.c
tools/testing/selftests/kvm/lib/sparsebit.c
tools/testing/selftests/kvm/lib/test_util.c
tools/testing/selftests/kvm/set_memory_region_test.c
tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
tools/testing/selftests/landlock/.gitignore [new file with mode: 0644]
tools/testing/selftests/landlock/Makefile [new file with mode: 0644]
tools/testing/selftests/landlock/base_test.c [new file with mode: 0644]
tools/testing/selftests/landlock/common.h [new file with mode: 0644]
tools/testing/selftests/landlock/config [new file with mode: 0644]
tools/testing/selftests/landlock/fs_test.c [new file with mode: 0644]
tools/testing/selftests/landlock/ptrace_test.c [new file with mode: 0644]
tools/testing/selftests/landlock/true.c [new file with mode: 0644]
tools/testing/selftests/mincore/mincore_selftest.c
tools/testing/selftests/powerpc/mm/tlbie_test.c
tools/testing/selftests/proc/Makefile
tools/testing/selftests/proc/proc-subset-pid.c [new file with mode: 0644]
tools/testing/selftests/proc/read.c
tools/testing/selftests/vm/.gitignore
tools/testing/selftests/vm/Makefile
tools/testing/selftests/vm/gup_test.c
tools/testing/selftests/vm/split_huge_page_test.c [new file with mode: 0644]
tools/testing/selftests/vm/userfaultfd.c
tools/tracing/latency/latency-collector.c
tools/usb/hcd-tests.sh
virt/kvm/coalesced_mmio.c
virt/kvm/kvm_main.c

diff --git a/CREDITS b/CREDITS
index b06760f..7ef7b13 100644 (file)
--- a/CREDITS
+++ b/CREDITS
@@ -1874,6 +1874,11 @@ S: Krosenska' 543
 S: 181 00 Praha 8
 S: Czech Republic
 
+N: Murali Karicheri
+E: m-karicheri2@ti.com
+D: Keystone NetCP driver
+D: Keystone PCIe host controller driver
+
 N: Jan "Yenya" Kasprzak
 E: kas@fi.muni.cz
 D: Author of the COSA/SRP sync serial board driver.
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe b/Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe
new file mode 100644 (file)
index 0000000..ad3bbc6
--- /dev/null
@@ -0,0 +1,14 @@
+What:          /sys/bus/coresight/devices/trbe<cpu>/align
+Date:          March 2021
+KernelVersion: 5.13
+Contact:       Anshuman Khandual <anshuman.khandual@arm.com>
+Description:   (Read) Shows the TRBE write pointer alignment. This value
+               is fetched from the TRBIDR register.
+
+What:          /sys/bus/coresight/devices/trbe<cpu>/flag
+Date:          March 2021
+KernelVersion: 5.13
+Contact:       Anshuman Khandual <anshuman.khandual@arm.com>
+Description:   (Read) Shows if TRBE updates in the memory are with access
+               and dirty flag updates as well. This value is fetched from
+               the TRBIDR register.
diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa b/Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa
new file mode 100644 (file)
index 0000000..3c7d132
--- /dev/null
@@ -0,0 +1,30 @@
+What:          /sys/bus/event_source/devices/dsa*/format
+Date:          April 2021
+KernelVersion:  5.13
+Contact:       Tom Zanussi <tom.zanussi@linux.intel.com>
+Description:   Read-only.  Attribute group to describe the magic bits
+               that go into perf_event_attr.config or
+               perf_event_attr.config1 for the IDXD DSA pmu.  (See also
+               ABI/testing/sysfs-bus-event_source-devices-format).
+
+               Each attribute in this group defines a bit range in
+               perf_event_attr.config or perf_event_attr.config1.
+               All supported attributes are listed below (See the
+               IDXD DSA Spec for possible attribute values)::
+
+                   event_category = "config:0-3"    - event category
+                   event          = "config:4-31"   - event ID
+
+                   filter_wq      = "config1:0-31"  - workqueue filter
+                   filter_tc      = "config1:32-39" - traffic class filter
+                   filter_pgsz    = "config1:40-43" - page size filter
+                   filter_sz      = "config1:44-51" - transfer size filter
+                   filter_eng     = "config1:52-59" - engine filter
+
+What:          /sys/bus/event_source/devices/dsa*/cpumask
+Date:          April 2021
+KernelVersion:  5.13
+Contact:       Tom Zanussi <tom.zanussi@linux.intel.com>
+Description:    Read-only.  This file always returns the cpu to which the
+                IDXD DSA pmu is bound for access to all dsa pmu
+               performance monitoring events.
index ed79f58..47e6b97 100644 (file)
@@ -58,3 +58,19 @@ Description:
 
                Indicates the mux id associated to the qmimux network interface
                during its creation.
+
+What:          /sys/class/net/<iface>/qmi/pass_through
+Date:          January 2021
+KernelVersion: 5.12
+Contact:       Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
+Description:
+               Boolean.  Default: 'N'
+
+               Set this to 'Y' to enable 'pass-through' mode, allowing packets
+               in MAP format to be passed on to the stack.
+
+               Normally the rmnet driver (CONFIG_RMNET) is then used to process
+               and demultiplex these packets.
+
+               'Pass-through' mode can be enabled when the device is in
+               'raw-ip' mode only.
index 0f7165a..49a4157 100644 (file)
@@ -34,6 +34,9 @@ Description:  Multipath policy specifies which path should be selected on each IO
                min-inflight (1):
                    select path with minimum inflights.
 
+               min-latency (2):
+                   select path with minimum latency.
+
 What:          /sys/class/rtrs-client/<session-name>/paths/
 Date:          Feb 2020
 KernelVersion: 5.7
@@ -95,6 +98,15 @@ KernelVersion:       5.7
 Contact:       Jack Wang <jinpu.wang@cloud.ionos.com> Danil Kipnis <danil.kipnis@cloud.ionos.com>
 Description:   RO, Contains the destination address of the path
 
+What:          /sys/class/rtrs-client/<session-name>/paths/<src@dst>/cur_latency
+Date:          Feb 2020
+KernelVersion: 5.7
+Contact:       Jack Wang <jinpu.wang@cloud.ionos.com> Danil Kipnis <danil.kipnis@cloud.ionos.com>
+Description:   RO, Contains the latency time calculated by the heart-beat messages.
+               Whenever the client sends heart-beat message, it checks the time gap
+               between sending the heart-beat message and receiving the ACK.
+               This value can be changed regularly.
+
 What:          /sys/class/rtrs-client/<session-name>/paths/<src@dst>/stats/reset_all
 Date:          Feb 2020
 KernelVersion: 5.7
index 0eee30b..fe13baa 100644 (file)
@@ -285,7 +285,7 @@ Description:        Disable L3 cache indices
 
                All AMD processors with L3 caches provide this functionality.
                For details, see BKDGs at
-               http://developer.amd.com/documentation/guides/Pages/default.aspx
+                https://www.amd.com/en/support/tech-docs?keyword=bios+kernel
 
 
 What:          /sys/devices/system/cpu/cpufreq/boost
index cd7c578..704434b 100644 (file)
@@ -15,3 +15,12 @@ Description:    Reports the model identification provided by the touchscreen, fo
                Access: Read
 
                Valid values: Represented as string
+
+What:          /sys/bus/i2c/devices/xxx/type
+Date:          Jan 2021
+Contact:       linux-input@vger.kernel.org
+Description:   Reports the type identification provided by the touchscreen, for example "PCAP82H80 Series"
+
+               Access: Read
+
+               Valid values: Represented as string
index cbeac1b..4849b8e 100644 (file)
@@ -276,7 +276,7 @@ Date                April 2019
 Contact:       "Daniel Rosenberg" <drosen@google.com>
 Description:   If checkpoint=disable, it displays the number of blocks that
                are unusable.
-               If checkpoint=enable it displays the enumber of blocks that
+               If checkpoint=enable it displays the number of blocks that
                would be unusable if checkpoint=disable were to be set.
 
 What:          /sys/fs/f2fs/<disk>/encoding
@@ -409,3 +409,32 @@ Description:       Give a way to change checkpoint merge daemon's io priority.
                I/O priority "3". We can select the class between "rt" and "be",
                and set the I/O priority within valid range of it. "," delimiter
                is necessary in between I/O class and priority number.
+
+What:          /sys/fs/f2fs/<disk>/ovp_segments
+Date:          March 2021
+Contact:       "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:   Shows the number of overprovision segments.
+
+What:          /sys/fs/f2fs/<disk>/compr_written_block
+Date:          March 2021
+Contact:       "Daeho Jeong" <daehojeong@google.com>
+Description:   Show the block count written after compression since mount. Note
+               that when the compressed blocks are deleted, this count doesn't
+               decrease. If you write "0" here, you can initialize
+               compr_written_block and compr_saved_block to "0".
+
+What:          /sys/fs/f2fs/<disk>/compr_saved_block
+Date:          March 2021
+Contact:       "Daeho Jeong" <daehojeong@google.com>
+Description:   Show the saved block count with compression since mount. Note
+               that when the compressed blocks are deleted, this count doesn't
+               decrease. If you write "0" here, you can initialize
+               compr_written_block and compr_saved_block to "0".
+
+What:          /sys/fs/f2fs/<disk>/compr_new_inode
+Date:          March 2021
+Contact:       "Daeho Jeong" <daehojeong@google.com>
+Description:   Show the count of inode newly enabled for compression since mount.
+               Note that when the compression is disabled for the files, this count
+               doesn't decrease. If you write "0" here, you can initialize
+               compr_new_inode to "0".
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma b/Documentation/ABI/testing/sysfs-kernel-mm-cma
new file mode 100644 (file)
index 0000000..02b2bb6
--- /dev/null
@@ -0,0 +1,25 @@
+What:          /sys/kernel/mm/cma/
+Date:          Feb 2021
+Contact:       Minchan Kim <minchan@kernel.org>
+Description:
+               /sys/kernel/mm/cma/ contains a subdirectory for each CMA
+               heap name (also sometimes called CMA areas).
+
+               Each CMA heap subdirectory (that is, each
+               /sys/kernel/mm/cma/<cma-heap-name> directory) contains the
+               following items:
+
+                       alloc_pages_success
+                       alloc_pages_fail
+
+What:          /sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_success
+Date:          Feb 2021
+Contact:       Minchan Kim <minchan@kernel.org>
+Description:
+               the number of pages CMA API succeeded to allocate
+
+What:          /sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_fail
+Date:          Feb 2021
+Contact:       Minchan Kim <minchan@kernel.org>
+Description:
+               the number of pages CMA API failed to allocate
index ef41f77..9c2be82 100644 (file)
@@ -4,7 +4,7 @@
 
    1 char      Memory devices
                  1 = /dev/mem          Physical memory access
-                 2 = /dev/kmem         Kernel virtual memory access
+                 2 = /dev/kmem         OBSOLETE - replaced by /proc/kcore
                  3 = /dev/null         Null device
                  4 = /dev/port         I/O port access
                  5 = /dev/zero         Null byte source
index 9fa1618..493071d 100644 (file)
@@ -17,17 +17,18 @@ module.
     gpio_mockup_ranges
 
         This parameter takes an argument in the form of an array of integer
-        pairs. Each pair defines the base GPIO number (if any) and the number
-        of lines exposed by the chip. If the base GPIO is -1, the gpiolib
-        will assign it automatically.
+        pairs. Each pair defines the base GPIO number (non-negative integer)
+        and the first number after the last of this chip. If the base GPIO
+        is -1, the gpiolib will assign it automatically. while the following
+        parameter is the number of lines exposed by the chip.
 
-        Example: gpio_mockup_ranges=-1,8,-1,16,405,4
+        Example: gpio_mockup_ranges=-1,8,-1,16,405,409
 
         The line above creates three chips. The first one will expose 8 lines,
         the second 16 and the third 4. The base GPIO for the third chip is set
         to 405 while for two first chips it will be assigned automatically.
 
-    gpio_named_lines
+    gpio_mockup_named_lines
 
         This parameter doesn't take any arguments. It lets the driver know that
         GPIO lines exposed by it should be named.
index 1c0a3cf..cb89dbd 100644 (file)
                        Don't use this when you are not running on the
                        android emulator
 
+       gpio-mockup.gpio_mockup_ranges
+                       [HW] Sets the ranges of gpiochip of for this device.
+                       Format: <start1>,<end1>,<start2>,<end2>...
+       gpio-mockup.gpio_mockup_named_lines
+                       [HW] Let the driver know GPIO lines should be named.
+
        gpt             [EFI] Forces disk with valid GPT signature but
                        invalid Protective MBR to be treated as GPT. If the
                        primary GPT is corrupted, it enables the backup/alternate
                        Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
                        Default: 1024
 
-       gpio-mockup.gpio_mockup_ranges
-                       [HW] Sets the ranges of gpiochip of for this device.
-                       Format: <start1>,<end1>,<start2>,<end2>...
-
        hardlockup_all_cpu_backtrace=
                        [KNL] Should the hard-lockup detector generate
                        backtraces on all cpus.
                        initcall functions.  Useful for debugging built-in
                        modules and initcalls.
 
+       initramfs_async= [KNL]
+                       Format: <bool>
+                       Default: 1
+                       This parameter controls whether the initramfs
+                       image is unpacked asynchronously, concurrently
+                       with devices being probed and
+                       initialized. This should normally just work,
+                       but as a debugging aid, one can get the
+                       historical behaviour of the initramfs
+                       unpacking being completed before device_ and
+                       late_ initcalls.
+
        initrd=         [BOOT] Specify the location of the initial ramdisk
 
        initrdmem=      [KNL] Specify a physical address and size from which to
                        bypassed by not enabling DMAR with this option. In
                        this case, gfx device will use physical address for
                        DMA.
-               forcedac [X86-64]
-                       With this option iommu will not optimize to look
-                       for io virtual address below 32-bit forcing dual
-                       address cycle on pci bus for cards supporting greater
-                       than 32-bit addressing. The default is to look
-                       for translation below 32-bit and if not available
-                       then look in the higher range.
                strict [Default Off]
                        With this option on every unmap_single operation will
                        result in a hardware IOTLB flush operation as opposed
                nobypass        [PPC/POWERNV]
                        Disable IOMMU bypass, using IOMMU for PCI devices.
 
+       iommu.forcedac= [ARM64, X86] Control IOVA allocation for PCI devices.
+                       Format: { "0" | "1" }
+                       0 - Try to allocate a 32-bit DMA address first, before
+                         falling back to the full range if needed.
+                       1 - Allocate directly from the full usable range,
+                         forcing Dual Address Cycle for PCI cards supporting
+                         greater than 32-bit addressing.
+
        iommu.strict=   [ARM64] Configure TLB invalidation behaviour
                        Format: { "0" | "1" }
                        0 - Lazy mode.
                        seconds.  Use this parameter to check at some
                        other rate.  0 disables periodic checking.
 
-       memtest=        [KNL,X86,ARM,PPC] Enable memtest
+       memory_hotplug.memmap_on_memory
+                       [KNL,X86,ARM] Boolean flag to enable this feature.
+                       Format: {on | off (default)}
+                       When enabled, runtime hotplugged memory will
+                       allocate its internal metadata (struct pages)
+                       from the hotadded memory which will allow to
+                       hotadd a lot of memory without requiring
+                       additional memory to do so.
+                       This feature is disabled by default because it
+                       has some implication on large (e.g. GB)
+                       allocations in some configurations (e.g. small
+                       memory blocks).
+                       The state of the flag can be read in
+                       /sys/module/memory_hotplug/parameters/memmap_on_memory.
+                       Note that even when enabled, there are a few cases where
+                       the feature is not effective.
+
+       memtest=        [KNL,X86,ARM,PPC,RISCV] Enable memtest
                        Format: <integer>
                        default : 0 <disable>
                        Specifies the number of memtest passes to be
 
        nohugeiomap     [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings.
 
+       nohugevmalloc   [PPC] Disable kernel huge vmalloc mappings.
+
        nosmt           [KNL,S390] Disable symmetric multithreading (SMT).
                        Equivalent to smt=1.
 
index 5307f90..05d51d2 100644 (file)
@@ -357,6 +357,15 @@ creates ZONE_MOVABLE as following.
    Unfortunately, there is no information to show which memory block belongs
    to ZONE_MOVABLE. This is TBD.
 
+.. note::
+   Techniques that rely on long-term pinnings of memory (especially, RDMA and
+   vfio) are fundamentally problematic with ZONE_MOVABLE and, therefore, memory
+   hot remove. Pinned pages cannot reside on ZONE_MOVABLE, to guarantee that
+   memory can still get hot removed - be aware that pinning can fail even if
+   there is plenty of free memory in ZONE_MOVABLE. In addition, using
+   ZONE_MOVABLE might make page pinning more expensive, because pages have to be
+   migrated off that zone first.
+
 .. _memory_hotplug_how_to_offline_memory:
 
 How to offline memory
index 65eefa6..3aa38e8 100644 (file)
@@ -63,36 +63,36 @@ the generic ioctl available.
 
 The ``uffdio_api.features`` bitmask returned by the ``UFFDIO_API`` ioctl
 defines what memory types are supported by the ``userfaultfd`` and what
-events, except page fault notifications, may be generated.
-
-If the kernel supports registering ``userfaultfd`` ranges on hugetlbfs
-virtual memory areas, ``UFFD_FEATURE_MISSING_HUGETLBFS`` will be set in
-``uffdio_api.features``. Similarly, ``UFFD_FEATURE_MISSING_SHMEM`` will be
-set if the kernel supports registering ``userfaultfd`` ranges on shared
-memory (covering all shmem APIs, i.e. tmpfs, ``IPCSHM``, ``/dev/zero``,
-``MAP_SHARED``, ``memfd_create``, etc).
-
-The userland application that wants to use ``userfaultfd`` with hugetlbfs
-or shared memory need to set the corresponding flag in
-``uffdio_api.features`` to enable those features.
-
-If the userland desires to receive notifications for events other than
-page faults, it has to verify that ``uffdio_api.features`` has appropriate
-``UFFD_FEATURE_EVENT_*`` bits set. These events are described in more
-detail below in `Non-cooperative userfaultfd`_ section.
-
-Once the ``userfaultfd`` has been enabled the ``UFFDIO_REGISTER`` ioctl should
-be invoked (if present in the returned ``uffdio_api.ioctls`` bitmask) to
-register a memory range in the ``userfaultfd`` by setting the
+events, except page fault notifications, may be generated:
+
+- The ``UFFD_FEATURE_EVENT_*`` flags indicate that various other events
+  other than page faults are supported. These events are described in more
+  detail below in the `Non-cooperative userfaultfd`_ section.
+
+- ``UFFD_FEATURE_MISSING_HUGETLBFS`` and ``UFFD_FEATURE_MISSING_SHMEM``
+  indicate that the kernel supports ``UFFDIO_REGISTER_MODE_MISSING``
+  registrations for hugetlbfs and shared memory (covering all shmem APIs,
+  i.e. tmpfs, ``IPCSHM``, ``/dev/zero``, ``MAP_SHARED``, ``memfd_create``,
+  etc) virtual memory areas, respectively.
+
+- ``UFFD_FEATURE_MINOR_HUGETLBFS`` indicates that the kernel supports
+  ``UFFDIO_REGISTER_MODE_MINOR`` registration for hugetlbfs virtual memory
+  areas.
+
+The userland application should set the feature flags it intends to use
+when invoking the ``UFFDIO_API`` ioctl, to request that those features be
+enabled if supported.
+
+Once the ``userfaultfd`` API has been enabled the ``UFFDIO_REGISTER``
+ioctl should be invoked (if present in the returned ``uffdio_api.ioctls``
+bitmask) to register a memory range in the ``userfaultfd`` by setting the
 uffdio_register structure accordingly. The ``uffdio_register.mode``
 bitmask will specify to the kernel which kind of faults to track for
-the range (``UFFDIO_REGISTER_MODE_MISSING`` would track missing
-pages). The ``UFFDIO_REGISTER`` ioctl will return the
+the range. The ``UFFDIO_REGISTER`` ioctl will return the
 ``uffdio_register.ioctls`` bitmask of ioctls that are suitable to resolve
 userfaults on the range registered. Not all ioctls will necessarily be
-supported for all memory types depending on the underlying virtual
-memory backend (anonymous memory vs tmpfs vs real filebacked
-mappings).
+supported for all memory types (e.g. anonymous memory vs. shmem vs.
+hugetlbfs), or all types of intercepted faults.
 
 Userland can use the ``uffdio_register.ioctls`` to manage the virtual
 address space in the background (to add or potentially also remove
@@ -100,21 +100,46 @@ memory from the ``userfaultfd`` registered range). This means a userfault
 could be triggering just before userland maps in the background the
 user-faulted page.
 
-The primary ioctl to resolve userfaults is ``UFFDIO_COPY``. That
-atomically copies a page into the userfault registered range and wakes
-up the blocked userfaults
-(unless ``uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE`` is set).
-Other ioctl works similarly to ``UFFDIO_COPY``. They're atomic as in
-guaranteeing that nothing can see an half copied page since it'll
-keep userfaulting until the copy has finished.
+Resolving Userfaults
+--------------------
+
+There are three basic ways to resolve userfaults:
+
+- ``UFFDIO_COPY`` atomically copies some existing page contents from
+  userspace.
+
+- ``UFFDIO_ZEROPAGE`` atomically zeros the new page.
+
+- ``UFFDIO_CONTINUE`` maps an existing, previously-populated page.
+
+These operations are atomic in the sense that they guarantee nothing can
+see a half-populated page, since readers will keep userfaulting until the
+operation has finished.
+
+By default, these wake up userfaults blocked on the range in question.
+They support a ``UFFDIO_*_MODE_DONTWAKE`` ``mode`` flag, which indicates
+that waking will be done separately at some later time.
+
+Which ioctl to choose depends on the kind of page fault, and what we'd
+like to do to resolve it:
+
+- For ``UFFDIO_REGISTER_MODE_MISSING`` faults, the fault needs to be
+  resolved by either providing a new page (``UFFDIO_COPY``), or mapping
+  the zero page (``UFFDIO_ZEROPAGE``). By default, the kernel would map
+  the zero page for a missing fault. With userfaultfd, userspace can
+  decide what content to provide before the faulting thread continues.
+
+- For ``UFFDIO_REGISTER_MODE_MINOR`` faults, there is an existing page (in
+  the page cache). Userspace has the option of modifying the page's
+  contents before resolving the fault. Once the contents are correct
+  (modified or not), userspace asks the kernel to map the page and let the
+  faulting thread continue with ``UFFDIO_CONTINUE``.
 
 Notes:
 
-- If you requested ``UFFDIO_REGISTER_MODE_MISSING`` when registering then
-  you must provide some kind of page in your thread after reading from
-  the uffd.  You must provide either ``UFFDIO_COPY`` or ``UFFDIO_ZEROPAGE``.
-  The normal behavior of the OS automatically providing a zero page on
-  an anonymous mmaping is not in place.
+- You can tell which kind of fault occurred by examining
+  ``pagefault.flags`` within the ``uffd_msg``, checking for the
+  ``UFFD_PAGEFAULT_FLAG_*`` flags.
 
 - None of the page-delivering ioctls default to the range that you
   registered with.  You must fill in all fields for the appropriate
@@ -122,9 +147,9 @@ Notes:
 
 - You get the address of the access that triggered the missing page
   event out of a struct uffd_msg that you read in the thread from the
-  uffd.  You can supply as many pages as you want with ``UFFDIO_COPY`` or
-  ``UFFDIO_ZEROPAGE``.  Keep in mind that unless you used DONTWAKE then
-  the first of any of those IOCTLs wakes up the faulting thread.
+  uffd.  You can supply as many pages as you want with these IOCTLs.
+  Keep in mind that unless you used DONTWAKE then the first of any of
+  those IOCTLs wakes up the faulting thread.
 
 - Be sure to test for all errors including
   (``pollfd[0].revents & POLLERR``).  This can happen, e.g. when ranges
index 48b4d0e..18d8e25 100644 (file)
@@ -24,7 +24,8 @@ longterm series? One still supported? Then search the `LKML
 you don't find any, install `the latest release from that series
 <https://kernel.org/>`_. If it still shows the issue, report it to the stable
 mailing list (stable@vger.kernel.org) and CC the regressions list
-(regressions@lists.linux.dev).
+(regressions@lists.linux.dev); ideally also CC the maintainer and the mailing
+list for the subsystem in question.
 
 In all other cases try your best guess which kernel part might be causing the
 issue. Check the :ref:`MAINTAINERS <maintainers>` file for how its developers
@@ -48,8 +49,9 @@ before the issue occurs.
 If you are facing multiple issues with the Linux kernel at once, report each
 separately. While writing your report, include all information relevant to the
 issue, like the kernel and the distro used. In case of a regression, CC the
-regressions mailing list (regressions@lists.linux.dev) to your report; also try
-to include the commit-id of the change causing it, which a bisection can find.
+regressions mailing list (regressions@lists.linux.dev) to your report. Also try
+to pin-point the culprit with a bisection; if you succeed, include its
+commit-id and CC everyone in the sign-off-by chain.
 
 Once the report is out, answer any questions that come up and help where you
 can. That includes keeping the ball rolling by occasionally retesting with newer
@@ -198,10 +200,11 @@ report them:
 
  * Send a short problem report to the Linux stable mailing list
    (stable@vger.kernel.org) and CC the Linux regressions mailing list
-   (regressions@lists.linux.dev). Roughly describe the issue and ideally
-   explain how to reproduce it. Mention the first version that shows the
-   problem and the last version that's working fine. Then wait for further
-   instructions.
+   (regressions@lists.linux.dev); if you suspect the cause in a particular
+   subsystem, CC its maintainer and its mailing list. Roughly describe the
+   issue and ideally explain how to reproduce it. Mention the first version
+   that shows the problem and the last version that's working fine. Then
+   wait for further instructions.
 
 The reference section below explains each of these steps in more detail.
 
@@ -768,7 +771,9 @@ regular internet search engine and add something like
 the results to the archives at that URL.
 
 It's also wise to check the internet, LKML and maybe bugzilla.kernel.org again
-at this point.
+at this point. If your report needs to be filed in a bug tracker, you may want
+to check the mailing list archives for the subsystem as well, as someone might
+have reported it only there.
 
 For details how to search and what to do if you find matching reports see
 "Search for existing reports, first run" above.
@@ -1249,9 +1254,10 @@ and the oldest where the issue occurs (say 5.8-rc1).
 
 When sending the report by mail, CC the Linux regressions mailing list
 (regressions@lists.linux.dev). In case the report needs to be filed to some web
-tracker, proceed to do so; once filed, forward the report by mail to the
-regressions list. Make sure to inline the forwarded report, hence do not attach
-it. Also add a short note at the top where you mention the URL to the ticket.
+tracker, proceed to do so. Once filed, forward the report by mail to the
+regressions list; CC the maintainer and the mailing list for the subsystem in
+question. Make sure to inline the forwarded report, hence do not attach it.
+Also add a short note at the top where you mention the URL to the ticket.
 
 When mailing or forwarding the report, in case of a successful bisection add the
 author of the culprit to the recipients; also CC everyone in the signed-off-by
@@ -1536,17 +1542,20 @@ Report the regression
 
     *Send a short problem report to the Linux stable mailing list
     (stable@vger.kernel.org) and CC the Linux regressions mailing list
-    (regressions@lists.linux.dev). Roughly describe the issue and ideally
-    explain how to reproduce it.  Mention the first version that shows the
-    problem and the last version that's working fine. Then wait for further
-    instructions.*
+    (regressions@lists.linux.dev); if you suspect the cause in a particular
+    subsystem, CC its maintainer and its mailing list. Roughly describe the
+    issue and ideally explain how to reproduce it. Mention the first version
+    that shows the problem and the last version that's working fine. Then
+    wait for further instructions.*
 
 When reporting a regression that happens within a stable or longterm kernel
 line (say when updating from 5.10.4 to 5.10.5) a brief report is enough for
-the start to get the issue reported quickly. Hence a rough description is all
-it takes.
+the start to get the issue reported quickly. Hence a rough description to the
+stable and regressions mailing list is all it takes; but in case you suspect
+the cause in a particular subsystem, CC its maintainers and its mailing list
+as well, because that will speed things up.
 
-But note, it helps developers a great deal if you can specify the exact version
+And note, it helps developers a great deal if you can specify the exact version
 that introduced the problem. Hence if possible within a reasonable time frame,
 try to find that version using vanilla kernels. Lets assume something broke when
 your distributor released a update from Linux kernel 5.10.5 to 5.10.8. Then as
@@ -1563,7 +1572,9 @@ pinpoint the exact change that causes the issue (which then can easily get
 reverted to fix the issue quickly). Hence consider to do a proper bisection
 right away if time permits. See the section 'Special care for regressions' and
 the document 'Documentation/admin-guide/bug-bisect.rst' for details how to
-perform one.
+perform one. In case of a successful bisection add the author of the culprit to
+the recipients; also CC everyone in the signed-off-by chain, which you find at
+the end of its commit message.
 
 
 Reference for "Reporting issues only occurring in older kernel version lines"
index 4fcc00a..18b8cc1 100644 (file)
@@ -277,9 +277,40 @@ Before jumping into the kernel, the following conditions must be met:
 
     - SCR_EL3.FGTEn (bit 27) must be initialised to 0b1.
 
+  For CPUs with Advanced SIMD and floating point support:
+
+  - If EL3 is present:
+
+    - CPTR_EL3.TFP (bit 10) must be initialised to 0b0.
+
+  - If EL2 is present and the kernel is entered at EL1:
+
+    - CPTR_EL2.TFP (bit 10) must be initialised to 0b0.
+
+  For CPUs with the Scalable Vector Extension (FEAT_SVE) present:
+
+  - if EL3 is present:
+
+    - CPTR_EL3.EZ (bit 8) must be initialised to 0b1.
+
+    - ZCR_EL3.LEN must be initialised to the same value for all CPUs the
+      kernel is executed on.
+
+  - If the kernel is entered at EL1 and EL2 is present:
+
+    - CPTR_EL2.TZ (bit 8) must be initialised to 0b0.
+
+    - CPTR_EL2.ZEN (bits 17:16) must be initialised to 0b11.
+
+    - ZCR_EL2.LEN must be initialised to the same value for all CPUs the
+      kernel will execute on.
+
 The requirements described above for CPU mode, caches, MMUs, architected
 timers, coherency and system registers apply to all CPUs.  All CPUs must
-enter the kernel in the same exception level.
+enter the kernel in the same exception level.  Where the values documented
+disable traps it is permissible for these traps to be enabled so long as
+those traps are handled transparently by higher exception levels as though
+the values documented were set.
 
 The boot loader is expected to enter the kernel on each CPU in the
 following manner:
index 8782166..ec1a5a6 100644 (file)
@@ -74,7 +74,7 @@ HWCAP_ASIMD
 
 HWCAP_EVTSTRM
     The generic timer is configured to generate events at a frequency of
-    approximately 100KHz.
+    approximately 10KHz.
 
 HWCAP_AES
     Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0001.
index cbc4d45..459e6b6 100644 (file)
@@ -113,6 +113,12 @@ ABI relaxation:
 
 - ``shmat()`` and ``shmdt()``.
 
+- ``brk()`` (since kernel v5.6).
+
+- ``mmap()`` (since kernel v5.6).
+
+- ``mremap()``, the ``new_address`` argument (since kernel v5.6).
+
 Any attempt to use non-zero tagged pointers may result in an error code
 being returned, a (fatal) signal being raised, or other modes of
 failure.
index e6d23f1..00a1d4f 100644 (file)
@@ -563,6 +563,16 @@ Free a region of memory previously allocated using dma_alloc_pages().
 dev, size, dma_handle and dir must all be the same as those passed into
 dma_alloc_pages().  page must be the pointer returned by dma_alloc_pages().
 
+::
+
+       int
+       dma_mmap_pages(struct device *dev, struct vm_area_struct *vma,
+                      size_t size, struct page *page)
+
+Map an allocation returned from dma_alloc_pages() into a user address space.
+dev and size must be the same as those passed into dma_alloc_pages().
+page must be the pointer returned by dma_alloc_pages().
+
 ::
 
        void *
@@ -584,6 +594,84 @@ dev, size, dma_handle and dir must all be the same as those passed into
 dma_alloc_noncoherent().  cpu_addr must be the virtual address returned by
 dma_alloc_noncoherent().
 
+::
+
+       struct sg_table *
+       dma_alloc_noncontiguous(struct device *dev, size_t size,
+                               enum dma_data_direction dir, gfp_t gfp,
+                               unsigned long attrs);
+
+This routine allocates  <size> bytes of non-coherent and possibly non-contiguous
+memory.  It returns a pointer to struct sg_table that describes the allocated
+and DMA mapped memory, or NULL if the allocation failed. The resulting memory
+can be used for struct page mapped into a scatterlist are suitable for.
+
+The return sg_table is guaranteed to have 1 single DMA mapped segment as
+indicated by sgt->nents, but it might have multiple CPU side segments as
+indicated by sgt->orig_nents.
+
+The dir parameter specified if data is read and/or written by the device,
+see dma_map_single() for details.
+
+The gfp parameter allows the caller to specify the ``GFP_`` flags (see
+kmalloc()) for the allocation, but rejects flags used to specify a memory
+zone such as GFP_DMA or GFP_HIGHMEM.
+
+The attrs argument must be either 0 or DMA_ATTR_ALLOC_SINGLE_PAGES.
+
+Before giving the memory to the device, dma_sync_sgtable_for_device() needs
+to be called, and before reading memory written by the device,
+dma_sync_sgtable_for_cpu(), just like for streaming DMA mappings that are
+reused.
+
+::
+
+       void
+       dma_free_noncontiguous(struct device *dev, size_t size,
+                              struct sg_table *sgt,
+                              enum dma_data_direction dir)
+
+Free memory previously allocated using dma_alloc_noncontiguous().  dev, size,
+and dir must all be the same as those passed into dma_alloc_noncontiguous().
+sgt must be the pointer returned by dma_alloc_noncontiguous().
+
+::
+
+       void *
+       dma_vmap_noncontiguous(struct device *dev, size_t size,
+               struct sg_table *sgt)
+
+Return a contiguous kernel mapping for an allocation returned from
+dma_alloc_noncontiguous().  dev and size must be the same as those passed into
+dma_alloc_noncontiguous().  sgt must be the pointer returned by
+dma_alloc_noncontiguous().
+
+Once a non-contiguous allocation is mapped using this function, the
+flush_kernel_vmap_range() and invalidate_kernel_vmap_range() APIs must be used
+to manage the coherency between the kernel mapping, the device and user space
+mappings (if any).
+
+::
+
+       void
+       dma_vunmap_noncontiguous(struct device *dev, void *vaddr)
+
+Unmap a kernel mapping returned by dma_vmap_noncontiguous().  dev must be the
+same the one passed into dma_alloc_noncontiguous().  vaddr must be the pointer
+returned by dma_vmap_noncontiguous().
+
+
+::
+
+       int
+       dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
+                              size_t size, struct sg_table *sgt)
+
+Map an allocation returned from dma_alloc_noncontiguous() into a user address
+space.  dev and size must be the same as those passed into
+dma_alloc_noncontiguous().  sgt must be the pointer returned by
+dma_alloc_noncontiguous().
+
 ::
 
        int
index a77c24c..8214e21 100644 (file)
@@ -42,10 +42,10 @@ irq_domain usage
 ================
 
 An interrupt controller driver creates and registers an irq_domain by
-calling one of the irq_domain_add_*() functions (each mapping method
-has a different allocator function, more on that later).  The function
-will return a pointer to the irq_domain on success.  The caller must
-provide the allocator function with an irq_domain_ops structure.
+calling one of the irq_domain_add_*() or irq_domain_create_*() functions
+(each mapping method has a different allocator function, more on that later).
+The function will return a pointer to the irq_domain on success. The caller
+must provide the allocator function with an irq_domain_ops structure.
 
 In most cases, the irq_domain will begin empty without any mappings
 between hwirq and IRQ numbers.  Mappings are added to the irq_domain
@@ -147,6 +147,7 @@ Legacy
        irq_domain_add_simple()
        irq_domain_add_legacy()
        irq_domain_add_legacy_isa()
+       irq_domain_create_simple()
        irq_domain_create_legacy()
 
 The Legacy mapping is a special case for drivers that already have a
@@ -169,13 +170,13 @@ supported.  For example, ISA controllers would use the legacy map for
 mapping Linux IRQs 0-15 so that existing ISA drivers get the correct IRQ
 numbers.
 
-Most users of legacy mappings should use irq_domain_add_simple() which
-will use a legacy domain only if an IRQ range is supplied by the
-system and will otherwise use a linear domain mapping. The semantics
-of this call are such that if an IRQ range is specified then
+Most users of legacy mappings should use irq_domain_add_simple() or
+irq_domain_create_simple() which will use a legacy domain only if an IRQ range
+is supplied by the system and will otherwise use a linear domain mapping.
+The semantics of this call are such that if an IRQ range is specified then
 descriptors will be allocated on-the-fly for it, and if no range is
-specified it will fall through to irq_domain_add_linear() which means
-*no* irq descriptors will be allocated.
+specified it will fall through to irq_domain_add_linear() or
+irq_domain_create_linear() which means *no* irq descriptors will be allocated.
 
 A typical use case for simple domains is where an irqchip provider
 is supporting both dynamic and static IRQ assignments.
@@ -186,6 +187,7 @@ that the driver using the simple domain call irq_create_mapping()
 before any irq_find_mapping() since the latter will actually work
 for the static IRQ assignment case.
 
+irq_domain_add_simple() and irq_domain_create_simple() as well as
 irq_domain_add_legacy() and irq_domain_create_legacy() are functionally
 equivalent, except for the first argument is different - the former
 accepts an Open Firmware specific 'struct device_node', while the latter
index 9b76337..5ad9e0a 100644 (file)
@@ -43,14 +43,14 @@ exporting of kernel symbols to the kernel symbol table, variants of these are
 available to export symbols into a certain namespace: EXPORT_SYMBOL_NS() and
 EXPORT_SYMBOL_NS_GPL(). They take one additional argument: the namespace.
 Please note that due to macro expansion that argument needs to be a
-preprocessor symbol. E.g. to export the symbol `usb_stor_suspend` into the
-namespace `USB_STORAGE`, use::
+preprocessor symbol. E.g. to export the symbol ``usb_stor_suspend`` into the
+namespace ``USB_STORAGE``, use::
 
        EXPORT_SYMBOL_NS(usb_stor_suspend, USB_STORAGE);
 
-The corresponding ksymtab entry struct `kernel_symbol` will have the member
-`namespace` set accordingly. A symbol that is exported without a namespace will
-refer to `NULL`. There is no default namespace if none is defined. `modpost`
+The corresponding ksymtab entry struct ``kernel_symbol`` will have the member
+``namespace`` set accordingly. A symbol that is exported without a namespace will
+refer to ``NULL``. There is no default namespace if none is defined. ``modpost``
 and kernel/module.c make use the namespace at build time or module load time,
 respectively.
 
@@ -64,7 +64,7 @@ and EXPORT_SYMBOL_GPL() macro expansions that do not specify a namespace.
 
 There are multiple ways of specifying this define and it depends on the
 subsystem and the maintainer's preference, which one to use. The first option
-is to define the default namespace in the `Makefile` of the subsystem. E.g. to
+is to define the default namespace in the ``Makefile`` of the subsystem. E.g. to
 export all symbols defined in usb-common into the namespace USB_COMMON, add a
 line like this to drivers/usb/common/Makefile::
 
@@ -96,7 +96,7 @@ using a statement like::
 
        MODULE_IMPORT_NS(USB_STORAGE);
 
-This will create a `modinfo` tag in the module for each imported namespace.
+This will create a ``modinfo`` tag in the module for each imported namespace.
 This has the side effect, that the imported namespaces of a module can be
 inspected with modinfo::
 
@@ -113,7 +113,7 @@ metadata definitions like MODULE_AUTHOR() or MODULE_LICENSE(). Refer to section
 4. Loading Modules that use namespaced Symbols
 ==============================================
 
-At module loading time (e.g. `insmod`), the kernel will check each symbol
+At module loading time (e.g. ``insmod``), the kernel will check each symbol
 referenced from the module for its availability and whether the namespace it
 might be exported to has been imported by the module. The default behaviour of
 the kernel is to reject loading modules that don't specify sufficient imports.
@@ -138,19 +138,19 @@ missing imports. Fixing missing imports can be done with::
 A typical scenario for module authors would be::
 
        - write code that depends on a symbol from a not imported namespace
-       - `make`
+       - ``make``
        - notice the warning of modpost telling about a missing import
-       - run `make nsdeps` to add the import to the correct code location
+       - run ``make nsdeps`` to add the import to the correct code location
 
 For subsystem maintainers introducing a namespace, the steps are very similar.
-Again, `make nsdeps` will eventually add the missing namespace imports for
+Again, ``make nsdeps`` will eventually add the missing namespace imports for
 in-tree modules::
 
        - move or add symbols to a namespace (e.g. with EXPORT_SYMBOL_NS())
-       - `make` (preferably with an allmodconfig to cover all in-kernel
+       - ``make`` (preferably with an allmodconfig to cover all in-kernel
          modules)
        - notice the warning of modpost telling about a missing import
-       - run `make nsdeps` to add the import to the correct code location
+       - run ``make nsdeps`` to add the import to the correct code location
 
 You can also run nsdeps for external module builds. A typical usage is::
 
index 4756f6b..8e0f1fe 100644 (file)
@@ -114,7 +114,7 @@ Examples of using the Linux-provided gdb helpers
     [     0.000000] BIOS-e820: [mem 0x000000000009fc00-0x000000000009ffff] reserved
     ....
 
-- Examine fields of the current task struct::
+- Examine fields of the current task struct(supported by x86 and arm64 only)::
 
     (gdb) p $lx_current().pid
     $1 = 4998
diff --git a/Documentation/devicetree/bindings/arm/ete.yaml b/Documentation/devicetree/bindings/arm/ete.yaml
new file mode 100644 (file)
index 0000000..7f9b2d1
--- /dev/null
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+# Copyright 2021, Arm Ltd
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/ete.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: ARM Embedded Trace Extensions
+
+maintainers:
+  - Suzuki K Poulose <suzuki.poulose@arm.com>
+  - Mathieu Poirier <mathieu.poirier@linaro.org>
+
+description: |
+  Arm Embedded Trace Extension(ETE) is a per CPU trace component that
+  allows tracing the CPU execution. It overlaps with the CoreSight ETMv4
+  architecture and has extended support for future architecture changes.
+  The trace generated by the ETE could be stored via legacy CoreSight
+  components (e.g, TMC-ETR) or other means (e.g, using a per CPU buffer
+  Arm Trace Buffer Extension (TRBE)). Since the ETE can be connected to
+  legacy CoreSight components, a node must be listed per instance, along
+  with any optional connection graph as per the coresight bindings.
+  See bindings/arm/coresight.txt.
+
+properties:
+  $nodename:
+    pattern: "^ete([0-9a-f]+)$"
+  compatible:
+    items:
+      - const: arm,embedded-trace-extension
+
+  cpu:
+    description: |
+      Handle to the cpu this ETE is bound to.
+    $ref: /schemas/types.yaml#/definitions/phandle
+
+  out-ports:
+    description: |
+      Output connections from the ETE to legacy CoreSight trace bus.
+    $ref: /schemas/graph.yaml#/properties/ports
+    properties:
+      port:
+        description: Output connection from the ETE to legacy CoreSight Trace bus.
+        $ref: /schemas/graph.yaml#/properties/port
+
+required:
+  - compatible
+  - cpu
+
+additionalProperties: false
+
+examples:
+
+# An ETE node without legacy CoreSight connections
+  - |
+    ete0 {
+      compatible = "arm,embedded-trace-extension";
+      cpu = <&cpu_0>;
+    };
+# An ETE node with legacy CoreSight connections
+  - |
+   ete1 {
+      compatible = "arm,embedded-trace-extension";
+      cpu = <&cpu_1>;
+
+      out-ports {        /* legacy coresight connection */
+         port {
+             ete1_out_port: endpoint {
+                remote-endpoint = <&funnel_in_port0>;
+             };
+         };
+      };
+   };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/trbe.yaml b/Documentation/devicetree/bindings/arm/trbe.yaml
new file mode 100644 (file)
index 0000000..4402d7b
--- /dev/null
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+# Copyright 2021, Arm Ltd
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/trbe.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: ARM Trace Buffer Extensions
+
+maintainers:
+  - Anshuman Khandual <anshuman.khandual@arm.com>
+
+description: |
+  Arm Trace Buffer Extension (TRBE) is a per CPU component
+  for storing trace generated on the CPU to memory. It is
+  accessed via CPU system registers. The software can verify
+  if it is permitted to use the component by checking the
+  TRBIDR register.
+
+properties:
+  $nodename:
+    const: "trbe"
+  compatible:
+    items:
+      - const: arm,trace-buffer-extension
+
+  interrupts:
+    description: |
+       Exactly 1 PPI must be listed. For heterogeneous systems where
+       TRBE is only supported on a subset of the CPUs, please consult
+       the arm,gic-v3 binding for details on describing a PPI partition.
+    maxItems: 1
+
+required:
+  - compatible
+  - interrupts
+
+additionalProperties: false
+
+examples:
+
+  - |
+   #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+   trbe {
+     compatible = "arm,trace-buffer-extension";
+     interrupts = <GIC_PPI 15 IRQ_TYPE_LEVEL_HIGH>;
+   };
+...
index 552a99c..121596f 100644 (file)
@@ -51,6 +51,9 @@ properties:
   resets: true
   reset-names: true
 
+  power-domains:
+    maxItems: 1
+
   ports:
     $ref: /schemas/graph.yaml#/properties/port
     description: |
index 2e66840..e302147 100644 (file)
@@ -20,6 +20,7 @@ properties:
   compatible:
     enum:
       - qcom,sdm845-gpi-dma
+      - qcom,sm8150-gpi-dma
 
   reg:
     maxItems: 1
diff --git a/Documentation/devicetree/bindings/gpio/fairchild,74hc595.yaml b/Documentation/devicetree/bindings/gpio/fairchild,74hc595.yaml
new file mode 100644 (file)
index 0000000..5fe19fa
--- /dev/null
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/fairchild,74hc595.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Generic 8-bit shift register
+
+maintainers:
+  - Maxime Ripard <mripard@kernel.org>
+
+properties:
+  compatible:
+    enum:
+      - fairchild,74hc595
+      - nxp,74lvc594
+
+  reg:
+    maxItems: 1
+
+  gpio-controller: true
+
+  '#gpio-cells':
+    description:
+      The second cell is only used to specify the GPIO polarity.
+    const: 2
+
+  registers-number:
+    description: Number of daisy-chained shift registers
+
+  enable-gpios:
+    description: GPIO connected to the OE (Output Enable) pin.
+    maxItems: 1
+
+  spi-max-frequency: true
+
+patternProperties:
+  "^(hog-[0-9]+|.+-hog(-[0-9]+)?)$":
+    type: object
+
+    properties:
+      gpio-hog: true
+      gpios: true
+      output-high: true
+      output-low: true
+      line-name: true
+
+    required:
+      - gpio-hog
+      - gpios
+
+    additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - gpio-controller
+  - '#gpio-cells'
+  - registers-number
+
+additionalProperties: false
+
+examples:
+  - |
+    spi {
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            gpio5: gpio5@0 {
+                    compatible = "fairchild,74hc595";
+                    reg = <0>;
+                    gpio-controller;
+                    #gpio-cells = <2>;
+                    registers-number = <4>;
+                    spi-max-frequency = <100000>;
+            };
+    };
diff --git a/Documentation/devicetree/bindings/gpio/gpio-74x164.txt b/Documentation/devicetree/bindings/gpio/gpio-74x164.txt
deleted file mode 100644 (file)
index 2a97553..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-* Generic 8-bits shift register GPIO driver
-
-Required properties:
-- compatible: Should contain one of the following:
-    "fairchild,74hc595"
-    "nxp,74lvc594"
-- reg : chip select number
-- gpio-controller : Marks the device node as a gpio controller.
-- #gpio-cells : Should be two.  The first cell is the pin number and
-  the second cell is used to specify the gpio polarity:
-      0 = active high
-      1 = active low
-- registers-number: Number of daisy-chained shift registers
-
-Optional properties:
-- enable-gpios: GPIO connected to the OE (Output Enable) pin.
-
-Example:
-
-gpio5: gpio5@0 {
-       compatible = "fairchild,74hc595";
-       reg = <0>;
-       gpio-controller;
-       #gpio-cells = <2>;
-       registers-number = <4>;
-       spi-max-frequency = <100000>;
-};
diff --git a/Documentation/devicetree/bindings/gpio/realtek,otto-gpio.yaml b/Documentation/devicetree/bindings/gpio/realtek,otto-gpio.yaml
new file mode 100644 (file)
index 0000000..100f20c
--- /dev/null
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/realtek,otto-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Realtek Otto GPIO controller
+
+maintainers:
+  - Sander Vanheule <sander@svanheule.net>
+  - Bert Vermeulen <bert@biot.com>
+
+description: |
+  Realtek's GPIO controller on their MIPS switch SoCs (Otto platform) consists
+  of two banks of 32 GPIOs. These GPIOs can generate edge-triggered interrupts.
+  Each bank's interrupts are cascased into one interrupt line on the parent
+  interrupt controller, if provided.
+  This binding allows defining a single bank in the devicetree. The interrupt
+  controller is not supported on the fallback compatible name, which only
+  allows for GPIO port use.
+
+properties:
+  $nodename:
+    pattern: "^gpio@[0-9a-f]+$"
+
+  compatible:
+    items:
+      - enum:
+          - realtek,rtl8380-gpio
+          - realtek,rtl8390-gpio
+      - const: realtek,otto-gpio
+
+  reg:
+    maxItems: 1
+
+  "#gpio-cells":
+    const: 2
+
+  gpio-controller: true
+
+  ngpios:
+    minimum: 1
+    maximum: 32
+
+  interrupt-controller: true
+
+  "#interrupt-cells":
+    const: 2
+
+  interrupts:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - "#gpio-cells"
+  - gpio-controller
+
+additionalProperties: false
+
+dependencies:
+  interrupt-controller: [ interrupts ]
+
+examples:
+  - |
+      gpio@3500 {
+        compatible = "realtek,rtl8380-gpio", "realtek,otto-gpio";
+        reg = <0x3500 0x1c>;
+        gpio-controller;
+        #gpio-cells = <2>;
+        ngpios = <24>;
+        interrupt-controller;
+        #interrupt-cells = <2>;
+        interrupt-parent = <&rtlintc>;
+        interrupts = <23>;
+      };
+
+...
diff --git a/Documentation/devicetree/bindings/gpio/rockchip,gpio-bank.yaml b/Documentation/devicetree/bindings/gpio/rockchip,gpio-bank.yaml
new file mode 100644 (file)
index 0000000..d993e00
--- /dev/null
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/rockchip,gpio-bank.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip GPIO bank
+
+maintainers:
+  - Heiko Stuebner <heiko@sntech.de>
+
+properties:
+  compatible:
+    enum:
+      - rockchip,gpio-bank
+      - rockchip,rk3188-gpio-bank0
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+
+  gpio-controller: true
+
+  "#gpio-cells":
+    const: 2
+
+  interrupt-controller: true
+
+  "#interrupt-cells":
+    const: 2
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - gpio-controller
+  - "#gpio-cells"
+  - interrupt-controller
+  - "#interrupt-cells"
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    pinctrl: pinctrl {
+      #address-cells = <1>;
+      #size-cells = <1>;
+      ranges;
+
+      gpio0: gpio@2000a000 {
+        compatible = "rockchip,rk3188-gpio-bank0";
+        reg = <0x2000a000 0x100>;
+        interrupts = <GIC_SPI 54 IRQ_TYPE_LEVEL_HIGH>;
+        clocks = <&clk_gates8 9>;
+
+        gpio-controller;
+        #gpio-cells = <2>;
+
+        interrupt-controller;
+        #interrupt-cells = <2>;
+      };
+
+      gpio1: gpio@2003c000 {
+        compatible = "rockchip,gpio-bank";
+        reg = <0x2003c000 0x100>;
+        interrupts = <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>;
+        clocks = <&clk_gates8 10>;
+
+        gpio-controller;
+        #gpio-cells = <2>;
+
+        interrupt-controller;
+        #interrupt-cells = <2>;
+      };
+    };
diff --git a/Documentation/devicetree/bindings/hwlock/sirf,hwspinlock.txt b/Documentation/devicetree/bindings/hwlock/sirf,hwspinlock.txt
deleted file mode 100644 (file)
index 9bb1240..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-SIRF Hardware spinlock device Binding
------------------------------------------------
-
-Required properties :
-- compatible : shall contain only one of the following:
-       "sirf,hwspinlock"
-
-- reg : the register address of hwspinlock
-
-- #hwlock-cells : hwlock users only use the hwlock id to represent a specific
-       hwlock, so the number of cells should be <1> here.
-
-Please look at the generic hwlock binding for usage information for consumers,
-"Documentation/devicetree/bindings/hwlock/hwlock.txt"
-
-Example of hwlock provider:
-       hwlock {
-               compatible = "sirf,hwspinlock";
-               reg = <0x13240000 0x00010000>;
-               #hwlock-cells = <1>;
-       };
-
-Example of hwlock users:
-       node {
-               ...
-               hwlocks = <&hwlock 2>;
-               ...
-       };
index adb5165..62f3ca6 100644 (file)
@@ -49,7 +49,7 @@ additionalProperties: true
 examples:
   - |
     i3c-master@a0000000 {
-        compatible = "silvaco,i3c-master";
+        compatible = "silvaco,i3c-master-v1";
         clocks = <&zynqmp_clk 71>, <&fclk>, <&sclk>;
         clock-names = "pclk", "fast_clk", "slow_clk";
         interrupt-parent = <&gic>;
index 84f1a1b..be31cf0 100644 (file)
@@ -1,7 +1,7 @@
 Hisilicon RoCE DT description
 
 Hisilicon RoCE engine is a part of network subsystem.
-It works depending on other part of network wubsytem, such as, gmac and
+It works depending on other part of network subsystem, such as gmac and
 dsa fabric.
 
 Additional properties are described here:
index 8c6418f..3ec579d 100644 (file)
@@ -39,6 +39,13 @@ properties:
       (active low). The line must be flagged with
       GPIO_ACTIVE_LOW.
 
+  wake-gpios:
+    maxItems: 1
+    description:
+      Optional GPIO specifier for the touchscreen's wake pin
+      (active low). The line must be flagged with
+      GPIO_ACTIVE_LOW.
+
   linux,gpio-keymap:
     $ref: /schemas/types.yaml#/definitions/uint32-array
     description: |
@@ -53,6 +60,29 @@ properties:
       or experiment to determine which bit corresponds to which input. Use
       KEY_RESERVED for unused padding values.
 
+  atmel,wakeup-method:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description: |
+      The WAKE line is an active-low input that is used to wake up the touch
+      controller from deep-sleep mode before communication with the controller
+      could be started. This optional feature used to minimize current
+      consumption when the controller is in deep sleep mode. This feature is
+      relevant only to some controller families, like mXT1386 controller for
+      example.
+
+      The WAKE pin can be connected in one of the following ways:
+       1) left permanently low
+       2) connected to the I2C-compatible SCL pin
+       3) connected to a GPIO pin on the host
+    enum:
+      - 0 # ATMEL_MXT_WAKEUP_NONE
+      - 1 # ATMEL_MXT_WAKEUP_I2C_SCL
+      - 2 # ATMEL_MXT_WAKEUP_GPIO
+    default: 0
+
+  wakeup-source:
+    type: boolean
+
 required:
   - compatible
   - reg
@@ -63,6 +93,7 @@ additionalProperties: false
 examples:
   - |
     #include <dt-bindings/interrupt-controller/irq.h>
+    #include <dt-bindings/input/atmel-maxtouch.h>
     #include <dt-bindings/gpio/gpio.h>
     i2c {
       #address-cells = <1>;
@@ -75,6 +106,7 @@ examples:
         reset-gpios = <&gpio 27 GPIO_ACTIVE_LOW>;
         vdda-supply = <&ab8500_ldo_aux2_reg>;
         vdd-supply = <&ab8500_ldo_aux5_reg>;
+        atmel,wakeup-method = <ATMEL_MXT_WAKEUP_I2C_SCL>;
       };
     };
 
diff --git a/Documentation/devicetree/bindings/input/iqs626a.yaml b/Documentation/devicetree/bindings/input/iqs626a.yaml
new file mode 100644 (file)
index 0000000..0cb736c
--- /dev/null
@@ -0,0 +1,843 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/iqs626a.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Azoteq IQS626A Capacitive Touch Controller
+
+maintainers:
+  - Jeff LaBundy <jeff@labundy.com>
+
+description: |
+  The Azoteq IQS626A is a 14-channel capacitive touch controller that features
+  additional Hall-effect and inductive sensing capabilities.
+
+  Link to datasheet: https://www.azoteq.com/
+
+allOf:
+  - $ref: touchscreen/touchscreen.yaml#
+
+properties:
+  compatible:
+    const: azoteq,iqs626a
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  "#address-cells":
+    const: 1
+
+  "#size-cells":
+    const: 0
+
+  azoteq,suspend-mode:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    enum: [0, 1, 2, 3]
+    default: 0
+    description: |
+      Specifies the power mode during suspend as follows:
+      0: Automatic (same as normal runtime, i.e. suspend/resume disabled)
+      1: Low power (all sensing at a reduced reporting rate)
+      2: Ultra-low power (ULP channel proximity sensing)
+      3: Halt (no sensing)
+
+  azoteq,clk-div:
+    type: boolean
+    description: Divides the device's core clock by a factor of 4.
+
+  azoteq,ulp-enable:
+    type: boolean
+    description:
+      Permits the device to automatically enter ultra-low-power mode from low-
+      power mode.
+
+  azoteq,ulp-update:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    enum: [0, 1, 2, 3, 4, 5, 6, 7]
+    default: 3
+    description: |
+      Specifies the rate at which the trackpad, generic and Hall channels are
+      updated during ultra-low-power mode as follows:
+      0: 8
+      1: 13
+      2: 28
+      3: 54
+      4: 89
+      5: 135
+      6: 190
+      7: 256
+
+  azoteq,ati-band-disable:
+    type: boolean
+    description: Disables the ATI band check.
+
+  azoteq,ati-lp-only:
+    type: boolean
+    description: Limits automatic ATI to low-power mode.
+
+  azoteq,gpio3-select:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    enum: [0, 1, 2, 3, 4, 5, 6, 7]
+    default: 1
+    description: |
+      Selects the channel or group of channels for which the GPIO3 pin
+      represents touch state as follows:
+      0: None
+      1: ULP channel
+      2: Trackpad
+      3: Trackpad
+      4: Generic channel 0
+      5: Generic channel 1
+      6: Generic channel 2
+      7: Hall channel
+
+  azoteq,reseed-select:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    enum: [0, 1, 2, 3]
+    default: 0
+    description: |
+      Specifies the event(s) that prompt the device to reseed (i.e. reset the
+      long-term average) of an associated channel as follows:
+      0: None
+      1: Proximity
+      2: Proximity or touch
+      3: Proximity, touch or deep touch
+
+  azoteq,thresh-extend:
+    type: boolean
+    description: Multiplies all touch and deep-touch thresholds by 4.
+
+  azoteq,tracking-enable:
+    type: boolean
+    description:
+      Enables all associated channels to track their respective reference
+      channels.
+
+  azoteq,reseed-offset:
+    type: boolean
+    description:
+      Applies an 8-count offset to all long-term averages upon either ATI or
+      reseed events.
+
+  azoteq,rate-np-ms:
+    minimum: 0
+    maximum: 255
+    default: 150
+    description: Specifies the report rate (in ms) during normal-power mode.
+
+  azoteq,rate-lp-ms:
+    minimum: 0
+    maximum: 255
+    default: 150
+    description: Specifies the report rate (in ms) during low-power mode.
+
+  azoteq,rate-ulp-ms:
+    multipleOf: 16
+    minimum: 0
+    maximum: 4080
+    default: 0
+    description: Specifies the report rate (in ms) during ultra-low-power mode.
+
+  azoteq,timeout-pwr-ms:
+    multipleOf: 512
+    minimum: 0
+    maximum: 130560
+    default: 2560
+    description:
+      Specifies the length of time (in ms) to wait for an event before moving
+      from normal-power mode to low-power mode, or (if 'azoteq,ulp-enable' is
+      present) from low-power mode to ultra-low-power mode.
+
+  azoteq,timeout-lta-ms:
+    multipleOf: 512
+    minimum: 0
+    maximum: 130560
+    default: 40960
+    description:
+      Specifies the length of time (in ms) to wait before resetting the long-
+      term average of all channels. Specify the maximum timeout to disable it
+      altogether.
+
+  touchscreen-inverted-x: true
+  touchscreen-inverted-y: true
+  touchscreen-swapped-x-y: true
+
+patternProperties:
+  "^ulp-0|generic-[0-2]|hall$":
+    type: object
+    description:
+      Represents a single sensing channel. A channel is active if defined and
+      inactive otherwise.
+
+    properties:
+      azoteq,ati-exclude:
+        type: boolean
+        description:
+          Prevents the channel from participating in an ATI event that is
+          manually triggered during initialization.
+
+      azoteq,reseed-disable:
+        type: boolean
+        description:
+          Prevents the channel from being reseeded if the long-term average
+          timeout (defined in 'azoteq,timeout-lta') expires.
+
+      azoteq,meas-cap-decrease:
+        type: boolean
+        description:
+          Decreases the internal measurement capacitance from 60 pF to 15 pF.
+
+      azoteq,rx-inactive:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2]
+        default: 0
+        description: |
+          Specifies how inactive CRX pins are to be terminated as follows:
+          0: VSS
+          1: Floating
+          2: VREG (generic channels only)
+
+      azoteq,linearize:
+        type: boolean
+        description:
+          Enables linearization of the channel's counts (generic and Hall
+          channels) or inverts the polarity of the channel's proximity or
+          touch states (ULP channel).
+
+      azoteq,dual-direction:
+        type: boolean
+        description:
+          Specifies that the channel's long-term average is to freeze in the
+          presence of either increasing or decreasing counts, thereby permit-
+          ting events to be reported in either direction.
+
+      azoteq,filt-disable:
+        type: boolean
+        description: Disables raw count filtering for the channel.
+
+      azoteq,ati-mode:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        description: |
+          Specifies the channel's ATI mode as follows:
+          0: Disabled
+          1: Semi-partial
+          2: Partial
+          3: Full
+
+          The default value is a function of the channel and the device's reset
+          user interface (RUI); reference the datasheet for further information
+          about the available RUI options.
+
+      azoteq,ati-base:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [75, 100, 150, 200]
+        description:
+          Specifies the channel's ATI base. The default value is a function
+          of the channel and the device's RUI.
+
+      azoteq,ati-target:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        multipleOf: 32
+        minimum: 0
+        maximum: 2016
+        description:
+          Specifies the channel's ATI target. The default value is a function
+          of the channel and the device's RUI.
+
+      azoteq,cct-increase:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        minimum: 0
+        maximum: 16
+        default: 0
+        description:
+          Specifies the degree to which the channel's charge cycle time is to
+          be increased, with 0 representing no increase. The maximum value is
+          limited to 4 in the case of the ULP channel, and the property is un-
+          available entirely in the case of the Hall channel.
+
+      azoteq,proj-bias:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description: |
+          Specifies the bias current applied during projected-capacitance
+          sensing as follows:
+          0: 2.5 uA
+          1: 5 uA
+          2: 10 uA
+          3: 20 uA
+
+          This property is unavailable in the case of the Hall channel.
+
+      azoteq,sense-freq:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        description: |
+          Specifies the channel's sensing frequency as follows (parenthesized
+          numbers represent the frequency if 'azoteq,clk-div' is present):
+          0: 4 MHz (1 MHz)
+          1: 2 MHz (500 kHz)
+          2: 1 MHz (250 kHz)
+          3: 500 kHz (125 kHz)
+
+          This property is unavailable in the case of the Hall channel. The
+          default value is a function of the channel and the device's RUI.
+
+      azoteq,ati-band-tighten:
+        type: boolean
+        description:
+          Tightens the ATI band from 1/8 to 1/16 of the desired target (ULP and
+          generic channels only).
+
+      azoteq,proj-enable:
+        type: boolean
+        description: Enables projected-capacitance sensing (ULP channel only).
+
+      azoteq,filt-str-np-cnt:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description:
+          Specifies the raw count filter strength during normal-power mode (ULP
+          and generic channels only).
+
+      azoteq,filt-str-lp-cnt:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description:
+          Specifies the raw count filter strength during low-power mode (ULP and
+          generic channels only).
+
+      azoteq,filt-str-np-lta:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description:
+          Specifies the long-term average filter strength during normal-power
+          mode (ULP and generic channels only).
+
+      azoteq,filt-str-lp-lta:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description:
+          Specifies the long-term average filter strength during low-power mode
+          (ULP and generic channels only).
+
+      azoteq,rx-enable:
+        $ref: /schemas/types.yaml#/definitions/uint32-array
+        minItems: 1
+        maxItems: 8
+        items:
+          minimum: 0
+          maximum: 7
+        description:
+          Specifies the CRX pin(s) associated with the channel.
+
+          This property is unavailable in the case of the Hall channel. The
+          default value is a function of the channel and the device's RUI.
+
+      azoteq,tx-enable:
+        $ref: /schemas/types.yaml#/definitions/uint32-array
+        minItems: 1
+        maxItems: 8
+        items:
+          minimum: 0
+          maximum: 7
+        description:
+          Specifies the TX pin(s) associated with the channel.
+
+          This property is unavailable in the case of the Hall channel. The
+          default value is a function of the channel and the device's RUI.
+
+      azoteq,local-cap-size:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3, 4]
+        default: 0
+        description: |
+          Specifies the capacitance to be added to the channel as follows:
+          0: 0 pF
+          1: 0.5 pF
+          2: 1.0 pF
+          3: 1.5 pF
+          4: 2.0 pF
+
+          This property is unavailable in the case of the ULP or Hall channels.
+
+      azoteq,sense-mode:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 8, 9, 12, 14, 15]
+        description: |
+          Specifies the channel's sensing mode as follows:
+          0:  Self capacitance
+          1:  Projected capacitance
+          8:  Self inductance
+          9:  Mutual inductance
+          12: External
+          14: Hall effect
+          15: Temperature
+
+          This property is unavailable in the case of the ULP or Hall channels.
+          The default value is a function of the channel and the device's RUI.
+
+      azoteq,tx-freq:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description: |
+          Specifies the inductive sensing excitation frequency as follows
+          (parenthesized numbers represent the frequency if 'azoteq,clk-div'
+          is present):
+          0: 16 MHz (4 MHz)
+          1: 8 MHz (2 MHz)
+          2: 4 MHz (1 MHz)
+          3: 2 MHz (500 kHz)
+
+          This property is unavailable in the case of the ULP or Hall channels.
+
+      azoteq,invert-enable:
+        type: boolean
+        description:
+          Inverts the polarity of the states reported for proximity, touch and
+          deep-touch events relative to their respective thresholds (generic
+          channels only).
+
+      azoteq,comp-disable:
+        type: boolean
+        description:
+          Disables compensation for the channel (generic channels only).
+
+      azoteq,static-enable:
+        type: boolean
+        description:
+          Enables the static front-end for the channel (generic channels only).
+
+      azoteq,assoc-select:
+        $ref: /schemas/types.yaml#/definitions/string-array
+        minItems: 1
+        maxItems: 6
+        items:
+          enum:
+            - ulp-0
+            - trackpad-3x2
+            - trackpad-3x3
+            - generic-0
+            - generic-1
+            - generic-2
+            - hall
+        description:
+          Specifies the associated channels for which the channel serves as a
+          reference channel. By default, no channels are selected. This prop-
+          erty is only available for the generic channels.
+
+      azoteq,assoc-weight:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        minimum: 0
+        maximum: 255
+        default: 0
+        description:
+          Specifies the channel's impact weight if it acts as an associated
+          channel (0 = 0% impact, 255 = 200% impact). This property is only
+          available for the generic channels.
+
+    patternProperties:
+      "^event-(prox|touch|deep)(-alt)?$":
+        type: object
+        description:
+          Represents a proximity, touch or deep-touch event reported by the
+          channel in response to a decrease in counts. Node names suffixed with
+          '-alt' instead correspond to an increase in counts.
+
+          By default, the long-term average tracks an increase in counts such
+          that only events corresponding to a decrease in counts are reported
+          (refer to the datasheet for more information).
+
+          Specify 'azoteq,dual-direction' to freeze the long-term average when
+          the counts increase or decrease such that events of either direction
+          can be reported. Alternatively, specify 'azoteq,invert-enable' to in-
+          vert the polarity of the states reported by the channel.
+
+          Complementary events (e.g. event-touch and event-touch-alt) can both
+          be present and specify different key or switch codes, but not differ-
+          ent thresholds or hysteresis (if applicable).
+
+          Proximity events are unavailable in the case of the Hall channel, and
+          deep-touch events are only available for the generic channels. Unless
+          otherwise specified, default values are a function of the channel and
+          the device's RUI.
+
+        properties:
+          azoteq,thresh:
+            $ref: /schemas/types.yaml#/definitions/uint32
+            minimum: 0
+            maximum: 255
+            description: Specifies the threshold for the event.
+
+          azoteq,hyst:
+            $ref: /schemas/types.yaml#/definitions/uint32
+            minimum: 0
+            maximum: 15
+            description:
+              Specifies the hysteresis for the event (touch and deep-touch
+              events only).
+
+          linux,code:
+            $ref: /schemas/types.yaml#/definitions/uint32
+            description: Numeric key or switch code associated with the event.
+
+          linux,input-type:
+            $ref: /schemas/types.yaml#/definitions/uint32
+            enum: [1, 5]
+            description:
+              Specifies whether the event is to be interpreted as a key (1) or
+              a switch (5). By default, Hall-channel events are interpreted as
+              switches and all others are interpreted as keys.
+
+        dependencies:
+          linux,input-type: ["linux,code"]
+
+        additionalProperties: false
+
+    dependencies:
+      azoteq,assoc-weight: ["azoteq,assoc-select"]
+
+    additionalProperties: false
+
+  "^trackpad-3x[2-3]$":
+    type: object
+    description:
+      Represents all channels associated with the trackpad. The channels are
+      collectively active if the trackpad is defined and inactive otherwise.
+
+    properties:
+      azoteq,ati-exclude:
+        type: boolean
+        description:
+          Prevents the trackpad channels from participating in an ATI event
+          that is manually triggered during initialization.
+
+      azoteq,reseed-disable:
+        type: boolean
+        description:
+          Prevents the trackpad channels from being reseeded if the long-term
+          average timeout (defined in 'azoteq,timeout-lta') expires.
+
+      azoteq,meas-cap-decrease:
+        type: boolean
+        description:
+          Decreases the internal measurement capacitance from 60 pF to 15 pF.
+
+      azoteq,rx-inactive:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1]
+        default: 0
+        description: |
+          Specifies how inactive CRX pins are to be terminated as follows:
+          0: VSS
+          1: Floating
+
+      azoteq,linearize:
+        type: boolean
+        description: Inverts the polarity of the trackpad's touch state.
+
+      azoteq,dual-direction:
+        type: boolean
+        description:
+          Specifies that the trackpad's long-term averages are to freeze in
+          the presence of either increasing or decreasing counts, thereby
+          permitting events to be reported in either direction.
+
+      azoteq,filt-disable:
+        type: boolean
+        description: Disables raw count filtering for the trackpad channels.
+
+      azoteq,ati-mode:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description: |
+          Specifies the trackpad's ATI mode as follows:
+          0: Disabled
+          1: Semi-partial
+          2: Partial
+          3: Full
+
+      azoteq,ati-base:
+        $ref: /schemas/types.yaml#/definitions/uint32-array
+        minItems: 6
+        maxItems: 9
+        items:
+          minimum: 45
+          maximum: 300
+        default: [45, 45, 45, 45, 45, 45, 45, 45, 45]
+        description: Specifies each individual trackpad channel's ATI base.
+
+      azoteq,ati-target:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        multipleOf: 32
+        minimum: 0
+        maximum: 2016
+        default: 0
+        description: Specifies the trackpad's ATI target.
+
+      azoteq,cct-increase:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        minimum: 0
+        maximum: 4
+        default: 0
+        description:
+          Specifies the degree to which the trackpad's charge cycle time is to
+          be increased, with 0 representing no increase.
+
+      azoteq,proj-bias:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description: |
+          Specifies the bias current applied during projected-capacitance
+          sensing as follows:
+          0: 2.5 uA
+          1: 5 uA
+          2: 10 uA
+          3: 20 uA
+
+      azoteq,sense-freq:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description: |
+          Specifies the trackpad's sensing frequency as follows (parenthesized
+          numbers represent the frequency if 'azoteq,clk-div' is present):
+          0: 4 MHz (1 MHz)
+          1: 2 MHz (500 kHz)
+          2: 1 MHz (250 kHz)
+          3: 500 kHz (125 kHz)
+
+      azoteq,ati-band-tighten:
+        type: boolean
+        description:
+          Tightens the ATI band from 1/8 to 1/16 of the desired target.
+
+      azoteq,thresh:
+        $ref: /schemas/types.yaml#/definitions/uint32-array
+        minItems: 6
+        maxItems: 9
+        items:
+          minimum: 0
+          maximum: 255
+        default: [0, 0, 0, 0, 0, 0, 0, 0, 0]
+        description:
+          Specifies each individual trackpad channel's touch threshold.
+
+      azoteq,hyst:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        minimum: 0
+        maximum: 15
+        default: 0
+        description: Specifies the trackpad's touch hysteresis.
+
+      azoteq,lta-update:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3, 4, 5, 6, 7]
+        default: 0
+        description: |
+          Specifies the update rate of the trackpad's long-term average during
+          ultra-low-power mode as follows:
+          0: 2
+          1: 4
+          2: 8
+          3: 16
+          4: 32
+          5: 64
+          6: 128
+          7: 255
+
+      azoteq,filt-str-trackpad:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description: Specifies the trackpad coordinate filter strength.
+
+      azoteq,filt-str-np-cnt:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description:
+          Specifies the raw count filter strength during normal-power mode.
+
+      azoteq,filt-str-lp-cnt:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        enum: [0, 1, 2, 3]
+        default: 0
+        description:
+          Specifies the raw count filter strength during low-power mode.
+
+      linux,keycodes:
+        $ref: /schemas/types.yaml#/definitions/uint32-array
+        minItems: 1
+        maxItems: 6
+        description: |
+          Specifies the numeric keycodes associated with each available gesture
+          in the following order (enter 0 for unused gestures):
+          0: Positive flick or swipe in X direction
+          1: Negative flick or swipe in X direction
+          2: Positive flick or swipe in Y direction
+          3: Negative flick or swipe in Y direction
+          4: Tap
+          5: Hold
+
+      azoteq,gesture-swipe:
+        type: boolean
+        description:
+          Directs the device to interpret axial gestures as a swipe (finger
+          remains on trackpad) instead of a flick (finger leaves trackpad).
+
+      azoteq,timeout-tap-ms:
+        multipleOf: 16
+        minimum: 0
+        maximum: 4080
+        default: 0
+        description:
+          Specifies the length of time (in ms) within which a trackpad touch
+          must be released in order to be interpreted as a tap.
+
+      azoteq,timeout-swipe-ms:
+        multipleOf: 16
+        minimum: 0
+        maximum: 4080
+        default: 0
+        description:
+          Specifies the length of time (in ms) within which an axial gesture
+          must be completed in order to be interpreted as a flick or swipe.
+
+      azoteq,thresh-swipe:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        minimum: 0
+        maximum: 255
+        default: 0
+        description:
+          Specifies the number of points across which an axial gesture must
+          travel in order to be interpreted as a flick or swipe.
+
+    dependencies:
+      azoteq,gesture-swipe: ["linux,keycodes"]
+      azoteq,timeout-tap-ms: ["linux,keycodes"]
+      azoteq,timeout-swipe-ms: ["linux,keycodes"]
+      azoteq,thresh-swipe: ["linux,keycodes"]
+
+    additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - "#address-cells"
+  - "#size-cells"
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/input/input.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    i2c {
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            iqs626a@44 {
+                    #address-cells = <1>;
+                    #size-cells = <0>;
+
+                    compatible = "azoteq,iqs626a";
+                    reg = <0x44>;
+                    interrupt-parent = <&gpio>;
+                    interrupts = <17 IRQ_TYPE_LEVEL_LOW>;
+
+                    azoteq,rate-np-ms = <16>;
+                    azoteq,rate-lp-ms = <160>;
+
+                    azoteq,timeout-pwr-ms = <2560>;
+                    azoteq,timeout-lta-ms = <32768>;
+
+                    ulp-0 {
+                            azoteq,meas-cap-decrease;
+
+                            azoteq,ati-base = <75>;
+                            azoteq,ati-target = <1024>;
+
+                            azoteq,rx-enable = <2>, <3>, <4>,
+                                               <5>, <6>, <7>;
+
+                            event-prox {
+                                    linux,code = <KEY_POWER>;
+                            };
+                    };
+
+                    trackpad-3x3 {
+                            azoteq,filt-str-np-cnt = <1>;
+                            azoteq,filt-str-lp-cnt = <1>;
+
+                            azoteq,hyst = <4>;
+                            azoteq,thresh = <35>, <40>, <40>,
+                                            <38>, <33>, <38>,
+                                            <35>, <35>, <35>;
+
+                            azoteq,ati-mode = <3>;
+                            azoteq,ati-base = <195>, <195>, <195>,
+                                              <195>, <195>, <195>,
+                                              <195>, <195>, <195>;
+                            azoteq,ati-target = <512>;
+
+                            azoteq,proj-bias = <1>;
+                            azoteq,sense-freq = <2>;
+
+                            linux,keycodes = <KEY_VOLUMEUP>,
+                                             <KEY_VOLUMEDOWN>,
+                                             <KEY_NEXTSONG>,
+                                             <KEY_PREVIOUSSONG>,
+                                             <KEY_PLAYPAUSE>,
+                                             <KEY_STOPCD>;
+
+                            azoteq,gesture-swipe;
+                            azoteq,timeout-swipe-ms = <800>;
+                            azoteq,timeout-tap-ms = <400>;
+                            azoteq,thresh-swipe = <40>;
+                    };
+
+                    /*
+                     * Preserve the default register settings for
+                     * the temperature-tracking channel leveraged
+                     * by reset user interface (RUI) 1.
+                     *
+                     * Scalar properties (e.g. ATI mode) are left
+                     * untouched by simply omitting them; boolean
+                     * properties must be specified explicitly as
+                     * needed.
+                     */
+                    generic-2 {
+                            azoteq,reseed-disable;
+                            azoteq,meas-cap-decrease;
+                            azoteq,dual-direction;
+                            azoteq,comp-disable;
+                            azoteq,static-enable;
+                    };
+
+                    hall {
+                            azoteq,reseed-disable;
+                            azoteq,meas-cap-decrease;
+
+                            event-touch {
+                                    linux,code = <SW_LID>;
+                            };
+                    };
+            };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/input/touchscreen/azoteq,iqs5xx.yaml b/Documentation/devicetree/bindings/input/touchscreen/azoteq,iqs5xx.yaml
new file mode 100644 (file)
index 0000000..b5f3772
--- /dev/null
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/azoteq,iqs5xx.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Azoteq IQS550/572/525 Trackpad/Touchscreen Controller
+
+maintainers:
+  - Jeff LaBundy <jeff@labundy.com>
+
+description: |
+  The Azoteq IQS550, IQS572 and IQS525 trackpad and touchscreen controllers
+  employ projected-capacitance sensing and can track up to five independent
+  contacts.
+
+  Link to datasheet: https://www.azoteq.com/
+
+allOf:
+  - $ref: touchscreen.yaml#
+
+properties:
+  compatible:
+    enum:
+      - azoteq,iqs550
+      - azoteq,iqs572
+      - azoteq,iqs525
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  reset-gpios:
+    maxItems: 1
+
+  wakeup-source: true
+
+  touchscreen-size-x: true
+  touchscreen-size-y: true
+  touchscreen-inverted-x: true
+  touchscreen-inverted-y: true
+  touchscreen-swapped-x-y: true
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    i2c {
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            touchscreen@74 {
+                    compatible = "azoteq,iqs550";
+                    reg = <0x74>;
+                    interrupt-parent = <&gpio>;
+                    interrupts = <27 IRQ_TYPE_LEVEL_HIGH>;
+                    reset-gpios = <&gpio 22 (GPIO_ACTIVE_LOW |
+                                             GPIO_PUSH_PULL)>;
+
+                    touchscreen-size-x = <800>;
+                    touchscreen-size-y = <480>;
+            };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml b/Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml
new file mode 100644 (file)
index 0000000..942562f
--- /dev/null
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/hycon,hy46xx.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Hycon HY46XX series touchscreen controller bindings
+
+description: |
+  There are 6 variants of the chip for various touch panel sizes and cover lens material
+   Glass: 0.3mm--4.0mm
+    PET/PMMA: 0.2mm--2.0mm
+    HY4613(B)-N048  < 6"
+    HY4614(B)-N068  7" .. 10.1"
+    HY4621-NS32  < 5"
+    HY4623-NS48  5.1" .. 7"
+   Glass: 0.3mm--8.0mm
+    PET/PMMA: 0.2mm--4.0mm
+    HY4633(B)-N048  < 6"
+    HY4635(B)-N048  < 7" .. 10.1"
+
+maintainers:
+  - Giulio Benetti <giulio.benetti@benettiengineering.com>
+
+allOf:
+  - $ref: touchscreen.yaml#
+
+properties:
+  compatible:
+    enum:
+      - hycon,hy4613
+      - hycon,hy4614
+      - hycon,hy4621
+      - hycon,hy4623
+      - hycon,hy4633
+      - hycon,hy4635
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  reset-gpios:
+    maxItems: 1
+
+  vcc-supply: true
+
+  hycon,threshold:
+    description: Allows setting the sensitivity in the range from 0 to 255.
+    $ref: /schemas/types.yaml#/definitions/uint32
+    minimum: 0
+    maximum: 255
+
+  hycon,glove-enable:
+    type: boolean
+    description: Allows enabling glove setting.
+
+  hycon,report-speed-hz:
+    description: Allows setting the report speed in Hertz.
+    minimum: 1
+    maximum: 255
+
+  hycon,noise-filter-enable:
+    type: boolean
+    description: Allows enabling power noise filter.
+
+  hycon,filter-data:
+    description: Allows setting how many samples throw before reporting touch
+                 in the range from 0 to 5.
+    $ref: /schemas/types.yaml#/definitions/uint32
+    minimum: 0
+    maximum: 5
+
+  hycon,gain:
+    description: Allows setting the sensitivity distance in the range from 0 to 5.
+    $ref: /schemas/types.yaml#/definitions/uint32
+    minimum: 0
+    maximum: 5
+
+  hycon,edge-offset:
+    description: Allows setting the edge compensation in the range from 0 to 16.
+    $ref: /schemas/types.yaml#/definitions/uint32
+    minimum: 0
+    maximum: 16
+
+  touchscreen-size-x: true
+  touchscreen-size-y: true
+  touchscreen-fuzz-x: true
+  touchscreen-fuzz-y: true
+  touchscreen-inverted-x: true
+  touchscreen-inverted-y: true
+  touchscreen-swapped-x-y: true
+  interrupt-controller: true
+
+additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    i2c {
+      #address-cells = <1>;
+      #size-cells = <0>;
+      touchscreen@1c {
+        compatible = "hycon,hy4633";
+        reg = <0x1c>;
+        interrupt-parent = <&gpio2>;
+        interrupts = <5 IRQ_TYPE_EDGE_FALLING>;
+        reset-gpios = <&gpio2 6 GPIO_ACTIVE_LOW>;
+      };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/input/touchscreen/ilitek_ts_i2c.yaml b/Documentation/devicetree/bindings/input/touchscreen/ilitek_ts_i2c.yaml
new file mode 100644 (file)
index 0000000..a190e7b
--- /dev/null
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/ilitek_ts_i2c.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Ilitek I2C Touchscreen Controller
+
+maintainers:
+  - Dmitry Torokhov <dmitry.torokhov@gmail.com>
+
+allOf:
+  - $ref: touchscreen.yaml#
+
+properties:
+  compatible:
+    enum:
+      - ilitek,ili2130
+      - ilitek,ili2131
+      - ilitek,ili2132
+      - ilitek,ili2316
+      - ilitek,ili2322
+      - ilitek,ili2323
+      - ilitek,ili2326
+      - ilitek,ili2520
+      - ilitek,ili2521
+
+  reg:
+    const: 0x41
+
+  interrupts:
+    maxItems: 1
+
+  reset-gpios:
+    maxItems: 1
+
+  wakeup-source:
+    type: boolean
+    description: touchscreen can be used as a wakeup source.
+
+  touchscreen-size-x: true
+  touchscreen-size-y: true
+  touchscreen-inverted-x: true
+  touchscreen-inverted-y: true
+  touchscreen-swapped-x-y: true
+
+additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - reset-gpios
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/irq.h>
+    #include <dt-bindings/gpio/gpio.h>
+    i2c {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        touchscreen@41 {
+            compatible = "ilitek,ili2520";
+            reg = <0x41>;
+
+            interrupt-parent = <&gpio1>;
+            interrupts = <7 IRQ_TYPE_LEVEL_LOW>;
+            reset-gpios = <&gpio1 8 GPIO_ACTIVE_LOW>;
+            touchscreen-inverted-y;
+            wakeup-source;
+        };
+    };
diff --git a/Documentation/devicetree/bindings/input/touchscreen/iqs5xx.txt b/Documentation/devicetree/bindings/input/touchscreen/iqs5xx.txt
deleted file mode 100644 (file)
index efa0820..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-Azoteq IQS550/572/525 Trackpad/Touchscreen Controller
-
-Required properties:
-
-- compatible                   : Must be equal to one of the following:
-                                 "azoteq,iqs550"
-                                 "azoteq,iqs572"
-                                 "azoteq,iqs525"
-
-- reg                          : I2C slave address for the device.
-
-- interrupts                   : GPIO to which the device's active-high RDY
-                                 output is connected (see [0]).
-
-- reset-gpios                  : GPIO to which the device's active-low NRST
-                                 input is connected (see [1]).
-
-Optional properties:
-
-- touchscreen-min-x            : See [2].
-
-- touchscreen-min-y            : See [2].
-
-- touchscreen-size-x           : See [2]. If this property is omitted, the
-                                 maximum x-coordinate is specified by the
-                                 device's "X Resolution" register.
-
-- touchscreen-size-y           : See [2]. If this property is omitted, the
-                                 maximum y-coordinate is specified by the
-                                 device's "Y Resolution" register.
-
-- touchscreen-max-pressure     : See [2]. Pressure is expressed as the sum of
-                                 the deltas across all channels impacted by a
-                                 touch event. A channel's delta is calculated
-                                 as its count value minus a reference, where
-                                 the count value is inversely proportional to
-                                 the channel's capacitance.
-
-- touchscreen-fuzz-x           : See [2].
-
-- touchscreen-fuzz-y           : See [2].
-
-- touchscreen-fuzz-pressure    : See [2].
-
-- touchscreen-inverted-x       : See [2]. Inversion is applied relative to that
-                                 which may already be specified by the device's
-                                 FLIP_X and FLIP_Y register fields.
-
-- touchscreen-inverted-y       : See [2]. Inversion is applied relative to that
-                                 which may already be specified by the device's
-                                 FLIP_X and FLIP_Y register fields.
-
-- touchscreen-swapped-x-y      : See [2]. Swapping is applied relative to that
-                                 which may already be specified by the device's
-                                 SWITCH_XY_AXIS register field.
-
-[0]: Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
-[1]: Documentation/devicetree/bindings/gpio/gpio.txt
-[2]: Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt
-
-Example:
-
-       &i2c1 {
-               /* ... */
-
-               touchscreen@74 {
-                       compatible = "azoteq,iqs550";
-                       reg = <0x74>;
-                       interrupt-parent = <&gpio>;
-                       interrupts = <17 4>;
-                       reset-gpios = <&gpio 27 1>;
-
-                       touchscreen-size-x = <640>;
-                       touchscreen-size-y = <480>;
-
-                       touchscreen-max-pressure = <16000>;
-               };
-
-               /* ... */
-       };
diff --git a/Documentation/devicetree/bindings/input/touchscreen/melfas,mms114.yaml b/Documentation/devicetree/bindings/input/touchscreen/melfas,mms114.yaml
new file mode 100644 (file)
index 0000000..6236688
--- /dev/null
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/melfas,mms114.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Melfas MMS114 family touchscreen controller bindings
+
+maintainers:
+  - Linus Walleij <linus.walleij@linaro.org>
+
+allOf:
+  - $ref: touchscreen.yaml#
+
+properties:
+  $nodename:
+    pattern: "^touchscreen(@.*)?$"
+
+  compatible:
+    items:
+      - enum:
+          - melfas,mms114
+          - melfas,mms134s
+          - melfas,mms136
+          - melfas,mms152
+          - melfas,mms345l
+
+  reg:
+    description: I2C address
+
+  clock-frequency:
+    description: I2C client clock frequency, defined for host
+    minimum: 100000
+    maximum: 400000
+
+  interrupts:
+    maxItems: 1
+
+  avdd-supply:
+    description: Analog power supply regulator on AVDD pin
+
+  vdd-supply:
+    description: Digital power supply regulator on VDD pin
+
+  touchscreen-size-x: true
+  touchscreen-size-y: true
+  touchscreen-fuzz-x: true
+  touchscreen-fuzz-y: true
+  touchscreen-fuzz-pressure: true
+  touchscreen-inverted-x: true
+  touchscreen-inverted-y: true
+  touchscreen-swapped-x-y: true
+  touchscreen-max-pressure: true
+
+additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - touchscreen-size-x
+  - touchscreen-size-y
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/irq.h>
+    i2c {
+      #address-cells = <1>;
+      #size-cells = <0>;
+      touchscreen@48 {
+        compatible = "melfas,mms114";
+        reg = <0x48>;
+        interrupt-parent = <&gpio>;
+        interrupts = <39 IRQ_TYPE_EDGE_FALLING>;
+        avdd-supply = <&ldo1_reg>;
+        vdd-supply = <&ldo2_reg>;
+        touchscreen-size-x = <720>;
+        touchscreen-size-y = <1280>;
+        touchscreen-fuzz-x = <10>;
+        touchscreen-fuzz-y = <10>;
+        touchscreen-fuzz-pressure = <10>;
+        touchscreen-inverted-x;
+        touchscreen-inverted-y;
+      };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/input/touchscreen/mms114.txt b/Documentation/devicetree/bindings/input/touchscreen/mms114.txt
deleted file mode 100644 (file)
index 707234c..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-* MELFAS MMS114/MMS152/MMS345L touchscreen controller
-
-Required properties:
-- compatible: should be one of:
-       - "melfas,mms114"
-       - "melfas,mms152"
-       - "melfas,mms345l"
-- reg: I2C address of the chip
-- interrupts: interrupt to which the chip is connected
-- touchscreen-size-x: See [1]
-- touchscreen-size-y: See [1]
-
-Optional properties:
-- touchscreen-fuzz-x: See [1]
-- touchscreen-fuzz-y: See [1]
-- touchscreen-fuzz-pressure: See [1]
-- touchscreen-inverted-x: See [1]
-- touchscreen-inverted-y: See [1]
-- touchscreen-swapped-x-y: See [1]
-
-[1]: Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt
-
-Example:
-
-       i2c@00000000 {
-               /* ... */
-
-               touchscreen@48 {
-                       compatible = "melfas,mms114";
-                       reg = <0x48>;
-                       interrupts = <39 0>;
-                       touchscreen-size-x = <720>;
-                       touchscreen-size-y = <1280>;
-                       touchscreen-fuzz-x = <10>;
-                       touchscreen-fuzz-y = <10>;
-                       touchscreen-fuzz-pressure = <10>;
-                       touchscreen-inverted-x;
-                       touchscreen-inverted-y;
-               };
-
-               /* ... */
-       };
diff --git a/Documentation/devicetree/bindings/input/touchscreen/mstar,msg2638.yaml b/Documentation/devicetree/bindings/input/touchscreen/mstar,msg2638.yaml
new file mode 100644 (file)
index 0000000..3a42c23
--- /dev/null
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/mstar,msg2638.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MStar msg2638 touchscreen controller Bindings
+
+maintainers:
+  - Vincent Knecht <vincent.knecht@mailoo.org>
+
+allOf:
+  - $ref: touchscreen.yaml#
+
+properties:
+  compatible:
+    const: mstar,msg2638
+
+  reg:
+    const: 0x26
+
+  interrupts:
+    maxItems: 1
+
+  reset-gpios:
+    maxItems: 1
+
+  vdd-supply:
+    description: Power supply regulator for the chip
+
+  vddio-supply:
+    description: Power supply regulator for the I2C bus
+
+  touchscreen-size-x: true
+  touchscreen-size-y: true
+
+additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - reset-gpios
+  - touchscreen-size-x
+  - touchscreen-size-y
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+    i2c {
+      #address-cells = <1>;
+      #size-cells = <0>;
+      touchscreen@26 {
+        compatible = "mstar,msg2638";
+        reg = <0x26>;
+        interrupt-parent = <&msmgpio>;
+        interrupts = <13 IRQ_TYPE_EDGE_FALLING>;
+        reset-gpios = <&msmgpio 100 GPIO_ACTIVE_LOW>;
+        pinctrl-names = "default";
+        pinctrl-0 = <&ts_int_reset_default>;
+        vdd-supply = <&pm8916_l17>;
+        vddio-supply = <&pm8916_l5>;
+        touchscreen-size-x = <2048>;
+        touchscreen-size-y = <2048>;
+      };
+    };
+
+...
index df5d8d1..160ff4b 100644 (file)
@@ -22,6 +22,9 @@ properties:
   reg:
     maxItems: 1
 
+  interrupts:
+    maxItems: 1
+
   interrupt-controller: true
 
 required:
@@ -29,6 +32,7 @@ required:
   - compatible
   - reg
   - interrupt-controller
+  - interrupts
 
 additionalProperties: false
 
index 6ba161d..9d27aa5 100644 (file)
@@ -34,6 +34,7 @@ properties:
         items:
           - enum:
               - qcom,sc7180-smmu-500
+              - qcom,sc7280-smmu-500
               - qcom,sc8180x-smmu-500
               - qcom,sdm845-smmu-500
               - qcom,sm8150-smmu-500
diff --git a/Documentation/devicetree/bindings/iommu/sprd,iommu.yaml b/Documentation/devicetree/bindings/iommu/sprd,iommu.yaml
new file mode 100644 (file)
index 0000000..7003e12
--- /dev/null
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright 2020 Unisoc Inc.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iommu/sprd,iommu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Unisoc IOMMU and Multi-media MMU
+
+maintainers:
+  - Chunyan Zhang <zhang.lyra@gmail.com>
+
+properties:
+  compatible:
+    enum:
+      - sprd,iommu-v1
+
+  "#iommu-cells":
+    const: 0
+    description:
+      Unisoc IOMMUs are all single-master IOMMU devices, therefore no
+      additional information needs to associate with its master device.
+      Please refer to the generic bindings document for more details,
+      Documentation/devicetree/bindings/iommu/iommu.txt
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    description:
+      Reference to a gate clock phandle, since access to some of IOMMUs are
+      controlled by gate clock, but this is not required.
+
+required:
+  - compatible
+  - reg
+  - "#iommu-cells"
+
+additionalProperties: false
+
+examples:
+  - |
+    iommu_disp: iommu@63000800 {
+      compatible = "sprd,iommu-v1";
+      reg = <0x63000800 0x80>;
+      #iommu-cells = <0>;
+    };
+
+  - |
+    iommu_jpg: iommu@62300300 {
+      compatible = "sprd,iommu-v1";
+      reg = <0x62300300 0x80>;
+      #iommu-cells = <0>;
+      clocks = <&mm_gate 1>;
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/leds/leds-rt4505.yaml b/Documentation/devicetree/bindings/leds/leds-rt4505.yaml
new file mode 100644 (file)
index 0000000..5b0c74a
--- /dev/null
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/leds/leds-rt4505.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Richtek RT4505 Single Channel LED Driver
+
+maintainers:
+  - ChiYuan Huang <cy_huang@richtek.com>
+
+description: |
+  The RT4505 is a flash LED driver that can support up to 375mA and 1.5A for
+  torch and flash mode, respectively.
+
+  The data sheet can be found at:
+    https://www.richtek.com/assets/product_file/RT4505/DS4505-02.pdf
+
+properties:
+  compatible:
+    const: richtek,rt4505
+
+  reg:
+    description: I2C slave address of the controller.
+    maxItems: 1
+
+  led:
+    type: object
+    $ref: common.yaml#
+
+required:
+  - compatible
+  - reg
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/leds/common.h>
+
+    i2c0 {
+      #address-cells = <1>;
+      #size-cells = <0>;
+
+      led-controller@63 {
+        compatible = "richtek,rt4505";
+        reg = <0x63>;
+
+        rt4505_flash: led {
+          function = LED_FUNCTION_FLASH;
+          color = <LED_COLOR_ID_WHITE>;
+          led-max-microamp = <375000>;
+          flash-max-microamp = <1500000>;
+          flash-max-timeout-us = <800000>;
+        };
+      };
+    };
index fe7c4cb..dd1a5ce 100644 (file)
@@ -193,23 +193,35 @@ required:
   - interrupts
   - clocks
   - power-domains
-  - resets
-
-if:
-  properties:
-    compatible:
-      contains:
-        enum:
-          - renesas,vin-r8a7778
-          - renesas,vin-r8a7779
-          - renesas,rcar-gen2-vin
-then:
-  required:
-    - port
-else:
-  required:
-    - renesas,id
-    - ports
+
+allOf:
+  - if:
+      not:
+        properties:
+          compatible:
+            contains:
+              enum:
+                - renesas,vin-r8a7778
+                - renesas,vin-r8a7779
+    then:
+      required:
+        - resets
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - renesas,vin-r8a7778
+              - renesas,vin-r8a7779
+              - renesas,rcar-gen2-vin
+    then:
+      required:
+        - port
+    else:
+      required:
+        - renesas,id
+        - ports
 
 additionalProperties: false
 
diff --git a/Documentation/devicetree/bindings/mtd/tango-nand.txt b/Documentation/devicetree/bindings/mtd/tango-nand.txt
deleted file mode 100644 (file)
index 91c8420..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-Sigma Designs Tango4 NAND Flash Controller (NFC)
-
-Required properties:
-
-- compatible: "sigma,smp8758-nand"
-- reg: address/size of nfc_reg, nfc_mem, and pbus_reg
-- dmas: reference to the DMA channel used by the controller
-- dma-names: "rxtx"
-- clocks: reference to the system clock
-- #address-cells: <1>
-- #size-cells: <0>
-
-Children nodes represent the available NAND chips.
-See Documentation/devicetree/bindings/mtd/nand-controller.yaml for generic bindings.
-
-Example:
-
-       nandc: nand-controller@2c000 {
-               compatible = "sigma,smp8758-nand";
-               reg = <0x2c000 0x30>, <0x2d000 0x800>, <0x20000 0x1000>;
-               dmas = <&dma0 3>;
-               dma-names = "rxtx";
-               clocks = <&clkgen SYS_CLK>;
-               #address-cells = <1>;
-               #size-cells = <0>;
-
-               nand@0 {
-                       reg = <0>; /* CS0 */
-                       nand-ecc-strength = <14>;
-                       nand-ecc-step-size = <1024>;
-               };
-
-               nand@1 {
-                       reg = <1>; /* CS1 */
-                       nand-ecc-strength = <14>;
-                       nand-ecc-step-size = <1024>;
-               };
-       };
index fe72a55..005868f 100644 (file)
@@ -51,12 +51,12 @@ properties:
 
   clocks:
     minItems: 1
-    maxItems: 2
     items:
       - description: AVB functional clock
       - description: Optional TXC reference clock
 
   clock-names:
+    minItems: 1
     items:
       - const: fck
       - const: refclk
diff --git a/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt b/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
deleted file mode 100644 (file)
index d6796ef..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-HiSilicon Hip05 and Hip06 PCIe host bridge DT description
-
-HiSilicon PCIe host controller is based on the Synopsys DesignWare PCI core.
-It shares common functions with the PCIe DesignWare core driver and inherits
-common properties defined in
-Documentation/devicetree/bindings/pci/designware-pcie.txt.
-
-Additional properties are described here:
-
-Required properties
-- compatible: Should contain "hisilicon,hip05-pcie" or "hisilicon,hip06-pcie".
-- reg: Should contain rc_dbi, config registers location and length.
-- reg-names: Must include the following entries:
-  "rc_dbi": controller configuration registers;
-  "config": PCIe configuration space registers.
-- msi-parent: Should be its_pcie which is an ITS receiving MSI interrupts.
-- port-id: Should be 0, 1, 2 or 3.
-
-Optional properties:
-- status: Either "ok" or "disabled".
-- dma-coherent: Present if DMA operations are coherent.
-
-Hip05 Example (note that Hip06 is the same except compatible):
-       pcie@b0080000 {
-               compatible = "hisilicon,hip05-pcie", "snps,dw-pcie";
-               reg = <0 0xb0080000 0 0x10000>, <0x220 0x00000000 0 0x2000>;
-               reg-names = "rc_dbi", "config";
-               bus-range = <0  15>;
-               msi-parent = <&its_pcie>;
-               #address-cells = <3>;
-               #size-cells = <2>;
-               device_type = "pci";
-               dma-coherent;
-               ranges = <0x82000000 0 0x00000000 0x220 0x00000000 0 0x10000000>;
-               num-lanes = <8>;
-               port-id = <1>;
-               #interrupt-cells = <1>;
-               interrupt-map-mask = <0xf800 0 0 7>;
-               interrupt-map = <0x0 0 0 1 &mbigen_pcie 1 10
-                                0x0 0 0 2 &mbigen_pcie 2 11
-                                0x0 0 0 3 &mbigen_pcie 3 12
-                                0x0 0 0 4 &mbigen_pcie 4 13>;
-       };
diff --git a/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml b/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml
new file mode 100644 (file)
index 0000000..e7b1f98
--- /dev/null
@@ -0,0 +1,181 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pci/mediatek-pcie-gen3.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Gen3 PCIe controller on MediaTek SoCs
+
+maintainers:
+  - Jianjun Wang <jianjun.wang@mediatek.com>
+
+description: |+
+  PCIe Gen3 MAC controller for MediaTek SoCs, it supports Gen3 speed
+  and compatible with Gen2, Gen1 speed.
+
+  This PCIe controller supports up to 256 MSI vectors, the MSI hardware
+  block diagram is as follows:
+
+                    +-----+
+                    | GIC |
+                    +-----+
+                       ^
+                       |
+                   port->irq
+                       |
+               +-+-+-+-+-+-+-+-+
+               |0|1|2|3|4|5|6|7| (PCIe intc)
+               +-+-+-+-+-+-+-+-+
+                ^ ^           ^
+                | |    ...    |
+        +-------+ +------+    +-----------+
+        |                |                |
+  +-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
+  |0|1|...|30|31|  |0|1|...|30|31|  |0|1|...|30|31| (MSI sets)
+  +-+-+---+--+--+  +-+-+---+--+--+  +-+-+---+--+--+
+   ^ ^      ^  ^    ^ ^      ^  ^    ^ ^      ^  ^
+   | |      |  |    | |      |  |    | |      |  |  (MSI vectors)
+   | |      |  |    | |      |  |    | |      |  |
+
+    (MSI SET0)       (MSI SET1)  ...   (MSI SET7)
+
+  With 256 MSI vectors supported, the MSI vectors are composed of 8 sets,
+  each set has its own address for MSI message, and supports 32 MSI vectors
+  to generate interrupt.
+
+allOf:
+  - $ref: /schemas/pci/pci-bus.yaml#
+
+properties:
+  compatible:
+    const: mediatek,mt8192-pcie
+
+  reg:
+    maxItems: 1
+
+  reg-names:
+    items:
+      - const: pcie-mac
+
+  interrupts:
+    maxItems: 1
+
+  ranges:
+    minItems: 1
+    maxItems: 8
+
+  resets:
+    minItems: 1
+    maxItems: 2
+
+  reset-names:
+    minItems: 1
+    maxItems: 2
+    items:
+      - const: phy
+      - const: mac
+
+  clocks:
+    maxItems: 6
+
+  clock-names:
+    items:
+      - const: pl_250m
+      - const: tl_26m
+      - const: tl_96m
+      - const: tl_32k
+      - const: peri_26m
+      - const: top_133m
+
+  assigned-clocks:
+    maxItems: 1
+
+  assigned-clock-parents:
+    maxItems: 1
+
+  phys:
+    maxItems: 1
+
+  '#interrupt-cells':
+    const: 1
+
+  interrupt-controller:
+    description: Interrupt controller node for handling legacy PCI interrupts.
+    type: object
+    properties:
+      '#address-cells':
+        const: 0
+      '#interrupt-cells':
+        const: 1
+      interrupt-controller: true
+
+    required:
+      - '#address-cells'
+      - '#interrupt-cells'
+      - interrupt-controller
+
+    additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - reg-names
+  - interrupts
+  - ranges
+  - clocks
+  - '#interrupt-cells'
+  - interrupt-controller
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    bus {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        pcie: pcie@11230000 {
+            compatible = "mediatek,mt8192-pcie";
+            device_type = "pci";
+            #address-cells = <3>;
+            #size-cells = <2>;
+            reg = <0x00 0x11230000 0x00 0x4000>;
+            reg-names = "pcie-mac";
+            interrupts = <GIC_SPI 251 IRQ_TYPE_LEVEL_HIGH 0>;
+            bus-range = <0x00 0xff>;
+            ranges = <0x82000000 0x00 0x12000000 0x00
+                      0x12000000 0x00 0x1000000>;
+            clocks = <&infracfg 44>,
+                     <&infracfg 40>,
+                     <&infracfg 43>,
+                     <&infracfg 97>,
+                     <&infracfg 99>,
+                     <&infracfg 111>;
+            clock-names = "pl_250m", "tl_26m", "tl_96m",
+                          "tl_32k", "peri_26m", "top_133m";
+            assigned-clocks = <&topckgen 50>;
+            assigned-clock-parents = <&topckgen 91>;
+
+            phys = <&pciephy>;
+            phy-names = "pcie-phy";
+
+            resets = <&infracfg_rst 2>,
+                     <&infracfg_rst 3>;
+            reset-names = "phy", "mac";
+
+            #interrupt-cells = <1>;
+            interrupt-map-mask = <0 0 0 0x7>;
+            interrupt-map = <0 0 0 1 &pcie_intc 0>,
+                            <0 0 0 2 &pcie_intc 1>,
+                            <0 0 0 3 &pcie_intc 2>,
+                            <0 0 0 4 &pcie_intc 3>;
+            pcie_intc: interrupt-controller {
+                      #address-cells = <0>;
+                      #interrupt-cells = <1>;
+                      interrupt-controller;
+            };
+        };
+    };
index 4a2bcc0..8fdfbc7 100644 (file)
@@ -17,6 +17,7 @@ allOf:
 properties:
   compatible:
     oneOf:
+      - const: renesas,pcie-r8a7779       # R-Car H1
       - items:
           - enum:
               - renesas,pcie-r8a7742      # RZ/G1H
@@ -74,7 +75,16 @@ required:
   - clocks
   - clock-names
   - power-domains
-  - resets
+
+if:
+  not:
+    properties:
+      compatible:
+        contains:
+          const: renesas,pcie-r8a7779
+then:
+  required:
+    - resets
 
 unevaluatedProperties: false
 
diff --git a/Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml b/Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml
new file mode 100644 (file)
index 0000000..b03cbb9
--- /dev/null
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pci/sifive,fu740-pcie.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: SiFive FU740 PCIe host controller
+
+description: |+
+  SiFive FU740 PCIe host controller is based on the Synopsys DesignWare
+  PCI core. It shares common features with the PCIe DesignWare core and
+  inherits common properties defined in
+  Documentation/devicetree/bindings/pci/designware-pcie.txt.
+
+maintainers:
+  - Paul Walmsley <paul.walmsley@sifive.com>
+  - Greentime Hu <greentime.hu@sifive.com>
+
+allOf:
+  - $ref: /schemas/pci/pci-bus.yaml#
+
+properties:
+  compatible:
+    const: sifive,fu740-pcie
+
+  reg:
+    maxItems: 3
+
+  reg-names:
+    items:
+      - const: dbi
+      - const: config
+      - const: mgmt
+
+  num-lanes:
+    const: 8
+
+  msi-parent: true
+
+  interrupt-names:
+    items:
+      - const: msi
+      - const: inta
+      - const: intb
+      - const: intc
+      - const: intd
+
+  resets:
+    description: A phandle to the PCIe power up reset line.
+    maxItems: 1
+
+  pwren-gpios:
+    description: Should specify the GPIO for controlling the PCI bus device power on.
+    maxItems: 1
+
+  reset-gpios:
+    maxItems: 1
+
+required:
+  - dma-coherent
+  - num-lanes
+  - interrupts
+  - interrupt-names
+  - interrupt-parent
+  - interrupt-map-mask
+  - interrupt-map
+  - clock-names
+  - clocks
+  - resets
+  - pwren-gpios
+  - reset-gpios
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    bus {
+        #address-cells = <2>;
+        #size-cells = <2>;
+        #include <dt-bindings/clock/sifive-fu740-prci.h>
+
+        pcie@e00000000 {
+            compatible = "sifive,fu740-pcie";
+            #address-cells = <3>;
+            #size-cells = <2>;
+            #interrupt-cells = <1>;
+            reg = <0xe 0x00000000 0x0 0x80000000>,
+                  <0xd 0xf0000000 0x0 0x10000000>,
+                  <0x0 0x100d0000 0x0 0x1000>;
+            reg-names = "dbi", "config", "mgmt";
+            device_type = "pci";
+            dma-coherent;
+            bus-range = <0x0 0xff>;
+            ranges = <0x81000000  0x0 0x60080000  0x0 0x60080000 0x0 0x10000>,      /* I/O */
+                     <0x82000000  0x0 0x60090000  0x0 0x60090000 0x0 0xff70000>,    /* mem */
+                     <0x82000000  0x0 0x70000000  0x0 0x70000000 0x0 0x1000000>,    /* mem */
+                     <0xc3000000 0x20 0x00000000 0x20 0x00000000 0x20 0x00000000>;  /* mem prefetchable */
+            num-lanes = <0x8>;
+            interrupts = <56>, <57>, <58>, <59>, <60>, <61>, <62>, <63>, <64>;
+            interrupt-names = "msi", "inta", "intb", "intc", "intd";
+            interrupt-parent = <&plic0>;
+            interrupt-map-mask = <0x0 0x0 0x0 0x7>;
+            interrupt-map = <0x0 0x0 0x0 0x1 &plic0 57>,
+                            <0x0 0x0 0x0 0x2 &plic0 58>,
+                            <0x0 0x0 0x0 0x3 &plic0 59>,
+                            <0x0 0x0 0x0 0x4 &plic0 60>;
+            clock-names = "pcie_aux";
+            clocks = <&prci PRCI_CLK_PCIE_AUX>;
+            resets = <&prci 4>;
+            pwren-gpios = <&gpio 5 0>;
+            reset-gpios = <&gpio 8 0>;
+        };
+    };
diff --git a/Documentation/devicetree/bindings/pci/tango-pcie.txt b/Documentation/devicetree/bindings/pci/tango-pcie.txt
deleted file mode 100644 (file)
index 2446838..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-Sigma Designs Tango PCIe controller
-
-Required properties:
-
-- compatible: "sigma,smp8759-pcie"
-- reg: address/size of PCI configuration space, address/size of register area
-- bus-range: defined by size of PCI configuration space
-- device_type: "pci"
-- #size-cells: <2>
-- #address-cells: <3>
-- msi-controller
-- ranges: translation from system to bus addresses
-- interrupts: spec for misc interrupts, spec for MSI
-
-Example:
-
-       pcie@2e000 {
-               compatible = "sigma,smp8759-pcie";
-               reg = <0x50000000 0x400000>, <0x2e000 0x100>;
-               bus-range = <0 3>;
-               device_type = "pci";
-               #size-cells = <2>;
-               #address-cells = <3>;
-               msi-controller;
-               ranges = <0x02000000 0x0 0x00400000  0x50400000  0x0 0x3c00000>;
-               interrupts =
-                       <54 IRQ_TYPE_LEVEL_HIGH>, /* misc interrupts */
-                       <55 IRQ_TYPE_LEVEL_HIGH>; /* MSI */
-       };
index d06f0c4..aed437d 100644 (file)
@@ -16,12 +16,14 @@ allOf:
 properties:
   compatible:
     oneOf:
-      - description: PCIe EP controller in J7200
+      - const: ti,j721e-pcie-ep
+      - description: PCIe EP controller in AM64
         items:
-          - const: ti,j7200-pcie-ep
+          - const: ti,am64-pcie-ep
           - const: ti,j721e-pcie-ep
-      - description: PCIe EP controller in J721E
+      - description: PCIe EP controller in J7200
         items:
+          - const: ti,j7200-pcie-ep
           - const: ti,j721e-pcie-ep
 
   reg:
@@ -66,7 +68,6 @@ required:
   - power-domains
   - clocks
   - clock-names
-  - dma-coherent
   - max-functions
   - phys
   - phy-names
index 0880a61..cc90020 100644 (file)
@@ -16,12 +16,14 @@ allOf:
 properties:
   compatible:
     oneOf:
-      - description: PCIe controller in J7200
+      - const: ti,j721e-pcie-host
+      - description: PCIe controller in AM64
         items:
-          - const: ti,j7200-pcie-host
+          - const: ti,am64-pcie-host
           - const: ti,j721e-pcie-host
-      - description: PCIe controller in J721E
+      - description: PCIe controller in J7200
         items:
+          - const: ti,j7200-pcie-host
           - const: ti,j721e-pcie-host
 
   reg:
@@ -46,12 +48,17 @@ properties:
     maxItems: 1
 
   clocks:
-    maxItems: 1
-    description: clock-specifier to represent input to the PCIe
+    minItems: 1
+    maxItems: 2
+    description: |+
+      clock-specifier to represent input to the PCIe for 1 item.
+      2nd item if present represents reference clock to the connector.
 
   clock-names:
+    minItems: 1
     items:
       - const: fck
+      - const: pcie_refclk
 
   vendor-id:
     const: 0x104c
@@ -62,6 +69,8 @@ properties:
           - const: 0xb00d
       - items:
           - const: 0xb00f
+      - items:
+          - const: 0xb010
 
   msi-map: true
 
@@ -78,7 +87,6 @@ required:
   - vendor-id
   - device-id
   - msi-map
-  - dma-coherent
   - dma-ranges
   - ranges
   - reset-gpios
index 01bf7fd..2d677e9 100644 (file)
@@ -33,6 +33,8 @@ Required properties:
        - #address-cells: specifies the number of cells needed to encode an
                address. The value must be 0.
 
+Optional properties:
+- dma-coherent: present if DMA operations are coherent
 
 Example:
 ++++++++
index 91fab61..84c4111 100644 (file)
@@ -51,23 +51,7 @@ Deprecated properties for iomux controller:
         Use rockchip,grf and rockchip,pmu described above instead.
 
 Required properties for gpio sub nodes:
-  - compatible: "rockchip,gpio-bank"
-  - reg: register of the gpio bank (different than the iomux registerset)
-  - interrupts: base interrupt of the gpio bank in the interrupt controller
-  - clocks: clock that drives this bank
-  - gpio-controller: identifies the node as a gpio controller and pin bank.
-  - #gpio-cells: number of cells in GPIO specifier. Since the generic GPIO
-    binding is used, the amount of cells must be specified as 2. See generic
-    GPIO binding documentation for description of particular cells.
-  - interrupt-controller: identifies the controller node as interrupt-parent.
-  - #interrupt-cells: the value of this property should be 2 and the interrupt
-    cells should use the standard two-cell scheme described in
-    bindings/interrupt-controller/interrupts.txt
-
-Deprecated properties for gpio sub nodes:
-  - compatible: "rockchip,rk3188-gpio-bank0"
-  - reg: second element: separate pull register for rk3188 bank0, use
-        rockchip,pmu described above instead
+See rockchip,gpio-bank.yaml
 
 Required properties for pin configuration node:
   - rockchip,pins: 3 integers array, represents a group of pins mux and config
@@ -128,43 +112,3 @@ uart2: serial@20064000 {
        pinctrl-names = "default";
        pinctrl-0 = <&uart2_xfer>;
 };
-
-Example for rk3188:
-
-       pinctrl@20008000 {
-               compatible = "rockchip,rk3188-pinctrl";
-               rockchip,grf = <&grf>;
-               rockchip,pmu = <&pmu>;
-               #address-cells = <1>;
-               #size-cells = <1>;
-               ranges;
-
-               gpio0: gpio0@2000a000 {
-                       compatible = "rockchip,rk3188-gpio-bank0";
-                       reg = <0x2000a000 0x100>;
-                       interrupts = <GIC_SPI 54 IRQ_TYPE_LEVEL_HIGH>;
-                       clocks = <&clk_gates8 9>;
-
-                       gpio-controller;
-                       #gpio-cells = <2>;
-
-                       interrupt-controller;
-                       #interrupt-cells = <2>;
-               };
-
-               gpio1: gpio1@2003c000 {
-                       compatible = "rockchip,gpio-bank";
-                       reg = <0x2003c000 0x100>;
-                       interrupts = <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>;
-                       clocks = <&clk_gates8 10>;
-
-                       gpio-controller;
-                       #gpio-cells = <2>;
-
-                       interrupt-controller;
-                       #interrupt-cells = <2>;
-               };
-
-               ...
-
-       };
diff --git a/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt b/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt
deleted file mode 100644 (file)
index f70956d..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-Rockchip PWM controller
-
-Required properties:
- - compatible: should be "rockchip,<name>-pwm"
-   "rockchip,rk2928-pwm": found on RK29XX,RK3066 and RK3188 SoCs
-   "rockchip,rk3288-pwm": found on RK3288 SOC
-   "rockchip,rv1108-pwm", "rockchip,rk3288-pwm": found on RV1108 SoC
-   "rockchip,vop-pwm": found integrated in VOP on RK3288 SoC
- - reg: physical base address and length of the controller's registers
- - clocks: See ../clock/clock-bindings.txt
-   - For older hardware (rk2928, rk3066, rk3188, rk3228, rk3288, rk3399):
-     - There is one clock that's used both to derive the functional clock
-       for the device and as the bus clock.
-   - For newer hardware (rk3328 and future socs): specified by name
-     - "pwm": This is used to derive the functional clock.
-     - "pclk": This is the APB bus clock.
- - #pwm-cells: must be 2 (rk2928) or 3 (rk3288). See pwm.yaml in this directory
-   for a description of the cell format.
-
-Example:
-
-       pwm0: pwm@20030000 {
-               compatible = "rockchip,rk2928-pwm";
-               reg = <0x20030000 0x10>;
-               clocks = <&cru PCLK_PWM01>;
-               #pwm-cells = <2>;
-       };
diff --git a/Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml b/Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml
new file mode 100644 (file)
index 0000000..5596bee
--- /dev/null
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pwm/pwm-rockchip.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip PWM controller
+
+maintainers:
+  - Heiko Stuebner <heiko@sntech.de>
+
+properties:
+  compatible:
+    oneOf:
+      - const: rockchip,rk2928-pwm
+      - const: rockchip,rk3288-pwm
+      - const: rockchip,rk3328-pwm
+      - const: rockchip,vop-pwm
+      - items:
+          - const: rockchip,rk3036-pwm
+          - const: rockchip,rk2928-pwm
+      - items:
+          - enum:
+              - rockchip,rk3368-pwm
+              - rockchip,rk3399-pwm
+              - rockchip,rv1108-pwm
+          - const: rockchip,rk3288-pwm
+      - items:
+          - enum:
+              - rockchip,px30-pwm
+              - rockchip,rk3308-pwm
+          - const: rockchip,rk3328-pwm
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    minItems: 1
+    maxItems: 2
+
+  clock-names:
+    maxItems: 2
+
+  "#pwm-cells":
+    enum: [2, 3]
+    description:
+      Must be 2 (rk2928) or 3 (rk3288 and later).
+      See pwm.yaml for a description of the cell format.
+
+required:
+  - compatible
+  - reg
+  - "#pwm-cells"
+
+if:
+  properties:
+    compatible:
+      contains:
+        enum:
+          - rockchip,rk3328-pwm
+          - rockchip,rv1108-pwm
+
+then:
+  properties:
+    clocks:
+      items:
+        - description: Used to derive the functional clock for the device.
+        - description: Used as the APB bus clock.
+
+    clock-names:
+      items:
+        - const: pwm
+        - const: pclk
+
+  required:
+    - clocks
+    - clock-names
+
+else:
+  properties:
+    clocks:
+      maxItems: 1
+      description:
+        Used both to derive the functional clock
+        for the device and as the bus clock.
+
+  required:
+    - clocks
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/rk3188-cru-common.h>
+    pwm0: pwm@20030000 {
+      compatible = "rockchip,rk2928-pwm";
+      reg = <0x20030000 0x10>;
+      clocks = <&cru PCLK_PWM01>;
+      #pwm-cells = <2>;
+    };
diff --git a/Documentation/devicetree/bindings/pwm/toshiba,pwm-visconti.yaml b/Documentation/devicetree/bindings/pwm/toshiba,pwm-visconti.yaml
new file mode 100644 (file)
index 0000000..d350f5e
--- /dev/null
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pwm/toshiba,pwm-visconti.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Toshiba Visconti PWM Controller
+
+maintainers:
+  - Nobuhiro Iwamatsu <nobuhiro1.iwamatsu@toshiba.co.jp>
+
+properties:
+  compatible:
+    items:
+      - const: toshiba,visconti-pwm
+
+  reg:
+    maxItems: 1
+
+  '#pwm-cells':
+    const: 2
+
+required:
+  - compatible
+  - reg
+  - '#pwm-cells'
+
+additionalProperties: false
+
+examples:
+  - |
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        pwm: pwm@241c0000 {
+            compatible = "toshiba,visconti-pwm";
+            reg = <0 0x241c0000 0 0x1000>;
+            pinctrl-names = "default";
+            pinctrl-0 = <&pwm_mux>;
+            #pwm-cells = <2>;
+        };
+    };
diff --git a/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml b/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml
new file mode 100644 (file)
index 0000000..208a628
--- /dev/null
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/remoteproc/fsl,imx-rproc.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: NXP i.MX Co-Processor Bindings
+
+description:
+  This binding provides support for ARM Cortex M4 Co-processor found on some NXP iMX SoCs.
+
+maintainers:
+  - Peng Fan <peng.fan@nxp.com>
+
+properties:
+  compatible:
+    enum:
+      - fsl,imx8mq-cm4
+      - fsl,imx8mm-cm4
+      - fsl,imx7d-cm4
+      - fsl,imx6sx-cm4
+
+  clocks:
+    maxItems: 1
+
+  syscon:
+    $ref: /schemas/types.yaml#/definitions/phandle
+    description:
+      Phandle to syscon block which provide access to System Reset Controller
+
+  mbox-names:
+    items:
+      - const: tx
+      - const: rx
+      - const: rxdb
+
+  mboxes:
+    description:
+      This property is required only if the rpmsg/virtio functionality is used.
+      List of <&phandle type channel> - 1 channel for TX, 1 channel for RX, 1 channel for RXDB.
+      (see mailbox/fsl,mu.yaml)
+    minItems: 1
+    maxItems: 3
+
+  memory-region:
+    description:
+      If present, a phandle for a reserved memory area that used for vdev buffer,
+      resource table, vring region and others used by remote processor.
+    minItems: 1
+    maxItems: 32
+
+required:
+  - compatible
+  - clocks
+  - syscon
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/imx7d-clock.h>
+    m4_reserved_sysmem1: cm4@80000000 {
+      reg = <0x80000000 0x80000>;
+    };
+
+    m4_reserved_sysmem2: cm4@81000000 {
+      reg = <0x81000000 0x80000>;
+    };
+
+    imx7d-cm4 {
+      compatible       = "fsl,imx7d-cm4";
+      memory-region    = <&m4_reserved_sysmem1>, <&m4_reserved_sysmem2>;
+      syscon           = <&src>;
+      clocks           = <&clks IMX7D_ARM_M4_ROOT_CLK>;
+    };
+
+  - |
+    #include <dt-bindings/clock/imx8mm-clock.h>
+
+    imx8mm-cm4 {
+      compatible = "fsl,imx8mm-cm4";
+      clocks = <&clk IMX8MM_CLK_M4_DIV>;
+      mbox-names = "tx", "rx", "rxdb";
+      mboxes = <&mu 0 1
+                &mu 1 1
+                &mu 3 1>;
+      memory-region = <&vdev0buffer>, <&vdev0vring0>, <&vdev0vring1>, <&rsc_table>;
+      syscon = <&src>;
+    };
+...
diff --git a/Documentation/devicetree/bindings/remoteproc/imx-rproc.txt b/Documentation/devicetree/bindings/remoteproc/imx-rproc.txt
deleted file mode 100644 (file)
index fbcefd9..0000000
+++ /dev/null
@@ -1,33 +0,0 @@
-NXP iMX6SX/iMX7D Co-Processor Bindings
-----------------------------------------
-
-This binding provides support for ARM Cortex M4 Co-processor found on some
-NXP iMX SoCs.
-
-Required properties:
-- compatible           Should be one of:
-                               "fsl,imx7d-cm4"
-                               "fsl,imx6sx-cm4"
-- clocks               Clock for co-processor (See: ../clock/clock-bindings.txt)
-- syscon               Phandle to syscon block which provide access to
-                       System Reset Controller
-
-Optional properties:
-- memory-region                list of phandels to the reserved memory regions.
-                       (See: ../reserved-memory/reserved-memory.txt)
-
-Example:
-       m4_reserved_sysmem1: cm4@80000000 {
-               reg = <0x80000000 0x80000>;
-       };
-
-       m4_reserved_sysmem2: cm4@81000000 {
-               reg = <0x81000000 0x80000>;
-       };
-
-       imx7d-cm4 {
-               compatible      = "fsl,imx7d-cm4";
-               memory-region   = <&m4_reserved_sysmem1>, <&m4_reserved_sysmem2>;
-               syscon          = <&src>;
-               clocks          = <&clks IMX7D_ARM_M4_ROOT_CLK>;
-       };
index 1c330a8..229f908 100644 (file)
@@ -18,6 +18,7 @@ on the Qualcomm ADSP Hexagon core.
                    "qcom,sc7180-mpss-pas"
                    "qcom,sdm845-adsp-pas"
                    "qcom,sdm845-cdsp-pas"
+                    "qcom,sdx55-mpss-pas"
                    "qcom,sm8150-adsp-pas"
                    "qcom,sm8150-cdsp-pas"
                    "qcom,sm8150-mpss-pas"
@@ -61,6 +62,7 @@ on the Qualcomm ADSP Hexagon core.
                    must be "wdog", "fatal", "ready", "handover", "stop-ack"
        qcom,qcs404-wcss-pas:
        qcom,sc7180-mpss-pas:
+        qcom,sdx55-mpss-pas:
        qcom,sm8150-mpss-pas:
        qcom,sm8350-mpss-pas:
                    must be "wdog", "fatal", "ready", "handover", "stop-ack",
@@ -128,6 +130,8 @@ on the Qualcomm ADSP Hexagon core.
        qcom,sm8150-mpss-pas:
        qcom,sm8350-mpss-pas:
                    must be "cx", "load_state", "mss"
+        qcom,sdx55-mpss-pas:
+                    must be "cx", "mss"
        qcom,sm8250-adsp-pas:
        qcom,sm8350-adsp-pas:
        qcom,sm8150-slpi-pas:
index 7ccd553..69c49c7 100644 (file)
@@ -9,6 +9,7 @@ on the Qualcomm Hexagon core.
        Definition: must be one of:
                    "qcom,q6v5-pil",
                    "qcom,ipq8074-wcss-pil"
+                   "qcom,qcs404-wcss-pil"
                    "qcom,msm8916-mss-pil",
                    "qcom,msm8974-mss-pil"
                    "qcom,msm8996-mss-pil"
@@ -39,6 +40,7 @@ on the Qualcomm Hexagon core.
                    string:
        qcom,q6v5-pil:
        qcom,ipq8074-wcss-pil:
+       qcom,qcs404-wcss-pil:
        qcom,msm8916-mss-pil:
        qcom,msm8974-mss-pil:
                    must be "wdog", "fatal", "ready", "handover", "stop-ack"
@@ -67,6 +69,11 @@ on the Qualcomm Hexagon core.
        Definition: The clocks needed depend on the compatible string:
        qcom,ipq8074-wcss-pil:
                    no clock names required
+       qcom,qcs404-wcss-pil:
+                   must be "xo", "gcc_abhs_cbcr", "gcc_abhs_cbcr",
+                   "gcc_axim_cbcr", "lcc_ahbfabric_cbc", "tcsr_lcc_cbc",
+                   "lcc_abhs_cbc", "lcc_tcm_slave_cbc", "lcc_abhm_cbc",
+                   "lcc_axim_cbc", "lcc_bcr_sleep"
        qcom,q6v5-pil:
        qcom,msm8916-mss-pil:
        qcom,msm8974-mss-pil:
@@ -132,6 +139,14 @@ For the compatible string below the following supplies are required:
        Definition: reference to the regulators to be held on behalf of the
                    booting of the Hexagon core
 
+For the compatible string below the following supplies are required:
+  "qcom,qcs404-wcss-pil"
+- cx-supply:
+       Usage: required
+       Value type: <phandle>
+       Definition: reference to the regulators to be held on behalf of the
+                   booting of the Hexagon core
+
 For the compatible string below the following supplies are required:
   "qcom,msm8996-mss-pil"
 - pll-supply:
index da09c0d..a83080b 100644 (file)
@@ -34,6 +34,12 @@ on the Qualcomm WCNSS core.
        Definition: should be "wdog", "fatal", optionally followed by "ready",
                    "handover", "stop-ack"
 
+- firmware-name:
+       Usage: optional
+       Value type: <string>
+       Definition: must list the relative firmware image path for the
+                   WCNSS core. Defaults to "wcnss.mdt".
+
 - vddmx-supply: (deprecated for qcom,pronto-v1/2-pil)
 - vddcx-supply: (deprecated for qcom,pronto-v1/2-pil)
 - vddpx-supply:
index a1171df..64afdcf 100644 (file)
@@ -65,16 +65,23 @@ properties:
           Unidirectional channel:
             - from local to remote, where ACK from the remote means that it is
               ready for shutdown
+      - description: |
+          A channel (d) used by the local proc to notify the remote proc that it
+          has to stop interprocessor communnication.
+          Unidirectional channel:
+            - from local to remote, where ACK from the remote means that communnication
+              as been stopped on the remote side.
     minItems: 1
-    maxItems: 3
+    maxItems: 4
 
   mbox-names:
     items:
       - const: vq0
       - const: vq1
       - const: shutdown
+      - const: detach
     minItems: 1
-    maxItems: 3
+    maxItems: 4
 
   memory-region:
     description:
diff --git a/Documentation/devicetree/bindings/riscv/microchip.yaml b/Documentation/devicetree/bindings/riscv/microchip.yaml
new file mode 100644 (file)
index 0000000..3f981e8
--- /dev/null
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/riscv/microchip.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Microchip PolarFire SoC-based boards device tree bindings
+
+maintainers:
+  - Cyril Jean <Cyril.Jean@microchip.com>
+  - Lewis Hanly <lewis.hanly@microchip.com>
+
+description:
+  Microchip PolarFire SoC-based boards
+
+properties:
+  $nodename:
+    const: '/'
+  compatible:
+    items:
+      - enum:
+          - microchip,mpfs-icicle-kit
+      - const: microchip,mpfs
+
+additionalProperties: true
+
+...
diff --git a/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml b/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml
new file mode 100644 (file)
index 0000000..4fba6db
--- /dev/null
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/rtc/qcom-pm8xxx-rtc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm PM8xxx PMIC RTC device
+
+maintainers:
+  - Satya Priya <skakit@codeaurora.org>
+
+properties:
+  compatible:
+    enum:
+      - qcom,pm8058-rtc
+      - qcom,pm8921-rtc
+      - qcom,pm8941-rtc
+      - qcom,pm8018-rtc
+      - qcom,pmk8350-rtc
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  allow-set-time:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      Indicates that the setting of RTC time is allowed by the host CPU.
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/spmi/spmi.h>
+    spmi_bus: spmi@c440000 {
+      reg = <0x0c440000 0x1100>;
+      #address-cells = <2>;
+      #size-cells = <0>;
+      pmicintc: pmic@0 {
+        reg = <0x0 SPMI_USID>;
+        compatible = "qcom,pm8921";
+        interrupts = <104 8>;
+        #interrupt-cells = <2>;
+        interrupt-controller;
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        pm8921_rtc: rtc@11d {
+          compatible = "qcom,pm8921-rtc";
+          reg = <0x11d>;
+          interrupts = <0x27 0>;
+        };
+      };
+    };
+...
index f0506a9..41f57c4 100644 (file)
@@ -99,11 +99,6 @@ properties:
               - mediatek,mt7622-btif
               - mediatek,mt7623-btif
           - const: mediatek,mtk-btif
-      - items:
-          - enum:
-              - mediatek,mt7622-btif
-              - mediatek,mt7623-btif
-          - const: mediatek,mtk-btif
       - items:
           - const: mrvl,mmp-uart
           - const: intel,xscale-uart
diff --git a/Documentation/devicetree/bindings/thermal/brcm,ns-thermal.txt b/Documentation/devicetree/bindings/thermal/brcm,ns-thermal.txt
deleted file mode 100644 (file)
index 68e0471..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-* Broadcom Northstar Thermal
-
-This binding describes thermal sensor that is part of Northstar's DMU (Device
-Management Unit).
-
-Required properties:
-- compatible : Must be "brcm,ns-thermal"
-- reg : iomem address range of PVTMON registers
-- #thermal-sensor-cells : Should be <0>
-
-Example:
-
-thermal: thermal@1800c2c0 {
-       compatible = "brcm,ns-thermal";
-       reg = <0x1800c2c0 0x10>;
-       #thermal-sensor-cells = <0>;
-};
-
-thermal-zones {
-       cpu_thermal: cpu-thermal {
-               polling-delay-passive = <0>;
-               polling-delay = <1000>;
-               coefficients = <(-556) 418000>;
-               thermal-sensors = <&thermal>;
-
-               trips {
-                       cpu-crit {
-                               temperature     = <125000>;
-                               hysteresis      = <0>;
-                               type            = "critical";
-                       };
-               };
-
-               cooling-maps {
-               };
-       };
-};
diff --git a/Documentation/devicetree/bindings/thermal/brcm,ns-thermal.yaml b/Documentation/devicetree/bindings/thermal/brcm,ns-thermal.yaml
new file mode 100644 (file)
index 0000000..fdeb333
--- /dev/null
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/thermal/brcm,ns-thermal.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Broadcom Northstar Thermal
+
+maintainers:
+  - Rafał Miłecki <rafal@milecki.pl>
+
+description:
+  Thermal sensor that is part of Northstar's DMU (Device Management Unit).
+
+allOf:
+  - $ref: thermal-sensor.yaml#
+
+properties:
+  compatible:
+    const: brcm,ns-thermal
+
+  reg:
+    description: PVTMON registers range
+    maxItems: 1
+
+  "#thermal-sensor-cells":
+    const: 0
+
+unevaluatedProperties: false
+
+required:
+  - reg
+
+examples:
+  - |
+    thermal: thermal@1800c2c0 {
+        compatible = "brcm,ns-thermal";
+        reg = <0x1800c2c0 0x10>;
+        #thermal-sensor-cells = <0>;
+    };
+
+    thermal-zones {
+        cpu-thermal {
+            polling-delay-passive = <0>;
+            polling-delay = <1000>;
+            coefficients = <(-556) 418000>;
+            thermal-sensors = <&thermal>;
+
+            trips {
+                cpu-crit {
+                    temperature = <125000>;
+                    hysteresis = <0>;
+                    type = "critical";
+                };
+            };
+
+            cooling-maps {
+            };
+        };
+    };
index 95462e0..0242fd9 100644 (file)
@@ -19,9 +19,15 @@ description: |
 properties:
   compatible:
     oneOf:
+      - description: msm9860 TSENS based
+        items:
+          - enum:
+              - qcom,ipq8064-tsens
+
       - description: v0.1 of TSENS
         items:
           - enum:
+              - qcom,mdm9607-tsens
               - qcom,msm8916-tsens
               - qcom,msm8939-tsens
               - qcom,msm8974-tsens
@@ -43,6 +49,7 @@ properties:
               - qcom,sdm845-tsens
               - qcom,sm8150-tsens
               - qcom,sm8250-tsens
+              - qcom,sm8350-tsens
           - const: qcom,tsens-v2
 
   reg:
@@ -73,7 +80,9 @@ properties:
     maxItems: 2
     items:
       - const: calib
-      - const: calib_sel
+      - enum:
+          - calib_backup
+          - calib_sel
 
   "#qcom,sensors":
     description:
@@ -88,12 +97,21 @@ properties:
       Number of cells required to uniquely identify the thermal sensors. Since
       we have multiple sensors this is set to 1
 
+required:
+  - compatible
+  - interrupts
+  - interrupt-names
+  - "#thermal-sensor-cells"
+  - "#qcom,sensors"
+
 allOf:
   - if:
       properties:
         compatible:
           contains:
             enum:
+              - qcom,ipq8064-tsens
+              - qcom,mdm9607-tsens
               - qcom,msm8916-tsens
               - qcom,msm8974-tsens
               - qcom,msm8976-tsens
@@ -114,17 +132,42 @@ allOf:
         interrupt-names:
           minItems: 2
 
-required:
-  - compatible
-  - reg
-  - "#qcom,sensors"
-  - interrupts
-  - interrupt-names
-  - "#thermal-sensor-cells"
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - qcom,tsens-v0_1
+              - qcom,tsens-v1
+              - qcom,tsens-v2
+
+    then:
+      required:
+        - reg
 
 additionalProperties: false
 
 examples:
+  - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    // Example msm9860 based SoC (ipq8064):
+    gcc: clock-controller {
+
+           /* ... */
+
+           tsens: thermal-sensor {
+                compatible = "qcom,ipq8064-tsens";
+
+                 nvmem-cells = <&tsens_calib>, <&tsens_calib_backup>;
+                 nvmem-cell-names = "calib", "calib_backup";
+                 interrupts = <GIC_SPI 178 IRQ_TYPE_LEVEL_HIGH>;
+                 interrupt-names = "uplow";
+
+                 #qcom,sensors = <11>;
+                 #thermal-sensor-cells = <1>;
+          };
+    };
+
   - |
     #include <dt-bindings/interrupt-controller/arm-gic.h>
     // Example 1 (legacy: for pre v1 IP):
index b33a76e..f963204 100644 (file)
@@ -28,14 +28,7 @@ properties:
       - renesas,r8a77980-thermal # R-Car V3H
       - renesas,r8a779a0-thermal # R-Car V3U
 
-  reg:
-    minItems: 2
-    maxItems: 4
-    items:
-      - description: TSC1 registers
-      - description: TSC2 registers
-      - description: TSC3 registers
-      - description: TSC4 registers
+  reg: true
 
   interrupts:
     items:
@@ -71,8 +64,25 @@ if:
           enum:
             - renesas,r8a779a0-thermal
 then:
+  properties:
+    reg:
+      minItems: 2
+      maxItems: 3
+      items:
+        - description: TSC1 registers
+        - description: TSC2 registers
+        - description: TSC3 registers
   required:
     - interrupts
+else:
+  properties:
+    reg:
+      items:
+        - description: TSC0 registers
+        - description: TSC1 registers
+        - description: TSC2 registers
+        - description: TSC3 registers
+        - description: TSC4 registers
 
 additionalProperties: false
 
@@ -111,3 +121,20 @@ examples:
                     };
             };
     };
+  - |
+    #include <dt-bindings/clock/r8a779a0-cpg-mssr.h>
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/power/r8a779a0-sysc.h>
+
+    tsc_r8a779a0: thermal@e6190000 {
+            compatible = "renesas,r8a779a0-thermal";
+            reg = <0xe6190000 0x200>,
+                  <0xe6198000 0x200>,
+                  <0xe61a0000 0x200>,
+                  <0xe61a8000 0x200>,
+                  <0xe61b0000 0x200>;
+            clocks = <&cpg CPG_MOD 919>;
+            power-domains = <&sysc R8A779A0_PD_ALWAYS_ON>;
+            resets = <&cpg 919>;
+            #thermal-sensor-cells = <1>;
+    };
index 9f74792..4bd345c 100644 (file)
@@ -36,6 +36,9 @@ properties:
       containing several internal sensors.
     enum: [0, 1]
 
+required:
+  - "#thermal-sensor-cells"
+
 additionalProperties: true
 
 examples:
index 99a72a4..b868cef 100644 (file)
@@ -495,6 +495,8 @@ patternProperties:
     description: Shenzhen Hugsun Technology Co. Ltd.
   "^hwacom,.*":
     description: HwaCom Systems Inc.
+  "^hycon,.*":
+    description: Hycon Technology Corp.
   "^hydis,.*":
     description: Hydis Technologies
   "^hyundai,.*":
index 22271c3..3366a99 100644 (file)
@@ -12,7 +12,7 @@ Guidelines for GPIOs consumers
 
 Drivers that can't work without standard GPIO calls should have Kconfig entries
 that depend on GPIOLIB or select GPIOLIB. The functions that allow a driver to
-obtain and use GPIOs are available by including the following file:
+obtain and use GPIOs are available by including the following file::
 
        #include <linux/gpio/consumer.h>
 
index 41ec3cc..af632d7 100644 (file)
@@ -96,6 +96,12 @@ hardware descriptions such as device tree or ACPI:
   way to pass the charging parameters from hardware descriptions such as the
   device tree.
 
+- gpio-mux: drivers/mux/gpio.c is used for controlling a multiplexer using
+  n GPIO lines such that you can mux in 2^n different devices by activating
+  different GPIO lines. Often the GPIOs are on a SoC and the devices are
+  some SoC-external entities, such as different components on a PCB that
+  can be selectively enabled.
+
 Apart from this there are special GPIO drivers in subsystems like MMC/SD to
 read card detect and write protect GPIO lines, and in the TTY serial subsystem
 to emulate MCTRL (modem control) signals CTS/RTS by using two GPIO lines. The
index ab62f1b..a7ca4f5 100644 (file)
@@ -55,7 +55,11 @@ several parameter at once. For example, if you see pwm_config() and
 pwm_{enable,disable}() calls in the same function, this probably means you
 should switch to pwm_apply_state().
 
-The PWM user API also allows one to query the PWM state with pwm_get_state().
+The PWM user API also allows one to query the PWM state that was passed to the
+last invocation of pwm_apply_state() using pwm_get_state(). Note this is
+different to what the driver has actually implemented if the request cannot be
+satisfied exactly with the hardware in use. There is currently no way for
+consumers to get the actually implemented settings.
 
 In addition to the PWM state, the PWM API also exposes PWM arguments, which
 are the reference PWM config one should use on this PWM.
index 29fdd81..4b638c1 100644 (file)
@@ -730,17 +730,7 @@ This function returns the thermal_instance corresponding to a given
 {thermal_zone, cooling_device, trip_point} combination. Returns NULL
 if such an instance does not exist.
 
-4.3. thermal_notify_framework
------------------------------
-
-This function handles the trip events from sensor drivers. It starts
-throttling the cooling devices according to the policy configured.
-For CRITICAL and HOT trip points, this notifies the respective drivers,
-and does actual throttling for other trip points i.e ACTIVE and PASSIVE.
-The throttling policy is based on the configured platform data; if no
-platform data is provided, this uses the step_wise throttling policy.
-
-4.4. thermal_cdev_update
+4.3. thermal_cdev_update
 ------------------------
 
 This function serves as an arbitrator to set the state of a cooling
index decc68c..606eed8 100644 (file)
@@ -2,7 +2,7 @@
 VFIO - "Virtual Function I/O" [1]_
 ==================================
 
-Many modern system now provide DMA and interrupt remapping facilities
+Many modern systems now provide DMA and interrupt remapping facilities
 to help ensure I/O devices behave within the boundaries they've been
 allotted.  This includes x86 hardware with AMD-Vi and Intel VT-d,
 POWER systems with Partitionable Endpoints (PEs) and embedded PowerPC
index 35ed01a..992bf91 100644 (file)
@@ -110,6 +110,12 @@ background_gc=%s    Turn on/off cleaning operations, namely garbage
                         on synchronous garbage collection running in background.
                         Default value for this option is on. So garbage
                         collection is on by default.
+gc_merge                When background_gc is on, this option can be enabled to
+                        let background GC thread to handle foreground GC requests,
+                        it can eliminate the sluggish issue caused by slow foreground
+                        GC operation when GC is triggered from a process with limited
+                        I/O and CPU resources.
+nogc_merge              Disable GC merge feature.
 disable_roll_forward    Disable the roll-forward recovery routine
 norecovery              Disable the roll-forward recovery routine, mounted read-
                         only (i.e., -o ro,disable_roll_forward)
@@ -813,6 +819,14 @@ Compression implementation
   * chattr +c file
   * chattr +c dir; touch dir/file
   * mount w/ -o compress_extension=ext; touch file.ext
+  * mount w/ -o compress_extension=*; touch any_file
+
+- At this point, compression feature doesn't expose compressed space to user
+  directly in order to guarantee potential data updates later to the space.
+  Instead, the main goal is to reduce data writes to flash disk as much as
+  possible, resulting in extending disk life time as well as relaxing IO
+  congestion. Alternatively, we've added ioctl interface to reclaim compressed
+  space and show it to user after putting the immutable bit.
 
 Compress metadata layout::
 
index 4e264c1..df4b711 100644 (file)
@@ -99,6 +99,12 @@ native::
       }
   }
 
+Note, that historically ACPI has no means of the GPIO polarity and thus
+the SPISerialBus() resource defines it on the per-chip basis. In order
+to avoid a chain of negations, the GPIO polarity is considered being
+Active High. Even for the cases when _DSD() is involved (see the example
+above) the GPIO CS polarity must be defined Active High to avoid ambiguity.
+
 Other supported properties
 ==========================
 
index 810ae02..5865748 100644 (file)
@@ -107,13 +107,17 @@ example below:
                },
        };
 
-       static const struct property_entry rotary_encoder_properties[] __initconst = {
+       static const struct property_entry rotary_encoder_properties[] = {
                PROPERTY_ENTRY_U32("rotary-encoder,steps-per-period", 24),
                PROPERTY_ENTRY_U32("linux,axis",                      ABS_X),
                PROPERTY_ENTRY_U32("rotary-encoder,relative_axis",    0),
                { },
        };
 
+       static const struct software_node rotary_encoder_node = {
+               .properties = rotary_encoder_properties,
+       };
+
        static struct platform_device rotary_encoder_device = {
                .name           = "rotary-encoder",
                .id             = 0,
@@ -122,7 +126,7 @@ example below:
        ...
 
        gpiod_add_lookup_table(&rotary_encoder_gpios);
-       device_add_properties(&rotary_encoder_device, rotary_encoder_properties);
+       device_add_software_node(&rotary_encoder_device.dev, &rotary_encoder_node);
        platform_device_register(&rotary_encoder_device);
 
        ...
index 95803e2..af5934c 100644 (file)
@@ -71,7 +71,7 @@ The possible values of ``type`` are::
        #define JS_EVENT_INIT           0x80    /* initial state of device */
 
 As mentioned above, the driver will issue synthetic JS_EVENT_INIT ORed
-events on open. That is, if it's issuing a INIT BUTTON event, the
+events on open. That is, if it's issuing an INIT BUTTON event, the
 current type value will be::
 
        int type = JS_EVENT_BUTTON | JS_EVENT_INIT;     /* 0x81 */
@@ -100,8 +100,8 @@ is, you have both an axis 0 and a button 0). Generally,
         =============== =======
 
 Hats vary from one joystick type to another. Some can be moved in 8
-directions, some only in 4, The driver, however, always reports a hat as two
-independent axis, even if the hardware doesn't allow independent movement.
+directions, some only in 4. The driver, however, always reports a hat as two
+independent axes, even if the hardware doesn't allow independent movement.
 
 
 js_event.value
@@ -188,10 +188,10 @@ One reason for emptying the queue is that if it gets full you'll start
 missing events since the queue is finite, and older events will get
 overwritten.
 
-The other reason is that you want to know all what happened, and not
+The other reason is that you want to know all that happened, and not
 delay the processing till later.
 
-Why can get the queue full? Because you don't empty the queue as
+Why can the queue get full? Because you don't empty the queue as
 mentioned, or because too much time elapses from one read to another
 and too many events to store in the queue get generated. Note that
 high system load may contribute to space those reads even more.
@@ -277,7 +277,7 @@ to be in the stable part of the API, and therefore may change without
 warning in following releases of the driver.
 
 Both JSIOCSCORR and JSIOCGCORR expect &js_corr to be able to hold
-information for all axis. That is, struct js_corr corr[MAX_AXIS];
+information for all axes. That is, struct js_corr corr[MAX_AXIS];
 
 struct js_corr is defined as::
 
@@ -328,7 +328,7 @@ To test the state of the buttons,
        second_button_state = js.buttons & 2;
 
 The axis values do not have a defined range in the original 0.x driver,
-except for that the values are non-negative. The 1.2.8+ drivers use a
+except that the values are non-negative. The 1.2.8+ drivers use a
 fixed range for reporting the values, 1 being the minimum, 128 the
 center, and 255 maximum value.
 
index 9746fd7..f615906 100644 (file)
@@ -133,15 +133,15 @@ And add a line to your rc script executing that file::
 This way, after the next reboot your joystick will remain calibrated. You
 can also add the ``jscal -p`` line to your shutdown script.
 
-Hspecific driver information
-==============================
+Hardware-specific driver information
+====================================
 
 In this section each of the separate hardware specific drivers is described.
 
 Analog joysticks
 ----------------
 
-The analog.c uses the standard analog inputs of the gameport, and thus
+The analog.c driver uses the standard analog inputs of the gameport, and thus
 supports all standard joysticks and gamepads. It uses a very advanced
 routine for this, allowing for data precision that can't be found on any
 other system.
@@ -266,7 +266,7 @@ to:
 * Logitech WingMan Extreme Digital 3D
 
 ADI devices are autodetected, and the driver supports up to two (any
-combination of) devices on a single gameport, using an Y-cable or chained
+combination of) devices on a single gameport, using a Y-cable or chained
 together.
 
 Logitech WingMan Joystick, Logitech WingMan Attack, Logitech WingMan
@@ -288,7 +288,7 @@ supports:
 * Gravis Xterminator DualControl
 
 All these devices are autodetected, and you can even use any combination
-of up to two of these pads either chained together or using an Y-cable on a
+of up to two of these pads either chained together or using a Y-cable on a
 single gameport.
 
 GrIP MultiPort isn't supported yet. Gravis Stinger is a serial device and is
@@ -311,7 +311,7 @@ allow connecting analog joysticks to them, you'll need to load the analog
 driver as well to handle the attached joysticks.
 
 The trackball should work with USB mousedev module as a normal mouse. See
-the USB documentation for how to setup an USB mouse.
+the USB documentation for how to setup a USB mouse.
 
 ThrustMaster DirectConnect (BSP)
 --------------------------------
@@ -332,7 +332,7 @@ If you have one of these, contact me.
 
 TMDC devices are autodetected, and thus no parameters to the module
 are needed. Up to two TMDC devices can be connected to one gameport, using
-an Y-cable.
+a Y-cable.
 
 Creative Labs Blaster
 ---------------------
@@ -342,7 +342,7 @@ the:
 
 * Creative Blaster GamePad Cobra
 
-Up to two of these can be used on a single gameport, using an Y-cable.
+Up to two of these can be used on a single gameport, using a Y-cable.
 
 Genius Digital joysticks
 ------------------------
@@ -381,7 +381,7 @@ card, 16 in case you have two in your system.
 Trident 4DWave / Aureal Vortex
 ------------------------------
 
-Soundcards with a Trident 4DWave DX/NX or Aureal Vortex/Vortex2 chipsets
+Soundcards with a Trident 4DWave DX/NX or Aureal Vortex/Vortex2 chipset
 provide an "Enhanced Game Port" mode where the soundcard handles polling the
 joystick.  This mode is supported by the pcigame.c module. Once loaded the
 analog driver can use the enhanced features of these gameports..
@@ -454,7 +454,7 @@ Devices currently supported by spaceball.c are:
 * SpaceTec SpaceBall 4000 FLX
 
 In addition to having the spaceorb/spaceball and serport modules in the
-kernel, you also need to attach a serial port to it. to do that, run the
+kernel, you also need to attach a serial port to it. To do that, run the
 inputattach program::
 
        inputattach --spaceorb /dev/tts/x &
@@ -466,7 +466,7 @@ or::
 where /dev/tts/x is the serial port which the device is connected to. After
 doing this, the device will be reported and will start working.
 
-There is one caveat with the SpaceOrb. The button #6, the on the bottom
+There is one caveat with the SpaceOrb. The button #6, the one on the bottom
 side of the orb, although reported as an ordinary button, causes internal
 recentering of the spaceorb, moving the zero point to the position in which
 the ball is at the moment of pressing the button. So, think first before
@@ -500,7 +500,7 @@ joy-magellan module. It currently supports only the:
 * Magellan 3D
 * Space Mouse
 
-models, the additional buttons on the 'Plus' versions are not supported yet.
+models; the additional buttons on the 'Plus' versions are not supported yet.
 
 To use it, you need to attach the serial port to the driver using the::
 
@@ -575,7 +575,7 @@ FAQ
 :A: The device files don't exist. Create them (see section 2.2).
 
 :Q: Is it possible to connect my old Atari/Commodore/Amiga/console joystick
-    or pad that uses a 9-pin D-type cannon connector to the serial port of my
+    or pad that uses a 9-pin D-type Cannon connector to the serial port of my
     PC?
 :A: Yes, it is possible, but it'll burn your serial port or the pad. It
     won't work, of course.
index dac1771..d3a8557 100644 (file)
@@ -48,7 +48,6 @@ quota-tools            3.09             quota -V
 PPP                    2.4.0            pppd --version
 nfs-utils              1.0.5            showmount --version
 procps                 3.2.0            ps --version
-oprofile               0.9              oprofiled --version
 udev                   081              udevd --version
 grub                   0.93             grub --version || grub-install --version
 mcelog                 0.6              mcelog --version
index 6e6e394..ea915c1 100644 (file)
@@ -6,6 +6,7 @@ RISC-V architecture
     :maxdepth: 1
 
     boot-image-header
+    vm-layout
     pmu
     patch-acceptance
 
diff --git a/Documentation/riscv/vm-layout.rst b/Documentation/riscv/vm-layout.rst
new file mode 100644 (file)
index 0000000..329d320
--- /dev/null
@@ -0,0 +1,63 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================================
+Virtual Memory Layout on RISC-V Linux
+=====================================
+
+:Author: Alexandre Ghiti <alex@ghiti.fr>
+:Date: 12 February 2021
+
+This document describes the virtual memory layout used by the RISC-V Linux
+Kernel.
+
+RISC-V Linux Kernel 32bit
+=========================
+
+RISC-V Linux Kernel SV32
+------------------------
+
+TODO
+
+RISC-V Linux Kernel 64bit
+=========================
+
+The RISC-V privileged architecture document states that the 64bit addresses
+"must have bits 63–48 all equal to bit 47, or else a page-fault exception will
+occur.": that splits the virtual address space into 2 halves separated by a very
+big hole, the lower half is where the userspace resides, the upper half is where
+the RISC-V Linux Kernel resides.
+
+RISC-V Linux Kernel SV39
+------------------------
+
+::
+
+  ========================================================================================================================
+      Start addr    |   Offset   |     End addr     |  Size   | VM area description
+  ========================================================================================================================
+                    |            |                  |         |
+   0000000000000000 |    0       | 0000003fffffffff |  256 GB | user-space virtual memory, different per mm
+  __________________|____________|__________________|_________|___________________________________________________________
+                    |            |                  |         |
+   0000004000000000 | +256    GB | ffffffbfffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical
+                    |            |                  |         |     virtual memory addresses up to the -256 GB
+                    |            |                  |         |     starting offset of kernel mappings.
+  __________________|____________|__________________|_________|___________________________________________________________
+                                                              |
+                                                              | Kernel-space virtual memory, shared between all processes:
+  ____________________________________________________________|___________________________________________________________
+                    |            |                  |         |
+   ffffffc000000000 | -256    GB | ffffffc7ffffffff |   32 GB | kasan
+   ffffffcefee00000 | -196    GB | ffffffcefeffffff |    2 MB | fixmap
+   ffffffceff000000 | -196    GB | ffffffceffffffff |   16 MB | PCI io
+   ffffffcf00000000 | -196    GB | ffffffcfffffffff |    4 GB | vmemmap
+   ffffffd000000000 | -192    GB | ffffffdfffffffff |   64 GB | vmalloc/ioremap space
+   ffffffe000000000 | -128    GB | ffffffff7fffffff |  124 GB | direct mapping of all physical memory
+  __________________|____________|__________________|_________|____________________________________________________________
+                                                              |
+                                                              |
+  ____________________________________________________________|____________________________________________________________
+                    |            |                  |         |
+   ffffffff00000000 |   -4    GB | ffffffff7fffffff |    2 GB | modules
+   ffffffff80000000 |   -2    GB | ffffffffffffffff |    2 GB | kernel, BPF
+  __________________|____________|__________________|_________|____________________________________________________________
index 8129405..16335de 100644 (file)
@@ -16,3 +16,4 @@ Security Documentation
    siphash
    tpm/index
    digsig
+   landlock
diff --git a/Documentation/security/landlock.rst b/Documentation/security/landlock.rst
new file mode 100644 (file)
index 0000000..2e84925
--- /dev/null
@@ -0,0 +1,85 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+.. Copyright © 2019-2020 ANSSI
+
+==================================
+Landlock LSM: kernel documentation
+==================================
+
+:Author: Mickaël Salaün
+:Date: March 2021
+
+Landlock's goal is to create scoped access-control (i.e. sandboxing).  To
+harden a whole system, this feature should be available to any process,
+including unprivileged ones.  Because such process may be compromised or
+backdoored (i.e. untrusted), Landlock's features must be safe to use from the
+kernel and other processes point of view.  Landlock's interface must therefore
+expose a minimal attack surface.
+
+Landlock is designed to be usable by unprivileged processes while following the
+system security policy enforced by other access control mechanisms (e.g. DAC,
+LSM).  Indeed, a Landlock rule shall not interfere with other access-controls
+enforced on the system, only add more restrictions.
+
+Any user can enforce Landlock rulesets on their processes.  They are merged and
+evaluated according to the inherited ones in a way that ensures that only more
+constraints can be added.
+
+User space documentation can be found here: :doc:`/userspace-api/landlock`.
+
+Guiding principles for safe access controls
+===========================================
+
+* A Landlock rule shall be focused on access control on kernel objects instead
+  of syscall filtering (i.e. syscall arguments), which is the purpose of
+  seccomp-bpf.
+* To avoid multiple kinds of side-channel attacks (e.g. leak of security
+  policies, CPU-based attacks), Landlock rules shall not be able to
+  programmatically communicate with user space.
+* Kernel access check shall not slow down access request from unsandboxed
+  processes.
+* Computation related to Landlock operations (e.g. enforcing a ruleset) shall
+  only impact the processes requesting them.
+
+Tests
+=====
+
+Userspace tests for backward compatibility, ptrace restrictions and filesystem
+support can be found here: `tools/testing/selftests/landlock/`_.
+
+Kernel structures
+=================
+
+Object
+------
+
+.. kernel-doc:: security/landlock/object.h
+    :identifiers:
+
+Filesystem
+----------
+
+.. kernel-doc:: security/landlock/fs.h
+    :identifiers:
+
+Ruleset and domain
+------------------
+
+A domain is a read-only ruleset tied to a set of subjects (i.e. tasks'
+credentials).  Each time a ruleset is enforced on a task, the current domain is
+duplicated and the ruleset is imported as a new layer of rules in the new
+domain.  Indeed, once in a domain, each rule is tied to a layer level.  To
+grant access to an object, at least one rule of each layer must allow the
+requested action on the object.  A task can then only transit to a new domain
+that is the intersection of the constraints from the current domain and those
+of a ruleset provided by the task.
+
+The definition of a subject is implicit for a task sandboxing itself, which
+makes the reasoning much easier and helps avoid pitfalls.
+
+.. kernel-doc:: security/landlock/ruleset.h
+    :identifiers:
+
+.. Links
+.. _tools/testing/selftests/landlock/:
+   https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/landlock/
diff --git a/Documentation/trace/coresight/coresight-trbe.rst b/Documentation/trace/coresight/coresight-trbe.rst
new file mode 100644 (file)
index 0000000..b9928ef
--- /dev/null
@@ -0,0 +1,38 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+Trace Buffer Extension (TRBE).
+==============================
+
+    :Author:   Anshuman Khandual <anshuman.khandual@arm.com>
+    :Date:     November 2020
+
+Hardware Description
+--------------------
+
+Trace Buffer Extension (TRBE) is a percpu hardware which captures in system
+memory, CPU traces generated from a corresponding percpu tracing unit. This
+gets plugged in as a coresight sink device because the corresponding trace
+generators (ETE), are plugged in as source device.
+
+The TRBE is not compliant to CoreSight architecture specifications, but is
+driven via the CoreSight driver framework to support the ETE (which is
+CoreSight compliant) integration.
+
+Sysfs files and directories
+---------------------------
+
+The TRBE devices appear on the existing coresight bus alongside the other
+coresight devices::
+
+       >$ ls /sys/bus/coresight/devices
+       trbe0  trbe1  trbe2 trbe3
+
+The ``trbe<N>`` named TRBEs are associated with a CPU.::
+
+       >$ ls /sys/bus/coresight/devices/trbe0/
+        align flag
+
+*Key file items are:-*
+   * ``align``: TRBE write pointer alignment
+   * ``flag``: TRBE updates memory with access and dirty flags
index cc883f8..87d0818 100644 (file)
@@ -51,7 +51,6 @@ quota-tools            3.09               quota -V
 PPP                    2.4.0              pppd --version
 nfs-utils              1.0.5              showmount --version
 procps                 3.2.0              ps --version
-oprofile               0.9                oprofiled --version
 udev                   081                udevd --version
 grub                   0.93               grub --version || grub-install --version
 mcelog                 0.6                mcelog --version
index ee6b20c..d56d6b7 100644 (file)
+.. SPDX-License-Identifier: GPL-2.0
+
 .. raw:: latex
 
        \renewcommand\thesection*
        \renewcommand\thesubsection*
 
+.. _linux_doc_zh:
+
 中文翻译
 ========
 
-这些手册包含有关如何开发内核的整体信息。内核社区非常庞大,一年下来有数千名开发
-人员做出贡献。 与任何大型社区一样,知道如何完成任务将使得更改合并的过程变得更
-加容易。
 
-翻译计划:
-内核中文文档欢迎任何翻译投稿,特别是关于内核用户和管理员指南部分。
+.. note::
+
+   **翻译计划:**
+   内核中文文档欢迎任何翻译投稿,特别是关于内核用户和管理员指南部分。
+
+许可证文档
+----------
+
+下面的文档介绍了Linux内核源代码的许可证(GPLv2)、如何在源代码树中正确标记
+单个文件的许可证、以及指向完整许可证文本的链接。
+
+* Documentation/translations/zh_CN/process/license-rules.rst
+
+用户文档
+--------
+
+下面的手册是为内核用户编写的——即那些试图让它在给定系统上以最佳方式工作的
+用户。
 
 .. toctree::
    :maxdepth: 2
 
    admin-guide/index
+
+TODOList:
+
+* kbuild/index
+
+固件相关文档
+------------
+
+下列文档描述了内核需要的平台固件相关信息。
+
+TODOList:
+
+* firmware-guide/index
+* devicetree/index
+
+应用程序开发人员文档
+--------------------
+
+用户空间API手册涵盖了描述应用程序开发人员可见内核接口方面的文档。
+
+TODOlist:
+
+* userspace-api/index
+
+内核开发简介
+------------
+
+这些手册包含有关如何开发内核的整体信息。内核社区非常庞大,一年下来有数千名
+开发人员做出贡献。与任何大型社区一样,知道如何完成任务将使得更改合并的过程
+变得更加容易。
+
+.. toctree::
+   :maxdepth: 2
+
    process/index
    dev-tools/index
    doc-guide/index
    kernel-hacking/index
-   filesystems/index
-   arm64/index
-   sound/index
+
+TODOList:
+
+* trace/index
+* maintainer/index
+* fault-injection/index
+* livepatch/index
+* rust/index
+
+内核API文档
+-----------
+
+以下手册从内核开发人员的角度详细介绍了特定的内核子系统是如何工作的。这里的
+大部分信息都是直接从内核源代码获取的,并根据需要添加补充材料(或者至少是在
+我们设法添加的时候——可能不是所有的都是有需要的)。
+
+.. toctree::
+   :maxdepth: 2
+
+   core-api/index
    cpu-freq/index
-   mips/index
    iio/index
+   sound/index
+   filesystems/index
+
+TODOList:
+
+* driver-api/index
+* locking/index
+* accounting/index
+* block/index
+* cdrom/index
+* ide/index
+* fb/index
+* fpga/index
+* hid/index
+* i2c/index
+* isdn/index
+* infiniband/index
+* leds/index
+* netlabel/index
+* networking/index
+* pcmcia/index
+* power/index
+* target/index
+* timers/index
+* spi/index
+* w1/index
+* watchdog/index
+* virt/index
+* input/index
+* hwmon/index
+* gpu/index
+* security/index
+* crypto/index
+* vm/index
+* bpf/index
+* usb/index
+* PCI/index
+* scsi/index
+* misc-devices/index
+* scheduler/index
+* mhi/index
+
+体系结构无关文档
+----------------
+
+TODOList:
+
+* asm-annotations
+
+特定体系结构文档
+----------------
+
+.. toctree::
+   :maxdepth: 2
+
+   mips/index
+   arm64/index
    riscv/index
-   core-api/index
    openrisc/index
 
+TODOList:
+
+* arm/index
+* ia64/index
+* m68k/index
+* nios2/index
+* parisc/index
+* powerpc/index
+* s390/index
+* sh/index
+* sparc/index
+* x86/index
+* xtensa/index
+
+其他文档
+--------
+
+有几份未排序的文档似乎不适合放在文档的其他部分,或者可能需要进行一些调整和/或
+转换为reStructureText格式,也有可能太旧。
+
+TODOList:
+
+* staging/index
+* watch_queue
+
 目录和表格
 ----------
 
index 1e2438b..0b5eefe 100644 (file)
@@ -18,6 +18,7 @@ place where this information is gathered.
 
    no_new_privs
    seccomp_filter
+   landlock
    unshare
    spec_ctrl
    accelerators/ocxl
diff --git a/Documentation/userspace-api/landlock.rst b/Documentation/userspace-api/landlock.rst
new file mode 100644 (file)
index 0000000..62c9361
--- /dev/null
@@ -0,0 +1,311 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+.. Copyright © 2019-2020 ANSSI
+.. Copyright © 2021 Microsoft Corporation
+
+=====================================
+Landlock: unprivileged access control
+=====================================
+
+:Author: Mickaël Salaün
+:Date: March 2021
+
+The goal of Landlock is to enable to restrict ambient rights (e.g. global
+filesystem access) for a set of processes.  Because Landlock is a stackable
+LSM, it makes possible to create safe security sandboxes as new security layers
+in addition to the existing system-wide access-controls. This kind of sandbox
+is expected to help mitigate the security impact of bugs or
+unexpected/malicious behaviors in user space applications.  Landlock empowers
+any process, including unprivileged ones, to securely restrict themselves.
+
+Landlock rules
+==============
+
+A Landlock rule describes an action on an object.  An object is currently a
+file hierarchy, and the related filesystem actions are defined with `access
+rights`_.  A set of rules is aggregated in a ruleset, which can then restrict
+the thread enforcing it, and its future children.
+
+Defining and enforcing a security policy
+----------------------------------------
+
+We first need to create the ruleset that will contain our rules.  For this
+example, the ruleset will contain rules that only allow read actions, but write
+actions will be denied.  The ruleset then needs to handle both of these kind of
+actions.
+
+.. code-block:: c
+
+    int ruleset_fd;
+    struct landlock_ruleset_attr ruleset_attr = {
+        .handled_access_fs =
+            LANDLOCK_ACCESS_FS_EXECUTE |
+            LANDLOCK_ACCESS_FS_WRITE_FILE |
+            LANDLOCK_ACCESS_FS_READ_FILE |
+            LANDLOCK_ACCESS_FS_READ_DIR |
+            LANDLOCK_ACCESS_FS_REMOVE_DIR |
+            LANDLOCK_ACCESS_FS_REMOVE_FILE |
+            LANDLOCK_ACCESS_FS_MAKE_CHAR |
+            LANDLOCK_ACCESS_FS_MAKE_DIR |
+            LANDLOCK_ACCESS_FS_MAKE_REG |
+            LANDLOCK_ACCESS_FS_MAKE_SOCK |
+            LANDLOCK_ACCESS_FS_MAKE_FIFO |
+            LANDLOCK_ACCESS_FS_MAKE_BLOCK |
+            LANDLOCK_ACCESS_FS_MAKE_SYM,
+    };
+
+    ruleset_fd = landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0);
+    if (ruleset_fd < 0) {
+        perror("Failed to create a ruleset");
+        return 1;
+    }
+
+We can now add a new rule to this ruleset thanks to the returned file
+descriptor referring to this ruleset.  The rule will only allow reading the
+file hierarchy ``/usr``.  Without another rule, write actions would then be
+denied by the ruleset.  To add ``/usr`` to the ruleset, we open it with the
+``O_PATH`` flag and fill the &struct landlock_path_beneath_attr with this file
+descriptor.
+
+.. code-block:: c
+
+    int err;
+    struct landlock_path_beneath_attr path_beneath = {
+        .allowed_access =
+            LANDLOCK_ACCESS_FS_EXECUTE |
+            LANDLOCK_ACCESS_FS_READ_FILE |
+            LANDLOCK_ACCESS_FS_READ_DIR,
+    };
+
+    path_beneath.parent_fd = open("/usr", O_PATH | O_CLOEXEC);
+    if (path_beneath.parent_fd < 0) {
+        perror("Failed to open file");
+        close(ruleset_fd);
+        return 1;
+    }
+    err = landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+                            &path_beneath, 0);
+    close(path_beneath.parent_fd);
+    if (err) {
+        perror("Failed to update ruleset");
+        close(ruleset_fd);
+        return 1;
+    }
+
+We now have a ruleset with one rule allowing read access to ``/usr`` while
+denying all other handled accesses for the filesystem.  The next step is to
+restrict the current thread from gaining more privileges (e.g. thanks to a SUID
+binary).
+
+.. code-block:: c
+
+    if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+        perror("Failed to restrict privileges");
+        close(ruleset_fd);
+        return 1;
+    }
+
+The current thread is now ready to sandbox itself with the ruleset.
+
+.. code-block:: c
+
+    if (landlock_restrict_self(ruleset_fd, 0)) {
+        perror("Failed to enforce ruleset");
+        close(ruleset_fd);
+        return 1;
+    }
+    close(ruleset_fd);
+
+If the `landlock_restrict_self` system call succeeds, the current thread is now
+restricted and this policy will be enforced on all its subsequently created
+children as well.  Once a thread is landlocked, there is no way to remove its
+security policy; only adding more restrictions is allowed.  These threads are
+now in a new Landlock domain, merge of their parent one (if any) with the new
+ruleset.
+
+Full working code can be found in `samples/landlock/sandboxer.c`_.
+
+Layers of file path access rights
+---------------------------------
+
+Each time a thread enforces a ruleset on itself, it updates its Landlock domain
+with a new layer of policy.  Indeed, this complementary policy is stacked with
+the potentially other rulesets already restricting this thread.  A sandboxed
+thread can then safely add more constraints to itself with a new enforced
+ruleset.
+
+One policy layer grants access to a file path if at least one of its rules
+encountered on the path grants the access.  A sandboxed thread can only access
+a file path if all its enforced policy layers grant the access as well as all
+the other system access controls (e.g. filesystem DAC, other LSM policies,
+etc.).
+
+Bind mounts and OverlayFS
+-------------------------
+
+Landlock enables to restrict access to file hierarchies, which means that these
+access rights can be propagated with bind mounts (cf.
+:doc:`/filesystems/sharedsubtree`) but not with :doc:`/filesystems/overlayfs`.
+
+A bind mount mirrors a source file hierarchy to a destination.  The destination
+hierarchy is then composed of the exact same files, on which Landlock rules can
+be tied, either via the source or the destination path.  These rules restrict
+access when they are encountered on a path, which means that they can restrict
+access to multiple file hierarchies at the same time, whether these hierarchies
+are the result of bind mounts or not.
+
+An OverlayFS mount point consists of upper and lower layers.  These layers are
+combined in a merge directory, result of the mount point.  This merge hierarchy
+may include files from the upper and lower layers, but modifications performed
+on the merge hierarchy only reflects on the upper layer.  From a Landlock
+policy point of view, each OverlayFS layers and merge hierarchies are
+standalone and contains their own set of files and directories, which is
+different from bind mounts.  A policy restricting an OverlayFS layer will not
+restrict the resulted merged hierarchy, and vice versa.  Landlock users should
+then only think about file hierarchies they want to allow access to, regardless
+of the underlying filesystem.
+
+Inheritance
+-----------
+
+Every new thread resulting from a :manpage:`clone(2)` inherits Landlock domain
+restrictions from its parent.  This is similar to the seccomp inheritance (cf.
+:doc:`/userspace-api/seccomp_filter`) or any other LSM dealing with task's
+:manpage:`credentials(7)`.  For instance, one process's thread may apply
+Landlock rules to itself, but they will not be automatically applied to other
+sibling threads (unlike POSIX thread credential changes, cf.
+:manpage:`nptl(7)`).
+
+When a thread sandboxes itself, we have the guarantee that the related security
+policy will stay enforced on all this thread's descendants.  This allows
+creating standalone and modular security policies per application, which will
+automatically be composed between themselves according to their runtime parent
+policies.
+
+Ptrace restrictions
+-------------------
+
+A sandboxed process has less privileges than a non-sandboxed process and must
+then be subject to additional restrictions when manipulating another process.
+To be allowed to use :manpage:`ptrace(2)` and related syscalls on a target
+process, a sandboxed process should have a subset of the target process rules,
+which means the tracee must be in a sub-domain of the tracer.
+
+Kernel interface
+================
+
+Access rights
+-------------
+
+.. kernel-doc:: include/uapi/linux/landlock.h
+    :identifiers: fs_access
+
+Creating a new ruleset
+----------------------
+
+.. kernel-doc:: security/landlock/syscalls.c
+    :identifiers: sys_landlock_create_ruleset
+
+.. kernel-doc:: include/uapi/linux/landlock.h
+    :identifiers: landlock_ruleset_attr
+
+Extending a ruleset
+-------------------
+
+.. kernel-doc:: security/landlock/syscalls.c
+    :identifiers: sys_landlock_add_rule
+
+.. kernel-doc:: include/uapi/linux/landlock.h
+    :identifiers: landlock_rule_type landlock_path_beneath_attr
+
+Enforcing a ruleset
+-------------------
+
+.. kernel-doc:: security/landlock/syscalls.c
+    :identifiers: sys_landlock_restrict_self
+
+Current limitations
+===================
+
+File renaming and linking
+-------------------------
+
+Because Landlock targets unprivileged access controls, it is needed to properly
+handle composition of rules.  Such property also implies rules nesting.
+Properly handling multiple layers of ruleset, each one of them able to restrict
+access to files, also implies to inherit the ruleset restrictions from a parent
+to its hierarchy.  Because files are identified and restricted by their
+hierarchy, moving or linking a file from one directory to another implies to
+propagate the hierarchy constraints.  To protect against privilege escalations
+through renaming or linking, and for the sake of simplicity, Landlock currently
+limits linking and renaming to the same directory.  Future Landlock evolutions
+will enable more flexibility for renaming and linking, with dedicated ruleset
+flags.
+
+Filesystem topology modification
+--------------------------------
+
+As for file renaming and linking, a sandboxed thread cannot modify its
+filesystem topology, whether via :manpage:`mount(2)` or
+:manpage:`pivot_root(2)`.  However, :manpage:`chroot(2)` calls are not denied.
+
+Special filesystems
+-------------------
+
+Access to regular files and directories can be restricted by Landlock,
+according to the handled accesses of a ruleset.  However, files that do not
+come from a user-visible filesystem (e.g. pipe, socket), but can still be
+accessed through ``/proc/<pid>/fd/*``, cannot currently be explicitly
+restricted.  Likewise, some special kernel filesystems such as nsfs, which can
+be accessed through ``/proc/<pid>/ns/*``, cannot currently be explicitly
+restricted.  However, thanks to the `ptrace restrictions`_, access to such
+sensitive ``/proc`` files are automatically restricted according to domain
+hierarchies.  Future Landlock evolutions could still enable to explicitly
+restrict such paths with dedicated ruleset flags.
+
+Ruleset layers
+--------------
+
+There is a limit of 64 layers of stacked rulesets.  This can be an issue for a
+task willing to enforce a new ruleset in complement to its 64 inherited
+rulesets.  Once this limit is reached, sys_landlock_restrict_self() returns
+E2BIG.  It is then strongly suggested to carefully build rulesets once in the
+life of a thread, especially for applications able to launch other applications
+that may also want to sandbox themselves (e.g. shells, container managers,
+etc.).
+
+Memory usage
+------------
+
+Kernel memory allocated to create rulesets is accounted and can be restricted
+by the :doc:`/admin-guide/cgroup-v1/memory`.
+
+Questions and answers
+=====================
+
+What about user space sandbox managers?
+---------------------------------------
+
+Using user space process to enforce restrictions on kernel resources can lead
+to race conditions or inconsistent evaluations (i.e. `Incorrect mirroring of
+the OS code and state
+<https://www.ndss-symposium.org/ndss2003/traps-and-pitfalls-practical-problems-system-call-interposition-based-security-tools/>`_).
+
+What about namespaces and containers?
+-------------------------------------
+
+Namespaces can help create sandboxes but they are not designed for
+access-control and then miss useful features for such use case (e.g. no
+fine-grained restrictions).  Moreover, their complexity can lead to security
+issues, especially when untrusted processes can manipulate them (cf.
+`Controlling access to user namespaces <https://lwn.net/Articles/673597/>`_).
+
+Additional documentation
+========================
+
+* :doc:`/security/landlock`
+* https://landlock.io
+
+.. Links
+.. _samples/landlock/sandboxer.c:
+   https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/samples/landlock/sandboxer.c
index 469a630..5ec8a19 100644 (file)
@@ -148,6 +148,9 @@ measurement. Since the guest owner knows the initial contents of the guest at
 boot, the measurement can be verified by comparing it to what the guest owner
 expects.
 
+If len is zero on entry, the measurement blob length is written to len and
+uaddr is unused.
+
 Parameters (in): struct  kvm_sev_launch_measure
 
 Returns: 0 on success, -negative on error
@@ -271,6 +274,9 @@ report containing the SHA-256 digest of the guest memory and VMSA passed through
 commands and signed with the PEK. The digest returned by the command should match the digest
 used by the guest owner with the KVM_SEV_LAUNCH_MEASURE.
 
+If len is zero on entry, the measurement blob length is written to len and
+uaddr is unused.
+
 Parameters (in): struct kvm_sev_attestation
 
 Returns: 0 on success, -negative on error
@@ -284,6 +290,143 @@ Returns: 0 on success, -negative on error
                 __u32 len;
         };
 
+11. KVM_SEV_SEND_START
+----------------------
+
+The KVM_SEV_SEND_START command can be used by the hypervisor to create an
+outgoing guest encryption context.
+
+If session_len is zero on entry, the length of the guest session information is
+written to session_len and all other fields are not used.
+
+Parameters (in): struct kvm_sev_send_start
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_send_start {
+                __u32 policy;                 /* guest policy */
+
+                __u64 pdh_cert_uaddr;         /* platform Diffie-Hellman certificate */
+                __u32 pdh_cert_len;
+
+                __u64 plat_certs_uaddr;        /* platform certificate chain */
+                __u32 plat_certs_len;
+
+                __u64 amd_certs_uaddr;        /* AMD certificate */
+                __u32 amd_certs_len;
+
+                __u64 session_uaddr;          /* Guest session information */
+                __u32 session_len;
+        };
+
+12. KVM_SEV_SEND_UPDATE_DATA
+----------------------------
+
+The KVM_SEV_SEND_UPDATE_DATA command can be used by the hypervisor to encrypt the
+outgoing guest memory region with the encryption context creating using
+KVM_SEV_SEND_START.
+
+If hdr_len or trans_len are zero on entry, the length of the packet header and
+transport region are written to hdr_len and trans_len respectively, and all
+other fields are not used.
+
+Parameters (in): struct kvm_sev_send_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_send_update_data {
+                __u64 hdr_uaddr;        /* userspace address containing the packet header */
+                __u32 hdr_len;
+
+                __u64 guest_uaddr;      /* the source memory region to be encrypted */
+                __u32 guest_len;
+
+                __u64 trans_uaddr;      /* the destination memory region  */
+                __u32 trans_len;
+        };
+
+13. KVM_SEV_SEND_FINISH
+------------------------
+
+After completion of the migration flow, the KVM_SEV_SEND_FINISH command can be
+issued by the hypervisor to delete the encryption context.
+
+Returns: 0 on success, -negative on error
+
+14. KVM_SEV_SEND_CANCEL
+------------------------
+
+After completion of SEND_START, but before SEND_FINISH, the source VMM can issue the
+SEND_CANCEL command to stop a migration. This is necessary so that a cancelled
+migration can restart with a new target later.
+
+Returns: 0 on success, -negative on error
+
+15. KVM_SEV_RECEIVE_START
+-------------------------
+
+The KVM_SEV_RECEIVE_START command is used for creating the memory encryption
+context for an incoming SEV guest. To create the encryption context, the user must
+provide a guest policy, the platform public Diffie-Hellman (PDH) key and session
+information.
+
+Parameters: struct  kvm_sev_receive_start (in/out)
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_receive_start {
+                __u32 handle;           /* if zero then firmware creates a new handle */
+                __u32 policy;           /* guest's policy */
+
+                __u64 pdh_uaddr;        /* userspace address pointing to the PDH key */
+                __u32 pdh_len;
+
+                __u64 session_uaddr;    /* userspace address which points to the guest session information */
+                __u32 session_len;
+        };
+
+On success, the 'handle' field contains a new handle and on error, a negative value.
+
+For more details, see SEV spec Section 6.12.
+
+16. KVM_SEV_RECEIVE_UPDATE_DATA
+-------------------------------
+
+The KVM_SEV_RECEIVE_UPDATE_DATA command can be used by the hypervisor to copy
+the incoming buffers into the guest memory region with encryption context
+created during the KVM_SEV_RECEIVE_START.
+
+Parameters (in): struct kvm_sev_receive_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_receive_update_data {
+                __u64 hdr_uaddr;        /* userspace address containing the packet header */
+                __u32 hdr_len;
+
+                __u64 guest_uaddr;      /* the destination guest memory region */
+                __u32 guest_len;
+
+                __u64 trans_uaddr;      /* the incoming buffer memory region  */
+                __u32 trans_len;
+        };
+
+17. KVM_SEV_RECEIVE_FINISH
+--------------------------
+
+After completion of the migration flow, the KVM_SEV_RECEIVE_FINISH command can be
+issued by the hypervisor to make the guest ready for execution.
+
+Returns: 0 on success, -negative on error
+
 References
 ==========
 
index 245d805..22d0775 100644 (file)
@@ -204,7 +204,7 @@ Errors:
 
   ======     ============================================================
   EFAULT     the msr index list cannot be read from or written to
-  E2BIG      the msr index list is to be to fit in the array specified by
+  E2BIG      the msr index list is too big to fit in the array specified by
              the user.
   ======     ============================================================
 
@@ -3116,6 +3116,18 @@ optional features it should have.  This will cause a reset of the cpu
 registers to their initial values.  If this is not called, KVM_RUN will
 return ENOEXEC for that vcpu.
 
+The initial values are defined as:
+       - Processor state:
+               * AArch64: EL1h, D, A, I and F bits set. All other bits
+                 are cleared.
+               * AArch32: SVC, A, I and F bits set. All other bits are
+                 cleared.
+       - General Purpose registers, including PC and SP: set to 0
+       - FPSIMD/NEON registers: set to 0
+       - SVE registers: set to 0
+       - System registers: Reset to their architecturally defined
+         values as for a warm reset to EL1 (resp. SVC)
+
 Note that because some registers reflect machine topology, all vcpus
 should be created before this ioctl is invoked.
 
@@ -3335,7 +3347,8 @@ The top 16 bits of the control field are architecture specific control
 flags which can include the following:
 
   - KVM_GUESTDBG_USE_SW_BP:     using software breakpoints [x86, arm64]
-  - KVM_GUESTDBG_USE_HW_BP:     using hardware breakpoints [x86, s390, arm64]
+  - KVM_GUESTDBG_USE_HW_BP:     using hardware breakpoints [x86, s390]
+  - KVM_GUESTDBG_USE_HW:        using hardware debug events [arm64]
   - KVM_GUESTDBG_INJECT_DB:     inject DB type exception [x86]
   - KVM_GUESTDBG_INJECT_BP:     inject BP type exception [x86]
   - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
@@ -3358,6 +3371,9 @@ indicating the number of supported registers.
 For ppc, the KVM_CAP_PPC_GUEST_DEBUG_SSTEP capability indicates whether
 the single-step debug event (KVM_GUESTDBG_SINGLESTEP) is supported.
 
+Also when supported, KVM_CAP_SET_GUEST_DEBUG2 capability indicates the
+supported KVM_GUESTDBG_* bits in the control field.
+
 When debug events exit the main run loop with the reason
 KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
 structure containing architecture specific debug information.
@@ -3690,31 +3706,105 @@ which is the maximum number of possibly pending cpu-local interrupts.
 
 Queues an SMI on the thread's vcpu.
 
-4.97 KVM_CAP_PPC_MULTITCE
--------------------------
+4.97 KVM_X86_SET_MSR_FILTER
+----------------------------
 
-:Capability: KVM_CAP_PPC_MULTITCE
-:Architectures: ppc
-:Type: vm
+:Capability: KVM_X86_SET_MSR_FILTER
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_msr_filter
+:Returns: 0 on success, < 0 on error
 
-This capability means the kernel is capable of handling hypercalls
-H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
-space. This significantly accelerates DMA operations for PPC KVM guests.
-User space should expect that its handlers for these hypercalls
-are not going to be called if user space previously registered LIOBN
-in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+::
 
-In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
-user space might have to advertise it for the guest. For example,
-IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
-present in the "ibm,hypertas-functions" device-tree property.
+  struct kvm_msr_filter_range {
+  #define KVM_MSR_FILTER_READ  (1 << 0)
+  #define KVM_MSR_FILTER_WRITE (1 << 1)
+       __u32 flags;
+       __u32 nmsrs; /* number of msrs in bitmap */
+       __u32 base;  /* MSR index the bitmap starts at */
+       __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */
+  };
 
-The hypercalls mentioned above may or may not be processed successfully
-in the kernel based fast path. If they can not be handled by the kernel,
-they will get passed on to user space. So user space still has to have
-an implementation for these despite the in kernel acceleration.
+  #define KVM_MSR_FILTER_MAX_RANGES 16
+  struct kvm_msr_filter {
+  #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
+  #define KVM_MSR_FILTER_DEFAULT_DENY  (1 << 0)
+       __u32 flags;
+       struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
+  };
 
-This capability is always enabled.
+flags values for ``struct kvm_msr_filter_range``:
+
+``KVM_MSR_FILTER_READ``
+
+  Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap
+  indicates that a read should immediately fail, while a 1 indicates that
+  a read for a particular MSR should be handled regardless of the default
+  filter action.
+
+``KVM_MSR_FILTER_WRITE``
+
+  Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap
+  indicates that a write should immediately fail, while a 1 indicates that
+  a write for a particular MSR should be handled regardless of the default
+  filter action.
+
+``KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE``
+
+  Filter both read and write accesses to MSRs using the given bitmap. A 0
+  in the bitmap indicates that both reads and writes should immediately fail,
+  while a 1 indicates that reads and writes for a particular MSR are not
+  filtered by this range.
+
+flags values for ``struct kvm_msr_filter``:
+
+``KVM_MSR_FILTER_DEFAULT_ALLOW``
+
+  If no filter range matches an MSR index that is getting accessed, KVM will
+  fall back to allowing access to the MSR.
+
+``KVM_MSR_FILTER_DEFAULT_DENY``
+
+  If no filter range matches an MSR index that is getting accessed, KVM will
+  fall back to rejecting access to the MSR. In this mode, all MSRs that should
+  be processed by KVM need to explicitly be marked as allowed in the bitmaps.
+
+This ioctl allows user space to define up to 16 bitmaps of MSR ranges to
+specify whether a certain MSR access should be explicitly filtered for or not.
+
+If this ioctl has never been invoked, MSR accesses are not guarded and the
+default KVM in-kernel emulation behavior is fully preserved.
+
+Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR
+filtering. In that mode, ``KVM_MSR_FILTER_DEFAULT_DENY`` is invalid and causes
+an error.
+
+As soon as the filtering is in place, every MSR access is processed through
+the filtering except for accesses to the x2APIC MSRs (from 0x800 to 0x8ff);
+x2APIC MSRs are always allowed, independent of the ``default_allow`` setting,
+and their behavior depends on the ``X2APIC_ENABLE`` bit of the APIC base
+register.
+
+If a bit is within one of the defined ranges, read and write accesses are
+guarded by the bitmap's value for the MSR index if the kind of access
+is included in the ``struct kvm_msr_filter_range`` flags.  If no range
+cover this particular access, the behavior is determined by the flags
+field in the kvm_msr_filter struct: ``KVM_MSR_FILTER_DEFAULT_ALLOW``
+and ``KVM_MSR_FILTER_DEFAULT_DENY``.
+
+Each bitmap range specifies a range of MSRs to potentially allow access on.
+The range goes from MSR index [base .. base+nmsrs]. The flags field
+indicates whether reads, writes or both reads and writes are filtered
+by setting a 1 bit in the bitmap for the corresponding MSR index.
+
+If an MSR access is not permitted through the filtering, it generates a
+#GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that
+allows user space to deflect and potentially handle various MSR accesses
+into user space.
+
+If a vCPU is in running state while this ioctl is invoked, the vCPU may
+experience inconsistent filtering behavior on MSR accesses.
 
 4.98 KVM_CREATE_SPAPR_TCE_64
 ----------------------------
@@ -4855,7 +4945,7 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO
 KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
   Sets the exception vector used to deliver Xen event channel upcalls.
 
-4.128 KVM_XEN_HVM_GET_ATTR
+4.127 KVM_XEN_HVM_GET_ATTR
 --------------------------
 
 :Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
@@ -4867,7 +4957,7 @@ KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
 Allows Xen VM attributes to be read. For the structure and types,
 see KVM_XEN_HVM_SET_ATTR above.
 
-4.129 KVM_XEN_VCPU_SET_ATTR
+4.128 KVM_XEN_VCPU_SET_ATTR
 ---------------------------
 
 :Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
@@ -4929,7 +5019,7 @@ KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST
   or RUNSTATE_offline) to set the current accounted state as of the
   adjusted state_entry_time.
 
-4.130 KVM_XEN_VCPU_GET_ATTR
+4.129 KVM_XEN_VCPU_GET_ATTR
 ---------------------------
 
 :Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
@@ -6233,6 +6323,45 @@ KVM_RUN_BUS_LOCK flag is used to distinguish between them.
 This capability can be used to check / enable 2nd DAWR feature provided
 by POWER10 processor.
 
+7.24 KVM_CAP_VM_COPY_ENC_CONTEXT_FROM
+-------------------------------------
+
+Architectures: x86 SEV enabled
+Type: vm
+Parameters: args[0] is the fd of the source vm
+Returns: 0 on success; ENOTTY on error
+
+This capability enables userspace to copy encryption context from the vm
+indicated by the fd to the vm this is called on.
+
+This is intended to support in-guest workloads scheduled by the host. This
+allows the in-guest workload to maintain its own NPTs and keeps the two vms
+from accidentally clobbering each other with interrupts and the like (separate
+APIC/MSRs/etc).
+
+7.25 KVM_CAP_SGX_ATTRIBUTE
+--------------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] is a file handle of a SGX attribute file in securityfs
+:Returns: 0 on success, -EINVAL if the file handle is invalid or if a requested
+          attribute is not supported by KVM.
+
+KVM_CAP_SGX_ATTRIBUTE enables a userspace VMM to grant a VM access to one or
+more priveleged enclave attributes.  args[0] must hold a file handle to a valid
+SGX attribute file corresponding to an attribute that is supported/restricted
+by KVM (currently only PROVISIONKEY).
+
+The SGX subsystem restricts access to a subset of enclave attributes to provide
+additional security for an uncompromised kernel, e.g. use of the PROVISIONKEY
+is restricted to deter malware from using the PROVISIONKEY to obtain a stable
+system fingerprint.  To prevent userspace from circumventing such restrictions
+by running an enclave in a VM, KVM prevents access to privileged attributes by
+default.
+
+See Documentation/x86/sgx/2.Kernel-internals.rst for more details.
+
 8. Other capabilities.
 ======================
 
@@ -6727,3 +6856,38 @@ vcpu_info is set.
 The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related
 features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are
 supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls.
+
+8.31 KVM_CAP_PPC_MULTITCE
+-------------------------
+
+:Capability: KVM_CAP_PPC_MULTITCE
+:Architectures: ppc
+:Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significantly accelerates DMA operations for PPC KVM guests.
+User space should expect that its handlers for these hypercalls
+are not going to be called if user space previously registered LIOBN
+in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
+present in the "ibm,hypertas-functions" device-tree property.
+
+The hypercalls mentioned above may or may not be processed successfully
+in the kernel based fast path. If they can not be handled by the kernel,
+they will get passed on to user space. So user space still has to have
+an implementation for these despite the in kernel acceleration.
+
+This capability is always enabled.
+
+8.32 KVM_CAP_PTP_KVM
+--------------------
+
+:Architectures: arm64
+
+This capability indicates that the KVM virtual PTP service is
+supported in the host. A VMM can check whether the service is
+available to the guest on migration.
index 3e2b2ab..78a9b67 100644 (file)
@@ -10,3 +10,4 @@ ARM
    hyp-abi
    psci
    pvtime
+   ptp_kvm
diff --git a/Documentation/virt/kvm/arm/ptp_kvm.rst b/Documentation/virt/kvm/arm/ptp_kvm.rst
new file mode 100644 (file)
index 0000000..aecdc80
--- /dev/null
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+PTP_KVM support for arm/arm64
+=============================
+
+PTP_KVM is used for high precision time sync between host and guests.
+It relies on transferring the wall clock and counter value from the
+host to the guest using a KVM-specific hypercall.
+
+* ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: 0x86000001
+
+This hypercall uses the SMC32/HVC32 calling convention:
+
+ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID
+    ==============    ========    =====================================
+    Function ID:      (uint32)    0x86000001
+    Arguments:        (uint32)    KVM_PTP_VIRT_COUNTER(0)
+                                  KVM_PTP_PHYS_COUNTER(1)
+    Return Values:    (int32)     NOT_SUPPORTED(-1) on error, or
+                      (uint32)    Upper 32 bits of wall clock time (r0)
+                      (uint32)    Lower 32 bits of wall clock time (r1)
+                      (uint32)    Upper 32 bits of counter (r2)
+                      (uint32)    Lower 32 bits of counter (r3)
+    Endianness:                   No Restrictions.
+    ==============    ========    =====================================
index 6c304fd..d257edd 100644 (file)
@@ -80,7 +80,7 @@ KVM_DEV_ARM_VGIC_GRP_CTRL
     -EFAULT  Invalid guest ram access
     -EBUSY   One or more VCPUS are running
     -EACCES  The virtual ITS is backed by a physical GICv4 ITS, and the
-            state is not available
+            state is not available without GICv4.1
     =======  ==========================================================
 
 KVM_DEV_ARM_VGIC_GRP_ITS_REGS
index 5dd3bff..51e5e57 100644 (file)
@@ -228,7 +228,7 @@ Groups:
 
     KVM_DEV_ARM_VGIC_CTRL_INIT
       request the initialization of the VGIC, no additional parameter in
-      kvm_device_attr.addr.
+      kvm_device_attr.addr. Must be called after all VCPUs have been created.
     KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES
       save all LPI pending bits into guest RAM pending tables.
 
index 0aa4817..1fc860c 100644 (file)
@@ -38,25 +38,24 @@ the mmu-lock on x86. Currently, the page fault can be fast in one of the
 following two cases:
 
 1. Access Tracking: The SPTE is not present, but it is marked for access
-   tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to
-   restore the saved R/X bits. This is described in more detail later below.
+   tracking. That means we need to restore the saved R/X bits. This is
+   described in more detail later below.
 
-2. Write-Protection: The SPTE is present and the fault is
-   caused by write-protect. That means we just need to change the W bit of
-   the spte.
+2. Write-Protection: The SPTE is present and the fault is caused by
+   write-protect. That means we just need to change the W bit of the spte.
 
-What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
-SPTE_MMU_WRITEABLE bit on the spte:
+What we use to avoid all the race is the Host-writable bit and MMU-writable bit
+on the spte:
 
-- SPTE_HOST_WRITEABLE means the gfn is writable on host.
-- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when
-  the gfn is writable on guest mmu and it is not write-protected by shadow
-  page write-protection.
+- Host-writable means the gfn is writable in the host kernel page tables and in
+  its KVM memslot.
+- MMU-writable means the gfn is writable in the guest's mmu and it is not
+  write-protected by shadow page write-protection.
 
 On fast page fault path, we will use cmpxchg to atomically set the spte W
-bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or
-restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This
-is safe because whenever changing these bits can be detected by cmpxchg.
+bit if spte.HOST_WRITEABLE = 1 and spte.WRITE_PROTECT = 1, to restore the saved
+R/X bits if for an access-traced spte, or both. This is safe because whenever
+changing these bits can be detected by cmpxchg.
 
 But we need carefully check these cases:
 
@@ -185,17 +184,17 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update().
 Lockless Access Tracking:
 
 This is used for Intel CPUs that are using EPT but do not support the EPT A/D
-bits. In this case, when the KVM MMU notifier is called to track accesses to a
-page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present
-by clearing the RWX bits in the PTE and storing the original R & X bits in
-some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the
-PTE (using the ignored bit 62). When the VM tries to access the page later on,
-a fault is generated and the fast page fault mechanism described above is used
-to atomically restore the PTE to a Present state. The W bit is not saved when
-the PTE is marked for access tracking and during restoration to the Present
-state, the W bit is set depending on whether or not it was a write access. If
-it wasn't, then the W bit will remain clear until a write access happens, at
-which time it will be set using the Dirty tracking mechanism described above.
+bits. In this case, PTEs are tagged as A/D disabled (using ignored bits), and
+when the KVM MMU notifier is called to track accesses to a page (via
+kvm_mmu_notifier_clear_flush_young), it marks the PTE not-present in hardware
+by clearing the RWX bits in the PTE and storing the original R & X bits in more
+unused/ignored bits. When the VM tries to access the page later on, a fault is
+generated and the fast page fault mechanism described above is used to
+atomically restore the PTE to a Present state. The W bit is not saved when the
+PTE is marked for access tracking and during restoration to the Present state,
+the W bit is set depending on whether or not it was a write access. If it
+wasn't, then the W bit will remain clear until a write access happens, at which
+time it will be set using the Dirty tracking mechanism described above.
 
 3. Reference
 ------------
index eaac486..ca85f03 100644 (file)
@@ -84,3 +84,36 @@ If the function code specifies 0x501, breakpoint functions may be performed.
 This function code is handled by userspace.
 
 This diagnose function code has no subfunctions and uses no parameters.
+
+
+DIAGNOSE function code 'X'9C - Voluntary Time Slice Yield
+---------------------------------------------------------
+
+General register 1 contains the target CPU address.
+
+In a guest of a hypervisor like LPAR, KVM or z/VM using shared host CPUs,
+DIAGNOSE with function code 0x9c may improve system performance by
+yielding the host CPU on which the guest CPU is running to be assigned
+to another guest CPU, preferably the logical CPU containing the specified
+target CPU.
+
+
+DIAG 'X'9C forwarding
++++++++++++++++++++++
+
+The guest may send a DIAGNOSE 0x9c in order to yield to a certain
+other vcpu. An example is a Linux guest that tries to yield to the vcpu
+that is currently holding a spinlock, but not running.
+
+However, on the host the real cpu backing the vcpu may itself not be
+running.
+Forwarding the DIAGNOSE 0x9c initially sent by the guest to yield to
+the backing cpu will hopefully cause that cpu, and thus subsequently
+the guest's vcpu, to be scheduled.
+
+
+diag9c_forwarding_hz
+    KVM kernel parameter allowing to specify the maximum number of DIAGNOSE
+    0x9c forwarding per second in the purpose of avoiding a DIAGNOSE 0x9c
+    forwarding storm.
+    A value of 0 turns the forwarding off.
index 4485641..b792bbd 100644 (file)
@@ -6,9 +6,9 @@
 
 Overview
 ========
-Original x86-64 was limited by 4-level paing to 256 TiB of virtual address
+Original x86-64 was limited by 4-level paging to 256 TiB of virtual address
 space and 64 TiB of physical address space. We are already bumping into
-this limit: some vendors offers servers with 64 TiB of memory today.
+this limit: some vendors offer servers with 64 TiB of memory today.
 
 To overcome the limitation upcoming hardware will introduce support for
 5-level paging. It is a straight-forward extension of the current page
index 5ce47bc..bd7aff0 100644 (file)
@@ -624,6 +624,7 @@ F:  fs/affs/
 
 AFS FILESYSTEM
 M:     David Howells <dhowells@redhat.com>
+M:     Marc Dionne <marc.dionne@auristor.com>
 L:     linux-afs@lists.infradead.org
 S:     Supported
 W:     https://www.infradead.org/~dhowells/kafs/
@@ -1782,6 +1783,8 @@ F:        Documentation/ABI/testing/sysfs-bus-coresight-devices-*
 F:     Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt
 F:     Documentation/devicetree/bindings/arm/coresight-cti.yaml
 F:     Documentation/devicetree/bindings/arm/coresight.txt
+F:     Documentation/devicetree/bindings/arm/ete.yaml
+F:     Documentation/devicetree/bindings/arm/trbe.yaml
 F:     Documentation/trace/coresight/*
 F:     drivers/hwtracing/coresight/*
 F:     include/dt-bindings/arm/coresight-cti-dt.h
@@ -3205,6 +3208,22 @@ F:       Documentation/filesystems/bfs.rst
 F:     fs/bfs/
 F:     include/uapi/linux/bfs_fs.h
 
+BITMAP API
+M:     Yury Norov <yury.norov@gmail.com>
+R:     Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+R:     Rasmus Villemoes <linux@rasmusvillemoes.dk>
+S:     Maintained
+F:     include/asm-generic/bitops/find.h
+F:     include/linux/bitmap.h
+F:     lib/bitmap.c
+F:     lib/find_bit.c
+F:     lib/find_bit_benchmark.c
+F:     lib/test_bitmap.c
+F:     tools/include/asm-generic/bitops/find.h
+F:     tools/include/linux/bitmap.h
+F:     tools/lib/bitmap.c
+F:     tools/lib/find_bit.c
+
 BLINKM RGB LED DRIVER
 M:     Jan-Simon Moeller <jansimon.moeller@gmx.de>
 S:     Maintained
@@ -8226,7 +8245,6 @@ F:        drivers/crypto/hisilicon/zip/
 
 HISILICON ROCE DRIVER
 M:     Lijun Ou <oulijun@huawei.com>
-M:     Wei Hu(Xavier) <huwei87@hisilicon.com>
 M:     Weihang Li <liweihang@huawei.com>
 L:     linux-rdma@vger.kernel.org
 S:     Maintained
@@ -8387,6 +8405,13 @@ S:       Maintained
 F:     mm/hwpoison-inject.c
 F:     mm/memory-failure.c
 
+HYCON HY46XX TOUCHSCREEN SUPPORT
+M:     Giulio Benetti <giulio.benetti@benettiengineering.com>
+L:     linux-input@vger.kernel.org
+S:     Maintained
+F:     Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml
+F:     drivers/input/touchscreen/hycon-hy46xx.c
+
 HYGON PROCESSOR SUPPORT
 M:     Pu Wen <puwen@hygon.cn>
 L:     linux-kernel@vger.kernel.org
@@ -9528,6 +9553,7 @@ F:        fs/io-wq.h
 F:     fs/io_uring.c
 F:     include/linux/io_uring.h
 F:     include/uapi/linux/io_uring.h
+F:     tools/io_uring/
 
 IPMI SUBSYSTEM
 M:     Corey Minyard <minyard@acm.org>
@@ -9950,10 +9976,10 @@ F:      virt/kvm/*
 KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64)
 M:     Marc Zyngier <maz@kernel.org>
 R:     James Morse <james.morse@arm.com>
-R:     Julien Thierry <julien.thierry.kdev@gmail.com>
+R:     Alexandru Elisei <alexandru.elisei@arm.com>
 R:     Suzuki K Poulose <suzuki.poulose@arm.com>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-L:     kvmarm@lists.cs.columbia.edu
+L:     kvmarm@lists.cs.columbia.edu (moderated for non-subscribers)
 S:     Maintained
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git
 F:     arch/arm64/include/asm/kvm*
@@ -10169,6 +10195,12 @@ S:     Maintained
 F:     Documentation/devicetree/bindings/leds/backlight/kinetic,ktd253.yaml
 F:     drivers/video/backlight/ktd253-backlight.c
 
+KTEST
+M:     Steven Rostedt <rostedt@goodmis.org>
+M:     John Hawley <warthog9@eaglescrag.net>
+S:     Maintained
+F:     tools/testing/ktest
+
 L3MDEV
 M:     David Ahern <dsahern@kernel.org>
 L:     netdev@vger.kernel.org
@@ -10190,6 +10222,21 @@ F:     net/core/sock_map.c
 F:     net/ipv4/tcp_bpf.c
 F:     net/ipv4/udp_bpf.c
 
+LANDLOCK SECURITY MODULE
+M:     Mickaël Salaün <mic@digikod.net>
+L:     linux-security-module@vger.kernel.org
+S:     Supported
+W:     https://landlock.io
+T:     git https://github.com/landlock-lsm/linux.git
+F:     Documentation/security/landlock.rst
+F:     Documentation/userspace-api/landlock.rst
+F:     include/uapi/linux/landlock.h
+F:     samples/landlock/
+F:     security/landlock/
+F:     tools/testing/selftests/landlock/
+K:     landlock
+K:     LANDLOCK
+
 LANTIQ / INTEL Ethernet drivers
 M:     Hauke Mehrtens <hauke@hauke-m.de>
 L:     netdev@vger.kernel.org
@@ -13961,6 +14008,14 @@ S:     Maintained
 F:     Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
 F:     drivers/pci/controller/dwc/*imx6*
 
+PCI DRIVER FOR FU740
+M:     Paul Walmsley <paul.walmsley@sifive.com>
+M:     Greentime Hu <greentime.hu@sifive.com>
+L:     linux-pci@vger.kernel.org
+S:     Maintained
+F:     Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml
+F:     drivers/pci/controller/dwc/pcie-fu740.c
+
 PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD)
 M:     Jonathan Derrick <jonathan.derrick@intel.com>
 L:     linux-pci@vger.kernel.org
@@ -14045,13 +14100,6 @@ F:     Documentation/devicetree/bindings/pci/ti-pci.txt
 F:     drivers/pci/controller/cadence/pci-j721e.c
 F:     drivers/pci/controller/dwc/pci-dra7xx.c
 
-PCI DRIVER FOR TI KEYSTONE
-M:     Murali Karicheri <m-karicheri2@ti.com>
-L:     linux-pci@vger.kernel.org
-L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-S:     Maintained
-F:     drivers/pci/controller/dwc/pci-keystone.c
-
 PCI DRIVER FOR V3 SEMICONDUCTOR V360EPC
 M:     Linus Walleij <linus.walleij@linaro.org>
 L:     linux-pci@vger.kernel.org
@@ -14168,7 +14216,6 @@ PCIE DRIVER FOR HISILICON
 M:     Zhou Wang <wangzhou1@hisilicon.com>
 L:     linux-pci@vger.kernel.org
 S:     Maintained
-F:     Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
 F:     drivers/pci/controller/dwc/pcie-hisi.c
 
 PCIE DRIVER FOR HISILICON KIRIN
@@ -14188,6 +14235,7 @@ F:      drivers/pci/controller/dwc/pcie-histb.c
 
 PCIE DRIVER FOR MEDIATEK
 M:     Ryder Lee <ryder.lee@mediatek.com>
+M:     Jianjun Wang <jianjun.wang@mediatek.com>
 L:     linux-pci@vger.kernel.org
 L:     linux-mediatek@lists.infradead.org
 S:     Supported
@@ -14289,8 +14337,10 @@ R:     Mark Rutland <mark.rutland@arm.com>
 R:     Alexander Shishkin <alexander.shishkin@linux.intel.com>
 R:     Jiri Olsa <jolsa@redhat.com>
 R:     Namhyung Kim <namhyung@kernel.org>
+L:     linux-perf-users@vger.kernel.org
 L:     linux-kernel@vger.kernel.org
 S:     Supported
+W:     https://perf.wiki.kernel.org/
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core
 F:     arch/*/events/*
 F:     arch/*/events/*/*
@@ -15173,6 +15223,7 @@ F:      include/linux/if_rmnet.h
 
 QUALCOMM TSENS THERMAL DRIVER
 M:     Amit Kucheria <amitk@kernel.org>
+M:     Thara Gopinath <thara.gopinath@linaro.org>
 L:     linux-pm@vger.kernel.org
 L:     linux-arm-msm@vger.kernel.org
 S:     Maintained
@@ -15826,14 +15877,15 @@ T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jes/linux.git rtl8xxxu-deve
 F:     drivers/net/wireless/realtek/rtl8xxxu/
 
 RTRS TRANSPORT DRIVERS
-M:     Danil Kipnis <danil.kipnis@cloud.ionos.com>
-M:     Jack Wang <jinpu.wang@cloud.ionos.com>
+M:     Md. Haris Iqbal <haris.iqbal@ionos.com>
+M:     Jack Wang <jinpu.wang@ionos.com>
 L:     linux-rdma@vger.kernel.org
 S:     Maintained
 F:     drivers/infiniband/ulp/rtrs/
 
 RXRPC SOCKETS (AF_RXRPC)
 M:     David Howells <dhowells@redhat.com>
+M:     Marc Dionne <marc.dionne@auristor.com>
 L:     linux-afs@lists.infradead.org
 S:     Supported
 W:     https://www.infradead.org/~dhowells/kafs/
@@ -18077,7 +18129,7 @@ THERMAL/CPU_COOLING
 M:     Amit Daniel Kachhap <amit.kachhap@gmail.com>
 M:     Daniel Lezcano <daniel.lezcano@linaro.org>
 M:     Viresh Kumar <viresh.kumar@linaro.org>
-M:     Javi Merino <javi.merino@kernel.org>
+R:     Lukasz Luba <lukasz.luba@arm.com>
 L:     linux-pm@vger.kernel.org
 S:     Supported
 F:     Documentation/driver-api/thermal/cpu-cooling-api.rst
@@ -18250,13 +18302,6 @@ S:     Maintained
 F:     sound/soc/codecs/isabelle*
 F:     sound/soc/codecs/lm49453*
 
-TI NETCP ETHERNET DRIVER
-M:     Wingman Kwok <w-kwok2@ti.com>
-M:     Murali Karicheri <m-karicheri2@ti.com>
-L:     netdev@vger.kernel.org
-S:     Maintained
-F:     drivers/net/ethernet/ti/netcp*
-
 TI PCM3060 ASoC CODEC DRIVER
 M:     Kirill Marinushkin <kmarinushkin@birdec.com>
 L:     alsa-devel@alsa-project.org (moderated for non-subscribers)
index 9b9a003..15b6476 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1506,9 +1506,9 @@ MRPROPER_FILES += include/config include/generated          \
                  debian snap tar-install \
                  .config .config.old .version \
                  Module.symvers \
-                 signing_key.pem signing_key.priv signing_key.x509     \
-                 x509.genkey extra_certificates signing_key.x509.keyid \
-                 signing_key.x509.signer vmlinux-gdb.py \
+                 certs/signing_key.pem certs/signing_key.x509 \
+                 certs/x509.genkey \
+                 vmlinux-gdb.py \
                  *.spec
 
 # clean - Delete most, but leave enough to build external modules
index bf27159..c45b770 100644 (file)
@@ -1068,6 +1068,13 @@ config COMPAT_32BIT_TIME
 config ARCH_NO_PREEMPT
        bool
 
+config ARCH_EPHEMERAL_INODES
+       def_bool n
+       help
+         An arch should select this symbol if it doesn't keep track of inode
+         instances on its own, but instead relies on something else (e.g. the
+         host kernel for an UML kernel).
+
 config ARCH_SUPPORTS_RT
        bool
 
index 1f6a909..0fab5ac 100644 (file)
@@ -602,11 +602,6 @@ extern void outsl (unsigned long port, const void *src, unsigned long count);
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 #endif /* __KERNEL__ */
 
 #endif /* __ALPHA_IO_H */
index 63aee5d..82b19c9 100644 (file)
@@ -13,12 +13,12 @@ static char *pc873xx_names[] = {
 static unsigned int base, model;
 
 
-unsigned int __init pc873xx_get_base()
+unsigned int __init pc873xx_get_base(void)
 {
        return base;
 }
 
-char *__init pc873xx_get_model()
+char *__init pc873xx_get_model(void)
 {
        return pc873xx_names[model];
 }
index c5f7e59..5622578 100644 (file)
 551    common  epoll_pwait2                    sys_epoll_pwait2
 552    common  mount_setattr                   sys_mount_setattr
 553    common  quotactl_path                   sys_quotactl_path
+554    common  landlock_create_ruleset         sys_landlock_create_ruleset
+555    common  landlock_add_rule               sys_landlock_add_rule
+556    common  landlock_restrict_self          sys_landlock_restrict_self
index dc68efb..1931a04 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
+#include <net/checksum.h>
 
 
 #define ldq_u(x,y) \
index bc8d6ae..2d98501 100644 (file)
@@ -6,6 +6,7 @@
 config ARC
        def_bool y
        select ARC_TIMERS
+       select ARCH_HAS_CACHE_LINE_SIZE
        select ARCH_HAS_DEBUG_VM_PGTABLE
        select ARCH_HAS_DMA_PREP_COHERENT
        select ARCH_HAS_PTE_SPECIAL
@@ -28,6 +29,7 @@ config ARC
        select GENERIC_SMP_IDLE_THREAD
        select HAVE_ARCH_KGDB
        select HAVE_ARCH_TRACEHOOK
+       select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARC_MMU_V4
        select HAVE_DEBUG_STACKOVERFLOW
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_FUTEX_CMPXCHG if FUTEX
@@ -48,9 +50,6 @@ config ARC
        select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32
        select SET_FS
 
-config ARCH_HAS_CACHE_LINE_SIZE
-       def_bool y
-
 config TRACE_IRQFLAGS_SUPPORT
        def_bool y
 
@@ -86,10 +85,6 @@ config STACKTRACE_SUPPORT
        def_bool y
        select STACKTRACE
 
-config HAVE_ARCH_TRANSPARENT_HUGEPAGE
-       def_bool y
-       depends on ARC_MMU_V4
-
 menu "ARC Architecture Configuration"
 
 menu "ARC Platform/SoC/Board"
index 085c830..24804f1 100644 (file)
@@ -31,6 +31,7 @@ config ARM
        select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
        select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
        select ARCH_SUPPORTS_ATOMIC_RMW
+       select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE
        select ARCH_USE_BUILTIN_BSWAP
        select ARCH_USE_CMPXCHG_LOCKREF
        select ARCH_USE_MEMTEST
@@ -77,6 +78,7 @@ config ARM
        select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT
        select HAVE_ARCH_THREAD_STRUCT_WHITELIST
        select HAVE_ARCH_TRACEHOOK
+       select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARM_LPAE
        select HAVE_ARM_SMCCC if CPU_V7
        select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32
        select HAVE_CONTEXT_TRACKING
@@ -1511,14 +1513,6 @@ config HW_PERF_EVENTS
        def_bool y
        depends on ARM_PMU
 
-config SYS_SUPPORTS_HUGETLBFS
-       def_bool y
-       depends on ARM_LPAE
-
-config HAVE_ARCH_TRANSPARENT_HUGEPAGE
-       def_bool y
-       depends on ARM_LPAE
-
 config ARCH_WANT_GENERAL_HUGETLB
        def_bool y
 
index 182b300..8eb70c1 100644 (file)
@@ -111,8 +111,8 @@ asflags-y := -DZIMAGE
 
 # Supply kernel BSS size to the decompressor via a linker symbol.
 KBSS_SZ = $(shell echo $$(($$($(NM) $(obj)/../../../../vmlinux | \
-               sed -n -e 's/^\([^ ]*\) [AB] __bss_start$$/-0x\1/p' \
-                      -e 's/^\([^ ]*\) [AB] __bss_stop$$/+0x\1/p') )) )
+               sed -n -e 's/^\([^ ]*\) [ABD] __bss_start$$/-0x\1/p' \
+                      -e 's/^\([^ ]*\) [ABD] __bss_stop$$/+0x\1/p') )) )
 LDFLAGS_vmlinux = --defsym _kernel_bss_size=$(KBSS_SZ)
 # Supply ZRELADDR to the decompressor via a linker symbol.
 ifneq ($(CONFIG_AUTO_ZRELADDR),y)
index 47a787a..e24230d 100644 (file)
                reg = <0x20050000 0x10>;
                #pwm-cells = <3>;
                clocks = <&cru PCLK_PWM>;
-               clock-names = "pwm";
                pinctrl-names = "default";
                pinctrl-0 = <&pwm0_pin>;
                status = "disabled";
                reg = <0x20050010 0x10>;
                #pwm-cells = <3>;
                clocks = <&cru PCLK_PWM>;
-               clock-names = "pwm";
                pinctrl-names = "default";
                pinctrl-0 = <&pwm1_pin>;
                status = "disabled";
                reg = <0x20050020 0x10>;
                #pwm-cells = <3>;
                clocks = <&cru PCLK_PWM>;
-               clock-names = "pwm";
                pinctrl-names = "default";
                pinctrl-0 = <&pwm2_pin>;
                status = "disabled";
                reg = <0x20050030 0x10>;
                #pwm-cells = <2>;
                clocks = <&cru PCLK_PWM>;
-               clock-names = "pwm";
                pinctrl-names = "default";
                pinctrl-0 = <&pwm3_pin>;
                status = "disabled";
index ea7416c..05557ad 100644 (file)
                pinctrl-names = "default";
                pinctrl-0 = <&pwm0_pin>;
                clocks = <&cru PCLK_RKPWM>;
-               clock-names = "pwm";
                status = "disabled";
        };
 
                pinctrl-names = "default";
                pinctrl-0 = <&pwm1_pin>;
                clocks = <&cru PCLK_RKPWM>;
-               clock-names = "pwm";
                status = "disabled";
        };
 
                pinctrl-names = "default";
                pinctrl-0 = <&pwm2_pin>;
                clocks = <&cru PCLK_RKPWM>;
-               clock-names = "pwm";
                status = "disabled";
        };
 
                pinctrl-names = "default";
                pinctrl-0 = <&pwm3_pin>;
                clocks = <&cru PCLK_RKPWM>;
-               clock-names = "pwm";
                status = "disabled";
        };
 
index e70c997..b935162 100644 (file)
@@ -63,7 +63,6 @@ CONFIG_INPUT_EVDEV=y
 # CONFIG_MOUSE_PS2 is not set
 # CONFIG_SERIO is not set
 CONFIG_LEGACY_PTY_COUNT=16
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_8250_RUNTIME_UARTS=2
index 3a7938f..2aa3ebe 100644 (file)
@@ -7,7 +7,6 @@ CONFIG_EXPERT=y
 CONFIG_MODULES=y
 CONFIG_ARCH_FOOTBRIDGE=y
 CONFIG_ARCH_CATS=y
-CONFIG_ARCH_PERSONAL_SERVER=y
 CONFIG_ARCH_EBSA285_HOST=y
 CONFIG_ARCH_NETWINDER=y
 CONFIG_LEDS=y
index b4670d4..abde1fb 100644 (file)
@@ -72,7 +72,6 @@ CONFIG_INPUT_TOUCHSCREEN=y
 CONFIG_INPUT_MISC=y
 CONFIG_INPUT_UINPUT=m
 # CONFIG_SERIO is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_PXA=y
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_HW_RANDOM is not set
index 6834e97..eacc089 100644 (file)
@@ -79,7 +79,6 @@ CONFIG_INPUT_EVBUG=y
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
 # CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_8250_NR_UARTS=1
index 1d923db..89f4a6f 100644 (file)
@@ -69,7 +69,6 @@ CONFIG_SMSC911X=y
 # CONFIG_VT is not set
 # CONFIG_LEGACY_PTYS is not set
 CONFIG_SERIAL_NONSTANDARD=y
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_MPS2_UART_CONSOLE=y
 CONFIG_SERIAL_MPS2_UART=y
 # CONFIG_HW_RANDOM is not set
index 4f16716..d57ff30 100644 (file)
@@ -100,7 +100,6 @@ CONFIG_INPUT_EVDEV=y
 CONFIG_KEYBOARD_GPIO=y
 # CONFIG_INPUT_MOUSE is not set
 CONFIG_LEGACY_PTY_COUNT=16
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_8250_RUNTIME_UARTS=2
index f1fbdfc..4d8e7f2 100644 (file)
@@ -53,7 +53,6 @@ CONFIG_NET_ETHERNET=y
 # CONFIG_INPUT_KEYBOARD is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_PXA=y
 CONFIG_SERIAL_PXA_CONSOLE=y
 # CONFIG_LEGACY_PTYS is not set
index 673c7dd..ba8d9d7 100644 (file)
@@ -88,5 +88,6 @@ extern asmlinkage void c_backtrace(unsigned long fp, int pmode,
 struct mm_struct;
 void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr);
 extern void __show_regs(struct pt_regs *);
+extern void __show_regs_alloc_free(struct pt_regs *regs);
 
 #endif
index df85243..bd61502 100644 (file)
@@ -4,4 +4,7 @@
 
 #include <asm/xen/hypervisor.h>
 
+void kvm_init_hyp_services(void);
+bool kvm_arm_hyp_service_available(u32 func_id);
+
 #endif
index fc74812..f74944c 100644 (file)
@@ -430,11 +430,6 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem *addr);
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 #include <asm-generic/io.h>
 
 #ifdef CONFIG_MMU
index 22751b5..e62832d 100644 (file)
@@ -56,9 +56,6 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
        }
 }
 
-/* Function pointer to optional machine-specific reinitialization */
-extern void (*kexec_reinit)(void);
-
 static inline unsigned long phys_to_boot_phys(phys_addr_t phys)
 {
        return phys_to_idmap(phys);
index 2f841cb..a711322 100644 (file)
@@ -150,21 +150,6 @@ extern unsigned long vectors_base;
  */
 #define PLAT_PHYS_OFFSET       UL(CONFIG_PHYS_OFFSET)
 
-#ifdef CONFIG_XIP_KERNEL
-/*
- * When referencing data in RAM from the XIP region in a relative manner
- * with the MMU off, we need the relative offset between the two physical
- * addresses.  The macro below achieves this, which is:
- *    __pa(v_data) - __xip_pa(v_text)
- */
-#define PHYS_RELATIVE(v_data, v_text) \
-       (((v_data) - PAGE_OFFSET + PLAT_PHYS_OFFSET) - \
-        ((v_text) - XIP_VIRT_ADDR(CONFIG_XIP_PHYS_ADDR) + \
-          CONFIG_XIP_PHYS_ADDR))
-#else
-#define PHYS_RELATIVE(v_data, v_text) ((v_data) - (v_text))
-#endif
-
 #ifndef __ASSEMBLY__
 
 /*
index a1ceff4..ec17fc0 100644 (file)
@@ -18,12 +18,4 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
 static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
 #endif
 
-#ifdef CONFIG_STRICT_KERNEL_RWX
-void set_kernel_text_rw(void);
-void set_kernel_text_ro(void);
-#else
-static inline void set_kernel_text_rw(void) { }
-static inline void set_kernel_text_ro(void) { }
-#endif
-
 #endif
index ce85731..63748af 100644 (file)
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 
-generated-y += unistd-common.h
 generated-y += unistd-oabi.h
 generated-y += unistd-eabi.h
 generic-y += kvm_para.h
index 93ecf8a..ae7749e 100644 (file)
@@ -24,7 +24,6 @@
 #include <asm/unistd-oabi.h>
 #endif
 
-#include <asm/unistd-common.h>
 #define __NR_sync_file_range2          __NR_arm_sync_file_range
 
 /*
index be8050b..70993af 100644 (file)
@@ -24,6 +24,7 @@
 #include <asm/vdso_datapage.h>
 #include <asm/hardware/cache-l2x0.h>
 #include <linux/kbuild.h>
+#include <linux/arm-smccc.h>
 #include "signal.h"
 
 /*
@@ -148,6 +149,8 @@ int main(void)
   DEFINE(SLEEP_SAVE_SP_PHYS,   offsetof(struct sleep_save_sp, save_ptr_stash_phys));
   DEFINE(SLEEP_SAVE_SP_VIRT,   offsetof(struct sleep_save_sp, save_ptr_stash));
 #endif
+  DEFINE(ARM_SMCCC_QUIRK_ID_OFFS,      offsetof(struct arm_smccc_quirk, id));
+  DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS,   offsetof(struct arm_smccc_quirk, state));
   BLANK();
   DEFINE(DMA_BIDIRECTIONAL,    DMA_BIDIRECTIONAL);
   DEFINE(DMA_TO_DEVICE,                DMA_TO_DEVICE);
index e0d7833..7f0b7ab 100644 (file)
@@ -344,20 +344,19 @@ ENTRY(\sym)
        .size   \sym, . - \sym
        .endm
 
-#define NATIVE(nr, func) syscall nr, func
+#define __SYSCALL_WITH_COMPAT(nr, native, compat)      __SYSCALL(nr, native)
+#define __SYSCALL(nr, func) syscall nr, func
 
 /*
  * This is the syscall table declaration for native ABI syscalls.
  * With EABI a couple syscalls are obsolete and defined as sys_ni_syscall.
  */
        syscall_table_start sys_call_table
-#define COMPAT(nr, native, compat) syscall nr, native
 #ifdef CONFIG_AEABI
 #include <calls-eabi.S>
 #else
 #include <calls-oabi.S>
 #endif
-#undef COMPAT
        syscall_table_end sys_call_table
 
 /*============================================================================
@@ -455,7 +454,8 @@ ENDPROC(sys_oabi_readahead)
  * using the compatibility syscall entries.
  */
        syscall_table_start sys_oabi_call_table
-#define COMPAT(nr, native, compat) syscall nr, compat
+#undef __SYSCALL_WITH_COMPAT
+#define __SYSCALL_WITH_COMPAT(nr, native, compat)      __SYSCALL(nr, compat)
 #include <calls-oabi.S>
        syscall_table_end sys_oabi_call_table
 
index 08660ae..b1423fb 100644 (file)
@@ -886,7 +886,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs)
                        info->trigger = addr;
                        pr_debug("breakpoint fired: address = 0x%x\n", addr);
                        perf_bp_event(bp, regs);
-                       if (!bp->overflow_handler)
+                       if (is_default_overflow_handler(bp))
                                enable_single_step(bp, addr);
                        goto unlock;
                }
index 2b09dad..f567032 100644 (file)
@@ -147,11 +147,6 @@ void machine_crash_shutdown(struct pt_regs *regs)
        pr_info("Loading crashdump kernel...\n");
 }
 
-/*
- * Function pointer to optional machine-specific reinitialization
- */
-void (*kexec_reinit)(void);
-
 void machine_kexec(struct kimage *image)
 {
        unsigned long page_list, reboot_entry_phys;
@@ -187,9 +182,6 @@ void machine_kexec(struct kimage *image)
 
        pr_info("Bye!\n");
 
-       if (kexec_reinit)
-               kexec_reinit();
-
        soft_restart(reboot_entry_phys);
 }
 
index 5199a2b..6324f4d 100644 (file)
@@ -92,6 +92,17 @@ void arch_cpu_idle_exit(void)
        ledtrig_cpu(CPU_LED_IDLE_END);
 }
 
+void __show_regs_alloc_free(struct pt_regs *regs)
+{
+       int i;
+
+       /* check for r0 - r12 only */
+       for (i = 0; i < 13; i++) {
+               pr_alert("Register r%d information:", i);
+               mem_dump_obj((void *)regs->uregs[i]);
+       }
+}
+
 void __show_regs(struct pt_regs *regs)
 {
        unsigned long flags;
index 00664c7..931df62 100644 (file)
@@ -3,7 +3,9 @@
  * Copyright (c) 2015, Linaro Limited
  */
 #include <linux/linkage.h>
+#include <linux/arm-smccc.h>
 
+#include <asm/asm-offsets.h>
 #include <asm/opcodes-sec.h>
 #include <asm/opcodes-virt.h>
 #include <asm/unwind.h>
@@ -27,7 +29,14 @@ UNWIND(      .fnstart)
 UNWIND(        .save   {r4-r7})
        ldm     r12, {r4-r7}
        \instr
-       pop     {r4-r7}
+       ldr     r4, [sp, #36]
+       cmp     r4, #0
+       beq     1f                      // No quirk structure
+       ldr     r5, [r4, #ARM_SMCCC_QUIRK_ID_OFFS]
+       cmp     r5, #ARM_SMCCC_QUIRK_QCOM_A6
+       bne     1f                      // No quirk present
+       str     r6, [r4, #ARM_SMCCC_QUIRK_STATE_OFFS]
+1:     pop     {r4-r7}
        ldr     r12, [sp, #(4 * 4)]
        stm     r12, {r0-r3}
        bx      lr
index 24bd205..43f0a3e 100644 (file)
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/mm_types.h>
@@ -25,6 +26,13 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
        if (!idmap_pgd)
                return -EINVAL;
 
+       /*
+        * Function graph tracer state gets incosistent when the kernel
+        * calls functions that never return (aka suspend finishers) hence
+        * disable graph tracing during their execution.
+        */
+       pause_graph_tracing();
+
        /*
         * Provide a temporary page table with an identity mapping for
         * the MMU-enable code, required for resuming.  On successful
@@ -32,6 +40,9 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
         * back to the correct page tables.
         */
        ret = __cpu_suspend(arg, fn, __mpidr);
+
+       unpause_graph_tracing();
+
        if (ret == 0) {
                cpu_switch_mm(mm->pgd, mm);
                local_flush_bp_all();
@@ -45,7 +56,13 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 {
        u32 __mpidr = cpu_logical_map(smp_processor_id());
-       return __cpu_suspend(arg, fn, __mpidr);
+       int ret;
+
+       pause_graph_tracing();
+       ret = __cpu_suspend(arg, fn, __mpidr);
+       unpause_graph_tracing();
+
+       return ret;
 }
 #define        idmap_pgd       NULL
 #endif
index 17d5a78..64308e3 100644 (file)
@@ -287,6 +287,7 @@ static int __die(const char *str, int err, struct pt_regs *regs)
 
        print_modules();
        __show_regs(regs);
+       __show_regs_alloc_free(regs);
        pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n",
                 TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), end_of_stack(tsk));
 
index 844aa58..728aff9 100644 (file)
@@ -16,27 +16,6 @@ config ARCH_CATS
 
          Saying N will reduce the size of the Footbridge kernel.
 
-config ARCH_PERSONAL_SERVER
-       bool "Compaq Personal Server"
-       select FOOTBRIDGE_HOST
-       select ISA
-       select ISA_DMA
-       select FORCE_PCI
-       help
-         Say Y here if you intend to run this kernel on the Compaq
-         Personal Server.
-
-         Saying N will reduce the size of the Footbridge kernel.
-
-         The Compaq Personal Server is not available for purchase.
-         There are no product plans beyond the current research
-         prototypes at this time.  Information is available at:
-
-         <http://www.crl.hpl.hp.com/projects/personalserver/>
-
-         If you have any questions or comments about the  Compaq Personal
-         Server, send e-mail to <skiff@crl.dec.com>.
-
 config ARCH_EBSA285_ADDIN
        bool "EBSA285 (addin mode)"
        select ARCH_EBSA285
index a09f104..6262993 100644 (file)
@@ -11,12 +11,10 @@ pci-y                       += dc21285.o
 pci-$(CONFIG_ARCH_CATS) += cats-pci.o
 pci-$(CONFIG_ARCH_EBSA285_HOST) += ebsa285-pci.o
 pci-$(CONFIG_ARCH_NETWINDER) += netwinder-pci.o
-pci-$(CONFIG_ARCH_PERSONAL_SERVER) += personal-pci.o
 
 obj-$(CONFIG_ARCH_CATS) += cats-hw.o isa-timer.o
 obj-$(CONFIG_ARCH_EBSA285) += ebsa285.o dc21285-timer.o
 obj-$(CONFIG_ARCH_NETWINDER) += netwinder-hw.o isa-timer.o
-obj-$(CONFIG_ARCH_PERSONAL_SERVER) += personal.o dc21285-timer.o
 
 obj-$(CONFIG_PCI)      +=$(pci-y)
 
diff --git a/arch/arm/mach-footbridge/personal-pci.c b/arch/arm/mach-footbridge/personal-pci.c
deleted file mode 100644 (file)
index 9d19aa9..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/arch/arm/mach-footbridge/personal-pci.c
- *
- * PCI bios-type initialisation for PCI machines
- *
- * Bits taken from various places.
- */
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-
-#include <asm/irq.h>
-#include <asm/mach/pci.h>
-#include <asm/mach-types.h>
-
-static int irqmap_personal_server[] = {
-       IRQ_IN0, IRQ_IN1, IRQ_IN2, IRQ_IN3, 0, 0, 0,
-       IRQ_DOORBELLHOST, IRQ_DMA1, IRQ_DMA2, IRQ_PCI
-};
-
-static int personal_server_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
-{
-       unsigned char line;
-
-       pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
-
-       if (line > 0x40 && line <= 0x5f) {
-               /* line corresponds to the bit controlling this interrupt
-                * in the footbridge.  Ignore the first 8 interrupt bits,
-                * look up the rest in the map.  IN0 is bit number 8
-                */
-               return irqmap_personal_server[(line & 0x1f) - 8];
-       } else if (line == 0) {
-               /* no interrupt */
-               return 0;
-       } else
-               return irqmap_personal_server[(line - 1) & 3];
-}
-
-static struct hw_pci personal_server_pci __initdata = {
-       .map_irq                = personal_server_map_irq,
-       .nr_controllers         = 1,
-       .ops                    = &dc21285_ops,
-       .setup                  = dc21285_setup,
-       .preinit                = dc21285_preinit,
-       .postinit               = dc21285_postinit,
-};
-
-static int __init personal_pci_init(void)
-{
-       if (machine_is_personal_server())
-               pci_common_init(&personal_server_pci);
-       return 0;
-}
-
-subsys_initcall(personal_pci_init);
diff --git a/arch/arm/mach-footbridge/personal.c b/arch/arm/mach-footbridge/personal.c
deleted file mode 100644 (file)
index ca71575..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/arch/arm/mach-footbridge/personal.c
- *
- * Personal server (Skiff) machine fixup
- */
-#include <linux/init.h>
-#include <linux/spinlock.h>
-
-#include <asm/hardware/dec21285.h>
-#include <asm/mach-types.h>
-
-#include <asm/mach/arch.h>
-
-#include "common.h"
-
-MACHINE_START(PERSONAL_SERVER, "Compaq-PersonalServer")
-       /* Maintainer: Jamey Hicks / George France */
-       .atag_offset    = 0x100,
-       .map_io         = footbridge_map_io,
-       .init_irq       = footbridge_init_irq,
-       .init_time      = footbridge_timer_init,
-       .restart        = footbridge_restart,
-MACHINE_END
-
index 78b9a5e..bf99e71 100644 (file)
@@ -116,16 +116,16 @@ static struct hw_pci n2100_pci __initdata = {
 };
 
 /*
- * Both r8169 chips on the n2100 exhibit PCI parity problems.  Set
- * the ->broken_parity_status flag for both ports so that the r8169
- * driver knows it should ignore error interrupts.
+ * Both r8169 chips on the n2100 exhibit PCI parity problems.  Turn
+ * off parity reporting for both ports so we don't get error interrupts
+ * for them.
  */
 static void n2100_fixup_r8169(struct pci_dev *dev)
 {
        if (dev->bus->number == 0 &&
            (dev->devfn == PCI_DEVFN(1, 0) ||
             dev->devfn == PCI_DEVFN(2, 0)))
-               dev->broken_parity_status = 1;
+               pci_disable_parity(dev);
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_REALTEK, PCI_ANY_ID, n2100_fixup_r8169);
 
index dc8f152..830bbfb 100644 (file)
@@ -33,41 +33,41 @@ icache_size:
  * processor.  We fix this by performing an invalidate, rather than a
  * clean + invalidate, before jumping into the kernel.
  *
- * This function is cloned from arch/arm/mach-tegra/headsmp.S, and needs
- * to be called for both secondary cores startup and primary core resume
- * procedures.
+ * This function needs to be called for both secondary cores startup and
+ * primary core resume procedures.
  */
 ENTRY(v7_invalidate_l1)
-       mov     r0, #0
-       mcr     p15, 2, r0, c0, c0, 0
-       mrc     p15, 1, r0, c0, c0, 0
-
-       movw    r1, #0x7fff
-       and     r2, r1, r0, lsr #13
+       mov     r0, #0
+       mcr     p15, 2, r0, c0, c0, 0   @ select L1 data cache in CSSELR
+       isb
+       mrc     p15, 1, r0, c0, c0, 0   @ read cache geometry from CCSIDR
 
-       movw    r1, #0x3ff
+       movw    r3, #0x3ff
+       and     r3, r3, r0, lsr #3      @ 'Associativity' in CCSIDR[12:3]
+       clz     r1, r3                  @ WayShift
+       mov     r2, #1
+       mov     r3, r3, lsl r1          @ NumWays-1 shifted into bits [31:...]
+       movs    r1, r2, lsl r1          @ #1 shifted left by same amount
+       moveq   r1, #1                  @ r1 needs value > 0 even if only 1 way
 
-       and     r3, r1, r0, lsr #3      @ NumWays - 1
-       add     r2, r2, #1              @ NumSets
+       and     r2, r0, #0x7
+       add     r2, r2, #4              @ SetShift
 
-       and     r0, r0, #0x7
-       add     r0, r0, #4      @ SetShift
+1:     movw    ip, #0x7fff
+       and     r0, ip, r0, lsr #13     @ 'NumSets' in CCSIDR[27:13]
 
-       clz     r1, r3          @ WayShift
-       add     r4, r3, #1      @ NumWays
-1:     sub     r2, r2, #1      @ NumSets--
-       mov     r3, r4          @ Temp = NumWays
-2:     subs    r3, r3, #1      @ Temp--
-       mov     r5, r3, lsl r1
-       mov     r6, r2, lsl r0
-       orr     r5, r5, r6      @ Reg = (Temp<<WayShift)|(NumSets<<SetShift)
-       mcr     p15, 0, r5, c7, c6, 2
-       bgt     2b
-       cmp     r2, #0
-       bgt     1b
-       dsb     st
-       isb
-       ret     lr
+2:     mov     ip, r0, lsl r2          @ NumSet << SetShift
+       orr     ip, ip, r3              @ Reg = (Temp<<WayShift)|(NumSets<<SetShift)
+       mcr     p15, 0, ip, c7, c6, 2
+       subs    r0, r0, #1              @ Set--
+       bpl     2b
+       subs    r3, r3, r1              @ Way--
+       bcc     3f
+       mrc     p15, 1, r0, c0, c0, 0   @ re-read cache geometry from CCSIDR
+       b       1b
+3:     dsb     st
+       isb
+       ret     lr
 ENDPROC(v7_invalidate_l1)
 
 /*
index 93ff009..fb68800 100644 (file)
@@ -420,7 +420,7 @@ void ptdump_walk_pgd(struct seq_file *m, struct ptdump_info *info)
        note_page(&st, 0, 0, 0, NULL);
 }
 
-static void ptdump_initialize(void)
+static void __init ptdump_initialize(void)
 {
        unsigned i, j;
 
@@ -466,7 +466,7 @@ void ptdump_check_wx(void)
                pr_info("Checked W+X mappings: passed, no W+X pages found\n");
 }
 
-static int ptdump_init(void)
+static int __init ptdump_init(void)
 {
        ptdump_initialize();
        ptdump_debugfs_register(&kernel_ptdump_info, "kernel_page_tables");
index 7022b7b..9d4744a 100644 (file)
@@ -301,7 +301,11 @@ static void __init free_highpages(void)
 void __init mem_init(void)
 {
 #ifdef CONFIG_ARM_LPAE
-       swiotlb_init(1);
+       if (swiotlb_force == SWIOTLB_FORCE ||
+           max_pfn > arm_dma_pfn_limit)
+               swiotlb_init(1);
+       else
+               swiotlb_force = SWIOTLB_NO_FORCE;
 #endif
 
        set_max_mapnr(pfn_to_page(max_pfn) - mem_map);
@@ -485,33 +489,12 @@ static int __mark_rodata_ro(void *unused)
        return 0;
 }
 
-static int kernel_set_to_readonly __read_mostly;
-
 void mark_rodata_ro(void)
 {
-       kernel_set_to_readonly = 1;
        stop_machine(__mark_rodata_ro, NULL, NULL);
        debug_checkwx();
 }
 
-void set_kernel_text_rw(void)
-{
-       if (!kernel_set_to_readonly)
-               return;
-
-       set_section_perms(ro_perms, ARRAY_SIZE(ro_perms), false,
-                               current->active_mm);
-}
-
-void set_kernel_text_ro(void)
-{
-       if (!kernel_set_to_readonly)
-               return;
-
-       set_section_perms(ro_perms, ARRAY_SIZE(ro_perms), true,
-                               current->active_mm);
-}
-
 #else
 static inline void fix_kernmem_perms(void) { }
 #endif /* CONFIG_STRICT_KERNEL_RWX */
index 28c9d32..26d726a 100644 (file)
@@ -256,6 +256,20 @@ ENDPROC(cpu_pj4b_do_resume)
 
 #endif
 
+       @
+       @ Invoke the v7_invalidate_l1() function, which adheres to the AAPCS
+       @ rules, and so it may corrupt registers that we need to preserve.
+       @
+       .macro  do_invalidate_l1
+       mov     r6, r1
+       mov     r7, r2
+       mov     r10, lr
+       bl      v7_invalidate_l1                @ corrupts {r0-r3, ip, lr}
+       mov     r1, r6
+       mov     r2, r7
+       mov     lr, r10
+       .endm
+
 /*
  *     __v7_setup
  *
@@ -277,6 +291,7 @@ __v7_ca5mp_setup:
 __v7_ca9mp_setup:
 __v7_cr7mp_setup:
 __v7_cr8mp_setup:
+       do_invalidate_l1
        mov     r10, #(1 << 0)                  @ Cache/TLB ops broadcasting
        b       1f
 __v7_ca7mp_setup:
@@ -284,13 +299,9 @@ __v7_ca12mp_setup:
 __v7_ca15mp_setup:
 __v7_b15mp_setup:
 __v7_ca17mp_setup:
+       do_invalidate_l1
        mov     r10, #0
-1:     adr     r0, __v7_setup_stack_ptr
-       ldr     r12, [r0]
-       add     r12, r12, r0                    @ the local stack
-       stmia   r12, {r1-r6, lr}                @ v7_invalidate_l1 touches r0-r6
-       bl      v7_invalidate_l1
-       ldmia   r12, {r1-r6, lr}
+1:
 #ifdef CONFIG_SMP
        orr     r10, r10, #(1 << 6)             @ Enable SMP/nAMP mode
        ALT_SMP(mrc     p15, 0, r0, c1, c0, 1)
@@ -471,12 +482,7 @@ __v7_pj4b_setup:
 #endif /* CONFIG_CPU_PJ4B */
 
 __v7_setup:
-       adr     r0, __v7_setup_stack_ptr
-       ldr     r12, [r0]
-       add     r12, r12, r0                    @ the local stack
-       stmia   r12, {r1-r6, lr}                @ v7_invalidate_l1 touches r0-r6
-       bl      v7_invalidate_l1
-       ldmia   r12, {r1-r6, lr}
+       do_invalidate_l1
 
 __v7_setup_cont:
        and     r0, r9, #0xff000000             @ ARM?
@@ -548,17 +554,8 @@ __errata_finish:
        orr     r0, r0, r6                      @ set them
  THUMB(        orr     r0, r0, #1 << 30        )       @ Thumb exceptions
        ret     lr                              @ return to head.S:__ret
-
-       .align  2
-__v7_setup_stack_ptr:
-       .word   PHYS_RELATIVE(__v7_setup_stack, .)
 ENDPROC(__v7_setup)
 
-       .bss
-       .align  2
-__v7_setup_stack:
-       .space  4 * 7                           @ 7 registers
-
        __INITDATA
 
        .weak cpu_v7_bugs_init
index 598b636..318de96 100644 (file)
@@ -11,20 +11,9 @@ static int ptdump_show(struct seq_file *m, void *v)
        ptdump_walk_pgd(m, info);
        return 0;
 }
+DEFINE_SHOW_ATTRIBUTE(ptdump);
 
-static int ptdump_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, ptdump_show, inode->i_private);
-}
-
-static const struct file_operations ptdump_fops = {
-       .open           = ptdump_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-void ptdump_debugfs_register(struct ptdump_info *info, const char *name)
+void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name)
 {
        debugfs_create_file(name, 0400, NULL, info, &ptdump_fops);
 }
index 977369f..a0dae35 100644 (file)
@@ -55,25 +55,25 @@ void kprobe_arm_test_cases(void)
        TEST_GROUP("Data-processing (register), (register-shifted register), (immediate)")
 
 #define _DATA_PROCESSING_DNM(op,s,val)                                         \
-       TEST_RR(  op "eq" s "   r0,  r",1, VAL1,", r",2, val, "")               \
-       TEST_RR(  op "ne" s "   r1,  r",1, VAL1,", r",2, val, ", lsl #3")       \
-       TEST_RR(  op "cs" s "   r2,  r",3, VAL1,", r",2, val, ", lsr #4")       \
-       TEST_RR(  op "cc" s "   r3,  r",3, VAL1,", r",2, val, ", asr #5")       \
-       TEST_RR(  op "mi" s "   r4,  r",5, VAL1,", r",2, N(val),", asr #6")     \
-       TEST_RR(  op "pl" s "   r5,  r",5, VAL1,", r",2, val, ", ror #7")       \
-       TEST_RR(  op "vs" s "   r6,  r",7, VAL1,", r",2, val, ", rrx")          \
-       TEST_R(   op "vc" s "   r6,  r",7, VAL1,", pc, lsl #3")                 \
-       TEST_R(   op "vc" s "   r6,  r",7, VAL1,", sp, lsr #4")                 \
-       TEST_R(   op "vc" s "   r6,  pc, r",7, VAL1,", asr #5")                 \
-       TEST_R(   op "vc" s "   r6,  sp, r",7, VAL1,", ror #6")                 \
-       TEST_RRR( op "hi" s "   r8,  r",9, VAL1,", r",14,val, ", lsl r",0, 3,"")\
-       TEST_RRR( op "ls" s "   r9,  r",9, VAL1,", r",14,val, ", lsr r",7, 4,"")\
-       TEST_RRR( op "ge" s "   r10, r",11,VAL1,", r",14,val, ", asr r",7, 5,"")\
-       TEST_RRR( op "lt" s "   r11, r",11,VAL1,", r",14,N(val),", asr r",7, 6,"")\
-       TEST_RR(  op "gt" s "   r12, r13"       ", r",14,val, ", ror r",14,7,"")\
-       TEST_RR(  op "le" s "   r14, r",0, val, ", r13"       ", lsl r",14,8,"")\
-       TEST_R(   op "eq" s "   r0,  r",11,VAL1,", #0xf5")                      \
-       TEST_R(   op "ne" s "   r11, r",0, VAL1,", #0xf5000000")                \
+       TEST_RR(  op s "eq      r0,  r",1, VAL1,", r",2, val, "")               \
+       TEST_RR(  op s "ne      r1,  r",1, VAL1,", r",2, val, ", lsl #3")       \
+       TEST_RR(  op s "cs      r2,  r",3, VAL1,", r",2, val, ", lsr #4")       \
+       TEST_RR(  op s "cc      r3,  r",3, VAL1,", r",2, val, ", asr #5")       \
+       TEST_RR(  op s "mi      r4,  r",5, VAL1,", r",2, N(val),", asr #6")     \
+       TEST_RR(  op s "pl      r5,  r",5, VAL1,", r",2, val, ", ror #7")       \
+       TEST_RR(  op s "vs      r6,  r",7, VAL1,", r",2, val, ", rrx")          \
+       TEST_R(   op s "vc      r6,  r",7, VAL1,", pc, lsl #3")                 \
+       TEST_R(   op s "vc      r6,  r",7, VAL1,", sp, lsr #4")                 \
+       TEST_R(   op s "vc      r6,  pc, r",7, VAL1,", asr #5")                 \
+       TEST_R(   op s "vc      r6,  sp, r",7, VAL1,", ror #6")                 \
+       TEST_RRR( op s "hi      r8,  r",9, VAL1,", r",14,val, ", lsl r",0, 3,"")\
+       TEST_RRR( op s "ls      r9,  r",9, VAL1,", r",14,val, ", lsr r",7, 4,"")\
+       TEST_RRR( op s "ge      r10, r",11,VAL1,", r",14,val, ", asr r",7, 5,"")\
+       TEST_RRR( op s "lt      r11, r",11,VAL1,", r",14,N(val),", asr r",7, 6,"")\
+       TEST_RR(  op s "gt      r12, r13"       ", r",14,val, ", ror r",14,7,"")\
+       TEST_RR(  op s "le      r14, r",0, val, ", r13"       ", lsl r",14,8,"")\
+       TEST_R(   op s "eq      r0,  r",11,VAL1,", #0xf5")                      \
+       TEST_R(   op s "ne      r11, r",0, VAL1,", #0xf5000000")                \
        TEST_R(   op s "        r7,  r",8, VAL2,", #0x000af000")                \
        TEST(     op s "        r4,  pc"        ", #0x00005a00")
 
@@ -104,23 +104,23 @@ void kprobe_arm_test_cases(void)
        TEST_R(   op "  r",8, VAL2,", #0x000af000")
 
 #define _DATA_PROCESSING_DM(op,s,val)                                  \
-       TEST_R(   op "eq" s "   r0,  r",1, val, "")                     \
-       TEST_R(   op "ne" s "   r1,  r",1, val, ", lsl #3")             \
-       TEST_R(   op "cs" s "   r2,  r",3, val, ", lsr #4")             \
-       TEST_R(   op "cc" s "   r3,  r",3, val, ", asr #5")             \
-       TEST_R(   op "mi" s "   r4,  r",5, N(val),", asr #6")           \
-       TEST_R(   op "pl" s "   r5,  r",5, val, ", ror #7")             \
-       TEST_R(   op "vs" s "   r6,  r",10,val, ", rrx")                \
-       TEST(     op "vs" s "   r7,  pc, lsl #3")                       \
-       TEST(     op "vs" s "   r7,  sp, lsr #4")                       \
-       TEST_RR(  op "vc" s "   r8,  r",7, val, ", lsl r",0, 3,"")      \
-       TEST_RR(  op "hi" s "   r9,  r",9, val, ", lsr r",7, 4,"")      \
-       TEST_RR(  op "ls" s "   r10, r",9, val, ", asr r",7, 5,"")      \
-       TEST_RR(  op "ge" s "   r11, r",11,N(val),", asr r",7, 6,"")    \
-       TEST_RR(  op "lt" s "   r12, r",11,val, ", ror r",14,7,"")      \
-       TEST_R(   op "gt" s "   r14, r13"       ", lsl r",14,8,"")      \
-       TEST(     op "eq" s "   r0,  #0xf5")                            \
-       TEST(     op "ne" s "   r11, #0xf5000000")                      \
+       TEST_R(   op s "eq      r0,  r",1, val, "")                     \
+       TEST_R(   op s "ne      r1,  r",1, val, ", lsl #3")             \
+       TEST_R(   op s "cs      r2,  r",3, val, ", lsr #4")             \
+       TEST_R(   op s "cc      r3,  r",3, val, ", asr #5")             \
+       TEST_R(   op s "mi      r4,  r",5, N(val),", asr #6")           \
+       TEST_R(   op s "pl      r5,  r",5, val, ", ror #7")             \
+       TEST_R(   op s "vs      r6,  r",10,val, ", rrx")                \
+       TEST(     op s "vs      r7,  pc, lsl #3")                       \
+       TEST(     op s "vs      r7,  sp, lsr #4")                       \
+       TEST_RR(  op s "vc      r8,  r",7, val, ", lsl r",0, 3,"")      \
+       TEST_RR(  op s "hi      r9,  r",9, val, ", lsr r",7, 4,"")      \
+       TEST_RR(  op s "ls      r10, r",9, val, ", asr r",7, 5,"")      \
+       TEST_RR(  op s "ge      r11, r",11,N(val),", asr r",7, 6,"")    \
+       TEST_RR(  op s "lt      r12, r",11,val, ", ror r",14,7,"")      \
+       TEST_R(   op s "gt      r14, r13"       ", lsl r",14,8,"")      \
+       TEST(     op s "eq      r0,  #0xf5")                            \
+       TEST(     op s "ne      r11, #0xf5000000")                      \
        TEST(     op s "        r7,  #0x000af000")                      \
        TEST(     op s "        r4,  #0x00005a00")
 
@@ -166,10 +166,10 @@ void kprobe_arm_test_cases(void)
 
        /* Data-processing with PC as a target and status registers updated */
        TEST_UNSUPPORTED("movs  pc, r1")
-       TEST_UNSUPPORTED("movs  pc, r1, lsl r2")
+       TEST_UNSUPPORTED(__inst_arm(0xe1b0f211) "       @movs   pc, r1, lsl r2")
        TEST_UNSUPPORTED("movs  pc, #0x10000")
        TEST_UNSUPPORTED("adds  pc, lr, r1")
-       TEST_UNSUPPORTED("adds  pc, lr, r1, lsl r2")
+       TEST_UNSUPPORTED(__inst_arm(0xe09ef211) "       @adds   pc, lr, r1, lsl r2")
        TEST_UNSUPPORTED("adds  pc, lr, #4")
 
        /* Data-processing with SP as target */
@@ -352,7 +352,7 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe000029f) " @ mul r0, pc, r2")
        TEST_UNSUPPORTED(__inst_arm(0xe0000f91) " @ mul r0, r1, pc")
        TEST_RR(    "muls       r0, r",1, VAL1,", r",2, VAL2,"")
-       TEST_RR(    "mullss     r7, r",8, VAL2,", r",9, VAL2,"")
+       TEST_RR(    "mulsls     r7, r",8, VAL2,", r",9, VAL2,"")
        TEST_R(     "muls       lr, r",4, VAL3,", r13")
        TEST_UNSUPPORTED(__inst_arm(0xe01f0291) " @ muls pc, r1, r2")
 
@@ -361,7 +361,7 @@ void kprobe_arm_test_cases(void)
        TEST_RR(     "mla       lr, r",1, VAL2,", r",2, VAL3,", r13")
        TEST_UNSUPPORTED(__inst_arm(0xe02f3291) " @ mla pc, r1, r2, r3")
        TEST_RRR(    "mlas      r0, r",1, VAL1,", r",2, VAL2,", r",3,  VAL3,"")
-       TEST_RRR(    "mlahis    r7, r",8, VAL3,", r",9, VAL1,", r",10, VAL2,"")
+       TEST_RRR(    "mlashi    r7, r",8, VAL3,", r",9, VAL1,", r",10, VAL2,"")
        TEST_RR(     "mlas      lr, r",1, VAL2,", r",2, VAL3,", r13")
        TEST_UNSUPPORTED(__inst_arm(0xe03f3291) " @ mlas pc, r1, r2, r3")
 
@@ -394,7 +394,7 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe081f392) " @ umull pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe08f1392) " @ umull r1, pc, r2, r3")
        TEST_RR(  "umulls       r0, r1, r",2, VAL1,", r",3, VAL2,"")
-       TEST_RR(  "umulllss     r7, r8, r",9, VAL2,", r",10, VAL1,"")
+       TEST_RR(  "umullsls     r7, r8, r",9, VAL2,", r",10, VAL1,"")
        TEST_R(   "umulls       lr, r12, r",11,VAL3,", r13")
        TEST_UNSUPPORTED(__inst_arm(0xe091f392) " @ umulls pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe09f1392) " @ umulls r1, pc, r2, r3")
@@ -405,7 +405,7 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe0af1392) " @ umlal pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe0a1f392) " @ umlal r1, pc, r2, r3")
        TEST_RRRR(  "umlals     r",0, VAL1,", r",1, VAL2,", r",2, VAL3,", r",3, VAL4)
-       TEST_RRRR(  "umlalles   r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3)
+       TEST_RRRR(  "umlalsle   r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3)
        TEST_RRR(   "umlals     r",14,VAL3,", r",7, VAL4,", r",5, VAL1,", r13")
        TEST_UNSUPPORTED(__inst_arm(0xe0bf1392) " @ umlals pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe0b1f392) " @ umlals r1, pc, r2, r3")
@@ -416,7 +416,7 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe0c1f392) " @ smull pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe0cf1392) " @ smull r1, pc, r2, r3")
        TEST_RR(  "smulls       r0, r1, r",2, VAL1,", r",3, VAL2,"")
-       TEST_RR(  "smulllss     r7, r8, r",9, VAL2,", r",10, VAL1,"")
+       TEST_RR(  "smullsls     r7, r8, r",9, VAL2,", r",10, VAL1,"")
        TEST_R(   "smulls       lr, r12, r",11,VAL3,", r13")
        TEST_UNSUPPORTED(__inst_arm(0xe0d1f392) " @ smulls pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe0df1392) " @ smulls r1, pc, r2, r3")
@@ -427,7 +427,7 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe0ef1392) " @ smlal pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe0e1f392) " @ smlal r1, pc, r2, r3")
        TEST_RRRR(  "smlals     r",0, VAL1,", r",1, VAL2,", r",2, VAL3,", r",3, VAL4)
-       TEST_RRRR(  "smlalles   r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3)
+       TEST_RRRR(  "smlalsle   r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3)
        TEST_RRR(   "smlals     r",14,VAL3,", r",7, VAL4,", r",5, VAL1,", r13")
        TEST_UNSUPPORTED(__inst_arm(0xe0ff1392) " @ smlals pc, r1, r2, r3")
        TEST_UNSUPPORTED(__inst_arm(0xe0f0f392) " @ smlals r0, pc, r2, r3")
@@ -450,7 +450,7 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe10f0091) " @ swp r0, r1, [pc]")
 #if __LINUX_ARM_ARCH__ < 6
        TEST_RP("swpb   lr, r",7,VAL2,", [r",8,0,"]")
-       TEST_R( "swpvsb r0, r",1,VAL1,", [sp]")
+       TEST_R( "swpbvs r0, r",1,VAL1,", [sp]")
 #else
        TEST_UNSUPPORTED(__inst_arm(0xe148e097) " @ swpb        lr, r7, [r8]")
        TEST_UNSUPPORTED(__inst_arm(0x614d0091) " @ swpvsb      r0, r1, [sp]")
@@ -477,11 +477,11 @@ void kprobe_arm_test_cases(void)
        TEST_GROUP("Extra load/store instructions")
 
        TEST_RPR(  "strh        r",0, VAL1,", [r",1, 48,", -r",2, 24,"]")
-       TEST_RPR(  "streqh      r",14,VAL2,", [r",11,0, ", r",12, 48,"]")
-       TEST_UNSUPPORTED(  "streqh      r14, [r13, r12]")
-       TEST_UNSUPPORTED(  "streqh      r14, [r12, r13]")
+       TEST_RPR(  "strheq      r",14,VAL2,", [r",11,0, ", r",12, 48,"]")
+       TEST_UNSUPPORTED(  "strheq      r14, [r13, r12]")
+       TEST_UNSUPPORTED(  "strheq      r14, [r12, r13]")
        TEST_RPR(  "strh        r",1, VAL1,", [r",2, 24,", r",3,  48,"]!")
-       TEST_RPR(  "strneh      r",12,VAL2,", [r",11,48,", -r",10,24,"]!")
+       TEST_RPR(  "strhne      r",12,VAL2,", [r",11,48,", -r",10,24,"]!")
        TEST_RPR(  "strh        r",2, VAL1,", [r",3, 24,"], r",4, 48,"")
        TEST_RPR(  "strh        r",10,VAL2,", [r",9, 48,"], -r",11,24,"")
        TEST_UNSUPPORTED(__inst_arm(0xe1afc0ba) "       @ strh r12, [pc, r10]!")
@@ -489,9 +489,9 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe089a0bf) "       @ strh r10, [r9], pc")
 
        TEST_PR(   "ldrh        r0, [r",0,  48,", -r",2, 24,"]")
-       TEST_PR(   "ldrcsh      r14, [r",13,0, ", r",12, 48,"]")
+       TEST_PR(   "ldrhcs      r14, [r",13,0, ", r",12, 48,"]")
        TEST_PR(   "ldrh        r1, [r",2,  24,", r",3,  48,"]!")
-       TEST_PR(   "ldrcch      r12, [r",11,48,", -r",10,24,"]!")
+       TEST_PR(   "ldrhcc      r12, [r",11,48,", -r",10,24,"]!")
        TEST_PR(   "ldrh        r2, [r",3,  24,"], r",4, 48,"")
        TEST_PR(   "ldrh        r10, [r",9, 48,"], -r",11,24,"")
        TEST_UNSUPPORTED(__inst_arm(0xe1bfc0ba) "       @ ldrh r12, [pc, r10]!")
@@ -499,9 +499,9 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe099a0bf) "       @ ldrh r10, [r9], pc")
 
        TEST_RP(   "strh        r",0, VAL1,", [r",1, 24,", #-2]")
-       TEST_RP(   "strmih      r",14,VAL2,", [r",13,0, ", #2]")
+       TEST_RP(   "strhmi      r",14,VAL2,", [r",13,0, ", #2]")
        TEST_RP(   "strh        r",1, VAL1,", [r",2, 24,", #4]!")
-       TEST_RP(   "strplh      r",12,VAL2,", [r",11,24,", #-4]!")
+       TEST_RP(   "strhpl      r",12,VAL2,", [r",11,24,", #-4]!")
        TEST_RP(   "strh        r",2, VAL1,", [r",3, 24,"], #48")
        TEST_RP(   "strh        r",10,VAL2,", [r",9, 64,"], #-48")
        TEST_RP(   "strh        r",3, VAL1,", [r",13,TEST_MEMORY_SIZE,", #-"__stringify(MAX_STACK_SIZE)"]!")
@@ -511,9 +511,9 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe0c9f3b0) "       @ strh pc, [r9], #48")
 
        TEST_P(    "ldrh        r0, [r",0,  24,", #-2]")
-       TEST_P(    "ldrvsh      r14, [r",13,0, ", #2]")
+       TEST_P(    "ldrhvs      r14, [r",13,0, ", #2]")
        TEST_P(    "ldrh        r1, [r",2,  24,", #4]!")
-       TEST_P(    "ldrvch      r12, [r",11,24,", #-4]!")
+       TEST_P(    "ldrhvc      r12, [r",11,24,", #-4]!")
        TEST_P(    "ldrh        r2, [r",3,  24,"], #48")
        TEST_P(    "ldrh        r10, [r",9, 64,"], #-48")
        TEST(      "ldrh        r0, [pc, #0]")
@@ -521,18 +521,18 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe0d9f3b0) "       @ ldrh pc, [r9], #48")
 
        TEST_PR(   "ldrsb       r0, [r",0,  48,", -r",2, 24,"]")
-       TEST_PR(   "ldrhisb     r14, [r",13,0,", r",12,  48,"]")
+       TEST_PR(   "ldrsbhi     r14, [r",13,0,", r",12,  48,"]")
        TEST_PR(   "ldrsb       r1, [r",2,  24,", r",3,  48,"]!")
-       TEST_PR(   "ldrlssb     r12, [r",11,48,", -r",10,24,"]!")
+       TEST_PR(   "ldrsbls     r12, [r",11,48,", -r",10,24,"]!")
        TEST_PR(   "ldrsb       r2, [r",3,  24,"], r",4, 48,"")
        TEST_PR(   "ldrsb       r10, [r",9, 48,"], -r",11,24,"")
        TEST_UNSUPPORTED(__inst_arm(0xe1bfc0da) "       @ ldrsb r12, [pc, r10]!")
        TEST_UNSUPPORTED(__inst_arm(0xe099f0db) "       @ ldrsb pc, [r9], r11")
 
        TEST_P(    "ldrsb       r0, [r",0,  24,", #-1]")
-       TEST_P(    "ldrgesb     r14, [r",13,0, ", #1]")
+       TEST_P(    "ldrsbge     r14, [r",13,0, ", #1]")
        TEST_P(    "ldrsb       r1, [r",2,  24,", #4]!")
-       TEST_P(    "ldrltsb     r12, [r",11,24,", #-4]!")
+       TEST_P(    "ldrsblt     r12, [r",11,24,", #-4]!")
        TEST_P(    "ldrsb       r2, [r",3,  24,"], #48")
        TEST_P(    "ldrsb       r10, [r",9, 64,"], #-48")
        TEST(      "ldrsb       r0, [pc, #0]")
@@ -540,18 +540,18 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe0d9f3d0) "       @ ldrsb pc, [r9], #48")
 
        TEST_PR(   "ldrsh       r0, [r",0,  48,", -r",2, 24,"]")
-       TEST_PR(   "ldrgtsh     r14, [r",13,0, ", r",12, 48,"]")
+       TEST_PR(   "ldrshgt     r14, [r",13,0, ", r",12, 48,"]")
        TEST_PR(   "ldrsh       r1, [r",2,  24,", r",3,  48,"]!")
-       TEST_PR(   "ldrlesh     r12, [r",11,48,", -r",10,24,"]!")
+       TEST_PR(   "ldrshle     r12, [r",11,48,", -r",10,24,"]!")
        TEST_PR(   "ldrsh       r2, [r",3,  24,"], r",4, 48,"")
        TEST_PR(   "ldrsh       r10, [r",9, 48,"], -r",11,24,"")
        TEST_UNSUPPORTED(__inst_arm(0xe1bfc0fa) "       @ ldrsh r12, [pc, r10]!")
        TEST_UNSUPPORTED(__inst_arm(0xe099f0fb) "       @ ldrsh pc, [r9], r11")
 
        TEST_P(    "ldrsh       r0, [r",0,  24,", #-1]")
-       TEST_P(    "ldreqsh     r14, [r",13,0 ,", #1]")
+       TEST_P(    "ldrsheq     r14, [r",13,0 ,", #1]")
        TEST_P(    "ldrsh       r1, [r",2,  24,", #4]!")
-       TEST_P(    "ldrnesh     r12, [r",11,24,", #-4]!")
+       TEST_P(    "ldrshne     r12, [r",11,24,", #-4]!")
        TEST_P(    "ldrsh       r2, [r",3,  24,"], #48")
        TEST_P(    "ldrsh       r10, [r",9, 64,"], #-48")
        TEST(      "ldrsh       r0, [pc, #0]")
@@ -571,30 +571,30 @@ void kprobe_arm_test_cases(void)
 
 #if __LINUX_ARM_ARCH__ >= 5
        TEST_RPR(  "strd        r",0, VAL1,", [r",1, 48,", -r",2,24,"]")
-       TEST_RPR(  "strccd      r",8, VAL2,", [r",11,0, ", r",12,48,"]")
-       TEST_UNSUPPORTED(  "strccd r8, [r13, r12]")
-       TEST_UNSUPPORTED(  "strccd r8, [r12, r13]")
+       TEST_RPR(  "strdcc      r",8, VAL2,", [r",11,0, ", r",12,48,"]")
+       TEST_UNSUPPORTED(  "strdcc r8, [r13, r12]")
+       TEST_UNSUPPORTED(  "strdcc r8, [r12, r13]")
        TEST_RPR(  "strd        r",4, VAL1,", [r",2, 24,", r",3, 48,"]!")
-       TEST_RPR(  "strcsd      r",12,VAL2,", [r",11,48,", -r",10,24,"]!")
-       TEST_RPR(  "strd        r",2, VAL1,", [r",5, 24,"], r",4,48,"")
-       TEST_RPR(  "strd        r",10,VAL2,", [r",9, 48,"], -r",7,24,"")
+       TEST_RPR(  "strdcs      r",12,VAL2,", r13, [r",11,48,", -r",10,24,"]!")
+       TEST_RPR(  "strd        r",2, VAL1,", r3, [r",5, 24,"], r",4,48,"")
+       TEST_RPR(  "strd        r",10,VAL2,", r11, [r",9, 48,"], -r",7,24,"")
        TEST_UNSUPPORTED(__inst_arm(0xe1afc0fa) "       @ strd r12, [pc, r10]!")
 
        TEST_PR(   "ldrd        r0, [r",0, 48,", -r",2,24,"]")
-       TEST_PR(   "ldrmid      r8, [r",13,0, ", r",12,48,"]")
+       TEST_PR(   "ldrdmi      r8, [r",13,0, ", r",12,48,"]")
        TEST_PR(   "ldrd        r4, [r",2, 24,", r",3, 48,"]!")
-       TEST_PR(   "ldrpld      r6, [r",11,48,", -r",10,24,"]!")
-       TEST_PR(   "ldrd        r2, [r",5, 24,"], r",4,48,"")
-       TEST_PR(   "ldrd        r10, [r",9,48,"], -r",7,24,"")
+       TEST_PR(   "ldrdpl      r6, [r",11,48,", -r",10,24,"]!")
+       TEST_PR(   "ldrd        r2, r3, [r",5, 24,"], r",4,48,"")
+       TEST_PR(   "ldrd        r10, r11, [r",9,48,"], -r",7,24,"")
        TEST_UNSUPPORTED(__inst_arm(0xe1afc0da) "       @ ldrd r12, [pc, r10]!")
        TEST_UNSUPPORTED(__inst_arm(0xe089f0db) "       @ ldrd pc, [r9], r11")
        TEST_UNSUPPORTED(__inst_arm(0xe089e0db) "       @ ldrd lr, [r9], r11")
        TEST_UNSUPPORTED(__inst_arm(0xe089c0df) "       @ ldrd r12, [r9], pc")
 
        TEST_RP(   "strd        r",0, VAL1,", [r",1, 24,", #-8]")
-       TEST_RP(   "strvsd      r",8, VAL2,", [r",13,0, ", #8]")
+       TEST_RP(   "strdvs      r",8, VAL2,", [r",13,0, ", #8]")
        TEST_RP(   "strd        r",4, VAL1,", [r",2, 24,", #16]!")
-       TEST_RP(   "strvcd      r",12,VAL2,", [r",11,24,", #-16]!")
+       TEST_RP(   "strdvc      r",12,VAL2,", r13, [r",11,24,", #-16]!")
        TEST_RP(   "strd        r",2, VAL1,", [r",4, 24,"], #48")
        TEST_RP(   "strd        r",10,VAL2,", [r",9, 64,"], #-48")
        TEST_RP(   "strd        r",6, VAL1,", [r",13,TEST_MEMORY_SIZE,", #-"__stringify(MAX_STACK_SIZE)"]!")
@@ -603,9 +603,9 @@ void kprobe_arm_test_cases(void)
        TEST_UNSUPPORTED(__inst_arm(0xe1efc3f0) "       @ strd r12, [pc, #48]!")
 
        TEST_P(    "ldrd        r0, [r",0, 24,", #-8]")
-       TEST_P(    "ldrhid      r8, [r",13,0, ", #8]")
+       TEST_P(    "ldrdhi      r8, [r",13,0, ", #8]")
        TEST_P(    "ldrd        r4, [r",2, 24,", #16]!")
-       TEST_P(    "ldrlsd      r6, [r",11,24,", #-16]!")
+       TEST_P(    "ldrdls      r6, [r",11,24,", #-16]!")
        TEST_P(    "ldrd        r2, [r",5, 24,"], #48")
        TEST_P(    "ldrd        r10, [r",9,6,"], #-48")
        TEST_UNSUPPORTED(__inst_arm(0xe1efc3d0) "       @ ldrd r12, [pc, #48]!")
@@ -1084,63 +1084,63 @@ void kprobe_arm_test_cases(void)
        TEST_GROUP("Branch, branch with link, and block data transfer")
 
        TEST_P(   "stmda        r",0, 16*4,", {r0}")
-       TEST_P(   "stmeqda      r",4, 16*4,", {r0-r15}")
-       TEST_P(   "stmneda      r",8, 16*4,"!, {r8-r15}")
+       TEST_P(   "stmdaeq      r",4, 16*4,", {r0-r15}")
+       TEST_P(   "stmdane      r",8, 16*4,"!, {r8-r15}")
        TEST_P(   "stmda        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_P(   "stmda        r",13,0,   "!, {pc}")
 
        TEST_P(   "ldmda        r",0, 16*4,", {r0}")
-       TEST_BF_P("ldmcsda      r",4, 15*4,", {r0-r15}")
-       TEST_BF_P("ldmccda      r",7, 15*4,"!, {r8-r15}")
+       TEST_BF_P("ldmdacs      r",4, 15*4,", {r0-r15}")
+       TEST_BF_P("ldmdacc      r",7, 15*4,"!, {r8-r15}")
        TEST_P(   "ldmda        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_BF_P("ldmda        r",14,15*4,"!, {pc}")
 
        TEST_P(   "stmia        r",0, 16*4,", {r0}")
-       TEST_P(   "stmmiia      r",4, 16*4,", {r0-r15}")
-       TEST_P(   "stmplia      r",8, 16*4,"!, {r8-r15}")
+       TEST_P(   "stmiami      r",4, 16*4,", {r0-r15}")
+       TEST_P(   "stmiapl      r",8, 16*4,"!, {r8-r15}")
        TEST_P(   "stmia        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_P(   "stmia        r",14,0,   "!, {pc}")
 
        TEST_P(   "ldmia        r",0, 16*4,", {r0}")
-       TEST_BF_P("ldmvsia      r",4, 0,   ", {r0-r15}")
-       TEST_BF_P("ldmvcia      r",7, 8*4, "!, {r8-r15}")
+       TEST_BF_P("ldmiavs      r",4, 0,   ", {r0-r15}")
+       TEST_BF_P("ldmiavc      r",7, 8*4, "!, {r8-r15}")
        TEST_P(   "ldmia        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_BF_P("ldmia        r",14,15*4,"!, {pc}")
 
        TEST_P(   "stmdb        r",0, 16*4,", {r0}")
-       TEST_P(   "stmhidb      r",4, 16*4,", {r0-r15}")
-       TEST_P(   "stmlsdb      r",8, 16*4,"!, {r8-r15}")
+       TEST_P(   "stmdbhi      r",4, 16*4,", {r0-r15}")
+       TEST_P(   "stmdbls      r",8, 16*4,"!, {r8-r15}")
        TEST_P(   "stmdb        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_P(   "stmdb        r",13,4,   "!, {pc}")
 
        TEST_P(   "ldmdb        r",0, 16*4,", {r0}")
-       TEST_BF_P("ldmgedb      r",4, 16*4,", {r0-r15}")
-       TEST_BF_P("ldmltdb      r",7, 16*4,"!, {r8-r15}")
+       TEST_BF_P("ldmdbge      r",4, 16*4,", {r0-r15}")
+       TEST_BF_P("ldmdblt      r",7, 16*4,"!, {r8-r15}")
        TEST_P(   "ldmdb        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_BF_P("ldmdb        r",14,16*4,"!, {pc}")
 
        TEST_P(   "stmib        r",0, 16*4,", {r0}")
-       TEST_P(   "stmgtib      r",4, 16*4,", {r0-r15}")
-       TEST_P(   "stmleib      r",8, 16*4,"!, {r8-r15}")
+       TEST_P(   "stmibgt      r",4, 16*4,", {r0-r15}")
+       TEST_P(   "stmible      r",8, 16*4,"!, {r8-r15}")
        TEST_P(   "stmib        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_P(   "stmib        r",13,-4,  "!, {pc}")
 
        TEST_P(   "ldmib        r",0, 16*4,", {r0}")
-       TEST_BF_P("ldmeqib      r",4, -4,", {r0-r15}")
-       TEST_BF_P("ldmneib      r",7, 7*4,"!, {r8-r15}")
+       TEST_BF_P("ldmibeq      r",4, -4,", {r0-r15}")
+       TEST_BF_P("ldmibne      r",7, 7*4,"!, {r8-r15}")
        TEST_P(   "ldmib        r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}")
        TEST_BF_P("ldmib        r",14,14*4,"!, {pc}")
 
        TEST_P(   "stmdb        r",13,16*4,"!, {r3-r12,lr}")
-       TEST_P(   "stmeqdb      r",13,16*4,"!, {r3-r12}")
-       TEST_P(   "stmnedb      r",2, 16*4,", {r3-r12,lr}")
+       TEST_P(   "stmdbeq      r",13,16*4,"!, {r3-r12}")
+       TEST_P(   "stmdbne      r",2, 16*4,", {r3-r12,lr}")
        TEST_P(   "stmdb        r",13,16*4,"!, {r2-r12,lr}")
        TEST_P(   "stmdb        r",0, 16*4,", {r0-r12}")
        TEST_P(   "stmdb        r",0, 16*4,", {r0-r12,lr}")
 
        TEST_BF_P("ldmia        r",13,5*4, "!, {r3-r12,pc}")
-       TEST_P(   "ldmccia      r",13,5*4, "!, {r3-r12}")
-       TEST_BF_P("ldmcsia      r",2, 5*4, "!, {r3-r12,pc}")
+       TEST_P(   "ldmiacc      r",13,5*4, "!, {r3-r12}")
+       TEST_BF_P("ldmiacs      r",2, 5*4, "!, {r3-r12,pc}")
        TEST_BF_P("ldmia        r",13,4*4, "!, {r2-r12,pc}")
        TEST_P(   "ldmia        r",0, 16*4,", {r0-r12}")
        TEST_P(   "ldmia        r",0, 16*4,", {r0-r12,lr}")
@@ -1174,80 +1174,80 @@ void kprobe_arm_test_cases(void)
 #define TEST_COPROCESSOR(code) TEST_UNSUPPORTED(code)
 
 #define COPROCESSOR_INSTRUCTIONS_ST_LD(two,cc)                                 \
-       TEST_COPROCESSOR("stc"two"      0, cr0, [r13, #4]")                     \
-       TEST_COPROCESSOR("stc"two"      0, cr0, [r13, #-4]")                    \
-       TEST_COPROCESSOR("stc"two"      0, cr0, [r13, #4]!")                    \
-       TEST_COPROCESSOR("stc"two"      0, cr0, [r13, #-4]!")                   \
-       TEST_COPROCESSOR("stc"two"      0, cr0, [r13], #4")                     \
-       TEST_COPROCESSOR("stc"two"      0, cr0, [r13], #-4")                    \
-       TEST_COPROCESSOR("stc"two"      0, cr0, [r13], {1}")                    \
-       TEST_COPROCESSOR("stc"two"l     0, cr0, [r13, #4]")                     \
-       TEST_COPROCESSOR("stc"two"l     0, cr0, [r13, #-4]")                    \
-       TEST_COPROCESSOR("stc"two"l     0, cr0, [r13, #4]!")                    \
-       TEST_COPROCESSOR("stc"two"l     0, cr0, [r13, #-4]!")                   \
-       TEST_COPROCESSOR("stc"two"l     0, cr0, [r13], #4")                     \
-       TEST_COPROCESSOR("stc"two"l     0, cr0, [r13], #-4")                    \
-       TEST_COPROCESSOR("stc"two"l     0, cr0, [r13], {1}")                    \
-       TEST_COPROCESSOR("ldc"two"      0, cr0, [r13, #4]")                     \
-       TEST_COPROCESSOR("ldc"two"      0, cr0, [r13, #-4]")                    \
-       TEST_COPROCESSOR("ldc"two"      0, cr0, [r13, #4]!")                    \
-       TEST_COPROCESSOR("ldc"two"      0, cr0, [r13, #-4]!")                   \
-       TEST_COPROCESSOR("ldc"two"      0, cr0, [r13], #4")                     \
-       TEST_COPROCESSOR("ldc"two"      0, cr0, [r13], #-4")                    \
-       TEST_COPROCESSOR("ldc"two"      0, cr0, [r13], {1}")                    \
-       TEST_COPROCESSOR("ldc"two"l     0, cr0, [r13, #4]")                     \
-       TEST_COPROCESSOR("ldc"two"l     0, cr0, [r13, #-4]")                    \
-       TEST_COPROCESSOR("ldc"two"l     0, cr0, [r13, #4]!")                    \
-       TEST_COPROCESSOR("ldc"two"l     0, cr0, [r13, #-4]!")                   \
-       TEST_COPROCESSOR("ldc"two"l     0, cr0, [r13], #4")                     \
-       TEST_COPROCESSOR("ldc"two"l     0, cr0, [r13], #-4")                    \
-       TEST_COPROCESSOR("ldc"two"l     0, cr0, [r13], {1}")                    \
+       TEST_COPROCESSOR("stc"two"      p0, cr0, [r13, #4]")                    \
+       TEST_COPROCESSOR("stc"two"      p0, cr0, [r13, #-4]")                   \
+       TEST_COPROCESSOR("stc"two"      p0, cr0, [r13, #4]!")                   \
+       TEST_COPROCESSOR("stc"two"      p0, cr0, [r13, #-4]!")                  \
+       TEST_COPROCESSOR("stc"two"      p0, cr0, [r13], #4")                    \
+       TEST_COPROCESSOR("stc"two"      p0, cr0, [r13], #-4")                   \
+       TEST_COPROCESSOR("stc"two"      p0, cr0, [r13], {1}")                   \
+       TEST_COPROCESSOR("stc"two"l     p0, cr0, [r13, #4]")                    \
+       TEST_COPROCESSOR("stc"two"l     p0, cr0, [r13, #-4]")                   \
+       TEST_COPROCESSOR("stc"two"l     p0, cr0, [r13, #4]!")                   \
+       TEST_COPROCESSOR("stc"two"l     p0, cr0, [r13, #-4]!")                  \
+       TEST_COPROCESSOR("stc"two"l     p0, cr0, [r13], #4")                    \
+       TEST_COPROCESSOR("stc"two"l     p0, cr0, [r13], #-4")                   \
+       TEST_COPROCESSOR("stc"two"l     p0, cr0, [r13], {1}")                   \
+       TEST_COPROCESSOR("ldc"two"      p0, cr0, [r13, #4]")                    \
+       TEST_COPROCESSOR("ldc"two"      p0, cr0, [r13, #-4]")                   \
+       TEST_COPROCESSOR("ldc"two"      p0, cr0, [r13, #4]!")                   \
+       TEST_COPROCESSOR("ldc"two"      p0, cr0, [r13, #-4]!")                  \
+       TEST_COPROCESSOR("ldc"two"      p0, cr0, [r13], #4")                    \
+       TEST_COPROCESSOR("ldc"two"      p0, cr0, [r13], #-4")                   \
+       TEST_COPROCESSOR("ldc"two"      p0, cr0, [r13], {1}")                   \
+       TEST_COPROCESSOR("ldc"two"l     p0, cr0, [r13, #4]")                    \
+       TEST_COPROCESSOR("ldc"two"l     p0, cr0, [r13, #-4]")                   \
+       TEST_COPROCESSOR("ldc"two"l     p0, cr0, [r13, #4]!")                   \
+       TEST_COPROCESSOR("ldc"two"l     p0, cr0, [r13, #-4]!")                  \
+       TEST_COPROCESSOR("ldc"two"l     p0, cr0, [r13], #4")                    \
+       TEST_COPROCESSOR("ldc"two"l     p0, cr0, [r13], #-4")                   \
+       TEST_COPROCESSOR("ldc"two"l     p0, cr0, [r13], {1}")                   \
                                                                                \
-       TEST_COPROCESSOR( "stc"two"     0, cr0, [r15, #4]")                     \
-       TEST_COPROCESSOR( "stc"two"     0, cr0, [r15, #-4]")                    \
+       TEST_COPROCESSOR( "stc"two"     p0, cr0, [r15, #4]")                    \
+       TEST_COPROCESSOR( "stc"two"     p0, cr0, [r15, #-4]")                   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##daf0001) "  @ stc"two"      0, cr0, [r15, #4]!")    \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##d2f0001) "  @ stc"two"      0, cr0, [r15, #-4]!")   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##caf0001) "  @ stc"two"      0, cr0, [r15], #4")     \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c2f0001) "  @ stc"two"      0, cr0, [r15], #-4")    \
-       TEST_COPROCESSOR( "stc"two"     0, cr0, [r15], {1}")                    \
-       TEST_COPROCESSOR( "stc"two"l    0, cr0, [r15, #4]")                     \
-       TEST_COPROCESSOR( "stc"two"l    0, cr0, [r15, #-4]")                    \
+       TEST_COPROCESSOR( "stc"two"     p0, cr0, [r15], {1}")                   \
+       TEST_COPROCESSOR( "stc"two"l    p0, cr0, [r15, #4]")                    \
+       TEST_COPROCESSOR( "stc"two"l    p0, cr0, [r15, #-4]")                   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##def0001) "  @ stc"two"l     0, cr0, [r15, #4]!")    \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##d6f0001) "  @ stc"two"l     0, cr0, [r15, #-4]!")   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##cef0001) "  @ stc"two"l     0, cr0, [r15], #4")     \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c6f0001) "  @ stc"two"l     0, cr0, [r15], #-4")    \
-       TEST_COPROCESSOR( "stc"two"l    0, cr0, [r15], {1}")                    \
-       TEST_COPROCESSOR( "ldc"two"     0, cr0, [r15, #4]")                     \
-       TEST_COPROCESSOR( "ldc"two"     0, cr0, [r15, #-4]")                    \
+       TEST_COPROCESSOR( "stc"two"l    p0, cr0, [r15], {1}")                   \
+       TEST_COPROCESSOR( "ldc"two"     p0, cr0, [r15, #4]")                    \
+       TEST_COPROCESSOR( "ldc"two"     p0, cr0, [r15, #-4]")                   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##dbf0001) "  @ ldc"two"      0, cr0, [r15, #4]!")    \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##d3f0001) "  @ ldc"two"      0, cr0, [r15, #-4]!")   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##cbf0001) "  @ ldc"two"      0, cr0, [r15], #4")     \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c3f0001) "  @ ldc"two"      0, cr0, [r15], #-4")    \
-       TEST_COPROCESSOR( "ldc"two"     0, cr0, [r15], {1}")                    \
-       TEST_COPROCESSOR( "ldc"two"l    0, cr0, [r15, #4]")                     \
-       TEST_COPROCESSOR( "ldc"two"l    0, cr0, [r15, #-4]")                    \
+       TEST_COPROCESSOR( "ldc"two"     p0, cr0, [r15], {1}")                   \
+       TEST_COPROCESSOR( "ldc"two"l    p0, cr0, [r15, #4]")                    \
+       TEST_COPROCESSOR( "ldc"two"l    p0, cr0, [r15, #-4]")                   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##dff0001) "  @ ldc"two"l     0, cr0, [r15, #4]!")    \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##d7f0001) "  @ ldc"two"l     0, cr0, [r15, #-4]!")   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##cff0001) "  @ ldc"two"l     0, cr0, [r15], #4")     \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c7f0001) "  @ ldc"two"l     0, cr0, [r15], #-4")    \
-       TEST_COPROCESSOR( "ldc"two"l    0, cr0, [r15], {1}")
+       TEST_COPROCESSOR( "ldc"two"l    p0, cr0, [r15], {1}")
 
 #define COPROCESSOR_INSTRUCTIONS_MC_MR(two,cc)                                 \
                                                                                \
-       TEST_COPROCESSOR( "mcrr"two"    0, 15, r0, r14, cr0")                   \
-       TEST_COPROCESSOR( "mcrr"two"    15, 0, r14, r0, cr15")                  \
+       TEST_COPROCESSOR( "mcrr"two"    p0, 15, r0, r14, cr0")                  \
+       TEST_COPROCESSOR( "mcrr"two"    p15, 0, r14, r0, cr15")                 \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c4f00f0) "  @ mcrr"two"     0, 15, r0, r15, cr0")   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c40ff0f) "  @ mcrr"two"     15, 0, r15, r0, cr15")  \
-       TEST_COPROCESSOR( "mrrc"two"    0, 15, r0, r14, cr0")                   \
-       TEST_COPROCESSOR( "mrrc"two"    15, 0, r14, r0, cr15")                  \
+       TEST_COPROCESSOR( "mrrc"two"    p0, 15, r0, r14, cr0")                  \
+       TEST_COPROCESSOR( "mrrc"two"    p15, 0, r14, r0, cr15")                 \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c5f00f0) "  @ mrrc"two"     0, 15, r0, r15, cr0")   \
        TEST_UNSUPPORTED(__inst_arm(0x##cc##c50ff0f) "  @ mrrc"two"     15, 0, r15, r0, cr15")  \
-       TEST_COPROCESSOR( "cdp"two"     15, 15, cr15, cr15, cr15, 7")           \
-       TEST_COPROCESSOR( "cdp"two"     0, 0, cr0, cr0, cr0, 0")                \
-       TEST_COPROCESSOR( "mcr"two"     15, 7, r15, cr15, cr15, 7")             \
-       TEST_COPROCESSOR( "mcr"two"     0, 0, r0, cr0, cr0, 0")                 \
-       TEST_COPROCESSOR( "mrc"two"     15, 7, r15, cr15, cr15, 7")             \
-       TEST_COPROCESSOR( "mrc"two"     0, 0, r0, cr0, cr0, 0")
+       TEST_COPROCESSOR( "cdp"two"     p15, 15, cr15, cr15, cr15, 7")          \
+       TEST_COPROCESSOR( "cdp"two"     p0, 0, cr0, cr0, cr0, 0")               \
+       TEST_COPROCESSOR( "mcr"two"     p15, 7, r15, cr15, cr15, 7")            \
+       TEST_COPROCESSOR( "mcr"two"     p0, 0, r0, cr0, cr0, 0")                \
+       TEST_COPROCESSOR( "mrc"two"     p15, 7, r14, cr15, cr15, 7")            \
+       TEST_COPROCESSOR( "mrc"two"     p0, 0, r0, cr0, cr0, 0")
 
        COPROCESSOR_INSTRUCTIONS_ST_LD("",e)
 #if __LINUX_ARM_ARCH__ >= 5
index 19a5b2a..f1d5583 100644 (file)
@@ -108,6 +108,7 @@ struct test_arg_end {
 
 #define TESTCASE_START(title)                                  \
        __asm__ __volatile__ (                                  \
+       ".syntax unified                                \n\t"   \
        "bl     __kprobes_test_case_start               \n\t"   \
        ".pushsection .rodata                           \n\t"   \
        "10:                                            \n\t"   \
index 3654f97..87de1f6 100644 (file)
@@ -8,16 +8,15 @@
 gen := arch/$(ARCH)/include/generated
 kapi := $(gen)/asm
 uapi := $(gen)/uapi/asm
-syshdr := $(srctree)/$(src)/syscallhdr.sh
+syshdr := $(srctree)/scripts/syscallhdr.sh
 sysnr := $(srctree)/$(src)/syscallnr.sh
-systbl := $(srctree)/$(src)/syscalltbl.sh
+systbl := $(srctree)/scripts/syscalltbl.sh
 syscall := $(src)/syscall.tbl
 
 gen-y := $(gen)/calls-oabi.S
 gen-y += $(gen)/calls-eabi.S
 kapi-hdrs-y := $(kapi)/unistd-nr.h
 kapi-hdrs-y += $(kapi)/mach-types.h
-uapi-hdrs-y := $(uapi)/unistd-common.h
 uapi-hdrs-y += $(uapi)/unistd-oabi.h
 uapi-hdrs-y += $(uapi)/unistd-eabi.h
 
@@ -41,28 +40,21 @@ $(kapi)/mach-types.h: $(src)/gen-mach-types $(src)/mach-types FORCE
        $(call if_changed,gen_mach)
 
 quiet_cmd_syshdr = SYSHDR  $@
-      cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \
-                  '$(syshdr_abi_$(basetarget))' \
-                  '$(syshdr_pfx_$(basetarget))' \
-                  '__NR_SYSCALL_BASE'
+      cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --abis $(abis) \
+                  --offset __NR_SYSCALL_BASE $< $@
 
 quiet_cmd_systbl = SYSTBL  $@
-      cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@' \
-                  '$(systbl_abi_$(basetarget))'
+      cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@
 
 quiet_cmd_sysnr  = SYSNR   $@
       cmd_sysnr  = $(CONFIG_SHELL) '$(sysnr)' '$<' '$@' \
                   '$(syshdr_abi_$(basetarget))'
 
-syshdr_abi_unistd-common := common
-$(uapi)/unistd-common.h: $(syscall) $(syshdr) FORCE
-       $(call if_changed,syshdr)
-
-syshdr_abi_unistd-oabi := oabi
+$(uapi)/unistd-oabi.h: abis := common,oabi
 $(uapi)/unistd-oabi.h: $(syscall) $(syshdr) FORCE
        $(call if_changed,syshdr)
 
-syshdr_abi_unistd-eabi := eabi
+$(uapi)/unistd-eabi.h: abis := common,eabi
 $(uapi)/unistd-eabi.h: $(syscall) $(syshdr) FORCE
        $(call if_changed,syshdr)
 
@@ -70,10 +62,10 @@ sysnr_abi_unistd-nr := common,oabi,eabi,compat
 $(kapi)/unistd-nr.h: $(syscall) $(sysnr) FORCE
        $(call if_changed,sysnr)
 
-systbl_abi_calls-oabi := common,oabi
+$(gen)/calls-oabi.S: abis := common,oabi
 $(gen)/calls-oabi.S: $(syscall) $(systbl) FORCE
        $(call if_changed,systbl)
 
-systbl_abi_calls-eabi := common,eabi
+$(gen)/calls-eabi.S: abis := common,eabi
 $(gen)/calls-eabi.S: $(syscall) $(systbl) FORCE
        $(call if_changed,systbl)
index 90cbe20..c7679d7 100644 (file)
 441    common  epoll_pwait2                    sys_epoll_pwait2
 442    common  mount_setattr                   sys_mount_setattr
 443    common  quotactl_path                   sys_quotactl_path
+444    common  landlock_create_ruleset         sys_landlock_create_ruleset
+445    common  landlock_add_rule               sys_landlock_add_rule
+446    common  landlock_restrict_self          sys_landlock_restrict_self
diff --git a/arch/arm/tools/syscallhdr.sh b/arch/arm/tools/syscallhdr.sh
deleted file mode 100644 (file)
index 6b2f25c..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-in="$1"
-out="$2"
-my_abis=`echo "($3)" | tr ',' '|'`
-prefix="$4"
-offset="$5"
-
-fileguard=_ASM_ARM_`basename "$out" | sed \
-    -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
-    -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
-if echo $out | grep -q uapi; then
-    fileguard="_UAPI$fileguard"
-fi
-grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
-    echo "#ifndef ${fileguard}"
-    echo "#define ${fileguard} 1"
-    echo ""
-
-    while read nr abi name entry ; do
-       if [ -z "$offset" ]; then
-           echo "#define __NR_${prefix}${name} $nr"
-       else
-           echo "#define __NR_${prefix}${name} ($offset + $nr)"
-        fi
-    done
-
-    echo ""
-    echo "#endif /* ${fileguard} */"
-) > "$out"
diff --git a/arch/arm/tools/syscalltbl.sh b/arch/arm/tools/syscalltbl.sh
deleted file mode 100644 (file)
index ae7e93c..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-in="$1"
-out="$2"
-my_abis=`echo "($3)" | tr ',' '|'`
-
-grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
-    while read nr abi name entry compat; do
-        if [ "$abi" = "eabi" -a -n "$compat" ]; then
-            echo "$in: error: a compat entry for an EABI syscall ($name) makes no sense" >&2
-            exit 1
-        fi
-
-       if [ -n "$entry" ]; then
-            if [ -z "$compat" ]; then
-                echo "NATIVE($nr, $entry)"
-            else
-                echo "COMPAT($nr, $entry, $compat)"
-            fi
-        fi
-    done
-) > "$out"
index e1b12b2..f8f0746 100644 (file)
@@ -152,7 +152,7 @@ static int __init xen_mm_init(void)
        struct gnttab_cache_flush cflush;
        if (!xen_swiotlb_detect())
                return 0;
-       xen_swiotlb_init(1, false);
+       xen_swiotlb_init();
 
        cflush.op = 0;
        cflush.a.dev_bus_addr = 0;
index 7f2a800..9f1d856 100644 (file)
@@ -11,6 +11,12 @@ config ARM64
        select ACPI_PPTT if ACPI
        select ARCH_HAS_DEBUG_WX
        select ARCH_BINFMT_ELF_STATE
+       select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
+       select ARCH_ENABLE_MEMORY_HOTPLUG
+       select ARCH_ENABLE_MEMORY_HOTREMOVE
+       select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
+       select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
+       select ARCH_HAS_CACHE_LINE_SIZE
        select ARCH_HAS_DEBUG_VIRTUAL
        select ARCH_HAS_DEBUG_VM_PGTABLE
        select ARCH_HAS_DMA_PREP_COHERENT
@@ -72,6 +78,7 @@ config ARM64
        select ARCH_USE_QUEUED_SPINLOCKS
        select ARCH_USE_SYM_ANNOTATIONS
        select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+       select ARCH_SUPPORTS_HUGETLBFS
        select ARCH_SUPPORTS_MEMORY_FAILURE
        select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK
        select ARCH_SUPPORTS_LTO_CLANG if CPU_LITTLE_ENDIAN
@@ -163,7 +170,6 @@ config ARM64
        select HAVE_CMPXCHG_DOUBLE
        select HAVE_CMPXCHG_LOCAL
        select HAVE_CONTEXT_TRACKING
-       select HAVE_DEBUG_BUGVERBOSE
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_DMA_CONTIGUOUS
        select HAVE_DYNAMIC_FTRACE
@@ -213,6 +219,7 @@ config ARM64
        select SWIOTLB
        select SYSCTL_EXCEPTION_TRACE
        select THREAD_INFO_IN_TASK
+       select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD
        help
          ARM 64-bit (AArch64) Linux support.
 
@@ -308,10 +315,7 @@ config ZONE_DMA32
        bool "Support DMA32 zone" if EXPERT
        default y
 
-config ARCH_ENABLE_MEMORY_HOTPLUG
-       def_bool y
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
        def_bool y
 
 config SMP
@@ -1056,32 +1060,15 @@ source "kernel/Kconfig.hz"
 config ARCH_SPARSEMEM_ENABLE
        def_bool y
        select SPARSEMEM_VMEMMAP_ENABLE
-
-config ARCH_SPARSEMEM_DEFAULT
-       def_bool ARCH_SPARSEMEM_ENABLE
-
-config ARCH_SELECT_MEMORY_MODEL
-       def_bool ARCH_SPARSEMEM_ENABLE
-
-config ARCH_FLATMEM_ENABLE
-       def_bool !NUMA
+       select SPARSEMEM_VMEMMAP
 
 config HW_PERF_EVENTS
        def_bool y
        depends on ARM_PMU
 
-config SYS_SUPPORTS_HUGETLBFS
-       def_bool y
-
-config ARCH_HAS_CACHE_LINE_SIZE
-       def_bool y
-
 config ARCH_HAS_FILTER_PGPROT
        def_bool y
 
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
-       def_bool y if PGTABLE_LEVELS > 2
-
 # Supported by clang >= 7.0
 config CC_HAVE_SHADOW_CALL_STACK
        def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18)
@@ -1923,14 +1910,6 @@ config SYSVIPC_COMPAT
        def_bool y
        depends on COMPAT && SYSVIPC
 
-config ARCH_ENABLE_HUGEPAGE_MIGRATION
-       def_bool y
-       depends on HUGETLB_PAGE && MIGRATION
-
-config ARCH_ENABLE_THP_MIGRATION
-       def_bool y
-       depends on TRANSPARENT_HUGEPAGE
-
 menu "Power management options"
 
 source "kernel/power/Kconfig"
index 242f821..dfc6376 100644 (file)
                pinctrl-names = "default";
                pinctrl-0 = <&pwm0_pin>;
                clocks = <&cru PCLK_PWM1>;
-               clock-names = "pwm";
                status = "disabled";
        };
 
                pinctrl-names = "default";
                pinctrl-0 = <&pwm1_pin>;
                clocks = <&cru PCLK_PWM1>;
-               clock-names = "pwm";
                status = "disabled";
        };
 
                reg = <0x0 0xff680020 0x0 0x10>;
                #pwm-cells = <3>;
                clocks = <&cru PCLK_PWM1>;
-               clock-names = "pwm";
                status = "disabled";
        };
 
                pinctrl-names = "default";
                pinctrl-0 = <&pwm3_pin>;
                clocks = <&cru PCLK_PWM1>;
-               clock-names = "pwm";
                status = "disabled";
        };
 
index 0f2879c..634a91a 100644 (file)
                pinctrl-names = "default";
                pinctrl-0 = <&pwm0_pin>;
                clocks = <&pmucru PCLK_RKPWM_PMU>;
-               clock-names = "pwm";
                status = "disabled";
        };
 
                pinctrl-names = "default";
                pinctrl-0 = <&pwm1_pin>;
                clocks = <&pmucru PCLK_RKPWM_PMU>;
-               clock-names = "pwm";
                status = "disabled";
        };
 
                pinctrl-names = "default";
                pinctrl-0 = <&pwm2_pin>;
                clocks = <&pmucru PCLK_RKPWM_PMU>;
-               clock-names = "pwm";
                status = "disabled";
        };
 
                pinctrl-names = "default";
                pinctrl-0 = <&pwm3a_pin>;
                clocks = <&pmucru PCLK_RKPWM_PMU>;
-               clock-names = "pwm";
                status = "disabled";
        };
 
index ab569b0..8418c1b 100644 (file)
@@ -16,6 +16,7 @@
 
 #include <asm/asm-offsets.h>
 #include <asm/alternative.h>
+#include <asm/asm-bug.h>
 #include <asm/cpufeature.h>
 #include <asm/cputype.h>
 #include <asm/debug-monitors.h>
@@ -279,12 +280,24 @@ alternative_endif
  * provide the system wide safe value from arm64_ftr_reg_ctrel0.sys_val
  */
        .macro  read_ctr, reg
+#ifndef __KVM_NVHE_HYPERVISOR__
 alternative_if_not ARM64_MISMATCHED_CACHE_TYPE
        mrs     \reg, ctr_el0                   // read CTR
        nop
 alternative_else
        ldr_l   \reg, arm64_ftr_reg_ctrel0 + ARM64_FTR_SYSVAL
 alternative_endif
+#else
+alternative_if_not ARM64_KVM_PROTECTED_MODE
+       ASM_BUG()
+alternative_else_nop_endif
+alternative_cb kvm_compute_final_ctr_el0
+       movz    \reg, #0
+       movk    \reg, #0, lsl #16
+       movk    \reg, #0, lsl #32
+       movk    \reg, #0, lsl #48
+alternative_cb_end
+#endif
        .endm
 
 
@@ -685,11 +698,11 @@ USER(\label, ic   ivau, \tmp2)                    // invalidate I line PoU
        .endm
 
 /*
- * Set SCTLR_EL1 to the passed value, and invalidate the local icache
+ * Set SCTLR_ELx to the @reg value, and invalidate the local icache
  * in the process. This is called when setting the MMU on.
  */
-.macro set_sctlr_el1, reg
-       msr     sctlr_el1, \reg
+.macro set_sctlr, sreg, reg
+       msr     \sreg, \reg
        isb
        /*
         * Invalidate the local I-cache so that any instructions fetched
@@ -701,6 +714,14 @@ USER(\label, ic    ivau, \tmp2)                    // invalidate I line PoU
        isb
 .endm
 
+.macro set_sctlr_el1, reg
+       set_sctlr sctlr_el1, \reg
+.endm
+
+.macro set_sctlr_el2, reg
+       set_sctlr sctlr_el2, \reg
+.endm
+
        /*
         * Check whether preempt/bh-disabled asm code should yield as soon as
         * it is able. This is the case if we are currently running in task
index 065ba48..2175ec0 100644 (file)
@@ -23,6 +23,7 @@
 #define dsb(opt)       asm volatile("dsb " #opt : : : "memory")
 
 #define psb_csync()    asm volatile("hint #17" : : : "memory")
+#define tsb_csync()    asm volatile("hint #18" : : : "memory")
 #define csdb()         asm volatile("hint #20" : : : "memory")
 
 #ifdef CONFIG_ARM64_PSEUDO_NMI
index 5eb7af9..55f57df 100644 (file)
@@ -131,6 +131,9 @@ static inline void local_daif_inherit(struct pt_regs *regs)
        if (interrupts_enabled(regs))
                trace_hardirqs_on();
 
+       if (system_uses_irq_prio_masking())
+               gic_write_pmr(regs->pmr_save);
+
        /*
         * We can't use local_daif_restore(regs->pstate) here as
         * system_has_prio_mask_debugging() won't restore the I bit if it can
index b3f2d3b..21fa330 100644 (file)
                                                // use EL1&0 translation.
 
 .Lskip_spe_\@:
+       /* Trace buffer */
+       ubfx    x0, x1, #ID_AA64DFR0_TRBE_SHIFT, #4
+       cbz     x0, .Lskip_trace_\@             // Skip if TraceBuffer is not present
+
+       mrs_s   x0, SYS_TRBIDR_EL1
+       and     x0, x0, TRBIDR_PROG
+       cbnz    x0, .Lskip_trace_\@             // If TRBE is available at EL2
+
+       mov     x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT)
+       orr     x2, x2, x0                      // allow the EL1&0 translation
+                                               // to own it.
+
+.Lskip_trace_\@:
        msr     mdcr_el2, x2                    // Configure debug traps
 .endm
 
index ebb263b..2599504 100644 (file)
@@ -131,6 +131,15 @@ static inline void sve_user_enable(void)
        sysreg_clear_set(cpacr_el1, 0, CPACR_EL1_ZEN_EL0EN);
 }
 
+#define sve_cond_update_zcr_vq(val, reg)               \
+       do {                                            \
+               u64 __zcr = read_sysreg_s((reg));       \
+               u64 __new = __zcr & ~ZCR_ELx_LEN_MASK;  \
+               __new |= (val) & ZCR_ELx_LEN_MASK;      \
+               if (__zcr != __new)                     \
+                       write_sysreg_s(__new, (reg));   \
+       } while (0)
+
 /*
  * Probing and setup functions.
  * Calls to these functions must be serialised with one another.
@@ -160,6 +169,8 @@ static inline int sve_get_current_vl(void)
 static inline void sve_user_disable(void) { BUILD_BUG(); }
 static inline void sve_user_enable(void) { BUILD_BUG(); }
 
+#define sve_cond_update_zcr_vq(val, reg) do { } while (0)
+
 static inline void sve_init_vq_map(void) { }
 static inline void sve_update_vq_map(void) { }
 static inline int sve_verify_vq_map(void) { return 0; }
index af43367..a256399 100644 (file)
@@ -6,6 +6,8 @@
  * Author: Catalin Marinas <catalin.marinas@arm.com>
  */
 
+#include <asm/assembler.h>
+
 .macro fpsimd_save state, tmpnr
        stp     q0, q1, [\state, #16 * 0]
        stp     q2, q3, [\state, #16 * 2]
                str             w\nxtmp, [\xpfpsr, #4]
 .endm
 
-.macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2
-               sve_load_vq     \xvqminus1, x\nxtmp, \xtmp2
+.macro __sve_load nxbase, xpfpsr, nxtmp
  _for n, 0, 31,        _sve_ldr_v      \n, \nxbase, \n - 34
                _sve_ldr_p      0, \nxbase
                _sve_wrffr      0
                ldr             w\nxtmp, [\xpfpsr, #4]
                msr             fpcr, x\nxtmp
 .endm
+
+.macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2
+               sve_load_vq     \xvqminus1, x\nxtmp, \xtmp2
+               __sve_load      \nxbase, \xpfpsr, \nxtmp
+.endm
index 737ded6..b4b3076 100644 (file)
 #define __HYP_CONCAT(a, b)     a ## b
 #define HYP_CONCAT(a, b)       __HYP_CONCAT(a, b)
 
+#ifndef __KVM_NVHE_HYPERVISOR__
 /*
  * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_,
  * to separate it from the kernel proper.
  */
 #define kvm_nvhe_sym(sym)      __kvm_nvhe_##sym
+#else
+#define kvm_nvhe_sym(sym)      sym
+#endif
 
 #ifdef LINKER_SCRIPT
 
@@ -56,6 +60,9 @@
  */
 #define KVM_NVHE_ALIAS(sym)    kvm_nvhe_sym(sym) = sym;
 
+/* Defines a linker script alias for KVM nVHE hyp symbols */
+#define KVM_NVHE_ALIAS_HYP(first, sec) kvm_nvhe_sym(first) = kvm_nvhe_sym(sec);
+
 #endif /* LINKER_SCRIPT */
 
 #endif /* __ARM64_HYP_IMAGE_H__ */
index f9cc1d0..0ae427f 100644 (file)
@@ -4,4 +4,7 @@
 
 #include <asm/xen/hypervisor.h>
 
+void kvm_init_hyp_services(void);
+bool kvm_arm_hyp_service_available(u32 func_id);
+
 #endif
index 587c504..d44df9d 100644 (file)
  * has a direct correspondence, and needs to appear sufficiently aligned
  * in the virtual address space.
  */
-#if defined(CONFIG_SPARSEMEM_VMEMMAP) && ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS
+#if ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS
 #define ARM64_MEMSTART_ALIGN   (1UL << SECTION_SIZE_BITS)
 #else
 #define ARM64_MEMSTART_ALIGN   (1UL << ARM64_MEMSTART_SHIFT)
index 94d4025..692c904 100644 (file)
 #define CPTR_EL2_DEFAULT       CPTR_EL2_RES1
 
 /* Hyp Debug Configuration Register bits */
+#define MDCR_EL2_E2TB_MASK     (UL(0x3))
+#define MDCR_EL2_E2TB_SHIFT    (UL(24))
 #define MDCR_EL2_TTRF          (1 << 19)
 #define MDCR_EL2_TPMS          (1 << 14)
 #define MDCR_EL2_E2PB_MASK     (UL(0x3))
index a7ab84f..cf8df03 100644 (file)
 #define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2               12
 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs              13
 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs           14
+#define __KVM_HOST_SMCCC_FUNC___pkvm_init                      15
+#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings           16
+#define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping    17
+#define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector            18
+#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize             19
+#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp                  20
 
 #ifndef __ASSEMBLY__
 
@@ -154,6 +160,9 @@ struct kvm_nvhe_init_params {
        unsigned long tpidr_el2;
        unsigned long stack_hyp_va;
        phys_addr_t pgd_pa;
+       unsigned long hcr_el2;
+       unsigned long vttbr;
+       unsigned long vtcr;
 };
 
 /* Translate a kernel address @ptr into its equivalent linear mapping */
index 3d10e65..7cd7d5c 100644 (file)
@@ -94,7 +94,7 @@ struct kvm_s2_mmu {
        /* The last vcpu id that ran on each physical CPU */
        int __percpu *last_vcpu_ran;
 
-       struct kvm *kvm;
+       struct kvm_arch *arch;
 };
 
 struct kvm_arch_memory_slot {
@@ -315,6 +315,8 @@ struct kvm_vcpu_arch {
                struct kvm_guest_debug_arch regs;
                /* Statistical profiling extension */
                u64 pmscr_el1;
+               /* Self-hosted trace */
+               u64 trfcr_el1;
        } host_debug_state;
 
        /* VGIC state */
@@ -372,8 +374,10 @@ struct kvm_vcpu_arch {
 };
 
 /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
-#define vcpu_sve_pffr(vcpu) ((void *)((char *)((vcpu)->arch.sve_state) + \
-                                     sve_ffr_offset((vcpu)->arch.sve_max_vl)))
+#define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) +     \
+                            sve_ffr_offset((vcpu)->arch.sve_max_vl))
+
+#define vcpu_sve_max_vq(vcpu)  sve_vq_from_vl((vcpu)->arch.sve_max_vl)
 
 #define vcpu_sve_state_size(vcpu) ({                                   \
        size_t __size_ret;                                              \
@@ -382,7 +386,7 @@ struct kvm_vcpu_arch {
        if (WARN_ON(!sve_vl_valid((vcpu)->arch.sve_max_vl))) {          \
                __size_ret = 0;                                         \
        } else {                                                        \
-               __vcpu_vq = sve_vq_from_vl((vcpu)->arch.sve_max_vl);    \
+               __vcpu_vq = vcpu_sve_max_vq(vcpu);                      \
                __size_ret = SVE_SIG_REGS_SIZE(__vcpu_vq);              \
        }                                                               \
                                                                        \
@@ -400,7 +404,13 @@ struct kvm_vcpu_arch {
 #define KVM_ARM64_GUEST_HAS_PTRAUTH    (1 << 7) /* PTRAUTH exposed to guest */
 #define KVM_ARM64_PENDING_EXCEPTION    (1 << 8) /* Exception pending */
 #define KVM_ARM64_EXCEPT_MASK          (7 << 9) /* Target EL/MODE */
+#define KVM_ARM64_DEBUG_STATE_SAVE_SPE (1 << 12) /* Save SPE context if active  */
+#define KVM_ARM64_DEBUG_STATE_SAVE_TRBE        (1 << 13) /* Save TRBE context if active  */
 
+#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
+                                KVM_GUESTDBG_USE_SW_BP | \
+                                KVM_GUESTDBG_USE_HW | \
+                                KVM_GUESTDBG_SINGLESTEP)
 /*
  * When KVM_ARM64_PENDING_EXCEPTION is set, KVM_ARM64_EXCEPT_MASK can
  * take the following values:
@@ -582,15 +592,11 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
                              struct kvm_vcpu_events *events);
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm,
-                       unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 
 void kvm_arm_halt_guest(struct kvm *kvm);
 void kvm_arm_resume_guest(struct kvm *kvm);
 
+#ifndef __KVM_NVHE_HYPERVISOR__
 #define kvm_call_hyp_nvhe(f, ...)                                              \
        ({                                                              \
                struct arm_smccc_res res;                               \
@@ -630,9 +636,13 @@ void kvm_arm_resume_guest(struct kvm *kvm);
                                                                        \
                ret;                                                    \
        })
+#else /* __KVM_NVHE_HYPERVISOR__ */
+#define kvm_call_hyp(f, ...) f(__VA_ARGS__)
+#define kvm_call_hyp_ret(f, ...) f(__VA_ARGS__)
+#define kvm_call_hyp_nvhe(f, ...) f(__VA_ARGS__)
+#endif /* __KVM_NVHE_HYPERVISOR__ */
 
 void force_vm_exit(const cpumask_t *mask);
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 
 int handle_exit(struct kvm_vcpu *vcpu, int exception_index);
 void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index);
@@ -692,19 +702,6 @@ static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
        ctxt_sys_reg(cpu_ctxt, MPIDR_EL1) = read_cpuid_mpidr();
 }
 
-static inline bool kvm_arch_requires_vhe(void)
-{
-       /*
-        * The Arm architecture specifies that implementation of SVE
-        * requires VHE also to be implemented.  The KVM code for arm64
-        * relies on this when SVE is present:
-        */
-       if (system_supports_sve())
-               return true;
-
-       return false;
-}
-
 void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu);
 
 static inline void kvm_arch_hardware_unsetup(void) {}
@@ -713,6 +710,7 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
 void kvm_arm_init_debug(void);
+void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
@@ -734,6 +732,10 @@ static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr)
        return (!has_vhe() && attr->exclude_host);
 }
 
+/* Flags for host debug state */
+void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu);
+
 #ifdef CONFIG_KVM /* Avoid conflicts with core headers if CONFIG_KVM=n */
 static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
 {
@@ -771,5 +773,12 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
        (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
 
 int kvm_trng_call(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_KVM
+extern phys_addr_t hyp_mem_base;
+extern phys_addr_t hyp_mem_size;
+void __init kvm_hyp_reserve(void);
+#else
+static inline void kvm_hyp_reserve(void) { }
+#endif
 
 #endif /* __ARM64_KVM_HOST_H__ */
index 32ae676..9d60b30 100644 (file)
@@ -90,6 +90,8 @@ void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu);
 
 void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
 void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
+void __sve_save_state(void *sve_pffr, u32 *fpsr);
+void __sve_restore_state(void *sve_pffr, u32 *fpsr);
 
 #ifndef __KVM_NVHE_HYPERVISOR__
 void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
@@ -100,10 +102,20 @@ u64 __guest_enter(struct kvm_vcpu *vcpu);
 
 bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt);
 
-void __noreturn hyp_panic(void);
 #ifdef __KVM_NVHE_HYPERVISOR__
 void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
                               u64 elr, u64 par);
 #endif
 
+#ifdef __KVM_NVHE_HYPERVISOR__
+void __pkvm_init_switch_pgd(phys_addr_t phys, unsigned long size,
+                           phys_addr_t pgd, void *sp, void *cont_fn);
+int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
+               unsigned long *per_cpu_base, u32 hyp_va_bits);
+void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
+#endif
+
+extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val);
+extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val);
+
 #endif /* __ARM64_KVM_HYP_H__ */
index 9087385..25ed956 100644 (file)
@@ -121,6 +121,8 @@ void kvm_update_va_mask(struct alt_instr *alt,
 void kvm_compute_layout(void);
 void kvm_apply_hyp_relocations(void);
 
+#define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset)
+
 static __always_inline unsigned long __kern_hyp_va(unsigned long v)
 {
        asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n"
@@ -166,7 +168,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
-int kvm_mmu_init(void);
+int kvm_mmu_init(u32 *hyp_va_bits);
+
+static inline void *__kvm_vector_slot2addr(void *base,
+                                          enum arm64_hyp_spectre_vector slot)
+{
+       int idx = slot - (slot != HYP_VECTOR_DIRECT);
+
+       return base + (idx * SZ_2K);
+}
 
 struct kvm;
 
@@ -262,9 +272,9 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
  * Must be called from hyp code running at EL2 with an updated VTTBR
  * and interrupts disabled.
  */
-static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
+static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long vtcr)
 {
-       write_sysreg(kern_hyp_va(mmu->kvm)->arch.vtcr, vtcr_el2);
+       write_sysreg(vtcr, vtcr_el2);
        write_sysreg(kvm_get_vttbr(mmu), vttbr_el2);
 
        /*
@@ -275,5 +285,14 @@ static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
        asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
 }
 
+static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
+{
+       __load_stage2(mmu, kern_hyp_va(mmu->arch)->vtcr);
+}
+
+static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
+{
+       return container_of(mmu->arch, struct kvm, arch);
+}
 #endif /* __ASSEMBLY__ */
 #endif /* __ARM64_KVM_MMU_H__ */
index 8886d43..c3674c4 100644 (file)
 #include <linux/kvm_host.h>
 #include <linux/types.h>
 
+#define KVM_PGTABLE_MAX_LEVELS         4U
+
+static inline u64 kvm_get_parange(u64 mmfr0)
+{
+       u64 parange = cpuid_feature_extract_unsigned_field(mmfr0,
+                               ID_AA64MMFR0_PARANGE_SHIFT);
+       if (parange > ID_AA64MMFR0_PARANGE_MAX)
+               parange = ID_AA64MMFR0_PARANGE_MAX;
+
+       return parange;
+}
+
 typedef u64 kvm_pte_t;
 
+/**
+ * struct kvm_pgtable_mm_ops - Memory management callbacks.
+ * @zalloc_page:       Allocate a single zeroed memory page. The @arg parameter
+ *                     can be used by the walker to pass a memcache. The
+ *                     initial refcount of the page is 1.
+ * @zalloc_pages_exact:        Allocate an exact number of zeroed memory pages. The
+ *                     @size parameter is in bytes, and is rounded-up to the
+ *                     next page boundary. The resulting allocation is
+ *                     physically contiguous.
+ * @free_pages_exact:  Free an exact number of memory pages previously
+ *                     allocated by zalloc_pages_exact.
+ * @get_page:          Increment the refcount on a page.
+ * @put_page:          Decrement the refcount on a page. When the refcount
+ *                     reaches 0 the page is automatically freed.
+ * @page_count:                Return the refcount of a page.
+ * @phys_to_virt:      Convert a physical address into a virtual address mapped
+ *                     in the current context.
+ * @virt_to_phys:      Convert a virtual address mapped in the current context
+ *                     into a physical address.
+ */
+struct kvm_pgtable_mm_ops {
+       void*           (*zalloc_page)(void *arg);
+       void*           (*zalloc_pages_exact)(size_t size);
+       void            (*free_pages_exact)(void *addr, size_t size);
+       void            (*get_page)(void *addr);
+       void            (*put_page)(void *addr);
+       int             (*page_count)(void *addr);
+       void*           (*phys_to_virt)(phys_addr_t phys);
+       phys_addr_t     (*virt_to_phys)(void *addr);
+};
+
+/**
+ * enum kvm_pgtable_stage2_flags - Stage-2 page-table flags.
+ * @KVM_PGTABLE_S2_NOFWB:      Don't enforce Normal-WB even if the CPUs have
+ *                             ARM64_HAS_STAGE2_FWB.
+ * @KVM_PGTABLE_S2_IDMAP:      Only use identity mappings.
+ */
+enum kvm_pgtable_stage2_flags {
+       KVM_PGTABLE_S2_NOFWB                    = BIT(0),
+       KVM_PGTABLE_S2_IDMAP                    = BIT(1),
+};
+
 /**
  * struct kvm_pgtable - KVM page-table.
  * @ia_bits:           Maximum input address size, in bits.
  * @start_level:       Level at which the page-table walk starts.
  * @pgd:               Pointer to the first top-level entry of the page-table.
+ * @mm_ops:            Memory management callbacks.
  * @mmu:               Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
  */
 struct kvm_pgtable {
        u32                                     ia_bits;
        u32                                     start_level;
        kvm_pte_t                               *pgd;
+       struct kvm_pgtable_mm_ops               *mm_ops;
 
        /* Stage-2 only */
        struct kvm_s2_mmu                       *mmu;
+       enum kvm_pgtable_stage2_flags           flags;
 };
 
 /**
@@ -49,6 +106,16 @@ enum kvm_pgtable_prot {
 #define PAGE_HYP_RO            (KVM_PGTABLE_PROT_R)
 #define PAGE_HYP_DEVICE                (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
 
+/**
+ * struct kvm_mem_range - Range of Intermediate Physical Addresses
+ * @start:     Start of the range.
+ * @end:       End of the range.
+ */
+struct kvm_mem_range {
+       u64 start;
+       u64 end;
+};
+
 /**
  * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
  * @KVM_PGTABLE_WALK_LEAF:             Visit leaf entries, including invalid
@@ -86,10 +153,12 @@ struct kvm_pgtable_walker {
  * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
  * @pgt:       Uninitialised page-table structure to initialise.
  * @va_bits:   Maximum virtual address bits.
+ * @mm_ops:    Memory management callbacks.
  *
  * Return: 0 on success, negative error code on failure.
  */
-int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits);
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
+                        struct kvm_pgtable_mm_ops *mm_ops);
 
 /**
  * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table.
@@ -123,17 +192,41 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
                        enum kvm_pgtable_prot prot);
 
 /**
- * kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
+ * kvm_get_vtcr() - Helper to construct VTCR_EL2
+ * @mmfr0:     Sanitized value of SYS_ID_AA64MMFR0_EL1 register.
+ * @mmfr1:     Sanitized value of SYS_ID_AA64MMFR1_EL1 register.
+ * @phys_shfit:        Value to set in VTCR_EL2.T0SZ.
+ *
+ * The VTCR value is common across all the physical CPUs on the system.
+ * We use system wide sanitised values to fill in different fields,
+ * except for Hardware Management of Access Flags. HA Flag is set
+ * unconditionally on all CPUs, as it is safe to run with or without
+ * the feature and the bit is RES0 on CPUs that don't support it.
+ *
+ * Return: VTCR_EL2 value
+ */
+u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
+
+/**
+ * kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table.
  * @pgt:       Uninitialised page-table structure to initialise.
- * @kvm:       KVM structure representing the guest virtual machine.
+ * @arch:      Arch-specific KVM structure representing the guest virtual
+ *             machine.
+ * @mm_ops:    Memory management callbacks.
+ * @flags:     Stage-2 configuration flags.
  *
  * Return: 0 on success, negative error code on failure.
  */
-int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm);
+int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch,
+                                 struct kvm_pgtable_mm_ops *mm_ops,
+                                 enum kvm_pgtable_stage2_flags flags);
+
+#define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \
+       kvm_pgtable_stage2_init_flags(pgt, arch, mm_ops, 0)
 
 /**
  * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  *
  * The page-table is assumed to be unreachable by any hardware walkers prior
  * to freeing and therefore no TLB invalidation is performed.
@@ -142,13 +235,13 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
 
 /**
  * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address at which to place the mapping.
  * @size:      Size of the mapping.
  * @phys:      Physical address of the memory to map.
  * @prot:      Permissions and attributes for the mapping.
- * @mc:                Cache of pre-allocated GFP_PGTABLE_USER memory from which to
- *             allocate page-table pages.
+ * @mc:                Cache of pre-allocated and zeroed memory from which to allocate
+ *             page-table pages.
  *
  * The offset of @addr within a page is ignored, @size is rounded-up to
  * the next page boundary and @phys is rounded-down to the previous page
@@ -170,11 +263,31 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
  */
 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
                           u64 phys, enum kvm_pgtable_prot prot,
-                          struct kvm_mmu_memory_cache *mc);
+                          void *mc);
+
+/**
+ * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to
+ *                                 track ownership.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @addr:      Base intermediate physical address to annotate.
+ * @size:      Size of the annotated range.
+ * @mc:                Cache of pre-allocated and zeroed memory from which to allocate
+ *             page-table pages.
+ * @owner_id:  Unique identifier for the owner of the page.
+ *
+ * By default, all page-tables are owned by identifier 0. This function can be
+ * used to mark portions of the IPA space as owned by other entities. When a
+ * stage 2 is used with identity-mappings, these annotations allow to use the
+ * page-table data structure as a simple rmap.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
+                                void *mc, u8 owner_id);
 
 /**
  * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address from which to remove the mapping.
  * @size:      Size of the mapping.
  *
@@ -194,7 +307,7 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
 /**
  * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
  *                                  without TLB invalidation.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address from which to write-protect,
  * @size:      Size of the range.
  *
@@ -211,7 +324,7 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
 
 /**
  * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address to identify the page-table entry.
  *
  * The offset of @addr within a page is ignored.
@@ -225,7 +338,7 @@ kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
 
 /**
  * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address to identify the page-table entry.
  *
  * The offset of @addr within a page is ignored.
@@ -244,7 +357,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
 /**
  * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
  *                                   page-table entry.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address to identify the page-table entry.
  * @prot:      Additional permissions to grant for the mapping.
  *
@@ -263,7 +376,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
 /**
  * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
  *                                access flag set.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address to identify the page-table entry.
  *
  * The offset of @addr within a page is ignored.
@@ -276,7 +389,7 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
  * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
  *                                   of Coherency for guest stage-2 address
  *                                   range.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address from which to flush.
  * @size:      Size of the range.
  *
@@ -311,4 +424,23 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
 int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
                     struct kvm_pgtable_walker *walker);
 
+/**
+ * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical
+ *                                  Addresses with compatible permission
+ *                                  attributes.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @addr:      Address that must be covered by the range.
+ * @prot:      Protection attributes that the range must be compatible with.
+ * @range:     Range structure used to limit the search space at call time and
+ *             that will hold the result.
+ *
+ * The offset of @addr within a page is ignored. An IPA is compatible with @prot
+ * iff its corresponding stage-2 page-table entry has default ownership and, if
+ * valid, is mapped with protection attributes identical to @prot.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
+                                 enum kvm_pgtable_prot prot,
+                                 struct kvm_mem_range *range);
 #endif /* __ARM64_KVM_PGTABLE_H__ */
index 6d9915d..87b90dc 100644 (file)
@@ -345,7 +345,7 @@ static inline void *phys_to_virt(phys_addr_t x)
  */
 #define ARCH_PFN_OFFSET                ((unsigned long)PHYS_PFN_OFFSET)
 
-#if !defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_DEBUG_VIRTUAL)
+#if defined(CONFIG_DEBUG_VIRTUAL)
 #define page_to_virt(x)        ({                                              \
        __typeof__(x) __page = x;                                       \
        void *__addr = __va(page_to_phys(__page));                      \
@@ -365,7 +365,7 @@ static inline void *phys_to_virt(phys_addr_t x)
        u64 __addr = VMEMMAP_START + (__idx * sizeof(struct page));     \
        (struct page *)__addr;                                          \
 })
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP || CONFIG_DEBUG_VIRTUAL */
+#endif /* CONFIG_DEBUG_VIRTUAL */
 
 #define virt_addr_valid(addr)  ({                                      \
        __typeof__(addr) __addr = __tag_reset(addr);                    \
index fab2f57..938092d 100644 (file)
@@ -71,10 +71,10 @@ extern bool arm64_use_ng_mappings;
 #define PAGE_KERNEL_EXEC       __pgprot(PROT_NORMAL & ~PTE_PXN)
 #define PAGE_KERNEL_EXEC_CONT  __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT)
 
-#define PAGE_S2_MEMATTR(attr)                                          \
+#define PAGE_S2_MEMATTR(attr, has_fwb)                                 \
        ({                                                              \
                u64 __val;                                              \
-               if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))          \
+               if (has_fwb)                                            \
                        __val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr);     \
                else                                                    \
                        __val = PTE_S2_MEMATTR(MT_S2_ ## attr);         \
index 2f36b16..e4ad9db 100644 (file)
@@ -13,6 +13,7 @@ extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
 extern char __hyp_text_start[], __hyp_text_end[];
 extern char __hyp_rodata_start[], __hyp_rodata_end[];
 extern char __hyp_reloc_begin[], __hyp_reloc_end[];
+extern char __hyp_bss_start[], __hyp_bss_end[];
 extern char __idmap_text_start[], __idmap_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __inittext_begin[], __inittext_end[];
index eb4a75d..4b73463 100644 (file)
@@ -5,7 +5,6 @@
 #ifndef __ASM_SPARSEMEM_H
 #define __ASM_SPARSEMEM_H
 
-#ifdef CONFIG_SPARSEMEM
 #define MAX_PHYSMEM_BITS       CONFIG_ARM64_PA_BITS
 
 /*
@@ -27,6 +26,4 @@
 #define SECTION_SIZE_BITS 27
 #endif /* CONFIG_ARM64_64K_PAGES */
 
-#endif /* CONFIG_SPARSEMEM*/
-
 #endif
index 012a0b8..65d1570 100644 (file)
 #define SYS_PMSIRR_EL1_INTERVAL_MASK   0xffffffUL
 
 /* Filtering controls */
+#define SYS_PMSNEVFR_EL1               sys_reg(3, 0, 9, 9, 1)
+
 #define SYS_PMSFCR_EL1                 sys_reg(3, 0, 9, 9, 4)
 #define SYS_PMSFCR_EL1_FE_SHIFT                0
 #define SYS_PMSFCR_EL1_FT_SHIFT                1
 
 /*** End of Statistical Profiling Extension ***/
 
+/*
+ * TRBE Registers
+ */
+#define SYS_TRBLIMITR_EL1              sys_reg(3, 0, 9, 11, 0)
+#define SYS_TRBPTR_EL1                 sys_reg(3, 0, 9, 11, 1)
+#define SYS_TRBBASER_EL1               sys_reg(3, 0, 9, 11, 2)
+#define SYS_TRBSR_EL1                  sys_reg(3, 0, 9, 11, 3)
+#define SYS_TRBMAR_EL1                 sys_reg(3, 0, 9, 11, 4)
+#define SYS_TRBTRG_EL1                 sys_reg(3, 0, 9, 11, 6)
+#define SYS_TRBIDR_EL1                 sys_reg(3, 0, 9, 11, 7)
+
+#define TRBLIMITR_LIMIT_MASK           GENMASK_ULL(51, 0)
+#define TRBLIMITR_LIMIT_SHIFT          12
+#define TRBLIMITR_NVM                  BIT(5)
+#define TRBLIMITR_TRIG_MODE_MASK       GENMASK(1, 0)
+#define TRBLIMITR_TRIG_MODE_SHIFT      3
+#define TRBLIMITR_FILL_MODE_MASK       GENMASK(1, 0)
+#define TRBLIMITR_FILL_MODE_SHIFT      1
+#define TRBLIMITR_ENABLE               BIT(0)
+#define TRBPTR_PTR_MASK                        GENMASK_ULL(63, 0)
+#define TRBPTR_PTR_SHIFT               0
+#define TRBBASER_BASE_MASK             GENMASK_ULL(51, 0)
+#define TRBBASER_BASE_SHIFT            12
+#define TRBSR_EC_MASK                  GENMASK(5, 0)
+#define TRBSR_EC_SHIFT                 26
+#define TRBSR_IRQ                      BIT(22)
+#define TRBSR_TRG                      BIT(21)
+#define TRBSR_WRAP                     BIT(20)
+#define TRBSR_ABORT                    BIT(18)
+#define TRBSR_STOP                     BIT(17)
+#define TRBSR_MSS_MASK                 GENMASK(15, 0)
+#define TRBSR_MSS_SHIFT                        0
+#define TRBSR_BSC_MASK                 GENMASK(5, 0)
+#define TRBSR_BSC_SHIFT                        0
+#define TRBSR_FSC_MASK                 GENMASK(5, 0)
+#define TRBSR_FSC_SHIFT                        0
+#define TRBMAR_SHARE_MASK              GENMASK(1, 0)
+#define TRBMAR_SHARE_SHIFT             8
+#define TRBMAR_OUTER_MASK              GENMASK(3, 0)
+#define TRBMAR_OUTER_SHIFT             4
+#define TRBMAR_INNER_MASK              GENMASK(3, 0)
+#define TRBMAR_INNER_SHIFT             0
+#define TRBTRG_TRG_MASK                        GENMASK(31, 0)
+#define TRBTRG_TRG_SHIFT               0
+#define TRBIDR_FLAG                    BIT(5)
+#define TRBIDR_PROG                    BIT(4)
+#define TRBIDR_ALIGN_MASK              GENMASK(3, 0)
+#define TRBIDR_ALIGN_SHIFT             0
+
 #define SYS_PMINTENSET_EL1             sys_reg(3, 0, 9, 14, 1)
 #define SYS_PMINTENCLR_EL1             sys_reg(3, 0, 9, 14, 2)
 
 #define SCTLR_ELx_A    (BIT(1))
 #define SCTLR_ELx_M    (BIT(0))
 
-#define SCTLR_ELx_FLAGS        (SCTLR_ELx_M  | SCTLR_ELx_A | SCTLR_ELx_C | \
-                        SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB)
-
 /* SCTLR_EL2 specific flags. */
 #define SCTLR_EL2_RES1 ((BIT(4))  | (BIT(5))  | (BIT(11)) | (BIT(16)) | \
                         (BIT(18)) | (BIT(22)) | (BIT(23)) | (BIT(28)) | \
 #define ENDIAN_SET_EL2         0
 #endif
 
+#define INIT_SCTLR_EL2_MMU_ON                                          \
+       (SCTLR_ELx_M  | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I |      \
+        SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | SCTLR_EL2_RES1)
+
 #define INIT_SCTLR_EL2_MMU_OFF \
        (SCTLR_EL2_RES1 | ENDIAN_SET_EL2)
 
 #define ID_AA64MMFR2_CNP_SHIFT         0
 
 /* id_aa64dfr0 */
+#define ID_AA64DFR0_TRBE_SHIFT         44
 #define ID_AA64DFR0_TRACE_FILT_SHIFT   40
 #define ID_AA64DFR0_DOUBLELOCK_SHIFT   36
 #define ID_AA64DFR0_PMSVER_SHIFT       32
index d1f7d35..727bfc3 100644 (file)
@@ -38,7 +38,7 @@
 #define __ARM_NR_compat_set_tls                (__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END            (__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls           444
+#define __NR_compat_syscalls           447
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
index 8361c51..7859749 100644 (file)
@@ -895,6 +895,12 @@ __SYSCALL(__NR_epoll_pwait2, compat_sys_epoll_pwait2)
 __SYSCALL(__NR_mount_setattr, sys_mount_setattr)
 #define __NR_quotactl_path 443
 __SYSCALL(__NR_quotactl_path, sys_quotactl_path)
+#define __NR_landlock_create_ruleset 444
+__SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset)
+#define __NR_landlock_add_rule 445
+__SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
+#define __NR_landlock_restrict_self 446
+__SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
 
 /*
  * Please add new compat syscalls above this comment and update
index abc8463..c906d20 100644 (file)
@@ -133,11 +133,10 @@ static void clean_dcache_range_nopatch(u64 start, u64 end)
        } while (cur += d_size, cur < end);
 }
 
-static void __nocfi __apply_alternatives(void *alt_region,  bool is_module,
-                                        unsigned long *feature_mask)
+static void __nocfi __apply_alternatives(struct alt_region *region, bool is_module,
+                                unsigned long *feature_mask)
 {
        struct alt_instr *alt;
-       struct alt_region *region = alt_region;
        __le32 *origptr, *updptr;
        alternative_cb_t alt_cb;
 
index e797603..0cb34cc 100644 (file)
@@ -123,6 +123,9 @@ int main(void)
   DEFINE(NVHE_INIT_TPIDR_EL2,  offsetof(struct kvm_nvhe_init_params, tpidr_el2));
   DEFINE(NVHE_INIT_STACK_HYP_VA,       offsetof(struct kvm_nvhe_init_params, stack_hyp_va));
   DEFINE(NVHE_INIT_PGD_PA,     offsetof(struct kvm_nvhe_init_params, pgd_pa));
+  DEFINE(NVHE_INIT_HCR_EL2,    offsetof(struct kvm_nvhe_init_params, hcr_el2));
+  DEFINE(NVHE_INIT_VTTBR,      offsetof(struct kvm_nvhe_init_params, vttbr));
+  DEFINE(NVHE_INIT_VTCR,       offsetof(struct kvm_nvhe_init_params, vtcr));
 #endif
 #ifdef CONFIG_CPU_PM
   DEFINE(CPU_CTX_SP,           offsetof(struct cpu_suspend_ctx, sp));
index 37721eb..d47ff63 100644 (file)
  * flat identity mapping.
  */
 SYM_CODE_START(__cpu_soft_restart)
-       /* Clear sctlr_el1 flags. */
-       mrs     x12, sctlr_el1
-       mov_q   x13, SCTLR_ELx_FLAGS
-       bic     x12, x12, x13
+       mov_q   x12, INIT_SCTLR_EL1_MMU_OFF
        pre_disable_mmu_workaround
        /*
         * either disable EL1&0 translation regime or disable EL2&0 translation
index 30c82d3..efed283 100644 (file)
@@ -68,6 +68,7 @@
 #include <linux/sort.h>
 #include <linux/stop_machine.h>
 #include <linux/types.h>
+#include <linux/minmax.h>
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/kasan.h>
@@ -694,14 +695,14 @@ static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new,
                ret = ftrp->safe_val;
                break;
        case FTR_LOWER_SAFE:
-               ret = new < cur ? new : cur;
+               ret = min(new, cur);
                break;
        case FTR_HIGHER_OR_ZERO_SAFE:
                if (!cur || !new)
                        break;
                fallthrough;
        case FTR_HIGHER_SAFE:
-               ret = new > cur ? new : cur;
+               ret = max(new, cur);
                break;
        default:
                BUG();
index b512b55..03991ee 100644 (file)
@@ -29,7 +29,7 @@ int arm_cpuidle_init(unsigned int cpu)
 
 /**
  * arm_cpuidle_suspend() - function to enter a low-power idle state
- * @arg: argument to pass to CPU suspend operations
+ * @index: argument to pass to CPU suspend operations
  *
  * Return: 0 on success, -EOPNOTSUPP if CPU suspend hook not initialized, CPU
  * operations back-end error code otherwise.
index a1ec351..340d04e 100644 (file)
@@ -230,14 +230,6 @@ static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr)
 {
        unsigned long far = read_sysreg(far_el1);
 
-       /*
-        * The CPU masked interrupts, and we are leaving them masked during
-        * do_debug_exception(). Update PMR as if we had called
-        * local_daif_mask().
-        */
-       if (system_uses_irq_prio_masking())
-               gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
-
        arm64_enter_el1_dbg(regs);
        if (!cortex_a76_erratum_1463225_debug_handler(regs))
                do_debug_exception(far, esr, regs);
@@ -404,9 +396,6 @@ static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr)
        /* Only watchpoints write FAR_EL1, otherwise its UNKNOWN */
        unsigned long far = read_sysreg(far_el1);
 
-       if (system_uses_irq_prio_masking())
-               gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
-
        enter_from_user_mode();
        do_debug_exception(far, esr, regs);
        local_daif_restore(DAIF_PROCCTX_NOIRQ);
@@ -414,9 +403,6 @@ static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr)
 
 static void noinstr el0_svc(struct pt_regs *regs)
 {
-       if (system_uses_irq_prio_masking())
-               gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
-
        enter_from_user_mode();
        cortex_a76_erratum_1463225_svc_handler();
        do_el0_svc(regs);
@@ -492,9 +478,6 @@ static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr)
 
 static void noinstr el0_svc_compat(struct pt_regs *regs)
 {
-       if (system_uses_irq_prio_masking())
-               gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
-
        enter_from_user_mode();
        cortex_a76_erratum_1463225_svc_handler();
        do_el0_svc_compat(regs);
index 4ac5455..3513984 100644 (file)
@@ -285,16 +285,16 @@ alternative_else_nop_endif
        stp     lr, x21, [sp, #S_LR]
 
        /*
-        * For exceptions from EL0, terminate the callchain here.
+        * For exceptions from EL0, create a terminal frame record.
         * For exceptions from EL1, create a synthetic frame record so the
         * interrupted code shows up in the backtrace.
         */
        .if \el == 0
-       mov     x29, xzr
+       stp     xzr, xzr, [sp, #S_STACKFRAME]
        .else
        stp     x29, x22, [sp, #S_STACKFRAME]
-       add     x29, sp, #S_STACKFRAME
        .endif
+       add     x29, sp, #S_STACKFRAME
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 alternative_if_not ARM64_HAS_PAN
@@ -314,6 +314,8 @@ alternative_else_nop_endif
 alternative_if ARM64_HAS_IRQ_PRIO_MASKING
        mrs_s   x20, SYS_ICC_PMR_EL1
        str     x20, [sp, #S_PMR_SAVE]
+       mov     x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
+       msr_s   SYS_ICC_PMR_EL1, x20
 alternative_else_nop_endif
 
        /* Re-enable tag checking (TCO set on exception entry) */
@@ -550,17 +552,7 @@ tsk        .req    x28             // current thread_info
 #endif
        .endm
 
-       .macro  gic_prio_irq_setup, pmr:req, tmp:req
-#ifdef CONFIG_ARM64_PSEUDO_NMI
-       alternative_if ARM64_HAS_IRQ_PRIO_MASKING
-       orr     \tmp, \pmr, #GIC_PRIO_PSR_I_SET
-       msr_s   SYS_ICC_PMR_EL1, \tmp
-       alternative_else_nop_endif
-#endif
-       .endm
-
        .macro el1_interrupt_handler, handler:req
-       gic_prio_irq_setup pmr=x20, tmp=x1
        enable_da
 
        mov     x0, sp
@@ -590,7 +582,6 @@ alternative_else_nop_endif
        .endm
 
        .macro el0_interrupt_handler, handler:req
-       gic_prio_irq_setup pmr=x20, tmp=x0
        user_exit_irqoff
        enable_da
 
@@ -788,7 +779,6 @@ SYM_CODE_END(el0_fiq)
 SYM_CODE_START_LOCAL(el1_error)
        kernel_entry 1
        mrs     x1, esr_el1
-       gic_prio_kentry_setup tmp=x2
        enable_dbg
        mov     x0, sp
        bl      do_serror
@@ -799,7 +789,6 @@ SYM_CODE_START_LOCAL(el0_error)
        kernel_entry 0
 el0_error_naked:
        mrs     x25, esr_el1
-       gic_prio_kentry_setup tmp=x2
        user_exit_irqoff
        enable_dbg
        mov     x0, sp
index 74ad3db..43d2126 100644 (file)
@@ -115,9 +115,10 @@ SYM_CODE_START_LOCAL(mutate_to_vhe)
        mrs_s   x0, SYS_VBAR_EL12
        msr     vbar_el1, x0
 
-       // Use EL2 translations for SPE and disable access from EL1
+       // Use EL2 translations for SPE & TRBE and disable access from EL1
        mrs     x0, mdcr_el2
        bic     x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT)
+       bic     x0, x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT)
        msr     mdcr_el2, x0
 
        // Transfer the MM state from EL1 to EL2
index 5aa9ed1..bcf3c27 100644 (file)
@@ -65,13 +65,13 @@ __efistub__ctype            = _ctype;
 KVM_NVHE_ALIAS(kvm_patch_vector_branch);
 KVM_NVHE_ALIAS(kvm_update_va_mask);
 KVM_NVHE_ALIAS(kvm_get_kimage_voffset);
+KVM_NVHE_ALIAS(kvm_compute_final_ctr_el0);
 
 /* Global kernel state accessed by nVHE hyp code. */
 KVM_NVHE_ALIAS(kvm_vgic_global_state);
 
 /* Kernel symbols used to call panic() from nVHE hyp code (via ERET). */
-KVM_NVHE_ALIAS(__hyp_panic_string);
-KVM_NVHE_ALIAS(panic);
+KVM_NVHE_ALIAS(nvhe_hyp_panic_handler);
 
 /* Vectors installed by hyp-init on reset HVC. */
 KVM_NVHE_ALIAS(__hyp_stub_vectors);
@@ -104,6 +104,36 @@ KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base);
 /* PMU available static key */
 KVM_NVHE_ALIAS(kvm_arm_pmu_available);
 
+/* Position-independent library routines */
+KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page);
+KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page);
+KVM_NVHE_ALIAS_HYP(memcpy, __pi_memcpy);
+KVM_NVHE_ALIAS_HYP(memset, __pi_memset);
+
+#ifdef CONFIG_KASAN
+KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy);
+KVM_NVHE_ALIAS_HYP(__memset, __pi_memset);
+#endif
+
+/* Kernel memory sections */
+KVM_NVHE_ALIAS(__start_rodata);
+KVM_NVHE_ALIAS(__end_rodata);
+KVM_NVHE_ALIAS(__bss_start);
+KVM_NVHE_ALIAS(__bss_stop);
+
+/* Hyp memory sections */
+KVM_NVHE_ALIAS(__hyp_idmap_text_start);
+KVM_NVHE_ALIAS(__hyp_idmap_text_end);
+KVM_NVHE_ALIAS(__hyp_text_start);
+KVM_NVHE_ALIAS(__hyp_text_end);
+KVM_NVHE_ALIAS(__hyp_bss_start);
+KVM_NVHE_ALIAS(__hyp_bss_end);
+KVM_NVHE_ALIAS(__hyp_rodata_start);
+KVM_NVHE_ALIAS(__hyp_rodata_end);
+
+/* pKVM static key */
+KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
+
 #endif /* CONFIG_KVM */
 
 #endif /* __ARM64_KERNEL_IMAGE_VARS_H */
index cbf5210..b4bb67f 100644 (file)
@@ -294,13 +294,10 @@ void __show_regs(struct pt_regs *regs)
        i = top_reg;
 
        while (i >= 0) {
-               printk("x%-2d: %016llx ", i, regs->regs[i]);
-               i--;
+               printk("x%-2d: %016llx", i, regs->regs[i]);
 
-               if (i % 2 == 0) {
-                       pr_cont("x%-2d: %016llx ", i, regs->regs[i]);
-                       i--;
-               }
+               while (i-- % 3)
+                       pr_cont(" x%-2d: %016llx", i, regs->regs[i]);
 
                pr_cont("\n");
        }
index 84b676b..de07147 100644 (file)
@@ -68,10 +68,6 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
        unsigned long fp = frame->fp;
        struct stack_info info;
 
-       /* Terminal record; nothing to unwind */
-       if (!fp)
-               return -ENOENT;
-
        if (fp & 0xf)
                return -EINVAL;
 
@@ -132,6 +128,12 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 
        frame->pc = ptrauth_strip_insn_pac(frame->pc);
 
+       /*
+        * This is a terminal record, so we have finished unwinding.
+        */
+       if (!frame->fp && !frame->pc)
+               return -ENOENT;
+
        return 0;
 }
 NOKPROBE_SYMBOL(unwind_frame);
index 61dbb4c..a5e61e0 100644 (file)
@@ -31,6 +31,13 @@ SECTIONS
        .gnu.version_d  : { *(.gnu.version_d) }
        .gnu.version_r  : { *(.gnu.version_r) }
 
+       /*
+        * Discard .note.gnu.property sections which are unused and have
+        * different alignment requirement from vDSO note sections.
+        */
+       /DISCARD/       : {
+               *(.note.GNU-stack .note.gnu.property)
+       }
        .note           : { *(.note.*) }                :text   :note
 
        . = ALIGN(16);
@@ -48,7 +55,6 @@ SECTIONS
        PROVIDE(end = .);
 
        /DISCARD/       : {
-               *(.note.GNU-stack)
                *(.data .data.* .gnu.linkonce.d.* .sdata*)
                *(.bss .sbss .dynbss .dynsbss)
                *(.eh_frame .eh_frame_hdr)
index 789ad42..3dba0c4 100644 (file)
@@ -10,15 +10,7 @@ include $(srctree)/lib/vdso/Makefile
 
 # Same as cc-*option, but using CC_COMPAT instead of CC
 ifeq ($(CONFIG_CC_IS_CLANG), y)
-COMPAT_GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE_COMPAT)elfedit))
-COMPAT_GCC_TOOLCHAIN := $(realpath $(COMPAT_GCC_TOOLCHAIN_DIR)/..)
-
 CC_COMPAT_CLANG_FLAGS := --target=$(notdir $(CROSS_COMPILE_COMPAT:%-=%))
-CC_COMPAT_CLANG_FLAGS += --prefix=$(COMPAT_GCC_TOOLCHAIN_DIR)$(notdir $(CROSS_COMPILE_COMPAT))
-CC_COMPAT_CLANG_FLAGS += -no-integrated-as -Qunused-arguments
-ifneq ($(COMPAT_GCC_TOOLCHAIN),)
-CC_COMPAT_CLANG_FLAGS += --gcc-toolchain=$(COMPAT_GCC_TOOLCHAIN)
-endif
 
 CC_COMPAT ?= $(CC)
 CC_COMPAT += $(CC_COMPAT_CLANG_FLAGS)
index 7eea788..709d2c4 100644 (file)
@@ -5,24 +5,7 @@
  * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
  */
 
-#define RO_EXCEPTION_TABLE_ALIGN       8
-#define RUNTIME_DISCARD_EXIT
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/cache.h>
 #include <asm/hyp_image.h>
-#include <asm/kernel-pgtable.h>
-#include <asm/memory.h>
-#include <asm/page.h>
-
-#include "image.h"
-
-OUTPUT_ARCH(aarch64)
-ENTRY(_text)
-
-jiffies = jiffies_64;
-
-
 #ifdef CONFIG_KVM
 #define HYPERVISOR_EXTABLE                                     \
        . = ALIGN(SZ_8);                                        \
@@ -32,9 +15,11 @@ jiffies = jiffies_64;
 
 #define HYPERVISOR_DATA_SECTIONS                               \
        HYP_SECTION_NAME(.rodata) : {                           \
+               . = ALIGN(PAGE_SIZE);                           \
                __hyp_rodata_start = .;                         \
                *(HYP_SECTION_NAME(.data..ro_after_init))       \
                *(HYP_SECTION_NAME(.rodata))                    \
+               . = ALIGN(PAGE_SIZE);                           \
                __hyp_rodata_end = .;                           \
        }
 
@@ -51,29 +36,52 @@ jiffies = jiffies_64;
                __hyp_reloc_end = .;                            \
        }
 
+#define BSS_FIRST_SECTIONS                                     \
+       __hyp_bss_start = .;                                    \
+       *(HYP_SECTION_NAME(.bss))                               \
+       . = ALIGN(PAGE_SIZE);                                   \
+       __hyp_bss_end = .;
+
+/*
+ * We require that __hyp_bss_start and __bss_start are aligned, and enforce it
+ * with an assertion. But the BSS_SECTION macro places an empty .sbss section
+ * between them, which can in some cases cause the linker to misalign them. To
+ * work around the issue, force a page alignment for __bss_start.
+ */
+#define SBSS_ALIGN                     PAGE_SIZE
 #else /* CONFIG_KVM */
 #define HYPERVISOR_EXTABLE
 #define HYPERVISOR_DATA_SECTIONS
 #define HYPERVISOR_PERCPU_SECTION
 #define HYPERVISOR_RELOC_SECTION
+#define SBSS_ALIGN                     0
 #endif
 
+#define RO_EXCEPTION_TABLE_ALIGN       8
+#define RUNTIME_DISCARD_EXIT
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/cache.h>
+#include <asm/kernel-pgtable.h>
+#include <asm/memory.h>
+#include <asm/page.h>
+
+#include "image.h"
+
+OUTPUT_ARCH(aarch64)
+ENTRY(_text)
+
+jiffies = jiffies_64;
+
 #define HYPERVISOR_TEXT                                        \
-       /*                                              \
-        * Align to 4 KB so that                        \
-        * a) the HYP vector table is at its minimum    \
-        *    alignment of 2048 bytes                   \
-        * b) the HYP init code will not cross a page   \
-        *    boundary if its size does not exceed      \
-        *    4 KB (see related ASSERT() below)         \
-        */                                             \
-       . = ALIGN(SZ_4K);                               \
+       . = ALIGN(PAGE_SIZE);                           \
        __hyp_idmap_text_start = .;                     \
        *(.hyp.idmap.text)                              \
        __hyp_idmap_text_end = .;                       \
        __hyp_text_start = .;                           \
        *(.hyp.text)                                    \
        HYPERVISOR_EXTABLE                              \
+       . = ALIGN(PAGE_SIZE);                           \
        __hyp_text_end = .;
 
 #define IDMAP_TEXT                                     \
@@ -276,7 +284,7 @@ SECTIONS
        __pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin);
        _edata = .;
 
-       BSS_SECTION(0, 0, 0)
+       BSS_SECTION(SBSS_ALIGN, 0, 0)
 
        . = ALIGN(PAGE_SIZE);
        init_pg_dir = .;
@@ -309,11 +317,12 @@ SECTIONS
 #include "image-vars.h"
 
 /*
- * The HYP init code and ID map text can't be longer than a page each,
- * and should not cross a page boundary.
+ * The HYP init code and ID map text can't be longer than a page each. The
+ * former is page-aligned, but the latter may not be with 16K or 64K pages, so
+ * it should also not cross a page boundary.
  */
-ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
-       "HYP init code too big or misaligned")
+ASSERT(__hyp_idmap_text_end - __hyp_idmap_text_start <= PAGE_SIZE,
+       "HYP init code too big")
 ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
        "ID map text too big or misaligned")
 #ifdef CONFIG_HIBERNATION
@@ -324,6 +333,9 @@ ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1))
 ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE,
        "Entry trampoline text too big")
 #endif
+#ifdef CONFIG_KVM
+ASSERT(__hyp_bss_start == __bss_start, "HYP and Host BSS are misaligned")
+#endif
 /*
  * If padding is applied before .head.text, virt<->phys conversions will fail.
  */
index 7f06ba7..1cb39c0 100644 (file)
@@ -206,8 +206,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_ARM_INJECT_EXT_DABT:
        case KVM_CAP_SET_GUEST_DEBUG:
        case KVM_CAP_VCPU_ATTRIBUTES:
+       case KVM_CAP_PTP_KVM:
                r = 1;
                break;
+       case KVM_CAP_SET_GUEST_DEBUG2:
+               return KVM_GUESTDBG_VALID_MASK;
        case KVM_CAP_ARM_SET_DEVICE_ADDR:
                r = 1;
                break;
@@ -416,10 +419,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
        if (vcpu_has_ptrauth(vcpu))
                vcpu_ptrauth_disable(vcpu);
+       kvm_arch_vcpu_load_debug_state_flags(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+       kvm_arch_vcpu_put_debug_state_flags(vcpu);
        kvm_arch_vcpu_put_fp(vcpu);
        if (has_vhe())
                kvm_vcpu_put_sysregs_vhe(vcpu);
@@ -580,6 +585,8 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
 
        vcpu->arch.has_run_once = true;
 
+       kvm_arm_vcpu_init_debug(vcpu);
+
        if (likely(irqchip_in_kernel(kvm))) {
                /*
                 * Map the VGIC hardware resources before running a vcpu the
@@ -1268,7 +1275,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 }
 
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot)
+                                       const struct kvm_memory_slot *memslot)
 {
        kvm_flush_remote_tlbs(kvm);
 }
@@ -1350,16 +1357,9 @@ static unsigned long nvhe_percpu_order(void)
 /* A lookup table holding the hypervisor VA for each vector slot */
 static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];
 
-static int __kvm_vector_slot2idx(enum arm64_hyp_spectre_vector slot)
-{
-       return slot - (slot != HYP_VECTOR_DIRECT);
-}
-
 static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot)
 {
-       int idx = __kvm_vector_slot2idx(slot);
-
-       hyp_spectre_vector_selector[slot] = base + (idx * SZ_2K);
+       hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot);
 }
 
 static int kvm_init_vector_slots(void)
@@ -1388,22 +1388,18 @@ static int kvm_init_vector_slots(void)
        return 0;
 }
 
-static void cpu_init_hyp_mode(void)
+static void cpu_prepare_hyp_mode(int cpu)
 {
-       struct kvm_nvhe_init_params *params = this_cpu_ptr_nvhe_sym(kvm_init_params);
-       struct arm_smccc_res res;
+       struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
        unsigned long tcr;
 
-       /* Switch from the HYP stub to our own HYP init vector */
-       __hyp_set_vectors(kvm_get_idmap_vector());
-
        /*
         * Calculate the raw per-cpu offset without a translation from the
         * kernel's mapping to the linear mapping, and store it in tpidr_el2
         * so that we can use adr_l to access per-cpu variables in EL2.
         * Also drop the KASAN tag which gets in the way...
         */
-       params->tpidr_el2 = (unsigned long)kasan_reset_tag(this_cpu_ptr_nvhe_sym(__per_cpu_start)) -
+       params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) -
                            (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
 
        params->mair_el2 = read_sysreg(mair_el1);
@@ -1427,14 +1423,28 @@ static void cpu_init_hyp_mode(void)
        tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET;
        params->tcr_el2 = tcr;
 
-       params->stack_hyp_va = kern_hyp_va(__this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE);
+       params->stack_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stack_page, cpu) + PAGE_SIZE);
        params->pgd_pa = kvm_mmu_get_httbr();
+       if (is_protected_kvm_enabled())
+               params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
+       else
+               params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
+       params->vttbr = params->vtcr = 0;
 
        /*
         * Flush the init params from the data cache because the struct will
         * be read while the MMU is off.
         */
        kvm_flush_dcache_to_poc(params, sizeof(*params));
+}
+
+static void hyp_install_host_vector(void)
+{
+       struct kvm_nvhe_init_params *params;
+       struct arm_smccc_res res;
+
+       /* Switch from the HYP stub to our own HYP init vector */
+       __hyp_set_vectors(kvm_get_idmap_vector());
 
        /*
         * Call initialization code, and switch to the full blown HYP code.
@@ -1443,8 +1453,14 @@ static void cpu_init_hyp_mode(void)
         * cpus_have_const_cap() wrapper.
         */
        BUG_ON(!system_capabilities_finalized());
+       params = this_cpu_ptr_nvhe_sym(kvm_init_params);
        arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res);
        WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
+}
+
+static void cpu_init_hyp_mode(void)
+{
+       hyp_install_host_vector();
 
        /*
         * Disabling SSBD on a non-VHE system requires us to enable SSBS
@@ -1487,7 +1503,10 @@ static void cpu_set_hyp_vector(void)
        struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
        void *vector = hyp_spectre_vector_selector[data->slot];
 
-       *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
+       if (!is_protected_kvm_enabled())
+               *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
+       else
+               kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot);
 }
 
 static void cpu_hyp_reinit(void)
@@ -1495,13 +1514,14 @@ static void cpu_hyp_reinit(void)
        kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
 
        cpu_hyp_reset();
-       cpu_set_hyp_vector();
 
        if (is_kernel_in_hyp_mode())
                kvm_timer_init_vhe();
        else
                cpu_init_hyp_mode();
 
+       cpu_set_hyp_vector();
+
        kvm_arm_init_debug();
 
        if (vgic_present)
@@ -1697,18 +1717,62 @@ static void teardown_hyp_mode(void)
        }
 }
 
+static int do_pkvm_init(u32 hyp_va_bits)
+{
+       void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base);
+       int ret;
+
+       preempt_disable();
+       hyp_install_host_vector();
+       ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
+                               num_possible_cpus(), kern_hyp_va(per_cpu_base),
+                               hyp_va_bits);
+       preempt_enable();
+
+       return ret;
+}
+
+static int kvm_hyp_init_protection(u32 hyp_va_bits)
+{
+       void *addr = phys_to_virt(hyp_mem_base);
+       int ret;
+
+       kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+       kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+
+       ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
+       if (ret)
+               return ret;
+
+       ret = do_pkvm_init(hyp_va_bits);
+       if (ret)
+               return ret;
+
+       free_hyp_pgds();
+
+       return 0;
+}
+
 /**
  * Inits Hyp-mode on all online CPUs
  */
 static int init_hyp_mode(void)
 {
+       u32 hyp_va_bits;
        int cpu;
-       int err = 0;
+       int err = -ENOMEM;
+
+       /*
+        * The protected Hyp-mode cannot be initialized if the memory pool
+        * allocation has failed.
+        */
+       if (is_protected_kvm_enabled() && !hyp_mem_base)
+               goto out_err;
 
        /*
         * Allocate Hyp PGD and setup Hyp identity mapping
         */
-       err = kvm_mmu_init();
+       err = kvm_mmu_init(&hyp_va_bits);
        if (err)
                goto out_err;
 
@@ -1769,7 +1833,19 @@ static int init_hyp_mode(void)
                goto out_err;
        }
 
-       err = create_hyp_mappings(kvm_ksym_ref(__bss_start),
+       /*
+        * .hyp.bss is guaranteed to be placed at the beginning of the .bss
+        * section thanks to an assertion in the linker script. Map it RW and
+        * the rest of .bss RO.
+        */
+       err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start),
+                                 kvm_ksym_ref(__hyp_bss_end), PAGE_HYP);
+       if (err) {
+               kvm_err("Cannot map hyp bss section: %d\n", err);
+               goto out_err;
+       }
+
+       err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end),
                                  kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
        if (err) {
                kvm_err("Cannot map bss section\n");
@@ -1790,26 +1866,36 @@ static int init_hyp_mode(void)
                }
        }
 
-       /*
-        * Map Hyp percpu pages
-        */
        for_each_possible_cpu(cpu) {
                char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
                char *percpu_end = percpu_begin + nvhe_percpu_size();
 
+               /* Map Hyp percpu pages */
                err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
-
                if (err) {
                        kvm_err("Cannot map hyp percpu region\n");
                        goto out_err;
                }
+
+               /* Prepare the CPU initialization parameters */
+               cpu_prepare_hyp_mode(cpu);
        }
 
        if (is_protected_kvm_enabled()) {
                init_cpu_logical_map();
 
-               if (!init_psci_relay())
+               if (!init_psci_relay()) {
+                       err = -ENODEV;
+                       goto out_err;
+               }
+       }
+
+       if (is_protected_kvm_enabled()) {
+               err = kvm_hyp_init_protection(hyp_va_bits);
+               if (err) {
+                       kvm_err("Failed to init hyp memory protection\n");
                        goto out_err;
+               }
        }
 
        return 0;
@@ -1820,6 +1906,72 @@ out_err:
        return err;
 }
 
+static void _kvm_host_prot_finalize(void *discard)
+{
+       WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize));
+}
+
+static inline int pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
+{
+       return kvm_call_hyp_nvhe(__pkvm_mark_hyp, start, end);
+}
+
+#define pkvm_mark_hyp_section(__section)               \
+       pkvm_mark_hyp(__pa_symbol(__section##_start),   \
+                       __pa_symbol(__section##_end))
+
+static int finalize_hyp_mode(void)
+{
+       int cpu, ret;
+
+       if (!is_protected_kvm_enabled())
+               return 0;
+
+       ret = pkvm_mark_hyp_section(__hyp_idmap_text);
+       if (ret)
+               return ret;
+
+       ret = pkvm_mark_hyp_section(__hyp_text);
+       if (ret)
+               return ret;
+
+       ret = pkvm_mark_hyp_section(__hyp_rodata);
+       if (ret)
+               return ret;
+
+       ret = pkvm_mark_hyp_section(__hyp_bss);
+       if (ret)
+               return ret;
+
+       ret = pkvm_mark_hyp(hyp_mem_base, hyp_mem_base + hyp_mem_size);
+       if (ret)
+               return ret;
+
+       for_each_possible_cpu(cpu) {
+               phys_addr_t start = virt_to_phys((void *)kvm_arm_hyp_percpu_base[cpu]);
+               phys_addr_t end = start + (PAGE_SIZE << nvhe_percpu_order());
+
+               ret = pkvm_mark_hyp(start, end);
+               if (ret)
+                       return ret;
+
+               start = virt_to_phys((void *)per_cpu(kvm_arm_hyp_stack_page, cpu));
+               end = start + PAGE_SIZE;
+               ret = pkvm_mark_hyp(start, end);
+               if (ret)
+                       return ret;
+       }
+
+       /*
+        * Flip the static key upfront as that may no longer be possible
+        * once the host stage 2 is installed.
+        */
+       static_branch_enable(&kvm_protected_mode_initialized);
+       on_each_cpu(_kvm_host_prot_finalize, NULL, 1);
+
+       return 0;
+}
+
 static void check_kvm_target_cpu(void *ret)
 {
        *(int *)ret = kvm_target_cpu();
@@ -1894,11 +2046,6 @@ int kvm_arch_init(void *opaque)
 
        in_hyp_mode = is_kernel_in_hyp_mode();
 
-       if (!in_hyp_mode && kvm_arch_requires_vhe()) {
-               kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n");
-               return -ENODEV;
-       }
-
        if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
            cpus_have_final_cap(ARM64_WORKAROUND_1508412))
                kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
@@ -1936,8 +2083,15 @@ int kvm_arch_init(void *opaque)
        if (err)
                goto out_hyp;
 
+       if (!in_hyp_mode) {
+               err = finalize_hyp_mode();
+               if (err) {
+                       kvm_err("Failed to finalize Hyp protection\n");
+                       goto out_hyp;
+               }
+       }
+
        if (is_protected_kvm_enabled()) {
-               static_branch_enable(&kvm_protected_mode_initialized);
                kvm_info("Protected nVHE mode initialized successfully\n");
        } else if (in_hyp_mode) {
                kvm_info("VHE mode initialized successfully\n");
index dbc8905..d5e79d7 100644 (file)
@@ -68,6 +68,65 @@ void kvm_arm_init_debug(void)
        __this_cpu_write(mdcr_el2, kvm_call_hyp_ret(__kvm_get_mdcr_el2));
 }
 
+/**
+ * kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value
+ *
+ * @vcpu:      the vcpu pointer
+ *
+ * This ensures we will trap access to:
+ *  - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR)
+ *  - Debug ROM Address (MDCR_EL2_TDRA)
+ *  - OS related registers (MDCR_EL2_TDOSA)
+ *  - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB)
+ *  - Self-hosted Trace Filter controls (MDCR_EL2_TTRF)
+ *  - Self-hosted Trace (MDCR_EL2_TTRF/MDCR_EL2_E2TB)
+ */
+static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
+{
+       /*
+        * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
+        * to disable guest access to the profiling and trace buffers
+        */
+       vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
+       vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
+                               MDCR_EL2_TPMS |
+                               MDCR_EL2_TTRF |
+                               MDCR_EL2_TPMCR |
+                               MDCR_EL2_TDRA |
+                               MDCR_EL2_TDOSA);
+
+       /* Is the VM being debugged by userspace? */
+       if (vcpu->guest_debug)
+               /* Route all software debug exceptions to EL2 */
+               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
+
+       /*
+        * Trap debug register access when one of the following is true:
+        *  - Userspace is using the hardware to debug the guest
+        *  (KVM_GUESTDBG_USE_HW is set).
+        *  - The guest is not using debug (KVM_ARM64_DEBUG_DIRTY is clear).
+        */
+       if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) ||
+           !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
+               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
+
+       trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
+}
+
+/**
+ * kvm_arm_vcpu_init_debug - setup vcpu debug traps
+ *
+ * @vcpu:      the vcpu pointer
+ *
+ * Set vcpu initial mdcr_el2 value.
+ */
+void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu)
+{
+       preempt_disable();
+       kvm_arm_setup_mdcr_el2(vcpu);
+       preempt_enable();
+}
+
 /**
  * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state
  */
@@ -83,13 +142,7 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
  * @vcpu:      the vcpu pointer
  *
  * This is called before each entry into the hypervisor to setup any
- * debug related registers. Currently this just ensures we will trap
- * access to:
- *  - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR)
- *  - Debug ROM Address (MDCR_EL2_TDRA)
- *  - OS related registers (MDCR_EL2_TDOSA)
- *  - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB)
- *  - Self-hosted Trace Filter controls (MDCR_EL2_TTRF)
+ * debug related registers.
  *
  * Additionally, KVM only traps guest accesses to the debug registers if
  * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY
@@ -101,28 +154,14 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
 
 void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
 {
-       bool trap_debug = !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY);
        unsigned long mdscr, orig_mdcr_el2 = vcpu->arch.mdcr_el2;
 
        trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug);
 
-       /*
-        * This also clears MDCR_EL2_E2PB_MASK to disable guest access
-        * to the profiling buffer.
-        */
-       vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
-       vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
-                               MDCR_EL2_TPMS |
-                               MDCR_EL2_TTRF |
-                               MDCR_EL2_TPMCR |
-                               MDCR_EL2_TDRA |
-                               MDCR_EL2_TDOSA);
+       kvm_arm_setup_mdcr_el2(vcpu);
 
        /* Is Guest debugging in effect? */
        if (vcpu->guest_debug) {
-               /* Route all software debug exceptions to EL2 */
-               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
-
                /* Save guest debug state */
                save_guest_debug_regs(vcpu);
 
@@ -176,7 +215,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
 
                        vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state;
                        vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY;
-                       trap_debug = true;
 
                        trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
                                                &vcpu->arch.debug_ptr->dbg_bcr[0],
@@ -191,10 +229,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
        BUG_ON(!vcpu->guest_debug &&
                vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state);
 
-       /* Trap debug register access */
-       if (trap_debug)
-               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
-
        /* If KDE or MDE are set, perform a full save/restore cycle. */
        if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE))
                vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY;
@@ -203,7 +237,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
        if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2)
                write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
 
-       trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
        trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1));
 }
 
@@ -231,3 +264,32 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
                }
        }
 }
+
+void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu)
+{
+       u64 dfr0;
+
+       /* For VHE, there is nothing to do */
+       if (has_vhe())
+               return;
+
+       dfr0 = read_sysreg(id_aa64dfr0_el1);
+       /*
+        * If SPE is present on this CPU and is available at current EL,
+        * we may need to check if the host state needs to be saved.
+        */
+       if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_PMSVER_SHIFT) &&
+           !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(SYS_PMBIDR_EL1_P_SHIFT)))
+               vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_SPE;
+
+       /* Check if we have TRBE implemented and available at the host */
+       if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_TRBE_SHIFT) &&
+           !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_PROG))
+               vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_TRBE;
+}
+
+void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.flags &= ~(KVM_ARM64_DEBUG_STATE_SAVE_SPE |
+                             KVM_ARM64_DEBUG_STATE_SAVE_TRBE);
+}
index 3e081d5..5621020 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/kvm_host.h>
 #include <asm/fpsimd.h>
 #include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 #include <asm/sysreg.h>
 
@@ -42,6 +43,17 @@ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu)
        if (ret)
                goto error;
 
+       if (vcpu->arch.sve_state) {
+               void *sve_end;
+
+               sve_end = vcpu->arch.sve_state + vcpu_sve_state_size(vcpu);
+
+               ret = create_hyp_mappings(vcpu->arch.sve_state, sve_end,
+                                         PAGE_HYP);
+               if (ret)
+                       goto error;
+       }
+
        vcpu->arch.host_thread_info = kern_hyp_va(ti);
        vcpu->arch.host_fpsimd_state = kern_hyp_va(fpsimd);
 error:
@@ -109,11 +121,17 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
        local_irq_save(flags);
 
        if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) {
-               fpsimd_save_and_flush_cpu_state();
+               if (guest_has_sve) {
+                       __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR);
+
+                       /* Restore the VL that was saved when bound to the CPU */
+                       if (!has_vhe())
+                               sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1,
+                                                      SYS_ZCR_EL1);
+               }
 
-               if (guest_has_sve)
-                       __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_s(SYS_ZCR_EL12);
-       } else if (host_has_sve) {
+               fpsimd_save_and_flush_cpu_state();
+       } else if (has_vhe() && host_has_sve) {
                /*
                 * The FPSIMD/SVE state in the CPU has not been touched, and we
                 * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been
index 9bbd30e..5cb4a1c 100644 (file)
@@ -299,7 +299,7 @@ static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 
        memset(vqs, 0, sizeof(vqs));
 
-       max_vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
+       max_vq = vcpu_sve_max_vq(vcpu);
        for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
                if (sve_vq_available(vq))
                        vqs[vq_word(vq)] |= vq_mask(vq);
@@ -427,7 +427,7 @@ static int sve_reg_to_region(struct sve_state_reg_region *region,
                if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0)
                        return -ENOENT;
 
-               vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
+               vq = vcpu_sve_max_vq(vcpu);
 
                reqoffset = SVE_SIG_ZREG_OFFSET(vq, reg_num) -
                                SVE_SIG_REGS_OFFSET;
@@ -437,7 +437,7 @@ static int sve_reg_to_region(struct sve_state_reg_region *region,
                if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0)
                        return -ENOENT;
 
-               vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
+               vq = vcpu_sve_max_vq(vcpu);
 
                reqoffset = SVE_SIG_PREG_OFFSET(vq, reg_num) -
                                SVE_SIG_REGS_OFFSET;
@@ -888,11 +888,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
        return -EINVAL;
 }
 
-#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE |    \
-                           KVM_GUESTDBG_USE_SW_BP | \
-                           KVM_GUESTDBG_USE_HW | \
-                           KVM_GUESTDBG_SINGLESTEP)
-
 /**
  * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
  * @kvm:       pointer to the KVM struct
index cebe39f..6f48336 100644 (file)
@@ -291,3 +291,48 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
        if (exception_index == ARM_EXCEPTION_EL1_SERROR)
                kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu));
 }
+
+void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr,
+                                             u64 par, uintptr_t vcpu,
+                                             u64 far, u64 hpfar) {
+       u64 elr_in_kimg = __phys_to_kimg(__hyp_pa(elr));
+       u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr;
+       u64 mode = spsr & PSR_MODE_MASK;
+
+       /*
+        * The nVHE hyp symbols are not included by kallsyms to avoid issues
+        * with aliasing. That means that the symbols cannot be printed with the
+        * "%pS" format specifier, so fall back to the vmlinux address if
+        * there's no better option.
+        */
+       if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) {
+               kvm_err("Invalid host exception to nVHE hyp!\n");
+       } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
+                  (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) {
+               struct bug_entry *bug = find_bug(elr_in_kimg);
+               const char *file = NULL;
+               unsigned int line = 0;
+
+               /* All hyp bugs, including warnings, are treated as fatal. */
+               if (bug)
+                       bug_get_file_line(bug, &file, &line);
+
+               if (file)
+                       kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
+               else
+                       kvm_err("nVHE hyp BUG at: %016llx!\n", elr + hyp_offset);
+       } else {
+               kvm_err("nVHE hyp panic at: %016llx!\n", elr + hyp_offset);
+       }
+
+       /*
+        * Hyp has panicked and we're going to handle that by panicking the
+        * kernel. The kernel offset will be revealed in the panic so we're
+        * also safe to reveal the hyp offset as a debugging aid for translating
+        * hyp VAs to vmlinux addresses.
+        */
+       kvm_err("Hyp Offset: 0x%llx\n", hyp_offset);
+
+       panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n",
+             spsr, elr, esr, far, hpfar, par, vcpu);
+}
index 687598e..b726332 100644 (file)
@@ -10,4 +10,4 @@ subdir-ccflags-y := -I$(incdir)                               \
                    -DDISABLE_BRANCH_PROFILING          \
                    $(DISABLE_STACKLEAK_PLUGIN)
 
-obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o
+obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o reserved_mem.o
index 01f114a..3c63592 100644 (file)
@@ -19,3 +19,13 @@ SYM_FUNC_START(__fpsimd_restore_state)
        fpsimd_restore  x0, 1
        ret
 SYM_FUNC_END(__fpsimd_restore_state)
+
+SYM_FUNC_START(__sve_restore_state)
+       __sve_load 0, x1, 2
+       ret
+SYM_FUNC_END(__sve_restore_state)
+
+SYM_FUNC_START(__sve_save_state)
+       sve_save 0, x1, 2
+       ret
+SYM_FUNC_END(__sve_save_state)
index 6c1f51f..e4a2f29 100644 (file)
@@ -30,8 +30,6 @@
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 
-extern const char __hyp_panic_string[];
-
 extern struct exception_table_entry __start___kvm_ex_table;
 extern struct exception_table_entry __stop___kvm_ex_table;
 
@@ -160,18 +158,10 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
        return true;
 }
 
-static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
+static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault)
 {
-       u8 ec;
-       u64 esr;
        u64 hpfar, far;
 
-       esr = vcpu->arch.fault.esr_el2;
-       ec = ESR_ELx_EC(esr);
-
-       if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW)
-               return true;
-
        far = read_sysreg_el2(SYS_FAR);
 
        /*
@@ -194,33 +184,59 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
                hpfar = read_sysreg(hpfar_el2);
        }
 
-       vcpu->arch.fault.far_el2 = far;
-       vcpu->arch.fault.hpfar_el2 = hpfar;
+       fault->far_el2 = far;
+       fault->hpfar_el2 = hpfar;
        return true;
 }
 
+static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
+{
+       u8 ec;
+       u64 esr;
+
+       esr = vcpu->arch.fault.esr_el2;
+       ec = ESR_ELx_EC(esr);
+
+       if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW)
+               return true;
+
+       return __get_fault_info(esr, &vcpu->arch.fault);
+}
+
+static inline void __hyp_sve_save_host(struct kvm_vcpu *vcpu)
+{
+       struct thread_struct *thread;
+
+       thread = container_of(vcpu->arch.host_fpsimd_state, struct thread_struct,
+                             uw.fpsimd_state);
+
+       __sve_save_state(sve_pffr(thread), &vcpu->arch.host_fpsimd_state->fpsr);
+}
+
+static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu)
+{
+       sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2);
+       __sve_restore_state(vcpu_sve_pffr(vcpu),
+                           &vcpu->arch.ctxt.fp_regs.fpsr);
+       write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR);
+}
+
 /* Check for an FPSIMD/SVE trap and handle as appropriate */
 static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu)
 {
-       bool vhe, sve_guest, sve_host;
+       bool sve_guest, sve_host;
        u8 esr_ec;
+       u64 reg;
 
        if (!system_supports_fpsimd())
                return false;
 
-       /*
-        * Currently system_supports_sve() currently implies has_vhe(),
-        * so the check is redundant. However, has_vhe() can be determined
-        * statically and helps the compiler remove dead code.
-        */
-       if (has_vhe() && system_supports_sve()) {
+       if (system_supports_sve()) {
                sve_guest = vcpu_has_sve(vcpu);
                sve_host = vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE;
-               vhe = true;
        } else {
                sve_guest = false;
                sve_host = false;
-               vhe = has_vhe();
        }
 
        esr_ec = kvm_vcpu_trap_get_class(vcpu);
@@ -229,53 +245,38 @@ static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu)
                return false;
 
        /* Don't handle SVE traps for non-SVE vcpus here: */
-       if (!sve_guest)
-               if (esr_ec != ESR_ELx_EC_FP_ASIMD)
-                       return false;
+       if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD)
+               return false;
 
        /* Valid trap.  Switch the context: */
-
-       if (vhe) {
-               u64 reg = read_sysreg(cpacr_el1) | CPACR_EL1_FPEN;
-
+       if (has_vhe()) {
+               reg = CPACR_EL1_FPEN;
                if (sve_guest)
                        reg |= CPACR_EL1_ZEN;
 
-               write_sysreg(reg, cpacr_el1);
+               sysreg_clear_set(cpacr_el1, 0, reg);
        } else {
-               write_sysreg(read_sysreg(cptr_el2) & ~(u64)CPTR_EL2_TFP,
-                            cptr_el2);
-       }
+               reg = CPTR_EL2_TFP;
+               if (sve_guest)
+                       reg |= CPTR_EL2_TZ;
 
+               sysreg_clear_set(cptr_el2, reg, 0);
+       }
        isb();
 
        if (vcpu->arch.flags & KVM_ARM64_FP_HOST) {
-               /*
-                * In the SVE case, VHE is assumed: it is enforced by
-                * Kconfig and kvm_arch_init().
-                */
-               if (sve_host) {
-                       struct thread_struct *thread = container_of(
-                               vcpu->arch.host_fpsimd_state,
-                               struct thread_struct, uw.fpsimd_state);
-
-                       sve_save_state(sve_pffr(thread),
-                                      &vcpu->arch.host_fpsimd_state->fpsr);
-               } else {
+               if (sve_host)
+                       __hyp_sve_save_host(vcpu);
+               else
                        __fpsimd_save_state(vcpu->arch.host_fpsimd_state);
-               }
 
                vcpu->arch.flags &= ~KVM_ARM64_FP_HOST;
        }
 
-       if (sve_guest) {
-               sve_load_state(vcpu_sve_pffr(vcpu),
-                              &vcpu->arch.ctxt.fp_regs.fpsr,
-                              sve_vq_from_vl(vcpu->arch.sve_max_vl) - 1);
-               write_sysreg_s(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR_EL12);
-       } else {
+       if (sve_guest)
+               __hyp_sve_restore_guest(vcpu);
+       else
                __fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs);
-       }
 
        /* Skip restoring fpexc32 for AArch64 guests */
        if (!(read_sysreg(hcr_el2) & HCR_RW))
diff --git a/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h b/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h
new file mode 100644 (file)
index 0000000..dc61aaa
--- /dev/null
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_EARLY_ALLOC_H
+#define __KVM_HYP_EARLY_ALLOC_H
+
+#include <asm/kvm_pgtable.h>
+
+void hyp_early_alloc_init(void *virt, unsigned long size);
+unsigned long hyp_early_alloc_nr_used_pages(void);
+void *hyp_early_alloc_page(void *arg);
+void *hyp_early_alloc_contig(unsigned int nr_pages);
+
+extern struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops;
+
+#endif /* __KVM_HYP_EARLY_ALLOC_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
new file mode 100644 (file)
index 0000000..18a4494
--- /dev/null
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_GFP_H
+#define __KVM_HYP_GFP_H
+
+#include <linux/list.h>
+
+#include <nvhe/memory.h>
+#include <nvhe/spinlock.h>
+
+#define HYP_NO_ORDER   UINT_MAX
+
+struct hyp_pool {
+       /*
+        * Spinlock protecting concurrent changes to the memory pool as well as
+        * the struct hyp_page of the pool's pages until we have a proper atomic
+        * API at EL2.
+        */
+       hyp_spinlock_t lock;
+       struct list_head free_area[MAX_ORDER];
+       phys_addr_t range_start;
+       phys_addr_t range_end;
+       unsigned int max_order;
+};
+
+static inline void hyp_page_ref_inc(struct hyp_page *p)
+{
+       struct hyp_pool *pool = hyp_page_to_pool(p);
+
+       hyp_spin_lock(&pool->lock);
+       p->refcount++;
+       hyp_spin_unlock(&pool->lock);
+}
+
+static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
+{
+       struct hyp_pool *pool = hyp_page_to_pool(p);
+       int ret;
+
+       hyp_spin_lock(&pool->lock);
+       p->refcount--;
+       ret = (p->refcount == 0);
+       hyp_spin_unlock(&pool->lock);
+
+       return ret;
+}
+
+static inline void hyp_set_page_refcounted(struct hyp_page *p)
+{
+       struct hyp_pool *pool = hyp_page_to_pool(p);
+
+       hyp_spin_lock(&pool->lock);
+       if (p->refcount) {
+               hyp_spin_unlock(&pool->lock);
+               BUG();
+       }
+       p->refcount = 1;
+       hyp_spin_unlock(&pool->lock);
+}
+
+/* Allocation */
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order);
+void hyp_get_page(void *addr);
+void hyp_put_page(void *addr);
+
+/* Used pages cannot be freed */
+int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
+                 unsigned int reserved_pages);
+#endif /* __KVM_HYP_GFP_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
new file mode 100644 (file)
index 0000000..42d81ec
--- /dev/null
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#ifndef __KVM_NVHE_MEM_PROTECT__
+#define __KVM_NVHE_MEM_PROTECT__
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/virt.h>
+#include <nvhe/spinlock.h>
+
+struct host_kvm {
+       struct kvm_arch arch;
+       struct kvm_pgtable pgt;
+       struct kvm_pgtable_mm_ops mm_ops;
+       hyp_spinlock_t lock;
+};
+extern struct host_kvm host_kvm;
+
+int __pkvm_prot_finalize(void);
+int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end);
+
+int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool);
+void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
+
+static __always_inline void __load_host_stage2(void)
+{
+       if (static_branch_likely(&kvm_protected_mode_initialized))
+               __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
+       else
+               write_sysreg(0, vttbr_el2);
+}
+#endif /* __KVM_NVHE_MEM_PROTECT__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
new file mode 100644 (file)
index 0000000..fd78bde
--- /dev/null
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_MEMORY_H
+#define __KVM_HYP_MEMORY_H
+
+#include <asm/kvm_mmu.h>
+#include <asm/page.h>
+
+#include <linux/types.h>
+
+struct hyp_pool;
+struct hyp_page {
+       unsigned int refcount;
+       unsigned int order;
+       struct hyp_pool *pool;
+       struct list_head node;
+};
+
+extern u64 __hyp_vmemmap;
+#define hyp_vmemmap ((struct hyp_page *)__hyp_vmemmap)
+
+#define __hyp_va(phys) ((void *)((phys_addr_t)(phys) - hyp_physvirt_offset))
+
+static inline void *hyp_phys_to_virt(phys_addr_t phys)
+{
+       return __hyp_va(phys);
+}
+
+static inline phys_addr_t hyp_virt_to_phys(void *addr)
+{
+       return __hyp_pa(addr);
+}
+
+#define hyp_phys_to_pfn(phys)  ((phys) >> PAGE_SHIFT)
+#define hyp_pfn_to_phys(pfn)   ((phys_addr_t)((pfn) << PAGE_SHIFT))
+#define hyp_phys_to_page(phys) (&hyp_vmemmap[hyp_phys_to_pfn(phys)])
+#define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt))
+#define hyp_virt_to_pfn(virt)  hyp_phys_to_pfn(__hyp_pa(virt))
+
+#define hyp_page_to_pfn(page)  ((struct hyp_page *)(page) - hyp_vmemmap)
+#define hyp_page_to_phys(page)  hyp_pfn_to_phys((hyp_page_to_pfn(page)))
+#define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page))
+#define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool)
+
+static inline int hyp_page_count(void *addr)
+{
+       struct hyp_page *p = hyp_virt_to_page(addr);
+
+       return p->refcount;
+}
+
+#endif /* __KVM_HYP_MEMORY_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
new file mode 100644 (file)
index 0000000..0095f62
--- /dev/null
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_MM_H
+#define __KVM_HYP_MM_H
+
+#include <asm/kvm_pgtable.h>
+#include <asm/spectre.h>
+#include <linux/memblock.h>
+#include <linux/types.h>
+
+#include <nvhe/memory.h>
+#include <nvhe/spinlock.h>
+
+#define HYP_MEMBLOCK_REGIONS 128
+extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
+extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);
+extern struct kvm_pgtable pkvm_pgtable;
+extern hyp_spinlock_t pkvm_pgd_lock;
+extern struct hyp_pool hpool;
+extern u64 __io_map_base;
+
+int hyp_create_idmap(u32 hyp_va_bits);
+int hyp_map_vectors(void);
+int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back);
+int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot);
+int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
+int __pkvm_create_mappings(unsigned long start, unsigned long size,
+                          unsigned long phys, enum kvm_pgtable_prot prot);
+unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
+                                           enum kvm_pgtable_prot prot);
+
+static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size,
+                                    unsigned long *start, unsigned long *end)
+{
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+       struct hyp_page *p = hyp_phys_to_page(phys);
+
+       *start = (unsigned long)p;
+       *end = *start + nr_pages * sizeof(struct hyp_page);
+       *start = ALIGN_DOWN(*start, PAGE_SIZE);
+       *end = ALIGN(*end, PAGE_SIZE);
+}
+
+static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
+{
+       unsigned long total = 0, i;
+
+       /* Provision the worst case scenario */
+       for (i = 0; i < KVM_PGTABLE_MAX_LEVELS; i++) {
+               nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE);
+               total += nr_pages;
+       }
+
+       return total;
+}
+
+static inline unsigned long __hyp_pgtable_total_pages(void)
+{
+       unsigned long res = 0, i;
+
+       /* Cover all of memory with page-granularity */
+       for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
+               struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i];
+               res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT);
+       }
+
+       return res;
+}
+
+static inline unsigned long hyp_s1_pgtable_pages(void)
+{
+       unsigned long res;
+
+       res = __hyp_pgtable_total_pages();
+
+       /* Allow 1 GiB for private mappings */
+       res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+
+       return res;
+}
+
+static inline unsigned long host_s2_mem_pgtable_pages(void)
+{
+       /*
+        * Include an extra 16 pages to safely upper-bound the worst case of
+        * concatenated pgds.
+        */
+       return __hyp_pgtable_total_pages() + 16;
+}
+
+static inline unsigned long host_s2_dev_pgtable_pages(void)
+{
+       /* Allow 1 GiB for MMIO mappings */
+       return __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+}
+
+#endif /* __KVM_HYP_MM_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
new file mode 100644 (file)
index 0000000..76b537f
--- /dev/null
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * A stand-alone ticket spinlock implementation for use by the non-VHE
+ * KVM hypervisor code running at EL2.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ *
+ * Heavily based on the implementation removed by c11090474d70 which was:
+ * Copyright (C) 2012 ARM Ltd.
+ */
+
+#ifndef __ARM64_KVM_NVHE_SPINLOCK_H__
+#define __ARM64_KVM_NVHE_SPINLOCK_H__
+
+#include <asm/alternative.h>
+#include <asm/lse.h>
+
+typedef union hyp_spinlock {
+       u32     __val;
+       struct {
+#ifdef __AARCH64EB__
+               u16 next, owner;
+#else
+               u16 owner, next;
+#endif
+       };
+} hyp_spinlock_t;
+
+#define hyp_spin_lock_init(l)                                          \
+do {                                                                   \
+       *(l) = (hyp_spinlock_t){ .__val = 0 };                          \
+} while (0)
+
+static inline void hyp_spin_lock(hyp_spinlock_t *lock)
+{
+       u32 tmp;
+       hyp_spinlock_t lockval, newval;
+
+       asm volatile(
+       /* Atomically increment the next ticket. */
+       ARM64_LSE_ATOMIC_INSN(
+       /* LL/SC */
+"      prfm    pstl1strm, %3\n"
+"1:    ldaxr   %w0, %3\n"
+"      add     %w1, %w0, #(1 << 16)\n"
+"      stxr    %w2, %w1, %3\n"
+"      cbnz    %w2, 1b\n",
+       /* LSE atomics */
+"      mov     %w2, #(1 << 16)\n"
+"      ldadda  %w2, %w0, %3\n"
+       __nops(3))
+
+       /* Did we get the lock? */
+"      eor     %w1, %w0, %w0, ror #16\n"
+"      cbz     %w1, 3f\n"
+       /*
+        * No: spin on the owner. Send a local event to avoid missing an
+        * unlock before the exclusive load.
+        */
+"      sevl\n"
+"2:    wfe\n"
+"      ldaxrh  %w2, %4\n"
+"      eor     %w1, %w2, %w0, lsr #16\n"
+"      cbnz    %w1, 2b\n"
+       /* We got the lock. Critical section starts here. */
+"3:"
+       : "=&r" (lockval), "=&r" (newval), "=&r" (tmp), "+Q" (*lock)
+       : "Q" (lock->owner)
+       : "memory");
+}
+
+static inline void hyp_spin_unlock(hyp_spinlock_t *lock)
+{
+       u64 tmp;
+
+       asm volatile(
+       ARM64_LSE_ATOMIC_INSN(
+       /* LL/SC */
+       "       ldrh    %w1, %0\n"
+       "       add     %w1, %w1, #1\n"
+       "       stlrh   %w1, %0",
+       /* LSE atomics */
+       "       mov     %w1, #1\n"
+       "       staddlh %w1, %0\n"
+       __nops(1))
+       : "=Q" (lock->owner), "=&r" (tmp)
+       :
+       : "memory");
+}
+
+#endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */
index fb24a0f..5df6193 100644 (file)
@@ -9,10 +9,15 @@ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
 hostprogs := gen-hyprel
 HOST_EXTRACFLAGS += -I$(objtree)/include
 
+lib-objs := clear_page.o copy_page.o memcpy.o memset.o
+lib-objs := $(addprefix ../../../lib/, $(lib-objs))
+
 obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
-        hyp-main.o hyp-smp.o psci-relay.o
+        hyp-main.o hyp-smp.o psci-relay.o early_alloc.o stub.o page_alloc.o \
+        cache.o setup.o mm.o mem_protect.o
 obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
-        ../fpsimd.o ../hyp-entry.o ../exception.o
+        ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
+obj-y += $(lib-objs)
 
 ##
 ## Build rules for compiling nVHE hyp code
diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S
new file mode 100644 (file)
index 0000000..36cef69
--- /dev/null
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Code copied from arch/arm64/mm/cache.S.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/alternative.h>
+
+SYM_FUNC_START_PI(__flush_dcache_area)
+       dcache_by_line_op civac, sy, x0, x1, x2, x3
+       ret
+SYM_FUNC_END_PI(__flush_dcache_area)
index f401724..7d3f258 100644 (file)
@@ -21,17 +21,11 @@ static void __debug_save_spe(u64 *pmscr_el1)
        /* Clear pmscr in case of early return */
        *pmscr_el1 = 0;
 
-       /* SPE present on this CPU? */
-       if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
-                                                 ID_AA64DFR0_PMSVER_SHIFT))
-               return;
-
-       /* Yes; is it owned by EL3? */
-       reg = read_sysreg_s(SYS_PMBIDR_EL1);
-       if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
-               return;
-
-       /* No; is the host actually using the thing? */
+       /*
+        * At this point, we know that this CPU implements
+        * SPE and is available to the host.
+        * Check if the host is actually using it ?
+        */
        reg = read_sysreg_s(SYS_PMBLIMITR_EL1);
        if (!(reg & BIT(SYS_PMBLIMITR_EL1_E_SHIFT)))
                return;
@@ -58,10 +52,43 @@ static void __debug_restore_spe(u64 pmscr_el1)
        write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1);
 }
 
+static void __debug_save_trace(u64 *trfcr_el1)
+{
+       *trfcr_el1 = 0;
+
+       /* Check if the TRBE is enabled */
+       if (!(read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_ENABLE))
+               return;
+       /*
+        * Prohibit trace generation while we are in guest.
+        * Since access to TRFCR_EL1 is trapped, the guest can't
+        * modify the filtering set by the host.
+        */
+       *trfcr_el1 = read_sysreg_s(SYS_TRFCR_EL1);
+       write_sysreg_s(0, SYS_TRFCR_EL1);
+       isb();
+       /* Drain the trace buffer to memory */
+       tsb_csync();
+       dsb(nsh);
+}
+
+static void __debug_restore_trace(u64 trfcr_el1)
+{
+       if (!trfcr_el1)
+               return;
+
+       /* Restore trace filter controls */
+       write_sysreg_s(trfcr_el1, SYS_TRFCR_EL1);
+}
+
 void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
 {
        /* Disable and flush SPE data generation */
-       __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1);
+       if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE)
+               __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1);
+       /* Disable and flush Self-Hosted Trace generation */
+       if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE)
+               __debug_save_trace(&vcpu->arch.host_debug_state.trfcr_el1);
 }
 
 void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
@@ -71,7 +98,10 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
 
 void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
 {
-       __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
+       if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE)
+               __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
+       if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE)
+               __debug_restore_trace(vcpu->arch.host_debug_state.trfcr_el1);
 }
 
 void __debug_switch_to_host(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/hyp/nvhe/early_alloc.c b/arch/arm64/kvm/hyp/nvhe/early_alloc.c
new file mode 100644 (file)
index 0000000..1306c43
--- /dev/null
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <asm/kvm_pgtable.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/memory.h>
+
+struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops;
+s64 __ro_after_init hyp_physvirt_offset;
+
+static unsigned long base;
+static unsigned long end;
+static unsigned long cur;
+
+unsigned long hyp_early_alloc_nr_used_pages(void)
+{
+       return (cur - base) >> PAGE_SHIFT;
+}
+
+void *hyp_early_alloc_contig(unsigned int nr_pages)
+{
+       unsigned long size = (nr_pages << PAGE_SHIFT);
+       void *ret = (void *)cur;
+
+       if (!nr_pages)
+               return NULL;
+
+       if (end - cur < size)
+               return NULL;
+
+       cur += size;
+       memset(ret, 0, size);
+
+       return ret;
+}
+
+void *hyp_early_alloc_page(void *arg)
+{
+       return hyp_early_alloc_contig(1);
+}
+
+void hyp_early_alloc_init(void *virt, unsigned long size)
+{
+       base = cur = (unsigned long)virt;
+       end = base + size;
+
+       hyp_early_alloc_mm_ops.zalloc_page = hyp_early_alloc_page;
+       hyp_early_alloc_mm_ops.phys_to_virt = hyp_phys_to_virt;
+       hyp_early_alloc_mm_ops.virt_to_phys = hyp_virt_to_phys;
+}
index ead02c6..6bc88a7 100644 (file)
 #ifndef R_AARCH64_ABS64
 #define R_AARCH64_ABS64                        257
 #endif
+#ifndef R_AARCH64_PREL64
+#define R_AARCH64_PREL64               260
+#endif
+#ifndef R_AARCH64_PREL32
+#define R_AARCH64_PREL32               261
+#endif
+#ifndef R_AARCH64_PREL16
+#define R_AARCH64_PREL16               262
+#endif
+#ifndef R_AARCH64_PLT32
+#define R_AARCH64_PLT32                        314
+#endif
 #ifndef R_AARCH64_LD_PREL_LO19
 #define R_AARCH64_LD_PREL_LO19         273
 #endif
@@ -371,6 +383,12 @@ static void emit_rela_section(Elf64_Shdr *sh_rela)
                case R_AARCH64_ABS64:
                        emit_rela_abs64(rela, sh_orig_name);
                        break;
+               /* Allow position-relative data relocations. */
+               case R_AARCH64_PREL64:
+               case R_AARCH64_PREL32:
+               case R_AARCH64_PREL16:
+               case R_AARCH64_PLT32:
+                       break;
                /* Allow relocations to generate PC-relative addressing. */
                case R_AARCH64_LD_PREL_LO19:
                case R_AARCH64_ADR_PREL_LO21:
index 5d94584..2b23400 100644 (file)
@@ -79,22 +79,18 @@ SYM_FUNC_START(__hyp_do_panic)
        mov     lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
                      PSR_MODE_EL1h)
        msr     spsr_el2, lr
-       ldr     lr, =panic
+       ldr     lr, =nvhe_hyp_panic_handler
        hyp_kimg_va lr, x6
        msr     elr_el2, lr
 
        mov     x29, x0
 
-       /* Load the format string into x0 and arguments into x1-7 */
-       ldr     x0, =__hyp_panic_string
-       hyp_kimg_va x0, x6
-
-       /* Load the format arguments into x1-7. */
-       mov     x6, x3
-       get_vcpu_ptr x7, x3
-       mrs     x3, esr_el2
-       mrs     x4, far_el2
-       mrs     x5, hpfar_el2
+       /* Load the panic arguments into x0-7 */
+       mrs     x0, esr_el2
+       get_vcpu_ptr x4, x5
+       mrs     x5, far_el2
+       mrs     x6, hpfar_el2
+       mov     x7, xzr                 // Unused argument
 
        /* Enter the host, conditionally restoring the host context. */
        cbz     x29, __host_enter_without_restoring
index c631e29..c953fb4 100644 (file)
@@ -83,11 +83,6 @@ SYM_CODE_END(__kvm_hyp_init)
  * x0: struct kvm_nvhe_init_params PA
  */
 SYM_CODE_START_LOCAL(___kvm_hyp_init)
-alternative_if ARM64_KVM_PROTECTED_MODE
-       mov_q   x1, HCR_HOST_NVHE_PROTECTED_FLAGS
-       msr     hcr_el2, x1
-alternative_else_nop_endif
-
        ldr     x1, [x0, #NVHE_INIT_TPIDR_EL2]
        msr     tpidr_el2, x1
 
@@ -97,6 +92,15 @@ alternative_else_nop_endif
        ldr     x1, [x0, #NVHE_INIT_MAIR_EL2]
        msr     mair_el2, x1
 
+       ldr     x1, [x0, #NVHE_INIT_HCR_EL2]
+       msr     hcr_el2, x1
+
+       ldr     x1, [x0, #NVHE_INIT_VTTBR]
+       msr     vttbr_el2, x1
+
+       ldr     x1, [x0, #NVHE_INIT_VTCR]
+       msr     vtcr_el2, x1
+
        ldr     x1, [x0, #NVHE_INIT_PGD_PA]
        phys_to_ttbr x2, x1
 alternative_if ARM64_HAS_CNP
@@ -115,15 +119,10 @@ alternative_else_nop_endif
 
        /* Invalidate the stale TLBs from Bootloader */
        tlbi    alle2
+       tlbi    vmalls12e1
        dsb     sy
 
-       /*
-        * Preserve all the RES1 bits while setting the default flags,
-        * as well as the EE bit on BE. Drop the A flag since the compiler
-        * is allowed to generate unaligned accesses.
-        */
-       mov_q   x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
-CPU_BE(        orr     x0, x0, #SCTLR_ELx_EE)
+       mov_q   x0, INIT_SCTLR_EL2_MMU_ON
 alternative_if ARM64_HAS_ADDRESS_AUTH
        mov_q   x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
                     SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
@@ -221,9 +220,7 @@ SYM_CODE_START(__kvm_handle_stub_hvc)
        mov     x0, xzr
 reset:
        /* Reset kvm back to the hyp stub. */
-       mrs     x5, sctlr_el2
-       mov_q   x6, SCTLR_ELx_FLAGS
-       bic     x5, x5, x6              // Clear SCTL_M and etc
+       mov_q   x5, INIT_SCTLR_EL2_MMU_OFF
        pre_disable_mmu_workaround
        msr     sctlr_el2, x5
        isb
@@ -244,4 +241,31 @@ alternative_else_nop_endif
 
 SYM_CODE_END(__kvm_handle_stub_hvc)
 
+SYM_FUNC_START(__pkvm_init_switch_pgd)
+       /* Turn the MMU off */
+       pre_disable_mmu_workaround
+       mrs     x2, sctlr_el2
+       bic     x3, x2, #SCTLR_ELx_M
+       msr     sctlr_el2, x3
+       isb
+
+       tlbi    alle2
+
+       /* Install the new pgtables */
+       ldr     x3, [x0, #NVHE_INIT_PGD_PA]
+       phys_to_ttbr x4, x3
+alternative_if ARM64_HAS_CNP
+       orr     x4, x4, #TTBR_CNP_BIT
+alternative_else_nop_endif
+       msr     ttbr0_el2, x4
+
+       /* Set the new stack pointer */
+       ldr     x0, [x0, #NVHE_INIT_STACK_HYP_VA]
+       mov     sp, x0
+
+       /* And turn the MMU back on! */
+       set_sctlr_el2   x2
+       ret     x1
+SYM_FUNC_END(__pkvm_init_switch_pgd)
+
        .popsection
index 9363282..f36420a 100644 (file)
@@ -6,12 +6,15 @@
 
 #include <hyp/switch.h>
 
+#include <asm/pgtable-types.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_host.h>
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
 #include <nvhe/trap_handler.h>
 
 DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
@@ -106,6 +109,61 @@ static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt)
        __vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
 }
 
+static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
+       DECLARE_REG(unsigned long, size, host_ctxt, 2);
+       DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3);
+       DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4);
+       DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5);
+
+       /*
+        * __pkvm_init() will return only if an error occurred, otherwise it
+        * will tail-call in __pkvm_init_finalise() which will have to deal
+        * with the host context directly.
+        */
+       cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base,
+                                           hyp_va_bits);
+}
+
+static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(enum arm64_hyp_spectre_vector, slot, host_ctxt, 1);
+
+       cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot);
+}
+
+static void handle___pkvm_create_mappings(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(unsigned long, start, host_ctxt, 1);
+       DECLARE_REG(unsigned long, size, host_ctxt, 2);
+       DECLARE_REG(unsigned long, phys, host_ctxt, 3);
+       DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4);
+
+       cpu_reg(host_ctxt, 1) = __pkvm_create_mappings(start, size, phys, prot);
+}
+
+static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
+       DECLARE_REG(size_t, size, host_ctxt, 2);
+       DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3);
+
+       cpu_reg(host_ctxt, 1) = __pkvm_create_private_mapping(phys, size, prot);
+}
+
+static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
+{
+       cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
+}
+
+static void handle___pkvm_mark_hyp(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(phys_addr_t, start, host_ctxt, 1);
+       DECLARE_REG(phys_addr_t, end, host_ctxt, 2);
+
+       cpu_reg(host_ctxt, 1) = __pkvm_mark_hyp(start, end);
+}
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -125,6 +183,12 @@ static const hcall_t host_hcall[] = {
        HANDLE_FUNC(__kvm_get_mdcr_el2),
        HANDLE_FUNC(__vgic_v3_save_aprs),
        HANDLE_FUNC(__vgic_v3_restore_aprs),
+       HANDLE_FUNC(__pkvm_init),
+       HANDLE_FUNC(__pkvm_cpu_set_vector),
+       HANDLE_FUNC(__pkvm_create_mappings),
+       HANDLE_FUNC(__pkvm_create_private_mapping),
+       HANDLE_FUNC(__pkvm_prot_finalize),
+       HANDLE_FUNC(__pkvm_mark_hyp),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
@@ -177,7 +241,16 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
        case ESR_ELx_EC_SMC64:
                handle_host_smc(host_ctxt);
                break;
+       case ESR_ELx_EC_SVE:
+               sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0);
+               isb();
+               sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
+               break;
+       case ESR_ELx_EC_IABT_LOW:
+       case ESR_ELx_EC_DABT_LOW:
+               handle_host_mem_abort(host_ctxt);
+               break;
        default:
-               hyp_panic();
+               BUG();
        }
 }
index 8795590..9f54833 100644 (file)
@@ -18,8 +18,7 @@ u64 __ro_after_init hyp_cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID
 
 u64 cpu_logical_map(unsigned int cpu)
 {
-       if (cpu >= ARRAY_SIZE(hyp_cpu_logical_map))
-               hyp_panic();
+       BUG_ON(cpu >= ARRAY_SIZE(hyp_cpu_logical_map));
 
        return hyp_cpu_logical_map[cpu];
 }
@@ -30,8 +29,7 @@ unsigned long __hyp_per_cpu_offset(unsigned int cpu)
        unsigned long this_cpu_base;
        unsigned long elf_base;
 
-       if (cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base))
-               hyp_panic();
+       BUG_ON(cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base));
 
        cpu_base_array = (unsigned long *)&kvm_arm_hyp_percpu_base;
        this_cpu_base = kern_hyp_va(cpu_base_array[cpu]);
index cd119d8..f4562f4 100644 (file)
@@ -25,4 +25,5 @@ SECTIONS {
        BEGIN_HYP_SECTION(.data..percpu)
                PERCPU_INPUT(L1_CACHE_BYTES)
        END_HYP_SECTION
+       HYP_SECTION(.bss)
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
new file mode 100644 (file)
index 0000000..e342f7f
--- /dev/null
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/stage2_pgtable.h>
+
+#include <hyp/switch.h>
+
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+
+#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP)
+
+extern unsigned long hyp_nr_cpus;
+struct host_kvm host_kvm;
+
+struct hyp_pool host_s2_mem;
+struct hyp_pool host_s2_dev;
+
+/*
+ * Copies of the host's CPU features registers holding sanitized values.
+ */
+u64 id_aa64mmfr0_el1_sys_val;
+u64 id_aa64mmfr1_el1_sys_val;
+
+static const u8 pkvm_hyp_id = 1;
+
+static void *host_s2_zalloc_pages_exact(size_t size)
+{
+       return hyp_alloc_pages(&host_s2_mem, get_order(size));
+}
+
+static void *host_s2_zalloc_page(void *pool)
+{
+       return hyp_alloc_pages(pool, 0);
+}
+
+static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool)
+{
+       unsigned long nr_pages, pfn;
+       int ret;
+
+       pfn = hyp_virt_to_pfn(mem_pgt_pool);
+       nr_pages = host_s2_mem_pgtable_pages();
+       ret = hyp_pool_init(&host_s2_mem, pfn, nr_pages, 0);
+       if (ret)
+               return ret;
+
+       pfn = hyp_virt_to_pfn(dev_pgt_pool);
+       nr_pages = host_s2_dev_pgtable_pages();
+       ret = hyp_pool_init(&host_s2_dev, pfn, nr_pages, 0);
+       if (ret)
+               return ret;
+
+       host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) {
+               .zalloc_pages_exact = host_s2_zalloc_pages_exact,
+               .zalloc_page = host_s2_zalloc_page,
+               .phys_to_virt = hyp_phys_to_virt,
+               .virt_to_phys = hyp_virt_to_phys,
+               .page_count = hyp_page_count,
+               .get_page = hyp_get_page,
+               .put_page = hyp_put_page,
+       };
+
+       return 0;
+}
+
+static void prepare_host_vtcr(void)
+{
+       u32 parange, phys_shift;
+
+       /* The host stage 2 is id-mapped, so use parange for T0SZ */
+       parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val);
+       phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange);
+
+       host_kvm.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
+                                         id_aa64mmfr1_el1_sys_val, phys_shift);
+}
+
+int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool)
+{
+       struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
+       int ret;
+
+       prepare_host_vtcr();
+       hyp_spin_lock_init(&host_kvm.lock);
+
+       ret = prepare_s2_pools(mem_pgt_pool, dev_pgt_pool);
+       if (ret)
+               return ret;
+
+       ret = kvm_pgtable_stage2_init_flags(&host_kvm.pgt, &host_kvm.arch,
+                                           &host_kvm.mm_ops, KVM_HOST_S2_FLAGS);
+       if (ret)
+               return ret;
+
+       mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd);
+       mmu->arch = &host_kvm.arch;
+       mmu->pgt = &host_kvm.pgt;
+       mmu->vmid.vmid_gen = 0;
+       mmu->vmid.vmid = 0;
+
+       return 0;
+}
+
+int __pkvm_prot_finalize(void)
+{
+       struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
+       struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
+
+       params->vttbr = kvm_get_vttbr(mmu);
+       params->vtcr = host_kvm.arch.vtcr;
+       params->hcr_el2 |= HCR_VM;
+       kvm_flush_dcache_to_poc(params, sizeof(*params));
+
+       write_sysreg(params->hcr_el2, hcr_el2);
+       __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
+
+       /*
+        * Make sure to have an ISB before the TLB maintenance below but only
+        * when __load_stage2() doesn't include one already.
+        */
+       asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT));
+
+       /* Invalidate stale HCR bits that may be cached in TLBs */
+       __tlbi(vmalls12e1);
+       dsb(nsh);
+       isb();
+
+       return 0;
+}
+
+static int host_stage2_unmap_dev_all(void)
+{
+       struct kvm_pgtable *pgt = &host_kvm.pgt;
+       struct memblock_region *reg;
+       u64 addr = 0;
+       int i, ret;
+
+       /* Unmap all non-memory regions to recycle the pages */
+       for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) {
+               reg = &hyp_memory[i];
+               ret = kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr);
+               if (ret)
+                       return ret;
+       }
+       return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
+}
+
+static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
+{
+       int cur, left = 0, right = hyp_memblock_nr;
+       struct memblock_region *reg;
+       phys_addr_t end;
+
+       range->start = 0;
+       range->end = ULONG_MAX;
+
+       /* The list of memblock regions is sorted, binary search it */
+       while (left < right) {
+               cur = (left + right) >> 1;
+               reg = &hyp_memory[cur];
+               end = reg->base + reg->size;
+               if (addr < reg->base) {
+                       right = cur;
+                       range->end = reg->base;
+               } else if (addr >= end) {
+                       left = cur + 1;
+                       range->start = end;
+               } else {
+                       range->start = reg->base;
+                       range->end = end;
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+static bool range_is_memory(u64 start, u64 end)
+{
+       struct kvm_mem_range r1, r2;
+
+       if (!find_mem_range(start, &r1) || !find_mem_range(end, &r2))
+               return false;
+       if (r1.start != r2.start)
+               return false;
+
+       return true;
+}
+
+static inline int __host_stage2_idmap(u64 start, u64 end,
+                                     enum kvm_pgtable_prot prot,
+                                     struct hyp_pool *pool)
+{
+       return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start,
+                                     prot, pool);
+}
+
+static int host_stage2_idmap(u64 addr)
+{
+       enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
+       struct kvm_mem_range range;
+       bool is_memory = find_mem_range(addr, &range);
+       struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev;
+       int ret;
+
+       if (is_memory)
+               prot |= KVM_PGTABLE_PROT_X;
+
+       hyp_spin_lock(&host_kvm.lock);
+       ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range);
+       if (ret)
+               goto unlock;
+
+       ret = __host_stage2_idmap(range.start, range.end, prot, pool);
+       if (is_memory || ret != -ENOMEM)
+               goto unlock;
+
+       /*
+        * host_s2_mem has been provided with enough pages to cover all of
+        * memory with page granularity, so we should never hit the ENOMEM case.
+        * However, it is difficult to know how much of the MMIO range we will
+        * need to cover upfront, so we may need to 'recycle' the pages if we
+        * run out.
+        */
+       ret = host_stage2_unmap_dev_all();
+       if (ret)
+               goto unlock;
+
+       ret = __host_stage2_idmap(range.start, range.end, prot, pool);
+
+unlock:
+       hyp_spin_unlock(&host_kvm.lock);
+
+       return ret;
+}
+
+int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
+{
+       int ret;
+
+       /*
+        * host_stage2_unmap_dev_all() currently relies on MMIO mappings being
+        * non-persistent, so don't allow changing page ownership in MMIO range.
+        */
+       if (!range_is_memory(start, end))
+               return -EINVAL;
+
+       hyp_spin_lock(&host_kvm.lock);
+       ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start,
+                                          &host_s2_mem, pkvm_hyp_id);
+       hyp_spin_unlock(&host_kvm.lock);
+
+       return ret != -EAGAIN ? ret : 0;
+}
+
+void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
+{
+       struct kvm_vcpu_fault_info fault;
+       u64 esr, addr;
+       int ret = 0;
+
+       esr = read_sysreg_el2(SYS_ESR);
+       BUG_ON(!__get_fault_info(esr, &fault));
+
+       addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
+       ret = host_stage2_idmap(addr);
+       BUG_ON(ret && ret != -EAGAIN);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
new file mode 100644 (file)
index 0000000..a8efdf0
--- /dev/null
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/spectre.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mm.h>
+#include <nvhe/spinlock.h>
+
+struct kvm_pgtable pkvm_pgtable;
+hyp_spinlock_t pkvm_pgd_lock;
+u64 __io_map_base;
+
+struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS];
+unsigned int hyp_memblock_nr;
+
+int __pkvm_create_mappings(unsigned long start, unsigned long size,
+                         unsigned long phys, enum kvm_pgtable_prot prot)
+{
+       int err;
+
+       hyp_spin_lock(&pkvm_pgd_lock);
+       err = kvm_pgtable_hyp_map(&pkvm_pgtable, start, size, phys, prot);
+       hyp_spin_unlock(&pkvm_pgd_lock);
+
+       return err;
+}
+
+unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
+                                           enum kvm_pgtable_prot prot)
+{
+       unsigned long addr;
+       int err;
+
+       hyp_spin_lock(&pkvm_pgd_lock);
+
+       size = PAGE_ALIGN(size + offset_in_page(phys));
+       addr = __io_map_base;
+       __io_map_base += size;
+
+       /* Are we overflowing on the vmemmap ? */
+       if (__io_map_base > __hyp_vmemmap) {
+               __io_map_base -= size;
+               addr = (unsigned long)ERR_PTR(-ENOMEM);
+               goto out;
+       }
+
+       err = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, size, phys, prot);
+       if (err) {
+               addr = (unsigned long)ERR_PTR(err);
+               goto out;
+       }
+
+       addr = addr + offset_in_page(phys);
+out:
+       hyp_spin_unlock(&pkvm_pgd_lock);
+
+       return addr;
+}
+
+int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
+{
+       unsigned long start = (unsigned long)from;
+       unsigned long end = (unsigned long)to;
+       unsigned long virt_addr;
+       phys_addr_t phys;
+
+       start = start & PAGE_MASK;
+       end = PAGE_ALIGN(end);
+
+       for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
+               int err;
+
+               phys = hyp_virt_to_phys((void *)virt_addr);
+               err = __pkvm_create_mappings(virt_addr, PAGE_SIZE, phys, prot);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back)
+{
+       unsigned long start, end;
+
+       hyp_vmemmap_range(phys, size, &start, &end);
+
+       return __pkvm_create_mappings(start, end - start, back, PAGE_HYP);
+}
+
+static void *__hyp_bp_vect_base;
+int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot)
+{
+       void *vector;
+
+       switch (slot) {
+       case HYP_VECTOR_DIRECT: {
+               vector = __kvm_hyp_vector;
+               break;
+       }
+       case HYP_VECTOR_SPECTRE_DIRECT: {
+               vector = __bp_harden_hyp_vecs;
+               break;
+       }
+       case HYP_VECTOR_INDIRECT:
+       case HYP_VECTOR_SPECTRE_INDIRECT: {
+               vector = (void *)__hyp_bp_vect_base;
+               break;
+       }
+       default:
+               return -EINVAL;
+       }
+
+       vector = __kvm_vector_slot2addr(vector, slot);
+       *this_cpu_ptr(&kvm_hyp_vector) = (unsigned long)vector;
+
+       return 0;
+}
+
+int hyp_map_vectors(void)
+{
+       phys_addr_t phys;
+       void *bp_base;
+
+       if (!cpus_have_const_cap(ARM64_SPECTRE_V3A))
+               return 0;
+
+       phys = __hyp_pa(__bp_harden_hyp_vecs);
+       bp_base = (void *)__pkvm_create_private_mapping(phys,
+                                                       __BP_HARDEN_HYP_VECS_SZ,
+                                                       PAGE_HYP_EXEC);
+       if (IS_ERR_OR_NULL(bp_base))
+               return PTR_ERR(bp_base);
+
+       __hyp_bp_vect_base = bp_base;
+
+       return 0;
+}
+
+int hyp_create_idmap(u32 hyp_va_bits)
+{
+       unsigned long start, end;
+
+       start = hyp_virt_to_phys((void *)__hyp_idmap_text_start);
+       start = ALIGN_DOWN(start, PAGE_SIZE);
+
+       end = hyp_virt_to_phys((void *)__hyp_idmap_text_end);
+       end = ALIGN(end, PAGE_SIZE);
+
+       /*
+        * One half of the VA space is reserved to linearly map portions of
+        * memory -- see va_layout.c for more details. The other half of the VA
+        * space contains the trampoline page, and needs some care. Split that
+        * second half in two and find the quarter of VA space not conflicting
+        * with the idmap to place the IOs and the vmemmap. IOs use the lower
+        * half of the quarter and the vmemmap the upper half.
+        */
+       __io_map_base = start & BIT(hyp_va_bits - 2);
+       __io_map_base ^= BIT(hyp_va_bits - 2);
+       __hyp_vmemmap = __io_map_base | BIT(hyp_va_bits - 3);
+
+       return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
new file mode 100644 (file)
index 0000000..237e03b
--- /dev/null
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <asm/kvm_hyp.h>
+#include <nvhe/gfp.h>
+
+u64 __hyp_vmemmap;
+
+/*
+ * Index the hyp_vmemmap to find a potential buddy page, but make no assumption
+ * about its current state.
+ *
+ * Example buddy-tree for a 4-pages physically contiguous pool:
+ *
+ *                 o : Page 3
+ *                /
+ *               o-o : Page 2
+ *              /
+ *             /   o : Page 1
+ *            /   /
+ *           o---o-o : Page 0
+ *    Order  2   1 0
+ *
+ * Example of requests on this pool:
+ *   __find_buddy_nocheck(pool, page 0, order 0) => page 1
+ *   __find_buddy_nocheck(pool, page 0, order 1) => page 2
+ *   __find_buddy_nocheck(pool, page 1, order 0) => page 0
+ *   __find_buddy_nocheck(pool, page 2, order 0) => page 3
+ */
+static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
+                                            struct hyp_page *p,
+                                            unsigned int order)
+{
+       phys_addr_t addr = hyp_page_to_phys(p);
+
+       addr ^= (PAGE_SIZE << order);
+
+       /*
+        * Don't return a page outside the pool range -- it belongs to
+        * something else and may not be mapped in hyp_vmemmap.
+        */
+       if (addr < pool->range_start || addr >= pool->range_end)
+               return NULL;
+
+       return hyp_phys_to_page(addr);
+}
+
+/* Find a buddy page currently available for allocation */
+static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool,
+                                          struct hyp_page *p,
+                                          unsigned int order)
+{
+       struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order);
+
+       if (!buddy || buddy->order != order || list_empty(&buddy->node))
+               return NULL;
+
+       return buddy;
+
+}
+
+static void __hyp_attach_page(struct hyp_pool *pool,
+                             struct hyp_page *p)
+{
+       unsigned int order = p->order;
+       struct hyp_page *buddy;
+
+       memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
+
+       /*
+        * Only the first struct hyp_page of a high-order page (otherwise known
+        * as the 'head') should have p->order set. The non-head pages should
+        * have p->order = HYP_NO_ORDER. Here @p may no longer be the head
+        * after coallescing, so make sure to mark it HYP_NO_ORDER proactively.
+        */
+       p->order = HYP_NO_ORDER;
+       for (; (order + 1) < pool->max_order; order++) {
+               buddy = __find_buddy_avail(pool, p, order);
+               if (!buddy)
+                       break;
+
+               /* Take the buddy out of its list, and coallesce with @p */
+               list_del_init(&buddy->node);
+               buddy->order = HYP_NO_ORDER;
+               p = min(p, buddy);
+       }
+
+       /* Mark the new head, and insert it */
+       p->order = order;
+       list_add_tail(&p->node, &pool->free_area[order]);
+}
+
+static void hyp_attach_page(struct hyp_page *p)
+{
+       struct hyp_pool *pool = hyp_page_to_pool(p);
+
+       hyp_spin_lock(&pool->lock);
+       __hyp_attach_page(pool, p);
+       hyp_spin_unlock(&pool->lock);
+}
+
+static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
+                                          struct hyp_page *p,
+                                          unsigned int order)
+{
+       struct hyp_page *buddy;
+
+       list_del_init(&p->node);
+       while (p->order > order) {
+               /*
+                * The buddy of order n - 1 currently has HYP_NO_ORDER as it
+                * is covered by a higher-level page (whose head is @p). Use
+                * __find_buddy_nocheck() to find it and inject it in the
+                * free_list[n - 1], effectively splitting @p in half.
+                */
+               p->order--;
+               buddy = __find_buddy_nocheck(pool, p, p->order);
+               buddy->order = p->order;
+               list_add_tail(&buddy->node, &pool->free_area[buddy->order]);
+       }
+
+       return p;
+}
+
+void hyp_put_page(void *addr)
+{
+       struct hyp_page *p = hyp_virt_to_page(addr);
+
+       if (hyp_page_ref_dec_and_test(p))
+               hyp_attach_page(p);
+}
+
+void hyp_get_page(void *addr)
+{
+       struct hyp_page *p = hyp_virt_to_page(addr);
+
+       hyp_page_ref_inc(p);
+}
+
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order)
+{
+       unsigned int i = order;
+       struct hyp_page *p;
+
+       hyp_spin_lock(&pool->lock);
+
+       /* Look for a high-enough-order page */
+       while (i < pool->max_order && list_empty(&pool->free_area[i]))
+               i++;
+       if (i >= pool->max_order) {
+               hyp_spin_unlock(&pool->lock);
+               return NULL;
+       }
+
+       /* Extract it from the tree at the right order */
+       p = list_first_entry(&pool->free_area[i], struct hyp_page, node);
+       p = __hyp_extract_page(pool, p, order);
+
+       hyp_spin_unlock(&pool->lock);
+       hyp_set_page_refcounted(p);
+
+       return hyp_page_to_virt(p);
+}
+
+int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
+                 unsigned int reserved_pages)
+{
+       phys_addr_t phys = hyp_pfn_to_phys(pfn);
+       struct hyp_page *p;
+       int i;
+
+       hyp_spin_lock_init(&pool->lock);
+       pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT));
+       for (i = 0; i < pool->max_order; i++)
+               INIT_LIST_HEAD(&pool->free_area[i]);
+       pool->range_start = phys;
+       pool->range_end = phys + (nr_pages << PAGE_SHIFT);
+
+       /* Init the vmemmap portion */
+       p = hyp_phys_to_page(phys);
+       memset(p, 0, sizeof(*p) * nr_pages);
+       for (i = 0; i < nr_pages; i++) {
+               p[i].pool = pool;
+               INIT_LIST_HEAD(&p[i].node);
+       }
+
+       /* Attach the unused pages to the buddy tree */
+       for (i = reserved_pages; i < nr_pages; i++)
+               __hyp_attach_page(pool, &p[i]);
+
+       return 0;
+}
index 63de71c..0850878 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/kvm_host.h>
 #include <uapi/linux/psci.h>
 
+#include <nvhe/memory.h>
 #include <nvhe/trap_handler.h>
 
 void kvm_hyp_cpu_entry(unsigned long r0);
@@ -20,9 +21,6 @@ void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
 
 /* Config options set by the host. */
 struct kvm_host_psci_config __ro_after_init kvm_host_psci_config;
-s64 __ro_after_init hyp_physvirt_offset;
-
-#define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset)
 
 #define INVALID_CPU_ID UINT_MAX
 
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
new file mode 100644 (file)
index 0000000..7488f53
--- /dev/null
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+#include <nvhe/trap_handler.h>
+
+struct hyp_pool hpool;
+struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
+unsigned long hyp_nr_cpus;
+
+#define hyp_percpu_size ((unsigned long)__per_cpu_end - \
+                        (unsigned long)__per_cpu_start)
+
+static void *vmemmap_base;
+static void *hyp_pgt_base;
+static void *host_s2_mem_pgt_base;
+static void *host_s2_dev_pgt_base;
+
+static int divide_memory_pool(void *virt, unsigned long size)
+{
+       unsigned long vstart, vend, nr_pages;
+
+       hyp_early_alloc_init(virt, size);
+
+       hyp_vmemmap_range(__hyp_pa(virt), size, &vstart, &vend);
+       nr_pages = (vend - vstart) >> PAGE_SHIFT;
+       vmemmap_base = hyp_early_alloc_contig(nr_pages);
+       if (!vmemmap_base)
+               return -ENOMEM;
+
+       nr_pages = hyp_s1_pgtable_pages();
+       hyp_pgt_base = hyp_early_alloc_contig(nr_pages);
+       if (!hyp_pgt_base)
+               return -ENOMEM;
+
+       nr_pages = host_s2_mem_pgtable_pages();
+       host_s2_mem_pgt_base = hyp_early_alloc_contig(nr_pages);
+       if (!host_s2_mem_pgt_base)
+               return -ENOMEM;
+
+       nr_pages = host_s2_dev_pgtable_pages();
+       host_s2_dev_pgt_base = hyp_early_alloc_contig(nr_pages);
+       if (!host_s2_dev_pgt_base)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
+                                unsigned long *per_cpu_base,
+                                u32 hyp_va_bits)
+{
+       void *start, *end, *virt = hyp_phys_to_virt(phys);
+       unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT;
+       int ret, i;
+
+       /* Recreate the hyp page-table using the early page allocator */
+       hyp_early_alloc_init(hyp_pgt_base, pgt_size);
+       ret = kvm_pgtable_hyp_init(&pkvm_pgtable, hyp_va_bits,
+                                  &hyp_early_alloc_mm_ops);
+       if (ret)
+               return ret;
+
+       ret = hyp_create_idmap(hyp_va_bits);
+       if (ret)
+               return ret;
+
+       ret = hyp_map_vectors();
+       if (ret)
+               return ret;
+
+       ret = hyp_back_vmemmap(phys, size, hyp_virt_to_phys(vmemmap_base));
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__hyp_text_start, __hyp_text_end, PAGE_HYP_EXEC);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__hyp_bss_start, __hyp_bss_end, PAGE_HYP);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP);
+       if (ret)
+               return ret;
+
+       for (i = 0; i < hyp_nr_cpus; i++) {
+               start = (void *)kern_hyp_va(per_cpu_base[i]);
+               end = start + PAGE_ALIGN(hyp_percpu_size);
+               ret = pkvm_create_mappings(start, end, PAGE_HYP);
+               if (ret)
+                       return ret;
+
+               end = (void *)per_cpu_ptr(&kvm_init_params, i)->stack_hyp_va;
+               start = end - PAGE_SIZE;
+               ret = pkvm_create_mappings(start, end, PAGE_HYP);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static void update_nvhe_init_params(void)
+{
+       struct kvm_nvhe_init_params *params;
+       unsigned long i;
+
+       for (i = 0; i < hyp_nr_cpus; i++) {
+               params = per_cpu_ptr(&kvm_init_params, i);
+               params->pgd_pa = __hyp_pa(pkvm_pgtable.pgd);
+               __flush_dcache_area(params, sizeof(*params));
+       }
+}
+
+static void *hyp_zalloc_hyp_page(void *arg)
+{
+       return hyp_alloc_pages(&hpool, 0);
+}
+
+void __noreturn __pkvm_init_finalise(void)
+{
+       struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
+       struct kvm_cpu_context *host_ctxt = &host_data->host_ctxt;
+       unsigned long nr_pages, reserved_pages, pfn;
+       int ret;
+
+       /* Now that the vmemmap is backed, install the full-fledged allocator */
+       pfn = hyp_virt_to_pfn(hyp_pgt_base);
+       nr_pages = hyp_s1_pgtable_pages();
+       reserved_pages = hyp_early_alloc_nr_used_pages();
+       ret = hyp_pool_init(&hpool, pfn, nr_pages, reserved_pages);
+       if (ret)
+               goto out;
+
+       ret = kvm_host_prepare_stage2(host_s2_mem_pgt_base, host_s2_dev_pgt_base);
+       if (ret)
+               goto out;
+
+       pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) {
+               .zalloc_page = hyp_zalloc_hyp_page,
+               .phys_to_virt = hyp_phys_to_virt,
+               .virt_to_phys = hyp_virt_to_phys,
+               .get_page = hyp_get_page,
+               .put_page = hyp_put_page,
+       };
+       pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
+
+out:
+       /*
+        * We tail-called to here from handle___pkvm_init() and will not return,
+        * so make sure to propagate the return value to the host.
+        */
+       cpu_reg(host_ctxt, 1) = ret;
+
+       __host_enter(host_ctxt);
+}
+
+int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
+               unsigned long *per_cpu_base, u32 hyp_va_bits)
+{
+       struct kvm_nvhe_init_params *params;
+       void *virt = hyp_phys_to_virt(phys);
+       void (*fn)(phys_addr_t params_pa, void *finalize_fn_va);
+       int ret;
+
+       if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
+               return -EINVAL;
+
+       hyp_spin_lock_init(&pkvm_pgd_lock);
+       hyp_nr_cpus = nr_cpus;
+
+       ret = divide_memory_pool(virt, size);
+       if (ret)
+               return ret;
+
+       ret = recreate_hyp_mappings(phys, size, per_cpu_base, hyp_va_bits);
+       if (ret)
+               return ret;
+
+       update_nvhe_init_params();
+
+       /* Jump in the idmap page to switch to the new page-tables */
+       params = this_cpu_ptr(&kvm_init_params);
+       fn = (typeof(fn))__hyp_pa(__pkvm_init_switch_pgd);
+       fn(__hyp_pa(params), __pkvm_init_finalise);
+
+       unreachable();
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/stub.c b/arch/arm64/kvm/hyp/nvhe/stub.c
new file mode 100644 (file)
index 0000000..c0aa6bb
--- /dev/null
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stubs for out-of-line function calls caused by re-using kernel
+ * infrastructure at EL2.
+ *
+ * Copyright (C) 2020 - Google LLC
+ */
+
+#include <linux/list.h>
+
+#ifdef CONFIG_DEBUG_LIST
+bool __list_add_valid(struct list_head *new, struct list_head *prev,
+                     struct list_head *next)
+{
+               return true;
+}
+
+bool __list_del_entry_valid(struct list_head *entry)
+{
+               return true;
+}
+#endif
index 68ab6b4..e9f6ea7 100644 (file)
@@ -28,6 +28,8 @@
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 
+#include <nvhe/mem_protect.h>
+
 /* Non-VHE specific context */
 DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
 DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
@@ -41,9 +43,9 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
        __activate_traps_common(vcpu);
 
        val = CPTR_EL2_DEFAULT;
-       val |= CPTR_EL2_TTA | CPTR_EL2_TZ | CPTR_EL2_TAM;
+       val |= CPTR_EL2_TTA | CPTR_EL2_TAM;
        if (!update_fp_enabled(vcpu)) {
-               val |= CPTR_EL2_TFP;
+               val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
                __activate_traps_fpsimd32(vcpu);
        }
 
@@ -68,7 +70,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
 static void __deactivate_traps(struct kvm_vcpu *vcpu)
 {
        extern char __kvm_hyp_host_vector[];
-       u64 mdcr_el2;
+       u64 mdcr_el2, cptr;
 
        ___deactivate_traps(vcpu);
 
@@ -95,19 +97,17 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
 
        mdcr_el2 &= MDCR_EL2_HPMN_MASK;
        mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
+       mdcr_el2 |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT;
 
        write_sysreg(mdcr_el2, mdcr_el2);
-       if (is_protected_kvm_enabled())
-               write_sysreg(HCR_HOST_NVHE_PROTECTED_FLAGS, hcr_el2);
-       else
-               write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
-       write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
-       write_sysreg(__kvm_hyp_host_vector, vbar_el2);
-}
+       write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
 
-static void __load_host_stage2(void)
-{
-       write_sysreg(0, vttbr_el2);
+       cptr = CPTR_EL2_DEFAULT;
+       if (vcpu_has_sve(vcpu) && (vcpu->arch.flags & KVM_ARM64_FP_ENABLED))
+               cptr |= CPTR_EL2_TZ;
+
+       write_sysreg(cptr, cptr_el2);
+       write_sysreg(__kvm_hyp_host_vector, vbar_el2);
 }
 
 /* Save VGICv3 state on non-VHE systems */
index 229b067..83dc3b2 100644 (file)
@@ -8,6 +8,8 @@
 #include <asm/kvm_mmu.h>
 #include <asm/tlbflush.h>
 
+#include <nvhe/mem_protect.h>
+
 struct tlb_inv_context {
        u64             tcr;
 };
@@ -43,7 +45,7 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
 
 static void __tlb_switch_to_host(struct tlb_inv_context *cxt)
 {
-       write_sysreg(0, vttbr_el2);
+       __load_host_stage2();
 
        if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
                /* Ensure write of the host VMID */
index 926fc07..c37c1dc 100644 (file)
@@ -9,8 +9,7 @@
 
 #include <linux/bitfield.h>
 #include <asm/kvm_pgtable.h>
-
-#define KVM_PGTABLE_MAX_LEVELS         4U
+#include <asm/stage2_pgtable.h>
 
 #define KVM_PTE_VALID                  BIT(0)
 
                                         KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
                                         KVM_PTE_LEAF_ATTR_HI_S2_XN)
 
+#define KVM_PTE_LEAF_ATTR_S2_IGNORED   GENMASK(58, 55)
+
+#define KVM_INVALID_PTE_OWNER_MASK     GENMASK(63, 56)
+#define KVM_MAX_OWNER_ID               1
+
 struct kvm_pgtable_walk_data {
        struct kvm_pgtable              *pgt;
        struct kvm_pgtable_walker       *walker;
@@ -68,21 +72,36 @@ static u64 kvm_granule_size(u32 level)
        return BIT(kvm_granule_shift(level));
 }
 
-static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+#define KVM_PHYS_INVALID (-1ULL)
+
+static bool kvm_phys_is_valid(u64 phys)
 {
-       u64 granule = kvm_granule_size(level);
+       return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_PARANGE_MAX));
+}
 
+static bool kvm_level_supports_block_mapping(u32 level)
+{
        /*
         * Reject invalid block mappings and don't bother with 4TB mappings for
         * 52-bit PAs.
         */
-       if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
+       return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1));
+}
+
+static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+{
+       u64 granule = kvm_granule_size(level);
+
+       if (!kvm_level_supports_block_mapping(level))
                return false;
 
        if (granule > (end - addr))
                return false;
 
-       return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
+       if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule))
+               return false;
+
+       return IS_ALIGNED(addr, granule);
 }
 
 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
@@ -152,20 +171,20 @@ static kvm_pte_t kvm_phys_to_pte(u64 pa)
        return pte;
 }
 
-static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
+static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
 {
-       return __va(kvm_pte_to_phys(pte));
+       return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
 }
 
-static void kvm_set_invalid_pte(kvm_pte_t *ptep)
+static void kvm_clear_pte(kvm_pte_t *ptep)
 {
-       kvm_pte_t pte = *ptep;
-       WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
+       WRITE_ONCE(*ptep, 0);
 }
 
-static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
+static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp,
+                             struct kvm_pgtable_mm_ops *mm_ops)
 {
-       kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
+       kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp));
 
        pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
        pte |= KVM_PTE_VALID;
@@ -187,6 +206,11 @@ static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
        return pte;
 }
 
+static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
+{
+       return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
+}
+
 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
                                  u32 level, kvm_pte_t *ptep,
                                  enum kvm_pgtable_walk_flags flag)
@@ -228,7 +252,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
                goto out;
        }
 
-       childp = kvm_pte_follow(pte);
+       childp = kvm_pte_follow(pte, data->pgt->mm_ops);
        ret = __kvm_pgtable_walk(data, childp, level + 1);
        if (ret)
                goto out;
@@ -303,12 +327,12 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
 }
 
 struct hyp_map_data {
-       u64             phys;
-       kvm_pte_t       attr;
+       u64                             phys;
+       kvm_pte_t                       attr;
+       struct kvm_pgtable_mm_ops       *mm_ops;
 };
 
-static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
-                                struct hyp_map_data *data)
+static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
 {
        bool device = prot & KVM_PGTABLE_PROT_DEVICE;
        u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
@@ -333,7 +357,8 @@ static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
        attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
        attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
        attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
-       data->attr = attr;
+       *ptep = attr;
+
        return 0;
 }
 
@@ -359,6 +384,8 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                          enum kvm_pgtable_walk_flags flag, void * const arg)
 {
        kvm_pte_t *childp;
+       struct hyp_map_data *data = arg;
+       struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
 
        if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
                return 0;
@@ -366,11 +393,11 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
        if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
                return -EINVAL;
 
-       childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+       childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
        if (!childp)
                return -ENOMEM;
 
-       kvm_set_table_pte(ptep, childp);
+       kvm_set_table_pte(ptep, childp, mm_ops);
        return 0;
 }
 
@@ -380,6 +407,7 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
        int ret;
        struct hyp_map_data map_data = {
                .phys   = ALIGN_DOWN(phys, PAGE_SIZE),
+               .mm_ops = pgt->mm_ops,
        };
        struct kvm_pgtable_walker walker = {
                .cb     = hyp_map_walker,
@@ -387,7 +415,7 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
                .arg    = &map_data,
        };
 
-       ret = hyp_map_set_prot_attr(prot, &map_data);
+       ret = hyp_set_prot_attr(prot, &map_data.attr);
        if (ret)
                return ret;
 
@@ -397,16 +425,18 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
        return ret;
 }
 
-int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
+                        struct kvm_pgtable_mm_ops *mm_ops)
 {
        u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
 
-       pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+       pgt->pgd = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
        if (!pgt->pgd)
                return -ENOMEM;
 
        pgt->ia_bits            = va_bits;
        pgt->start_level        = KVM_PGTABLE_MAX_LEVELS - levels;
+       pgt->mm_ops             = mm_ops;
        pgt->mmu                = NULL;
        return 0;
 }
@@ -414,7 +444,9 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
 static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                           enum kvm_pgtable_walk_flags flag, void * const arg)
 {
-       free_page((unsigned long)kvm_pte_follow(*ptep));
+       struct kvm_pgtable_mm_ops *mm_ops = arg;
+
+       mm_ops->put_page((void *)kvm_pte_follow(*ptep, mm_ops));
        return 0;
 }
 
@@ -423,29 +455,75 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
        struct kvm_pgtable_walker walker = {
                .cb     = hyp_free_walker,
                .flags  = KVM_PGTABLE_WALK_TABLE_POST,
+               .arg    = pgt->mm_ops,
        };
 
        WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
-       free_page((unsigned long)pgt->pgd);
+       pgt->mm_ops->put_page(pgt->pgd);
        pgt->pgd = NULL;
 }
 
 struct stage2_map_data {
        u64                             phys;
        kvm_pte_t                       attr;
+       u8                              owner_id;
 
        kvm_pte_t                       *anchor;
+       kvm_pte_t                       *childp;
 
        struct kvm_s2_mmu               *mmu;
-       struct kvm_mmu_memory_cache     *memcache;
+       void                            *memcache;
+
+       struct kvm_pgtable_mm_ops       *mm_ops;
 };
 
-static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
-                                   struct stage2_map_data *data)
+u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
+{
+       u64 vtcr = VTCR_EL2_FLAGS;
+       u8 lvls;
+
+       vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
+       vtcr |= VTCR_EL2_T0SZ(phys_shift);
+       /*
+        * Use a minimum 2 level page table to prevent splitting
+        * host PMD huge pages at stage2.
+        */
+       lvls = stage2_pgtable_levels(phys_shift);
+       if (lvls < 2)
+               lvls = 2;
+       vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
+
+       /*
+        * Enable the Hardware Access Flag management, unconditionally
+        * on all CPUs. The features is RES0 on CPUs without the support
+        * and must be ignored by the CPUs.
+        */
+       vtcr |= VTCR_EL2_HA;
+
+       /* Set the vmid bits */
+       vtcr |= (get_vmid_bits(mmfr1) == 16) ?
+               VTCR_EL2_VS_16BIT :
+               VTCR_EL2_VS_8BIT;
+
+       return vtcr;
+}
+
+static bool stage2_has_fwb(struct kvm_pgtable *pgt)
+{
+       if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+               return false;
+
+       return !(pgt->flags & KVM_PGTABLE_S2_NOFWB);
+}
+
+#define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))
+
+static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
+                               kvm_pte_t *ptep)
 {
        bool device = prot & KVM_PGTABLE_PROT_DEVICE;
-       kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
-                           PAGE_S2_MEMATTR(NORMAL);
+       kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) :
+                           KVM_S2_MEMATTR(pgt, NORMAL);
        u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
 
        if (!(prot & KVM_PGTABLE_PROT_X))
@@ -461,44 +539,78 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
 
        attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
        attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
-       data->attr = attr;
+       *ptep = attr;
+
        return 0;
 }
 
+static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
+{
+       if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
+               return true;
+
+       return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
+}
+
+static bool stage2_pte_is_counted(kvm_pte_t pte)
+{
+       /*
+        * The refcount tracks valid entries as well as invalid entries if they
+        * encode ownership of a page to another entity than the page-table
+        * owner, whose id is 0.
+        */
+       return !!pte;
+}
+
+static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
+                          u32 level, struct kvm_pgtable_mm_ops *mm_ops)
+{
+       /*
+        * Clear the existing PTE, and perform break-before-make with
+        * TLB maintenance if it was valid.
+        */
+       if (kvm_pte_valid(*ptep)) {
+               kvm_clear_pte(ptep);
+               kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
+       }
+
+       mm_ops->put_page(ptep);
+}
+
 static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
                                      kvm_pte_t *ptep,
                                      struct stage2_map_data *data)
 {
        kvm_pte_t new, old = *ptep;
        u64 granule = kvm_granule_size(level), phys = data->phys;
-       struct page *page = virt_to_page(ptep);
+       struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
 
        if (!kvm_block_mapping_supported(addr, end, phys, level))
                return -E2BIG;
 
-       new = kvm_init_valid_leaf_pte(phys, data->attr, level);
-       if (kvm_pte_valid(old)) {
+       if (kvm_phys_is_valid(phys))
+               new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+       else
+               new = kvm_init_invalid_leaf_owner(data->owner_id);
+
+       if (stage2_pte_is_counted(old)) {
                /*
                 * Skip updating the PTE if we are trying to recreate the exact
                 * same mapping or only change the access permissions. Instead,
                 * the vCPU will exit one more time from guest if still needed
                 * and then go through the path of relaxing permissions.
                 */
-               if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)))
+               if (!stage2_pte_needs_update(old, new))
                        return -EAGAIN;
 
-               /*
-                * There's an existing different valid leaf entry, so perform
-                * break-before-make.
-                */
-               kvm_set_invalid_pte(ptep);
-               kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
-               put_page(page);
+               stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
        }
 
        smp_store_release(ptep, new);
-       get_page(page);
-       data->phys += granule;
+       if (stage2_pte_is_counted(new))
+               mm_ops->get_page(ptep);
+       if (kvm_phys_is_valid(phys))
+               data->phys += granule;
        return 0;
 }
 
@@ -512,7 +624,8 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
        if (!kvm_block_mapping_supported(addr, end, data->phys, level))
                return 0;
 
-       kvm_set_invalid_pte(ptep);
+       data->childp = kvm_pte_follow(*ptep, data->mm_ops);
+       kvm_clear_pte(ptep);
 
        /*
         * Invalidate the whole stage-2, as we may have numerous leaf
@@ -527,13 +640,13 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
 static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                                struct stage2_map_data *data)
 {
-       int ret;
+       struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
        kvm_pte_t *childp, pte = *ptep;
-       struct page *page = virt_to_page(ptep);
+       int ret;
 
        if (data->anchor) {
-               if (kvm_pte_valid(pte))
-                       put_page(page);
+               if (stage2_pte_is_counted(pte))
+                       mm_ops->put_page(ptep);
 
                return 0;
        }
@@ -548,7 +661,7 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
        if (!data->memcache)
                return -ENOMEM;
 
-       childp = kvm_mmu_memory_cache_alloc(data->memcache);
+       childp = mm_ops->zalloc_page(data->memcache);
        if (!childp)
                return -ENOMEM;
 
@@ -557,14 +670,11 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
         * a table. Accesses beyond 'end' that fall within the new table
         * will be mapped lazily.
         */
-       if (kvm_pte_valid(pte)) {
-               kvm_set_invalid_pte(ptep);
-               kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
-               put_page(page);
-       }
+       if (stage2_pte_is_counted(pte))
+               stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
 
-       kvm_set_table_pte(ptep, childp);
-       get_page(page);
+       kvm_set_table_pte(ptep, childp, mm_ops);
+       mm_ops->get_page(ptep);
 
        return 0;
 }
@@ -573,19 +683,25 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
                                      kvm_pte_t *ptep,
                                      struct stage2_map_data *data)
 {
+       struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
+       kvm_pte_t *childp;
        int ret = 0;
 
        if (!data->anchor)
                return 0;
 
-       free_page((unsigned long)kvm_pte_follow(*ptep));
-       put_page(virt_to_page(ptep));
-
        if (data->anchor == ptep) {
+               childp = data->childp;
                data->anchor = NULL;
+               data->childp = NULL;
                ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
+       } else {
+               childp = kvm_pte_follow(*ptep, mm_ops);
        }
 
+       mm_ops->put_page(childp);
+       mm_ops->put_page(ptep);
+
        return ret;
 }
 
@@ -627,13 +743,14 @@ static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 
 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
                           u64 phys, enum kvm_pgtable_prot prot,
-                          struct kvm_mmu_memory_cache *mc)
+                          void *mc)
 {
        int ret;
        struct stage2_map_data map_data = {
                .phys           = ALIGN_DOWN(phys, PAGE_SIZE),
                .mmu            = pgt->mmu,
                .memcache       = mc,
+               .mm_ops         = pgt->mm_ops,
        };
        struct kvm_pgtable_walker walker = {
                .cb             = stage2_map_walker,
@@ -643,7 +760,10 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
                .arg            = &map_data,
        };
 
-       ret = stage2_map_set_prot_attr(prot, &map_data);
+       if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys)))
+               return -EINVAL;
+
+       ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
        if (ret)
                return ret;
 
@@ -652,38 +772,63 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
        return ret;
 }
 
-static void stage2_flush_dcache(void *addr, u64 size)
+int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
+                                void *mc, u8 owner_id)
 {
-       if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
-               return;
+       int ret;
+       struct stage2_map_data map_data = {
+               .phys           = KVM_PHYS_INVALID,
+               .mmu            = pgt->mmu,
+               .memcache       = mc,
+               .mm_ops         = pgt->mm_ops,
+               .owner_id       = owner_id,
+       };
+       struct kvm_pgtable_walker walker = {
+               .cb             = stage2_map_walker,
+               .flags          = KVM_PGTABLE_WALK_TABLE_PRE |
+                                 KVM_PGTABLE_WALK_LEAF |
+                                 KVM_PGTABLE_WALK_TABLE_POST,
+               .arg            = &map_data,
+       };
+
+       if (owner_id > KVM_MAX_OWNER_ID)
+               return -EINVAL;
 
-       __flush_dcache_area(addr, size);
+       ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+       return ret;
 }
 
-static bool stage2_pte_cacheable(kvm_pte_t pte)
+static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
 {
        u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
-       return memattr == PAGE_S2_MEMATTR(NORMAL);
+       return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
 }
 
 static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                               enum kvm_pgtable_walk_flags flag,
                               void * const arg)
 {
-       struct kvm_s2_mmu *mmu = arg;
+       struct kvm_pgtable *pgt = arg;
+       struct kvm_s2_mmu *mmu = pgt->mmu;
+       struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
        kvm_pte_t pte = *ptep, *childp = NULL;
        bool need_flush = false;
 
-       if (!kvm_pte_valid(pte))
+       if (!kvm_pte_valid(pte)) {
+               if (stage2_pte_is_counted(pte)) {
+                       kvm_clear_pte(ptep);
+                       mm_ops->put_page(ptep);
+               }
                return 0;
+       }
 
        if (kvm_pte_table(pte, level)) {
-               childp = kvm_pte_follow(pte);
+               childp = kvm_pte_follow(pte, mm_ops);
 
-               if (page_count(virt_to_page(childp)) != 1)
+               if (mm_ops->page_count(childp) != 1)
                        return 0;
-       } else if (stage2_pte_cacheable(pte)) {
-               need_flush = true;
+       } else if (stage2_pte_cacheable(pgt, pte)) {
+               need_flush = !stage2_has_fwb(pgt);
        }
 
        /*
@@ -691,17 +836,15 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
         * block entry and rely on the remaining portions being faulted
         * back lazily.
         */
-       kvm_set_invalid_pte(ptep);
-       kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
-       put_page(virt_to_page(ptep));
+       stage2_put_pte(ptep, mmu, addr, level, mm_ops);
 
        if (need_flush) {
-               stage2_flush_dcache(kvm_pte_follow(pte),
+               __flush_dcache_area(kvm_pte_follow(pte, mm_ops),
                                    kvm_granule_size(level));
        }
 
        if (childp)
-               free_page((unsigned long)childp);
+               mm_ops->put_page(childp);
 
        return 0;
 }
@@ -710,7 +853,7 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
 {
        struct kvm_pgtable_walker walker = {
                .cb     = stage2_unmap_walker,
-               .arg    = pgt->mmu,
+               .arg    = pgt,
                .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
        };
 
@@ -842,12 +985,14 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                               enum kvm_pgtable_walk_flags flag,
                               void * const arg)
 {
+       struct kvm_pgtable *pgt = arg;
+       struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
        kvm_pte_t pte = *ptep;
 
-       if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte))
+       if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte))
                return 0;
 
-       stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level));
+       __flush_dcache_area(kvm_pte_follow(pte, mm_ops), kvm_granule_size(level));
        return 0;
 }
 
@@ -856,30 +1001,35 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
        struct kvm_pgtable_walker walker = {
                .cb     = stage2_flush_walker,
                .flags  = KVM_PGTABLE_WALK_LEAF,
+               .arg    = pgt,
        };
 
-       if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+       if (stage2_has_fwb(pgt))
                return 0;
 
        return kvm_pgtable_walk(pgt, addr, size, &walker);
 }
 
-int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
+int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch,
+                                 struct kvm_pgtable_mm_ops *mm_ops,
+                                 enum kvm_pgtable_stage2_flags flags)
 {
        size_t pgd_sz;
-       u64 vtcr = kvm->arch.vtcr;
+       u64 vtcr = arch->vtcr;
        u32 ia_bits = VTCR_EL2_IPA(vtcr);
        u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
        u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
 
        pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
-       pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+       pgt->pgd = mm_ops->zalloc_pages_exact(pgd_sz);
        if (!pgt->pgd)
                return -ENOMEM;
 
        pgt->ia_bits            = ia_bits;
        pgt->start_level        = start_level;
-       pgt->mmu                = &kvm->arch.mmu;
+       pgt->mm_ops             = mm_ops;
+       pgt->mmu                = &arch->mmu;
+       pgt->flags              = flags;
 
        /* Ensure zeroed PGD pages are visible to the hardware walker */
        dsb(ishst);
@@ -890,15 +1040,16 @@ static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                              enum kvm_pgtable_walk_flags flag,
                              void * const arg)
 {
+       struct kvm_pgtable_mm_ops *mm_ops = arg;
        kvm_pte_t pte = *ptep;
 
-       if (!kvm_pte_valid(pte))
+       if (!stage2_pte_is_counted(pte))
                return 0;
 
-       put_page(virt_to_page(ptep));
+       mm_ops->put_page(ptep);
 
        if (kvm_pte_table(pte, level))
-               free_page((unsigned long)kvm_pte_follow(pte));
+               mm_ops->put_page(kvm_pte_follow(pte, mm_ops));
 
        return 0;
 }
@@ -910,10 +1061,85 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
                .cb     = stage2_free_walker,
                .flags  = KVM_PGTABLE_WALK_LEAF |
                          KVM_PGTABLE_WALK_TABLE_POST,
+               .arg    = pgt->mm_ops,
        };
 
        WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
        pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
-       free_pages_exact(pgt->pgd, pgd_sz);
+       pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
        pgt->pgd = NULL;
 }
+
+#define KVM_PTE_LEAF_S2_COMPAT_MASK    (KVM_PTE_LEAF_ATTR_S2_PERMS | \
+                                        KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR | \
+                                        KVM_PTE_LEAF_ATTR_S2_IGNORED)
+
+static int stage2_check_permission_walker(u64 addr, u64 end, u32 level,
+                                         kvm_pte_t *ptep,
+                                         enum kvm_pgtable_walk_flags flag,
+                                         void * const arg)
+{
+       kvm_pte_t old_attr, pte = *ptep, *new_attr = arg;
+
+       /*
+        * Compatible mappings are either invalid and owned by the page-table
+        * owner (whose id is 0), or valid with matching permission attributes.
+        */
+       if (kvm_pte_valid(pte)) {
+               old_attr = pte & KVM_PTE_LEAF_S2_COMPAT_MASK;
+               if (old_attr != *new_attr)
+                       return -EEXIST;
+       } else if (pte) {
+               return -EEXIST;
+       }
+
+       return 0;
+}
+
+int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
+                                 enum kvm_pgtable_prot prot,
+                                 struct kvm_mem_range *range)
+{
+       kvm_pte_t attr;
+       struct kvm_pgtable_walker check_perm_walker = {
+               .cb             = stage2_check_permission_walker,
+               .flags          = KVM_PGTABLE_WALK_LEAF,
+               .arg            = &attr,
+       };
+       u64 granule, start, end;
+       u32 level;
+       int ret;
+
+       ret = stage2_set_prot_attr(pgt, prot, &attr);
+       if (ret)
+               return ret;
+       attr &= KVM_PTE_LEAF_S2_COMPAT_MASK;
+
+       for (level = pgt->start_level; level < KVM_PGTABLE_MAX_LEVELS; level++) {
+               granule = kvm_granule_size(level);
+               start = ALIGN_DOWN(addr, granule);
+               end = start + granule;
+
+               if (!kvm_level_supports_block_mapping(level))
+                       continue;
+
+               if (start < range->start || range->end < end)
+                       continue;
+
+               /*
+                * Check the presence of existing mappings with incompatible
+                * permissions within the current block range, and try one level
+                * deeper if one is found.
+                */
+               ret = kvm_pgtable_walk(pgt, start, granule, &check_perm_walker);
+               if (ret != -EEXIST)
+                       break;
+       }
+
+       if (!ret) {
+               range->start = start;
+               range->end = end;
+       }
+
+       return ret;
+}
diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c
new file mode 100644 (file)
index 0000000..83ca23a
--- /dev/null
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 - Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/memblock.h>
+#include <linux/sort.h>
+
+#include <asm/kvm_host.h>
+
+#include <nvhe/memory.h>
+#include <nvhe/mm.h>
+
+static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
+static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);
+
+phys_addr_t hyp_mem_base;
+phys_addr_t hyp_mem_size;
+
+static int cmp_hyp_memblock(const void *p1, const void *p2)
+{
+       const struct memblock_region *r1 = p1;
+       const struct memblock_region *r2 = p2;
+
+       return r1->base < r2->base ? -1 : (r1->base > r2->base);
+}
+
+static void __init sort_memblock_regions(void)
+{
+       sort(hyp_memory,
+            *hyp_memblock_nr_ptr,
+            sizeof(struct memblock_region),
+            cmp_hyp_memblock,
+            NULL);
+}
+
+static int __init register_memblock_regions(void)
+{
+       struct memblock_region *reg;
+
+       for_each_mem_region(reg) {
+               if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS)
+                       return -ENOMEM;
+
+               hyp_memory[*hyp_memblock_nr_ptr] = *reg;
+               (*hyp_memblock_nr_ptr)++;
+       }
+       sort_memblock_regions();
+
+       return 0;
+}
+
+void __init kvm_hyp_reserve(void)
+{
+       u64 nr_pages, prev, hyp_mem_pages = 0;
+       int ret;
+
+       if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
+               return;
+
+       if (kvm_get_mode() != KVM_MODE_PROTECTED)
+               return;
+
+       ret = register_memblock_regions();
+       if (ret) {
+               *hyp_memblock_nr_ptr = 0;
+               kvm_err("Failed to register hyp memblocks: %d\n", ret);
+               return;
+       }
+
+       hyp_mem_pages += hyp_s1_pgtable_pages();
+       hyp_mem_pages += host_s2_mem_pgtable_pages();
+       hyp_mem_pages += host_s2_dev_pgtable_pages();
+
+       /*
+        * The hyp_vmemmap needs to be backed by pages, but these pages
+        * themselves need to be present in the vmemmap, so compute the number
+        * of pages needed by looking for a fixed point.
+        */
+       nr_pages = 0;
+       do {
+               prev = nr_pages;
+               nr_pages = hyp_mem_pages + prev;
+               nr_pages = DIV_ROUND_UP(nr_pages * sizeof(struct hyp_page), PAGE_SIZE);
+               nr_pages += __hyp_pgtable_max_pages(nr_pages);
+       } while (nr_pages != prev);
+       hyp_mem_pages += nr_pages;
+
+       /*
+        * Try to allocate a PMD-aligned region to reduce TLB pressure once
+        * this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
+        */
+       hyp_mem_size = hyp_mem_pages << PAGE_SHIFT;
+       hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(),
+                                             ALIGN(hyp_mem_size, PMD_SIZE),
+                                             PMD_SIZE);
+       if (!hyp_mem_base)
+               hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(),
+                                                     hyp_mem_size, PAGE_SIZE);
+       else
+               hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE);
+
+       if (!hyp_mem_base) {
+               kvm_err("Failed to reserve hyp memory\n");
+               return;
+       }
+       memblock_reserve(hyp_mem_base, hyp_mem_size);
+
+       kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
+                hyp_mem_base);
+}
index af8e940..7b8f7db 100644 (file)
@@ -27,8 +27,6 @@
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 
-const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
-
 /* VHE specific context */
 DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
 DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
@@ -207,7 +205,7 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par)
        __deactivate_traps(vcpu);
        sysreg_restore_host_state_vhe(host_ctxt);
 
-       panic(__hyp_panic_string,
+       panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n",
              spsr, elr,
              read_sysreg_el2(SYS_ESR), read_sysreg_el2(SYS_FAR),
              read_sysreg(hpfar_el2), par, vcpu);
index ead21b9..30da78f 100644 (file)
@@ -9,16 +9,65 @@
 #include <kvm/arm_hypercalls.h>
 #include <kvm/arm_psci.h>
 
+static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
+{
+       struct system_time_snapshot systime_snapshot;
+       u64 cycles = ~0UL;
+       u32 feature;
+
+       /*
+        * system time and counter value must captured at the same
+        * time to keep consistency and precision.
+        */
+       ktime_get_snapshot(&systime_snapshot);
+
+       /*
+        * This is only valid if the current clocksource is the
+        * architected counter, as this is the only one the guest
+        * can see.
+        */
+       if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER)
+               return;
+
+       /*
+        * The guest selects one of the two reference counters
+        * (virtual or physical) with the first argument of the SMCCC
+        * call. In case the identifier is not supported, error out.
+        */
+       feature = smccc_get_arg1(vcpu);
+       switch (feature) {
+       case KVM_PTP_VIRT_COUNTER:
+               cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, CNTVOFF_EL2);
+               break;
+       case KVM_PTP_PHYS_COUNTER:
+               cycles = systime_snapshot.cycles;
+               break;
+       default:
+               return;
+       }
+
+       /*
+        * This relies on the top bit of val[0] never being set for
+        * valid values of system time, because that is *really* far
+        * in the future (about 292 years from 1970, and at that stage
+        * nobody will give a damn about it).
+        */
+       val[0] = upper_32_bits(systime_snapshot.real);
+       val[1] = lower_32_bits(systime_snapshot.real);
+       val[2] = upper_32_bits(cycles);
+       val[3] = lower_32_bits(cycles);
+}
+
 int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 {
        u32 func_id = smccc_get_function(vcpu);
-       long val = SMCCC_RET_NOT_SUPPORTED;
+       u64 val[4] = {SMCCC_RET_NOT_SUPPORTED};
        u32 feature;
        gpa_t gpa;
 
        switch (func_id) {
        case ARM_SMCCC_VERSION_FUNC_ID:
-               val = ARM_SMCCC_VERSION_1_1;
+               val[0] = ARM_SMCCC_VERSION_1_1;
                break;
        case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
                feature = smccc_get_arg1(vcpu);
@@ -28,10 +77,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
                        case SPECTRE_VULNERABLE:
                                break;
                        case SPECTRE_MITIGATED:
-                               val = SMCCC_RET_SUCCESS;
+                               val[0] = SMCCC_RET_SUCCESS;
                                break;
                        case SPECTRE_UNAFFECTED:
-                               val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
+                               val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
                                break;
                        }
                        break;
@@ -54,22 +103,35 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
                                        break;
                                fallthrough;
                        case SPECTRE_UNAFFECTED:
-                               val = SMCCC_RET_NOT_REQUIRED;
+                               val[0] = SMCCC_RET_NOT_REQUIRED;
                                break;
                        }
                        break;
                case ARM_SMCCC_HV_PV_TIME_FEATURES:
-                       val = SMCCC_RET_SUCCESS;
+                       val[0] = SMCCC_RET_SUCCESS;
                        break;
                }
                break;
        case ARM_SMCCC_HV_PV_TIME_FEATURES:
-               val = kvm_hypercall_pv_features(vcpu);
+               val[0] = kvm_hypercall_pv_features(vcpu);
                break;
        case ARM_SMCCC_HV_PV_TIME_ST:
                gpa = kvm_init_stolen_time(vcpu);
                if (gpa != GPA_INVALID)
-                       val = gpa;
+                       val[0] = gpa;
+               break;
+       case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
+               val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0;
+               val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1;
+               val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2;
+               val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3;
+               break;
+       case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
+               val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
+               val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP);
+               break;
+       case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
+               kvm_ptp_get_time(vcpu, val);
                break;
        case ARM_SMCCC_TRNG_VERSION:
        case ARM_SMCCC_TRNG_FEATURES:
@@ -81,6 +143,6 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
                return kvm_psci_call(vcpu);
        }
 
-       smccc_set_retval(vcpu, val, 0, 0, 0);
+       smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
        return 1;
 }
index 8711894..c5d1f3c 100644 (file)
@@ -88,6 +88,44 @@ static bool kvm_is_device_pfn(unsigned long pfn)
        return !pfn_valid(pfn);
 }
 
+static void *stage2_memcache_zalloc_page(void *arg)
+{
+       struct kvm_mmu_memory_cache *mc = arg;
+
+       /* Allocated with __GFP_ZERO, so no need to zero */
+       return kvm_mmu_memory_cache_alloc(mc);
+}
+
+static void *kvm_host_zalloc_pages_exact(size_t size)
+{
+       return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+}
+
+static void kvm_host_get_page(void *addr)
+{
+       get_page(virt_to_page(addr));
+}
+
+static void kvm_host_put_page(void *addr)
+{
+       put_page(virt_to_page(addr));
+}
+
+static int kvm_host_page_count(void *addr)
+{
+       return page_count(virt_to_page(addr));
+}
+
+static phys_addr_t kvm_host_pa(void *addr)
+{
+       return __pa(addr);
+}
+
+static void *kvm_host_va(phys_addr_t phys)
+{
+       return __va(phys);
+}
+
 /*
  * Unmapping vs dcache management:
  *
@@ -127,7 +165,7 @@ static bool kvm_is_device_pfn(unsigned long pfn)
 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
                                 bool may_block)
 {
-       struct kvm *kvm = mmu->kvm;
+       struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
        phys_addr_t end = start + size;
 
        assert_spin_locked(&kvm->mmu_lock);
@@ -183,15 +221,39 @@ void free_hyp_pgds(void)
        if (hyp_pgtable) {
                kvm_pgtable_hyp_destroy(hyp_pgtable);
                kfree(hyp_pgtable);
+               hyp_pgtable = NULL;
        }
        mutex_unlock(&kvm_hyp_pgd_mutex);
 }
 
+static bool kvm_host_owns_hyp_mappings(void)
+{
+       if (static_branch_likely(&kvm_protected_mode_initialized))
+               return false;
+
+       /*
+        * This can happen at boot time when __create_hyp_mappings() is called
+        * after the hyp protection has been enabled, but the static key has
+        * not been flipped yet.
+        */
+       if (!hyp_pgtable && is_protected_kvm_enabled())
+               return false;
+
+       WARN_ON(!hyp_pgtable);
+
+       return true;
+}
+
 static int __create_hyp_mappings(unsigned long start, unsigned long size,
                                 unsigned long phys, enum kvm_pgtable_prot prot)
 {
        int err;
 
+       if (!kvm_host_owns_hyp_mappings()) {
+               return kvm_call_hyp_nvhe(__pkvm_create_mappings,
+                                        start, size, phys, prot);
+       }
+
        mutex_lock(&kvm_hyp_pgd_mutex);
        err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
        mutex_unlock(&kvm_hyp_pgd_mutex);
@@ -253,6 +315,16 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
        unsigned long base;
        int ret = 0;
 
+       if (!kvm_host_owns_hyp_mappings()) {
+               base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
+                                        phys_addr, size, prot);
+               if (IS_ERR_OR_NULL((void *)base))
+                       return PTR_ERR((void *)base);
+               *haddr = base;
+
+               return 0;
+       }
+
        mutex_lock(&kvm_hyp_pgd_mutex);
 
        /*
@@ -351,6 +423,17 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
        return 0;
 }
 
+static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
+       .zalloc_page            = stage2_memcache_zalloc_page,
+       .zalloc_pages_exact     = kvm_host_zalloc_pages_exact,
+       .free_pages_exact       = free_pages_exact,
+       .get_page               = kvm_host_get_page,
+       .put_page               = kvm_host_put_page,
+       .page_count             = kvm_host_page_count,
+       .phys_to_virt           = kvm_host_va,
+       .virt_to_phys           = kvm_host_pa,
+};
+
 /**
  * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
  * @kvm:       The pointer to the KVM structure
@@ -374,7 +457,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
        if (!pgt)
                return -ENOMEM;
 
-       err = kvm_pgtable_stage2_init(pgt, kvm);
+       err = kvm_pgtable_stage2_init(pgt, &kvm->arch, &kvm_s2_mm_ops);
        if (err)
                goto out_free_pgtable;
 
@@ -387,7 +470,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
        for_each_possible_cpu(cpu)
                *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
 
-       mmu->kvm = kvm;
+       mmu->arch = &kvm->arch;
        mmu->pgt = pgt;
        mmu->pgd_phys = __pa(pgt->pgd);
        mmu->vmid.vmid_gen = 0;
@@ -421,10 +504,11 @@ static void stage2_unmap_memslot(struct kvm *kvm,
         *     +--------------------------------------------+
         */
        do {
-               struct vm_area_struct *vma = find_vma(current->mm, hva);
+               struct vm_area_struct *vma;
                hva_t vm_start, vm_end;
 
-               if (!vma || vma->vm_start >= reg_end)
+               vma = find_vma_intersection(current->mm, hva, reg_end);
+               if (!vma)
                        break;
 
                /*
@@ -469,7 +553,7 @@ void stage2_unmap_vm(struct kvm *kvm)
 
 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 {
-       struct kvm *kvm = mmu->kvm;
+       struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
        struct kvm_pgtable *pgt = NULL;
 
        spin_lock(&kvm->mmu_lock);
@@ -538,7 +622,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
  */
 static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
 {
-       struct kvm *kvm = mmu->kvm;
+       struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
        stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
 }
 
@@ -555,7 +639,7 @@ static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_
  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
  * serializing operations for VM memory regions.
  */
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
+static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
 {
        struct kvm_memslots *slots = kvm_memslots(kvm);
        struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
@@ -839,13 +923,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
         * the page we just got a reference to gets unmapped before we have a
         * chance to grab the mmu_lock, which ensure that if the page gets
-        * unmapped afterwards, the call to kvm_unmap_hva will take it away
+        * unmapped afterwards, the call to kvm_unmap_gfn will take it away
         * from us again properly. This smp_rmb() interacts with the smp_wmb()
         * in kvm_mmu_notifier_invalidate_<page|range_end>.
+        *
+        * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
+        * used to avoid unnecessary overhead introduced to locate the memory
+        * slot because it's always fixed even @gfn is adjusted for huge pages.
         */
        smp_rmb();
 
-       pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
+       pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+                                  write_fault, &writable, NULL);
        if (pfn == KVM_PFN_ERR_HWPOISON) {
                kvm_send_hwpoison_signal(hva, vma_shift);
                return 0;
@@ -911,7 +1000,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        /* Mark the page dirty only if the fault is handled successfully */
        if (writable && !ret) {
                kvm_set_pfn_dirty(pfn);
-               mark_page_dirty(kvm, gfn);
+               mark_page_dirty_in_slot(kvm, memslot, gfn);
        }
 
 out_unlock:
@@ -1064,126 +1153,70 @@ out_unlock:
        return ret;
 }
 
-static int handle_hva_to_gpa(struct kvm *kvm,
-                            unsigned long start,
-                            unsigned long end,
-                            int (*handler)(struct kvm *kvm,
-                                           gpa_t gpa, u64 size,
-                                           void *data),
-                            void *data)
-{
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-       int ret = 0;
-
-       slots = kvm_memslots(kvm);
-
-       /* we only care about the pages that the guest sees */
-       kvm_for_each_memslot(memslot, slots) {
-               unsigned long hva_start, hva_end;
-               gfn_t gpa;
-
-               hva_start = max(start, memslot->userspace_addr);
-               hva_end = min(end, memslot->userspace_addr +
-                                       (memslot->npages << PAGE_SHIFT));
-               if (hva_start >= hva_end)
-                       continue;
-
-               gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
-               ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
-       }
-
-       return ret;
-}
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-       unsigned flags = *(unsigned *)data;
-       bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
-
-       __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
-       return 0;
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm,
-                       unsigned long start, unsigned long end, unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        if (!kvm->arch.mmu.pgt)
                return 0;
 
-       trace_kvm_unmap_hva_range(start, end);
-       handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
-       return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-       kvm_pfn_t *pfn = (kvm_pfn_t *)data;
-
-       WARN_ON(size != PAGE_SIZE);
+       __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
+                            (range->end - range->start) << PAGE_SHIFT,
+                            range->may_block);
 
-       /*
-        * The MMU notifiers will have unmapped a huge PMD before calling
-        * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
-        * therefore we never need to clear out a huge PMD through this
-        * calling path and a memcache is not required.
-        */
-       kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
-                              __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
        return 0;
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       unsigned long end = hva + PAGE_SIZE;
-       kvm_pfn_t pfn = pte_pfn(pte);
+       kvm_pfn_t pfn = pte_pfn(range->pte);
 
        if (!kvm->arch.mmu.pgt)
                return 0;
 
-       trace_kvm_set_spte_hva(hva);
+       WARN_ON(range->end - range->start != 1);
 
        /*
         * We've moved a page around, probably through CoW, so let's treat it
         * just like a translation fault and clean the cache to the PoC.
         */
        clean_dcache_guest_page(pfn, PAGE_SIZE);
-       handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
+
+       /*
+        * The MMU notifiers will have unmapped a huge PMD before calling
+        * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
+        * therefore we never need to clear out a huge PMD through this
+        * calling path and a memcache is not required.
+        */
+       kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
+                              PAGE_SIZE, __pfn_to_phys(pfn),
+                              KVM_PGTABLE_PROT_R, NULL);
+
        return 0;
 }
 
-static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       pte_t pte;
+       u64 size = (range->end - range->start) << PAGE_SHIFT;
        kvm_pte_t kpte;
+       pte_t pte;
+
+       if (!kvm->arch.mmu.pgt)
+               return 0;
 
        WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-       kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
+
+       kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
+                                       range->start << PAGE_SHIFT);
        pte = __pte(kpte);
        return pte_valid(pte) && pte_young(pte);
 }
 
-static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-       WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-       return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
-}
-
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        if (!kvm->arch.mmu.pgt)
                return 0;
-       trace_kvm_age_hva(start, end);
-       return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
-       if (!kvm->arch.mmu.pgt)
-               return 0;
-       trace_kvm_test_age_hva(hva);
-       return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
-                                kvm_test_age_hva_handler, NULL);
+       return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
+                                          range->start << PAGE_SHIFT);
 }
 
 phys_addr_t kvm_mmu_get_httbr(void)
@@ -1208,10 +1241,22 @@ static int kvm_map_idmap_text(void)
        return err;
 }
 
-int kvm_mmu_init(void)
+static void *kvm_hyp_zalloc_page(void *arg)
+{
+       return (void *)get_zeroed_page(GFP_KERNEL);
+}
+
+static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
+       .zalloc_page            = kvm_hyp_zalloc_page,
+       .get_page               = kvm_host_get_page,
+       .put_page               = kvm_host_put_page,
+       .phys_to_virt           = kvm_host_va,
+       .virt_to_phys           = kvm_host_pa,
+};
+
+int kvm_mmu_init(u32 *hyp_va_bits)
 {
        int err;
-       u32 hyp_va_bits;
 
        hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
        hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
@@ -1225,8 +1270,8 @@ int kvm_mmu_init(void)
         */
        BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
 
-       hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
-       kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
+       *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+       kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
        kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
        kvm_debug("HYP VA range: %lx:%lx\n",
                  kern_hyp_va(PAGE_OFFSET),
@@ -1251,7 +1296,7 @@ int kvm_mmu_init(void)
                goto out;
        }
 
-       err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
+       err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops);
        if (err)
                goto out_free_pgtable;
 
@@ -1329,10 +1374,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
         *     +--------------------------------------------+
         */
        do {
-               struct vm_area_struct *vma = find_vma(current->mm, hva);
+               struct vm_area_struct *vma;
                hva_t vm_start, vm_end;
 
-               if (!vma || vma->vm_start >= reg_end)
+               vma = find_vma_intersection(current->mm, hva, reg_end);
+               if (!vma)
                        break;
 
                /*
index 7391643..151c31f 100644 (file)
@@ -50,12 +50,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
 
 int kvm_perf_init(void)
 {
-       /*
-        * Check if HW_PERF_EVENTS are supported by checking the number of
-        * hardware performance counters. This could ensure the presence of
-        * a physical PMU and CONFIG_PERF_EVENT is selected.
-        */
-       if (IS_ENABLED(CONFIG_ARM_PMU) && perf_num_counters() > 0)
+       if (kvm_pmu_probe_pmuver() != 0xf && !is_protected_kvm_enabled())
                static_branch_enable(&kvm_arm_pmu_available);
 
        return perf_register_guest_info_callbacks(&kvm_guest_cbs);
index e32c6e1..fd167d4 100644 (file)
@@ -739,7 +739,7 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
        kvm_pmu_create_perf_event(vcpu, select_idx);
 }
 
-static int kvm_pmu_probe_pmuver(void)
+int kvm_pmu_probe_pmuver(void)
 {
        struct perf_event_attr attr = { };
        struct perf_event *event;
index faf32a4..03a6c1f 100644 (file)
@@ -33,7 +33,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
 {
        struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
 
-       if (!ctx || !kvm_pmu_switch_needed(attr))
+       if (!kvm_arm_support_pmu_v3() || !ctx || !kvm_pmu_switch_needed(attr))
                return;
 
        if (!attr->exclude_host)
@@ -49,7 +49,7 @@ void kvm_clr_pmu_events(u32 clr)
 {
        struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
 
-       if (!ctx)
+       if (!kvm_arm_support_pmu_v3() || !ctx)
                return;
 
        ctx->pmu_events.events_host &= ~clr;
@@ -172,7 +172,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
        struct kvm_host_data *host;
        u32 events_guest, events_host;
 
-       if (!has_vhe())
+       if (!kvm_arm_support_pmu_v3() || !has_vhe())
                return;
 
        preempt_disable();
@@ -193,7 +193,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
        struct kvm_host_data *host;
        u32 events_guest, events_host;
 
-       if (!has_vhe())
+       if (!kvm_arm_support_pmu_v3() || !has_vhe())
                return;
 
        host = this_cpu_ptr_hyp_sym(kvm_host_data);
index bd354cd..956cdc2 100644 (file)
@@ -74,10 +74,6 @@ static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu)
        if (!system_supports_sve())
                return -EINVAL;
 
-       /* Verify that KVM startup enforced this when SVE was detected: */
-       if (WARN_ON(!has_vhe()))
-               return -EINVAL;
-
        vcpu->arch.sve_max_vl = kvm_sve_max_vl;
 
        /*
@@ -242,6 +238,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 
        /* Reset core registers */
        memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu)));
+       memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs));
+       vcpu->arch.ctxt.spsr_abt = 0;
+       vcpu->arch.ctxt.spsr_und = 0;
+       vcpu->arch.ctxt.spsr_irq = 0;
+       vcpu->arch.ctxt.spsr_fiq = 0;
        vcpu_gp_regs(vcpu)->pstate = pstate;
 
        /* Reset system registers */
@@ -333,19 +334,10 @@ int kvm_set_ipa_limit(void)
        return 0;
 }
 
-/*
- * Configure the VTCR_EL2 for this VM. The VTCR value is common
- * across all the physical CPUs on the system. We use system wide
- * sanitised values to fill in different fields, except for Hardware
- * Management of Access Flags. HA Flag is set unconditionally on
- * all CPUs, as it is safe to run with or without the feature and
- * the bit is RES0 on CPUs that don't support it.
- */
 int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
 {
-       u64 vtcr = VTCR_EL2_FLAGS, mmfr0;
-       u32 parange, phys_shift;
-       u8 lvls;
+       u64 mmfr0, mmfr1;
+       u32 phys_shift;
 
        if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
                return -EINVAL;
@@ -365,33 +357,8 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
        }
 
        mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
-       parange = cpuid_feature_extract_unsigned_field(mmfr0,
-                               ID_AA64MMFR0_PARANGE_SHIFT);
-       if (parange > ID_AA64MMFR0_PARANGE_MAX)
-               parange = ID_AA64MMFR0_PARANGE_MAX;
-       vtcr |= parange << VTCR_EL2_PS_SHIFT;
-
-       vtcr |= VTCR_EL2_T0SZ(phys_shift);
-       /*
-        * Use a minimum 2 level page table to prevent splitting
-        * host PMD huge pages at stage2.
-        */
-       lvls = stage2_pgtable_levels(phys_shift);
-       if (lvls < 2)
-               lvls = 2;
-       vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
-
-       /*
-        * Enable the Hardware Access Flag management, unconditionally
-        * on all CPUs. The features is RES0 on CPUs without the support
-        * and must be ignored by the CPUs.
-        */
-       vtcr |= VTCR_EL2_HA;
+       mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+       kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
 
-       /* Set the vmid bits */
-       vtcr |= (kvm_get_vmid_bits() == 16) ?
-               VTCR_EL2_VS_16BIT :
-               VTCR_EL2_VS_8BIT;
-       kvm->arch.vtcr = vtcr;
        return 0;
 }
index 4f2f1e3..76ea280 100644 (file)
@@ -1063,6 +1063,8 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
                val = cpuid_feature_cap_perfmon_field(val,
                                                      ID_AA64DFR0_PMUVER_SHIFT,
                                                      kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0);
+               /* Hide SPE from guests */
+               val &= ~FEATURE(ID_AA64DFR0_PMSVER);
                break;
        case SYS_ID_DFR0_EL1:
                /* Limit guests to PMUv3 for ARMv8.4 */
@@ -1472,6 +1474,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
        { SYS_DESC(SYS_GCR_EL1), undef_access },
 
        { SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility },
+       { SYS_DESC(SYS_TRFCR_EL1), undef_access },
        { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 },
        { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 },
        { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 },
@@ -1501,6 +1504,19 @@ static const struct sys_reg_desc sys_reg_descs[] = {
        { SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 },
        { SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 },
 
+       { SYS_DESC(SYS_PMSCR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSNEVFR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSICR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSIRR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSFCR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSEVFR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSLATFR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSIDR_EL1), undef_access },
+       { SYS_DESC(SYS_PMBLIMITR_EL1), undef_access },
+       { SYS_DESC(SYS_PMBPTR_EL1), undef_access },
+       { SYS_DESC(SYS_PMBSR_EL1), undef_access },
+       /* PMBIDR_EL1 is not trapped */
+
        { PMU_SYS_REG(SYS_PMINTENSET_EL1),
          .access = access_pminten, .reg = PMINTENSET_EL1 },
        { PMU_SYS_REG(SYS_PMINTENCLR_EL1),
index ff04443..33e4e7d 100644 (file)
@@ -135,72 +135,6 @@ TRACE_EVENT(kvm_mmio_emulate,
                  __entry->vcpu_pc, __entry->instr, __entry->cpsr)
 );
 
-TRACE_EVENT(kvm_unmap_hva_range,
-       TP_PROTO(unsigned long start, unsigned long end),
-       TP_ARGS(start, end),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  start           )
-               __field(        unsigned long,  end             )
-       ),
-
-       TP_fast_assign(
-               __entry->start          = start;
-               __entry->end            = end;
-       ),
-
-       TP_printk("mmu notifier unmap range: %#016lx -- %#016lx",
-                 __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_set_spte_hva,
-       TP_PROTO(unsigned long hva),
-       TP_ARGS(hva),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  hva             )
-       ),
-
-       TP_fast_assign(
-               __entry->hva            = hva;
-       ),
-
-       TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
-);
-
-TRACE_EVENT(kvm_age_hva,
-       TP_PROTO(unsigned long start, unsigned long end),
-       TP_ARGS(start, end),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  start           )
-               __field(        unsigned long,  end             )
-       ),
-
-       TP_fast_assign(
-               __entry->start          = start;
-               __entry->end            = end;
-       ),
-
-       TP_printk("mmu notifier age hva: %#016lx -- %#016lx",
-                 __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_test_age_hva,
-       TP_PROTO(unsigned long hva),
-       TP_ARGS(hva),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  hva             )
-       ),
-
-       TP_fast_assign(
-               __entry->hva            = hva;
-       ),
-
-       TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
-);
-
 TRACE_EVENT(kvm_set_way_flush,
            TP_PROTO(unsigned long vcpu_pc, bool cache),
            TP_ARGS(vcpu_pc, cache),
index 9783013..acdb7b3 100644 (file)
@@ -288,3 +288,10 @@ void kvm_get_kimage_voffset(struct alt_instr *alt,
 {
        generate_mov_q(kimage_voffset, origptr, updptr, nr_inst);
 }
+
+void kvm_compute_final_ctr_el0(struct alt_instr *alt,
+                              __le32 *origptr, __le32 *updptr, int nr_inst)
+{
+       generate_mov_q(read_sanitised_ftr_reg(SYS_CTR_EL0),
+                      origptr, updptr, nr_inst);
+}
index 052917d..58cbda0 100644 (file)
@@ -335,13 +335,14 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
        kfree(dist->spis);
        dist->spis = NULL;
        dist->nr_spis = 0;
+       dist->vgic_dist_base = VGIC_ADDR_UNDEF;
 
-       if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-               list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) {
-                       list_del(&rdreg->list);
-                       kfree(rdreg);
-               }
+       if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+               list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list)
+                       vgic_v3_free_redist_region(rdreg);
                INIT_LIST_HEAD(&dist->rd_regions);
+       } else {
+               dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
        }
 
        if (vgic_has_its(kvm))
@@ -362,6 +363,7 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
        vgic_flush_pending_lpis(vcpu);
 
        INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+       vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
 }
 
 /* To be called with kvm->lock held */
index b9518f9..61728c5 100644 (file)
@@ -2218,10 +2218,10 @@ static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device)
                /*
                 * If an LPI carries the HW bit, this means that this
                 * interrupt is controlled by GICv4, and we do not
-                * have direct access to that state. Let's simply fail
-                * the save operation...
+                * have direct access to that state without GICv4.1.
+                * Let's simply fail the save operation...
                 */
-               if (ite->irq->hw)
+               if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1)
                        return -EACCES;
 
                ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
index 4441967..7740995 100644 (file)
@@ -87,8 +87,8 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
                        r = vgic_v3_set_redist_base(kvm, 0, *addr, 0);
                        goto out;
                }
-               rdreg = list_first_entry(&vgic->rd_regions,
-                                        struct vgic_redist_region, list);
+               rdreg = list_first_entry_or_null(&vgic->rd_regions,
+                                                struct vgic_redist_region, list);
                if (!rdreg)
                        addr_ptr = &undef_value;
                else
@@ -226,6 +226,9 @@ static int vgic_get_common_attr(struct kvm_device *dev,
                u64 addr;
                unsigned long type = (unsigned long)attr->attr;
 
+               if (copy_from_user(&addr, uaddr, sizeof(addr)))
+                       return -EFAULT;
+
                r = kvm_vgic_addr(dev->kvm, type, &addr, false);
                if (r)
                        return (r == -ENODEV) ? -ENXIO : r;
index 2f1b156..a09cdc0 100644 (file)
@@ -251,30 +251,35 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
                vgic_enable_lpis(vcpu);
 }
 
-static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
-                                             gpa_t addr, unsigned int len)
+static bool vgic_mmio_vcpu_rdist_is_last(struct kvm_vcpu *vcpu)
 {
-       unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+       struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_redist_region *rdreg = vgic_cpu->rdreg;
-       int target_vcpu_id = vcpu->vcpu_id;
-       gpa_t last_rdist_typer = rdreg->base + GICR_TYPER +
-                       (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE;
-       u64 value;
+       struct vgic_redist_region *iter, *rdreg = vgic_cpu->rdreg;
 
-       value = (u64)(mpidr & GENMASK(23, 0)) << 32;
-       value |= ((target_vcpu_id & 0xffff) << 8);
+       if (!rdreg)
+               return false;
 
-       if (addr == last_rdist_typer)
-               value |= GICR_TYPER_LAST;
-       if (vgic_has_its(vcpu->kvm))
-               value |= GICR_TYPER_PLPIS;
+       if (vgic_cpu->rdreg_index < rdreg->free_index - 1) {
+               return false;
+       } else if (rdreg->count && vgic_cpu->rdreg_index == (rdreg->count - 1)) {
+               struct list_head *rd_regions = &vgic->rd_regions;
+               gpa_t end = rdreg->base + rdreg->count * KVM_VGIC_V3_REDIST_SIZE;
 
-       return extract_bytes(value, addr & 7, len);
+               /*
+                * the rdist is the last one of the redist region,
+                * check whether there is no other contiguous rdist region
+                */
+               list_for_each_entry(iter, rd_regions, list) {
+                       if (iter->base == end && iter->free_index > 0)
+                               return false;
+               }
+       }
+       return true;
 }
 
-static unsigned long vgic_uaccess_read_v3r_typer(struct kvm_vcpu *vcpu,
-                                                gpa_t addr, unsigned int len)
+static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
+                                             gpa_t addr, unsigned int len)
 {
        unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
        int target_vcpu_id = vcpu->vcpu_id;
@@ -286,7 +291,9 @@ static unsigned long vgic_uaccess_read_v3r_typer(struct kvm_vcpu *vcpu,
        if (vgic_has_its(vcpu->kvm))
                value |= GICR_TYPER_PLPIS;
 
-       /* reporting of the Last bit is not supported for userspace */
+       if (vgic_mmio_vcpu_rdist_is_last(vcpu))
+               value |= GICR_TYPER_LAST;
+
        return extract_bytes(value, addr & 7, len);
 }
 
@@ -612,7 +619,7 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = {
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_TYPER,
                vgic_mmio_read_v3r_typer, vgic_mmio_write_wi,
-               vgic_uaccess_read_v3r_typer, vgic_mmio_uaccess_write_wi, 8,
+               NULL, vgic_mmio_uaccess_write_wi, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_WAKER,
                vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
@@ -714,6 +721,7 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
                return -EINVAL;
 
        vgic_cpu->rdreg = rdreg;
+       vgic_cpu->rdreg_index = rdreg->free_index;
 
        rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
 
@@ -768,7 +776,7 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm)
 }
 
 /**
- * vgic_v3_insert_redist_region - Insert a new redistributor region
+ * vgic_v3_alloc_redist_region - Allocate a new redistributor region
  *
  * Performs various checks before inserting the rdist region in the list.
  * Those tests depend on whether the size of the rdist region is known
@@ -782,8 +790,8 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm)
  *
  * Return 0 on success, < 0 otherwise
  */
-static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
-                                       gpa_t base, uint32_t count)
+static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index,
+                                      gpa_t base, uint32_t count)
 {
        struct vgic_dist *d = &kvm->arch.vgic;
        struct vgic_redist_region *rdreg;
@@ -791,10 +799,6 @@ static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
        size_t size = count * KVM_VGIC_V3_REDIST_SIZE;
        int ret;
 
-       /* single rdist region already set ?*/
-       if (!count && !list_empty(rd_regions))
-               return -EINVAL;
-
        /* cross the end of memory ? */
        if (base + size < base)
                return -EINVAL;
@@ -805,11 +809,15 @@ static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
        } else {
                rdreg = list_last_entry(rd_regions,
                                        struct vgic_redist_region, list);
-               if (index != rdreg->index + 1)
+
+               /* Don't mix single region and discrete redist regions */
+               if (!count && rdreg->count)
                        return -EINVAL;
 
-               /* Cannot add an explicitly sized regions after legacy region */
-               if (!rdreg->count)
+               if (!count)
+                       return -EEXIST;
+
+               if (index != rdreg->index + 1)
                        return -EINVAL;
        }
 
@@ -848,11 +856,17 @@ free:
        return ret;
 }
 
+void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg)
+{
+       list_del(&rdreg->list);
+       kfree(rdreg);
+}
+
 int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
 {
        int ret;
 
-       ret = vgic_v3_insert_redist_region(kvm, index, addr, count);
+       ret = vgic_v3_alloc_redist_region(kvm, index, addr, count);
        if (ret)
                return ret;
 
@@ -861,8 +875,13 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
         * afterwards will register the iodevs when needed.
         */
        ret = vgic_register_all_redist_iodevs(kvm);
-       if (ret)
+       if (ret) {
+               struct vgic_redist_region *rdreg;
+
+               rdreg = vgic_v3_rdist_region_from_index(kvm, index);
+               vgic_v3_free_redist_region(rdreg);
                return ret;
+       }
 
        return 0;
 }
index b2d73fc..48c6067 100644 (file)
@@ -938,10 +938,9 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
        return region;
 }
 
-static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
                             gpa_t addr, u32 *val)
 {
-       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
        const struct vgic_register_region *region;
        struct kvm_vcpu *r_vcpu;
 
@@ -960,10 +959,9 @@ static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
        return 0;
 }
 
-static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
                              gpa_t addr, const u32 *val)
 {
-       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
        const struct vgic_register_region *region;
        struct kvm_vcpu *r_vcpu;
 
@@ -986,9 +984,9 @@ int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
                 bool is_write, int offset, u32 *val)
 {
        if (is_write)
-               return vgic_uaccess_write(vcpu, &dev->dev, offset, val);
+               return vgic_uaccess_write(vcpu, dev, offset, val);
        else
-               return vgic_uaccess_read(vcpu, &dev->dev, offset, val);
+               return vgic_uaccess_read(vcpu, dev, offset, val);
 }
 
 static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
index 6f53092..41ecf21 100644 (file)
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
 #include <linux/irqchip/arm-gic-v3.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <kvm/arm_vgic.h>
@@ -356,6 +358,32 @@ retry:
        return 0;
 }
 
+/*
+ * The deactivation of the doorbell interrupt will trigger the
+ * unmapping of the associated vPE.
+ */
+static void unmap_all_vpes(struct vgic_dist *dist)
+{
+       struct irq_desc *desc;
+       int i;
+
+       for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+               desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+               irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+       }
+}
+
+static void map_all_vpes(struct vgic_dist *dist)
+{
+       struct irq_desc *desc;
+       int i;
+
+       for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+               desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+               irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
+       }
+}
+
 /**
  * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
  * kvm lock and all vcpu lock must be held
@@ -365,13 +393,28 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct vgic_irq *irq;
        gpa_t last_ptr = ~(gpa_t)0;
-       int ret;
+       bool vlpi_avail = false;
+       int ret = 0;
        u8 val;
 
+       if (unlikely(!vgic_initialized(kvm)))
+               return -ENXIO;
+
+       /*
+        * A preparation for getting any VLPI states.
+        * The above vgic initialized check also ensures that the allocation
+        * and enabling of the doorbells have already been done.
+        */
+       if (kvm_vgic_global_state.has_gicv4_1) {
+               unmap_all_vpes(dist);
+               vlpi_avail = true;
+       }
+
        list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
                int byte_offset, bit_nr;
                struct kvm_vcpu *vcpu;
                gpa_t pendbase, ptr;
+               bool is_pending;
                bool stored;
 
                vcpu = irq->target_vcpu;
@@ -387,24 +430,35 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
                if (ptr != last_ptr) {
                        ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
                        if (ret)
-                               return ret;
+                               goto out;
                        last_ptr = ptr;
                }
 
                stored = val & (1U << bit_nr);
-               if (stored == irq->pending_latch)
+
+               is_pending = irq->pending_latch;
+
+               if (irq->hw && vlpi_avail)
+                       vgic_v4_get_vlpi_state(irq, &is_pending);
+
+               if (stored == is_pending)
                        continue;
 
-               if (irq->pending_latch)
+               if (is_pending)
                        val |= 1 << bit_nr;
                else
                        val &= ~(1 << bit_nr);
 
                ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
                if (ret)
-                       return ret;
+                       goto out;
        }
-       return 0;
+
+out:
+       if (vlpi_avail)
+               map_all_vpes(dist);
+
+       return ret;
 }
 
 /**
index 66508b0..c1845d8 100644 (file)
@@ -203,6 +203,25 @@ void vgic_v4_configure_vsgis(struct kvm *kvm)
        kvm_arm_resume_guest(kvm);
 }
 
+/*
+ * Must be called with GICv4.1 and the vPE unmapped, which
+ * indicates the invalidation of any VPT caches associated
+ * with the vPE, thus we can get the VLPI state by peeking
+ * at the VPT.
+ */
+void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val)
+{
+       struct its_vpe *vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+       int mask = BIT(irq->intid % BITS_PER_BYTE);
+       void *va;
+       u8 *ptr;
+
+       va = page_address(vpe->vpt_page);
+       ptr = va + irq->intid / BITS_PER_BYTE;
+
+       *val = !!(*ptr & mask);
+}
+
 /**
  * vgic_v4_init - Initialize the GICv4 data structures
  * @kvm:       Pointer to the VM being initialized
@@ -385,6 +404,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
        struct vgic_its *its;
        struct vgic_irq *irq;
        struct its_vlpi_map map;
+       unsigned long flags;
        int ret;
 
        if (!vgic_supports_direct_msis(kvm))
@@ -430,6 +450,24 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
        irq->host_irq   = virq;
        atomic_inc(&map.vpe->vlpi_count);
 
+       /* Transfer pending state */
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+       if (irq->pending_latch) {
+               ret = irq_set_irqchip_state(irq->host_irq,
+                                           IRQCHIP_STATE_PENDING,
+                                           irq->pending_latch);
+               WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
+
+               /*
+                * Clear pending_latch and communicate this state
+                * change via vgic_queue_irq_unlock.
+                */
+               irq->pending_latch = false;
+               vgic_queue_irq_unlock(kvm, irq, flags);
+       } else {
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+       }
+
 out:
        mutex_unlock(&its->its_lock);
        return ret;
index 64fcd75..dc1f3d1 100644 (file)
@@ -293,6 +293,7 @@ vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg)
 
 struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
                                                           u32 index);
+void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg);
 
 bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size);
 
@@ -317,5 +318,6 @@ bool vgic_supports_direct_msis(struct kvm *kvm);
 int vgic_v4_init(struct kvm *kvm);
 void vgic_v4_teardown(struct kvm *kvm);
 void vgic_v4_configure_vsgis(struct kvm *kvm);
+void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val);
 
 #endif
index 073acbf..b84b179 100644 (file)
@@ -14,7 +14,7 @@
  * Parameters:
  *     x0 - dest
  */
-SYM_FUNC_START(clear_page)
+SYM_FUNC_START_PI(clear_page)
        mrs     x1, dczid_el0
        and     w1, w1, #0xf
        mov     x2, #4
@@ -25,5 +25,5 @@ SYM_FUNC_START(clear_page)
        tst     x0, #(PAGE_SIZE - 1)
        b.ne    1b
        ret
-SYM_FUNC_END(clear_page)
+SYM_FUNC_END_PI(clear_page)
 EXPORT_SYMBOL(clear_page)
index e7a7939..29144f4 100644 (file)
@@ -17,7 +17,7 @@
  *     x0 - dest
  *     x1 - src
  */
-SYM_FUNC_START(copy_page)
+SYM_FUNC_START_PI(copy_page)
 alternative_if ARM64_HAS_NO_HW_PREFETCH
        // Prefetch three cache lines ahead.
        prfm    pldl1strm, [x1, #128]
@@ -75,5 +75,5 @@ alternative_else_nop_endif
        stnp    x16, x17, [x0, #112 - 256]
 
        ret
-SYM_FUNC_END(copy_page)
+SYM_FUNC_END_PI(copy_page)
 EXPORT_SYMBOL(copy_page)
index 55ecf6d..58987a9 100644 (file)
@@ -252,7 +252,7 @@ void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
                set_pte(ptep, pte);
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long addr, unsigned long sz)
 {
        pgd_t *pgdp;
@@ -284,9 +284,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
                 */
                ptep = pte_alloc_map(mm, pmdp, addr);
        } else if (sz == PMD_SIZE) {
-               if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
-                   pud_none(READ_ONCE(*pudp)))
-                       ptep = huge_pmd_share(mm, addr, pudp);
+               if (want_pmd_share(vma, addr) && pud_none(READ_ONCE(*pudp)))
+                       ptep = huge_pmd_share(mm, vma, addr, pudp);
                else
                        ptep = (pte_t *)pmd_alloc(mm, pudp, addr);
        } else if (sz == (CONT_PMD_SIZE)) {
index ef03151..16a2b2b 100644 (file)
@@ -35,6 +35,7 @@
 #include <asm/fixmap.h>
 #include <asm/kasan.h>
 #include <asm/kernel-pgtable.h>
+#include <asm/kvm_host.h>
 #include <asm/memory.h>
 #include <asm/numa.h>
 #include <asm/sections.h>
@@ -220,6 +221,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
 int pfn_valid(unsigned long pfn)
 {
        phys_addr_t addr = PFN_PHYS(pfn);
+       struct mem_section *ms;
 
        /*
         * Ensure the upper PAGE_SHIFT bits are clear in the
@@ -230,10 +232,6 @@ int pfn_valid(unsigned long pfn)
        if (PHYS_PFN(addr) != pfn)
                return 0;
 
-#ifdef CONFIG_SPARSEMEM
-{
-       struct mem_section *ms;
-
        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
 
@@ -252,8 +250,7 @@ int pfn_valid(unsigned long pfn)
         */
        if (!early_section(ms))
                return pfn_section_valid(ms, pfn);
-}
-#endif
+
        return memblock_is_map_memory(addr);
 }
 EXPORT_SYMBOL(pfn_valid);
@@ -452,6 +449,8 @@ void __init bootmem_init(void)
 
        dma_pernuma_cma_reserve();
 
+       kvm_hyp_reserve();
+
        /*
         * sparse_init() tries to allocate memory from memblock, so must be
         * done after the fixed reservations
index 70fa3cd..6dd9369 100644 (file)
@@ -1113,7 +1113,6 @@ static void free_empty_tables(unsigned long addr, unsigned long end,
 }
 #endif
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
 #if !ARM64_SWAPPER_USES_SECTION_MAPS
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap)
@@ -1177,7 +1176,6 @@ void vmemmap_free(unsigned long start, unsigned long end,
        free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
 #endif
 }
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 static inline pud_t *fixmap_pud(unsigned long addr)
 {
index a50e92e..a1937df 100644 (file)
@@ -51,10 +51,8 @@ static struct addr_marker address_markers[] = {
        { FIXADDR_TOP,                  "Fixmap end" },
        { PCI_IO_START,                 "PCI I/O start" },
        { PCI_IO_END,                   "PCI I/O end" },
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
        { VMEMMAP_START,                "vmemmap start" },
        { VMEMMAP_START + VMEMMAP_SIZE, "vmemmap end" },
-#endif
        { -1,                           NULL },
 };
 
index cc24bb8..904a18a 100644 (file)
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 generic-y += asm-offsets.h
+generic-y += extable.h
 generic-y += gpio.h
 generic-y += kvm_para.h
 generic-y += qrwlock.h
index ac08b0f..6ff205a 100644 (file)
@@ -37,7 +37,7 @@ void asid_new_context(struct asid_info *info, atomic64_t *pasid,
  * Check the ASID is still valid for the context. If not generate a new ASID.
  *
  * @pasid: Pointer to the current ASID batch
- * @cpu: current CPU ID. Must have been acquired throught get_cpu()
+ * @cpu: current CPU ID. Must have been acquired through get_cpu()
  */
 static inline void asid_check_context(struct asid_info *info,
                                      atomic64_t *pasid, unsigned int cpu,
index 84fc600..f4045dd 100644 (file)
@@ -64,7 +64,7 @@
 
 /*
  * sync:        completion barrier, all sync.xx instructions
- *              guarantee the last response recieved by bus transaction
+ *              guarantee the last response received by bus transaction
  *              made by ld/st instructions before sync.s
  * sync.s:      inherit from sync, but also shareable to other cores
  * sync.i:      inherit from sync, but also flush cpu pipeline
index 589e832..5bc1cc6 100644 (file)
@@ -7,11 +7,4 @@ typedef struct {
        unsigned long seg;
 } mm_segment_t;
 
-#define KERNEL_DS              ((mm_segment_t) { 0xFFFFFFFF })
-
-#define USER_DS                        ((mm_segment_t) { PAGE_OFFSET })
-#define get_fs()               (current_thread_info()->addr_limit)
-#define set_fs(x)              (current_thread_info()->addr_limit = (x))
-#define uaccess_kernel()       (get_fs().seg == KERNEL_DS.seg)
-
 #endif /* __ASM_CSKY_SEGMENT_H */
index 3dec272..ac83823 100644 (file)
 #ifndef __ASM_CSKY_UACCESS_H
 #define __ASM_CSKY_UACCESS_H
 
-/*
- * User space memory access functions
- */
-#include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/version.h>
-#include <asm/segment.h>
+#define user_addr_max() \
+       (uaccess_kernel() ? KERNEL_DS.seg : get_fs().seg)
 
-static inline int access_ok(const void *addr, unsigned long size)
+static inline int __access_ok(unsigned long addr, unsigned long size)
 {
        unsigned long limit = current_thread_info()->addr_limit.seg;
 
-       return (((unsigned long)addr < limit) &&
-               ((unsigned long)(addr + size) < limit));
+       return ((addr < limit) && ((addr + size) < limit));
 }
-
-#define __addr_ok(addr) (access_ok(addr, 0))
-
-extern int __put_user_bad(void);
+#define __access_ok __access_ok
 
 /*
- * Tell gcc we read from memory instead of writing: this is because
- * we do not write to any memory gcc knows about, so there are no
- * aliasing issues.
+ * __put_user_fn
  */
+extern int __put_user_bad(void);
 
-/*
- * These are the main single-value transfer routines.  They automatically
- * use the right size if we just have the right pointer type.
- *
- * This gets kind of ugly. We want to return _two_ values in "get_user()"
- * and yet we don't want to do any pointers, because that is too much
- * of a performance impact. Thus we have a few rather ugly macros here,
- * and hide all the ugliness from the user.
- *
- * The "__xxx" versions of the user access functions are versions that
- * do not verify the address space, that must have been done previously
- * with a separate "access_ok()" call (this is used when we do multiple
- * accesses to the same area of user memory).
- *
- * As we use the same address space for kernel and user data on
- * Ckcore, we can just do these as direct assignments.  (Of course, the
- * exception handling means that it's no longer "just"...)
- */
-
-#define put_user(x, ptr) \
-       __put_user_check((x), (ptr), sizeof(*(ptr)))
-
-#define __put_user(x, ptr) \
-       __put_user_nocheck((x), (ptr), sizeof(*(ptr)))
-
-#define __ptr(x) ((unsigned long *)(x))
-
-#define get_user(x, ptr) \
-       __get_user_check((x), (ptr), sizeof(*(ptr)))
-
-#define __get_user(x, ptr) \
-       __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
-
-#define __put_user_nocheck(x, ptr, size)                               \
-({                                                                     \
-       long __pu_err = 0;                                              \
-       typeof(*(ptr)) *__pu_addr = (ptr);                              \
-       typeof(*(ptr)) __pu_val = (typeof(*(ptr)))(x);                  \
-       if (__pu_addr)                                                  \
-               __put_user_size(__pu_val, (__pu_addr), (size),          \
-                               __pu_err);                              \
-       __pu_err;                                                       \
-})
-
-#define __put_user_check(x, ptr, size)                                 \
-({                                                                     \
-       long __pu_err = -EFAULT;                                        \
-       typeof(*(ptr)) *__pu_addr = (ptr);                              \
-       typeof(*(ptr)) __pu_val = (typeof(*(ptr)))(x);                  \
-       if (access_ok(__pu_addr, size) && __pu_addr)    \
-               __put_user_size(__pu_val, __pu_addr, (size), __pu_err); \
-       __pu_err;                                                       \
-})
-
-#define __put_user_size(x, ptr, size, retval)          \
-do {                                                   \
-       retval = 0;                                     \
-       switch (size) {                                 \
-       case 1:                                         \
-               __put_user_asm_b(x, ptr, retval);       \
-               break;                                  \
-       case 2:                                         \
-               __put_user_asm_h(x, ptr, retval);       \
-               break;                                  \
-       case 4:                                         \
-               __put_user_asm_w(x, ptr, retval);       \
-               break;                                  \
-       case 8:                                         \
-               __put_user_asm_64(x, ptr, retval);      \
-               break;                                  \
-       default:                                        \
-               __put_user_bad();                       \
-       }                                               \
-} while (0)
-
-/*
- * We don't tell gcc that we are accessing memory, but this is OK
- * because we do not write to any memory gcc knows about, so there
- * are no aliasing issues.
- *
- * Note that PC at a fault is the address *after* the faulting
- * instruction.
- */
 #define __put_user_asm_b(x, ptr, err)                  \
 do {                                                   \
        int errcode;                                    \
-       asm volatile(                                   \
+       __asm__ __volatile__(                           \
        "1:     stb   %1, (%2,0)        \n"             \
        "       br    3f                \n"             \
        "2:     mov   %0, %3            \n"             \
@@ -136,7 +40,7 @@ do {                                                 \
 #define __put_user_asm_h(x, ptr, err)                  \
 do {                                                   \
        int errcode;                                    \
-       asm volatile(                                   \
+       __asm__ __volatile__(                           \
        "1:     sth   %1, (%2,0)        \n"             \
        "       br    3f                \n"             \
        "2:     mov   %0, %3            \n"             \
@@ -154,7 +58,7 @@ do {                                                 \
 #define __put_user_asm_w(x, ptr, err)                  \
 do {                                                   \
        int errcode;                                    \
-       asm volatile(                                   \
+       __asm__ __volatile__(                           \
        "1:     stw   %1, (%2,0)        \n"             \
        "       br    3f                \n"             \
        "2:     mov   %0, %3            \n"             \
@@ -169,241 +73,149 @@ do {                                                     \
        : "memory");                                    \
 } while (0)
 
-#define __put_user_asm_64(x, ptr, err)                         \
-do {                                                           \
-       int tmp;                                                \
-       int errcode;                                            \
-       typeof(*(ptr))src = (typeof(*(ptr)))x;                  \
-       typeof(*(ptr))*psrc = &src;                             \
-                                                               \
-       asm volatile(                                           \
-       "     ldw     %3, (%1, 0)     \n"                       \
-       "1:   stw     %3, (%2, 0)     \n"                       \
-       "     ldw     %3, (%1, 4)     \n"                       \
-       "2:   stw     %3, (%2, 4)     \n"                       \
-       "     br      4f              \n"                       \
-       "3:   mov     %0, %4          \n"                       \
-       "     br      4f              \n"                       \
-       ".section __ex_table, \"a\"   \n"                       \
-       ".align   2                   \n"                       \
-       ".long    1b, 3b              \n"                       \
-       ".long    2b, 3b              \n"                       \
-       ".previous                    \n"                       \
-       "4:                           \n"                       \
-       : "=r"(err), "=r"(psrc), "=r"(ptr),                     \
-         "=r"(tmp), "=r"(errcode)                              \
-       : "0"(err), "1"(psrc), "2"(ptr), "3"(0), "4"(-EFAULT)   \
-       : "memory");                                            \
-} while (0)
-
-#define __get_user_nocheck(x, ptr, size)                       \
-({                                                             \
-       long  __gu_err;                                         \
-       __get_user_size(x, (ptr), (size), __gu_err);            \
-       __gu_err;                                               \
-})
-
-#define __get_user_check(x, ptr, size)                         \
-({                                                             \
-       int __gu_err = -EFAULT;                                 \
-       const __typeof__(*(ptr)) __user *__gu_ptr = (ptr);      \
-       if (access_ok(__gu_ptr, size) && __gu_ptr)      \
-               __get_user_size(x, __gu_ptr, size, __gu_err);   \
-       __gu_err;                                               \
-})
-
-#define __get_user_size(x, ptr, size, retval)                  \
-do {                                                           \
-       switch (size) {                                         \
-       case 1:                                                 \
-               __get_user_asm_common((x), ptr, "ldb", retval); \
-               break;                                          \
-       case 2:                                                 \
-               __get_user_asm_common((x), ptr, "ldh", retval); \
-               break;                                          \
-       case 4:                                                 \
-               __get_user_asm_common((x), ptr, "ldw", retval); \
-               break;                                          \
-       default:                                                \
-               x = 0;                                          \
-               (retval) = __get_user_bad();                    \
-       }                                                       \
+#define __put_user_asm_64(x, ptr, err)                 \
+do {                                                   \
+       int tmp;                                        \
+       int errcode;                                    \
+                                                       \
+       __asm__ __volatile__(                           \
+       "     ldw     %3, (%1, 0)     \n"               \
+       "1:   stw     %3, (%2, 0)     \n"               \
+       "     ldw     %3, (%1, 4)     \n"               \
+       "2:   stw     %3, (%2, 4)     \n"               \
+       "     br      4f              \n"               \
+       "3:   mov     %0, %4          \n"               \
+       "     br      4f              \n"               \
+       ".section __ex_table, \"a\"   \n"               \
+       ".align   2                   \n"               \
+       ".long    1b, 3b              \n"               \
+       ".long    2b, 3b              \n"               \
+       ".previous                    \n"               \
+       "4:                           \n"               \
+       : "=r"(err), "=r"(x), "=r"(ptr),                \
+         "=r"(tmp), "=r"(errcode)                      \
+       : "0"(err), "1"(x), "2"(ptr), "3"(0),           \
+         "4"(-EFAULT)                                  \
+       : "memory");                                    \
 } while (0)
 
-#define __get_user_asm_common(x, ptr, ins, err)                        \
-do {                                                           \
-       int errcode;                                            \
-       asm volatile(                                           \
-       "1:   " ins " %1, (%4,0)        \n"                     \
-       "       br    3f                \n"                     \
-       /* Fix up codes */                                      \
-       "2:     mov   %0, %2            \n"                     \
-       "       movi  %1, 0             \n"                     \
-       "       br    3f                \n"                     \
-       ".section __ex_table,\"a\"      \n"                     \
-       ".align   2                     \n"                     \
-       ".long    1b, 2b                \n"                     \
-       ".previous                      \n"                     \
-       "3:                             \n"                     \
-       : "=r"(err), "=r"(x), "=r"(errcode)                     \
-       : "0"(0), "r"(ptr), "2"(-EFAULT)                        \
-       : "memory");                                            \
-} while (0)
+static inline int __put_user_fn(size_t size, void __user *ptr, void *x)
+{
+       int retval = 0;
+       u32 tmp;
+
+       switch (size) {
+       case 1:
+               tmp = *(u8 *)x;
+               __put_user_asm_b(tmp, ptr, retval);
+               break;
+       case 2:
+               tmp = *(u16 *)x;
+               __put_user_asm_h(tmp, ptr, retval);
+               break;
+       case 4:
+               tmp = *(u32 *)x;
+               __put_user_asm_w(tmp, ptr, retval);
+               break;
+       case 8:
+               __put_user_asm_64(x, (u64 *)ptr, retval);
+               break;
+       }
+
+       return retval;
+}
+#define __put_user_fn __put_user_fn
 
+/*
+ * __get_user_fn
+ */
 extern int __get_user_bad(void);
 
-#define ___copy_to_user(to, from, n)                   \
+#define __get_user_asm_common(x, ptr, ins, err)                \
 do {                                                   \
-       int w0, w1, w2, w3;                             \
-       asm volatile(                                   \
-       "0:     cmpnei  %1, 0           \n"             \
-       "       bf      8f              \n"             \
-       "       mov     %3, %1          \n"             \
-       "       or      %3, %2          \n"             \
-       "       andi    %3, 3           \n"             \
-       "       cmpnei  %3, 0           \n"             \
-       "       bf      1f              \n"             \
-       "       br      5f              \n"             \
-       "1:     cmplti  %0, 16          \n" /* 4W */    \
-       "       bt      3f              \n"             \
-       "       ldw     %3, (%2, 0)     \n"             \
-       "       ldw     %4, (%2, 4)     \n"             \
-       "       ldw     %5, (%2, 8)     \n"             \
-       "       ldw     %6, (%2, 12)    \n"             \
-       "2:     stw     %3, (%1, 0)     \n"             \
-       "9:     stw     %4, (%1, 4)     \n"             \
-       "10:    stw     %5, (%1, 8)     \n"             \
-       "11:    stw     %6, (%1, 12)    \n"             \
-       "       addi    %2, 16          \n"             \
-       "       addi    %1, 16          \n"             \
-       "       subi    %0, 16          \n"             \
-       "       br      1b              \n"             \
-       "3:     cmplti  %0, 4           \n" /* 1W */    \
-       "       bt      5f              \n"             \
-       "       ldw     %3, (%2, 0)     \n"             \
-       "4:     stw     %3, (%1, 0)     \n"             \
-       "       addi    %2, 4           \n"             \
-       "       addi    %1, 4           \n"             \
-       "       subi    %0, 4           \n"             \
-       "       br      3b              \n"             \
-       "5:     cmpnei  %0, 0           \n"  /* 1B */   \
-       "       bf      13f             \n"             \
-       "       ldb     %3, (%2, 0)     \n"             \
-       "6:     stb     %3, (%1, 0)     \n"             \
-       "       addi    %2,  1          \n"             \
-       "       addi    %1,  1          \n"             \
-       "       subi    %0,  1          \n"             \
-       "       br      5b              \n"             \
-       "7:     subi    %0,  4          \n"             \
-       "8:     subi    %0,  4          \n"             \
-       "12:    subi    %0,  4          \n"             \
-       "       br      13f             \n"             \
-       ".section __ex_table, \"a\"     \n"             \
-       ".align   2                     \n"             \
-       ".long    2b, 13f               \n"             \
-       ".long    4b, 13f               \n"             \
-       ".long    6b, 13f               \n"             \
-       ".long    9b, 12b               \n"             \
-       ".long   10b, 8b                \n"             \
-       ".long   11b, 7b                \n"             \
-       ".previous                      \n"             \
-       "13:                            \n"             \
-       : "=r"(n), "=r"(to), "=r"(from), "=r"(w0),      \
-         "=r"(w1), "=r"(w2), "=r"(w3)                  \
-       : "0"(n), "1"(to), "2"(from)                    \
+       int errcode;                                    \
+       __asm__ __volatile__(                           \
+       "1:   " ins " %1, (%4, 0)       \n"             \
+       "       br    3f                \n"             \
+       "2:     mov   %0, %2            \n"             \
+       "       movi  %1, 0             \n"             \
+       "       br    3f                \n"             \
+       ".section __ex_table,\"a\"      \n"             \
+       ".align   2                     \n"             \
+       ".long    1b, 2b                \n"             \
+       ".previous                      \n"             \
+       "3:                             \n"             \
+       : "=r"(err), "=r"(x), "=r"(errcode)             \
+       : "0"(0), "r"(ptr), "2"(-EFAULT)                \
        : "memory");                                    \
 } while (0)
 
-#define ___copy_from_user(to, from, n)                 \
+#define __get_user_asm_64(x, ptr, err)                 \
 do {                                                   \
        int tmp;                                        \
-       int nsave;                                      \
-       asm volatile(                                   \
-       "0:     cmpnei  %1, 0           \n"             \
-       "       bf      7f              \n"             \
-       "       mov     %3, %1          \n"             \
-       "       or      %3, %2          \n"             \
-       "       andi    %3, 3           \n"             \
-       "       cmpnei  %3, 0           \n"             \
-       "       bf      1f              \n"             \
-       "       br      5f              \n"             \
-       "1:     cmplti  %0, 16          \n"             \
-       "       bt      3f              \n"             \
-       "2:     ldw     %3, (%2, 0)     \n"             \
-       "10:    ldw     %4, (%2, 4)     \n"             \
-       "       stw     %3, (%1, 0)     \n"             \
-       "       stw     %4, (%1, 4)     \n"             \
-       "11:    ldw     %3, (%2, 8)     \n"             \
-       "12:    ldw     %4, (%2, 12)    \n"             \
-       "       stw     %3, (%1, 8)     \n"             \
-       "       stw     %4, (%1, 12)    \n"             \
-       "       addi    %2, 16          \n"             \
-       "       addi    %1, 16          \n"             \
-       "       subi    %0, 16          \n"             \
-       "       br      1b              \n"             \
-       "3:     cmplti  %0, 4           \n"             \
-       "       bt      5f              \n"             \
-       "4:     ldw     %3, (%2, 0)     \n"             \
-       "       stw     %3, (%1, 0)     \n"             \
-       "       addi    %2, 4           \n"             \
-       "       addi    %1, 4           \n"             \
-       "       subi    %0, 4           \n"             \
-       "       br      3b              \n"             \
-       "5:     cmpnei  %0, 0           \n"             \
-       "       bf      7f              \n"             \
-       "6:     ldb     %3, (%2, 0)     \n"             \
-       "       stb     %3, (%1, 0)     \n"             \
-       "       addi    %2,  1          \n"             \
-       "       addi    %1,  1          \n"             \
-       "       subi    %0,  1          \n"             \
-       "       br      5b              \n"             \
-       "8:     stw     %3, (%1, 0)     \n"             \
-       "       subi    %0, 4           \n"             \
-       "       bf      7f              \n"             \
-       "9:     subi    %0, 8           \n"             \
-       "       bf      7f              \n"             \
-       "13:    stw     %3, (%1, 8)     \n"             \
-       "       subi    %0, 12          \n"             \
-       "       bf      7f              \n"             \
-       ".section __ex_table, \"a\"     \n"             \
-       ".align   2                     \n"             \
-       ".long    2b, 7f                \n"             \
-       ".long    4b, 7f                \n"             \
-       ".long    6b, 7f                \n"             \
-       ".long   10b, 8b                \n"             \
-       ".long   11b, 9b                \n"             \
-       ".long   12b,13b                \n"             \
-       ".previous                      \n"             \
-       "7:                             \n"             \
-       : "=r"(n), "=r"(to), "=r"(from), "=r"(nsave),   \
-         "=r"(tmp)                                     \
-       : "0"(n), "1"(to), "2"(from)                    \
+       int errcode;                                    \
+                                                       \
+       __asm__ __volatile__(                           \
+       "1:   ldw     %3, (%2, 0)     \n"               \
+       "     stw     %3, (%1, 0)     \n"               \
+       "2:   ldw     %3, (%2, 4)     \n"               \
+       "     stw     %3, (%1, 4)     \n"               \
+       "     br      4f              \n"               \
+       "3:   mov     %0, %4          \n"               \
+       "     br      4f              \n"               \
+       ".section __ex_table, \"a\"   \n"               \
+       ".align   2                   \n"               \
+       ".long    1b, 3b              \n"               \
+       ".long    2b, 3b              \n"               \
+       ".previous                    \n"               \
+       "4:                           \n"               \
+       : "=r"(err), "=r"(x), "=r"(ptr),                \
+         "=r"(tmp), "=r"(errcode)                      \
+       : "0"(err), "1"(x), "2"(ptr), "3"(0),           \
+         "4"(-EFAULT)                                  \
        : "memory");                                    \
 } while (0)
 
+static inline int __get_user_fn(size_t size, const void __user *ptr, void *x)
+{
+       int retval;
+       u32 tmp;
+
+       switch (size) {
+       case 1:
+               __get_user_asm_common(tmp, ptr, "ldb", retval);
+               *(u8 *)x = (u8)tmp;
+               break;
+       case 2:
+               __get_user_asm_common(tmp, ptr, "ldh", retval);
+               *(u16 *)x = (u16)tmp;
+               break;
+       case 4:
+               __get_user_asm_common(tmp, ptr, "ldw", retval);
+               *(u32 *)x = (u32)tmp;
+               break;
+       case 8:
+               __get_user_asm_64(x, ptr, retval);
+               break;
+       }
+
+       return retval;
+}
+#define __get_user_fn __get_user_fn
+
 unsigned long raw_copy_from_user(void *to, const void *from, unsigned long n);
 unsigned long raw_copy_to_user(void *to, const void *from, unsigned long n);
 
-unsigned long clear_user(void *to, unsigned long n);
 unsigned long __clear_user(void __user *to, unsigned long n);
+#define __clear_user __clear_user
 
-long strncpy_from_user(char *dst, const char *src, long count);
 long __strncpy_from_user(char *dst, const char *src, long count);
+#define __strncpy_from_user __strncpy_from_user
 
-/*
- * Return the size of a string (including the ending 0)
- *
- * Return 0 on exception, a value greater than N if too long
- */
-long strnlen_user(const char *src, long n);
-
-#define strlen_user(str) strnlen_user(str, 32767)
-
-struct exception_table_entry {
-       unsigned long insn;
-       unsigned long nextinsn;
-};
+long __strnlen_user(const char *s, long n);
+#define __strnlen_user __strnlen_user
 
-extern int fixup_exception(struct pt_regs *regs);
+#include <asm/segment.h>
+#include <asm-generic/uaccess.h>
 
 #endif /* __ASM_CSKY_UACCESS_H */
index eb5142f..bdce581 100644 (file)
@@ -16,7 +16,7 @@ struct vdso_data {
  * offset of 0, but since the linker must support setting weak undefined
  * symbols to the absolute address 0 it also happens to support other low
  * addresses even when the code model suggests those low addresses would not
- * otherwise be availiable.
+ * otherwise be available.
  */
 #define VDSO_SYMBOL(base, name)                                                        \
 ({                                                                             \
index c1bd7a6..00e3c8e 100644 (file)
@@ -9,7 +9,6 @@
 #include <asm/unistd.h>
 #include <asm/asm-offsets.h>
 #include <linux/threads.h>
-#include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/thread_info.h>
 
index 3c9bd64..c5d394a 100644 (file)
@@ -7,7 +7,70 @@
 unsigned long raw_copy_from_user(void *to, const void *from,
                        unsigned long n)
 {
-       ___copy_from_user(to, from, n);
+       int tmp, nsave;
+
+       __asm__ __volatile__(
+       "0:     cmpnei  %1, 0           \n"
+       "       bf      7f              \n"
+       "       mov     %3, %1          \n"
+       "       or      %3, %2          \n"
+       "       andi    %3, 3           \n"
+       "       cmpnei  %3, 0           \n"
+       "       bf      1f              \n"
+       "       br      5f              \n"
+       "1:     cmplti  %0, 16          \n"
+       "       bt      3f              \n"
+       "2:     ldw     %3, (%2, 0)     \n"
+       "10:    ldw     %4, (%2, 4)     \n"
+       "       stw     %3, (%1, 0)     \n"
+       "       stw     %4, (%1, 4)     \n"
+       "11:    ldw     %3, (%2, 8)     \n"
+       "12:    ldw     %4, (%2, 12)    \n"
+       "       stw     %3, (%1, 8)     \n"
+       "       stw     %4, (%1, 12)    \n"
+       "       addi    %2, 16          \n"
+       "       addi    %1, 16          \n"
+       "       subi    %0, 16          \n"
+       "       br      1b              \n"
+       "3:     cmplti  %0, 4           \n"
+       "       bt      5f              \n"
+       "4:     ldw     %3, (%2, 0)     \n"
+       "       stw     %3, (%1, 0)     \n"
+       "       addi    %2, 4           \n"
+       "       addi    %1, 4           \n"
+       "       subi    %0, 4           \n"
+       "       br      3b              \n"
+       "5:     cmpnei  %0, 0           \n"
+       "       bf      7f              \n"
+       "6:     ldb     %3, (%2, 0)     \n"
+       "       stb     %3, (%1, 0)     \n"
+       "       addi    %2,  1          \n"
+       "       addi    %1,  1          \n"
+       "       subi    %0,  1          \n"
+       "       br      5b              \n"
+       "8:     stw     %3, (%1, 0)     \n"
+       "       subi    %0, 4           \n"
+       "       bf      7f              \n"
+       "9:     subi    %0, 8           \n"
+       "       bf      7f              \n"
+       "13:    stw     %3, (%1, 8)     \n"
+       "       subi    %0, 12          \n"
+       "       bf      7f              \n"
+       ".section __ex_table, \"a\"     \n"
+       ".align   2                     \n"
+       ".long    2b, 7f                \n"
+       ".long    4b, 7f                \n"
+       ".long    6b, 7f                \n"
+       ".long   10b, 8b                \n"
+       ".long   11b, 9b                \n"
+       ".long   12b,13b                \n"
+       ".previous                      \n"
+       "7:                             \n"
+       : "=r"(n), "=r"(to), "=r"(from), "=r"(nsave),
+         "=r"(tmp)
+       : "0"(n), "1"(to), "2"(from)
+       : "memory");
+
        return n;
 }
 EXPORT_SYMBOL(raw_copy_from_user);
@@ -15,48 +78,70 @@ EXPORT_SYMBOL(raw_copy_from_user);
 unsigned long raw_copy_to_user(void *to, const void *from,
                        unsigned long n)
 {
-       ___copy_to_user(to, from, n);
+       int w0, w1, w2, w3;
+
+       __asm__ __volatile__(
+       "0:     cmpnei  %1, 0           \n"
+       "       bf      8f              \n"
+       "       mov     %3, %1          \n"
+       "       or      %3, %2          \n"
+       "       andi    %3, 3           \n"
+       "       cmpnei  %3, 0           \n"
+       "       bf      1f              \n"
+       "       br      5f              \n"
+       "1:     cmplti  %0, 16          \n" /* 4W */
+       "       bt      3f              \n"
+       "       ldw     %3, (%2, 0)     \n"
+       "       ldw     %4, (%2, 4)     \n"
+       "       ldw     %5, (%2, 8)     \n"
+       "       ldw     %6, (%2, 12)    \n"
+       "2:     stw     %3, (%1, 0)     \n"
+       "9:     stw     %4, (%1, 4)     \n"
+       "10:    stw     %5, (%1, 8)     \n"
+       "11:    stw     %6, (%1, 12)    \n"
+       "       addi    %2, 16          \n"
+       "       addi    %1, 16          \n"
+       "       subi    %0, 16          \n"
+       "       br      1b              \n"
+       "3:     cmplti  %0, 4           \n" /* 1W */
+       "       bt      5f              \n"
+       "       ldw     %3, (%2, 0)     \n"
+       "4:     stw     %3, (%1, 0)     \n"
+       "       addi    %2, 4           \n"
+       "       addi    %1, 4           \n"
+       "       subi    %0, 4           \n"
+       "       br      3b              \n"
+       "5:     cmpnei  %0, 0           \n"  /* 1B */
+       "       bf      13f             \n"
+       "       ldb     %3, (%2, 0)     \n"
+       "6:     stb     %3, (%1, 0)     \n"
+       "       addi    %2,  1          \n"
+       "       addi    %1,  1          \n"
+       "       subi    %0,  1          \n"
+       "       br      5b              \n"
+       "7:     subi    %0,  4          \n"
+       "8:     subi    %0,  4          \n"
+       "12:    subi    %0,  4          \n"
+       "       br      13f             \n"
+       ".section __ex_table, \"a\"     \n"
+       ".align   2                     \n"
+       ".long    2b, 13f               \n"
+       ".long    4b, 13f               \n"
+       ".long    6b, 13f               \n"
+       ".long    9b, 12b               \n"
+       ".long   10b, 8b                \n"
+       ".long   11b, 7b                \n"
+       ".previous                      \n"
+       "13:                            \n"
+       : "=r"(n), "=r"(to), "=r"(from), "=r"(w0),
+         "=r"(w1), "=r"(w2), "=r"(w3)
+       : "0"(n), "1"(to), "2"(from)
+       : "memory");
+
        return n;
 }
 EXPORT_SYMBOL(raw_copy_to_user);
 
-
-/*
- * copy a null terminated string from userspace.
- */
-#define __do_strncpy_from_user(dst, src, count, res)   \
-do {                                                   \
-       int tmp;                                        \
-       long faultres;                                  \
-       asm volatile(                                   \
-       "       cmpnei  %3, 0           \n"             \
-       "       bf      4f              \n"             \
-       "1:     cmpnei  %1, 0           \n"             \
-       "       bf      5f              \n"             \
-       "2:     ldb     %4, (%3, 0)     \n"             \
-       "       stb     %4, (%2, 0)     \n"             \
-       "       cmpnei  %4, 0           \n"             \
-       "       bf      3f              \n"             \
-       "       addi    %3,  1          \n"             \
-       "       addi    %2,  1          \n"             \
-       "       subi    %1,  1          \n"             \
-       "       br      1b              \n"             \
-       "3:     subu    %0, %1          \n"             \
-       "       br      5f              \n"             \
-       "4:     mov     %0, %5          \n"             \
-       "       br      5f              \n"             \
-       ".section __ex_table, \"a\"     \n"             \
-       ".align   2                     \n"             \
-       ".long    2b, 4b                \n"             \
-       ".previous                      \n"             \
-       "5:                             \n"             \
-       : "=r"(res), "=r"(count), "=r"(dst),            \
-         "=r"(src), "=r"(tmp),   "=r"(faultres)        \
-       : "5"(-EFAULT), "0"(count), "1"(count),         \
-         "2"(dst), "3"(src)                            \
-       : "memory", "cc");                              \
-} while (0)
-
 /*
  * __strncpy_from_user: - Copy a NUL terminated string from userspace,
  * with less checking.
@@ -80,43 +165,43 @@ do {                                                       \
  */
 long __strncpy_from_user(char *dst, const char *src, long count)
 {
-       long res;
+       long res, faultres;
+       int tmp;
 
-       __do_strncpy_from_user(dst, src, count, res);
-       return res;
-}
-EXPORT_SYMBOL(__strncpy_from_user);
-
-/*
- * strncpy_from_user: - Copy a NUL terminated string from userspace.
- * @dst:   Destination address, in kernel space.  This buffer must be at
- *         least @count bytes long.
- * @src:   Source address, in user space.
- * @count: Maximum number of bytes to copy, including the trailing NUL.
- *
- * Copies a NUL-terminated string from userspace to kernel space.
- *
- * On success, returns the length of the string (not including the trailing
- * NUL).
- *
- * If access to userspace fails, returns -EFAULT (some data may have been
- * copied).
- *
- * If @count is smaller than the length of the string, copies @count bytes
- * and returns @count.
- */
-long strncpy_from_user(char *dst, const char *src, long count)
-{
-       long res = -EFAULT;
+       __asm__ __volatile__(
+       "       cmpnei  %3, 0           \n"
+       "       bf      4f              \n"
+       "1:     cmpnei  %1, 0           \n"
+       "       bf      5f              \n"
+       "2:     ldb     %4, (%3, 0)     \n"
+       "       stb     %4, (%2, 0)     \n"
+       "       cmpnei  %4, 0           \n"
+       "       bf      3f              \n"
+       "       addi    %3,  1          \n"
+       "       addi    %2,  1          \n"
+       "       subi    %1,  1          \n"
+       "       br      1b              \n"
+       "3:     subu    %0, %1          \n"
+       "       br      5f              \n"
+       "4:     mov     %0, %5          \n"
+       "       br      5f              \n"
+       ".section __ex_table, \"a\"     \n"
+       ".align   2                     \n"
+       ".long    2b, 4b                \n"
+       ".previous                      \n"
+       "5:                             \n"
+       : "=r"(res), "=r"(count), "=r"(dst),
+         "=r"(src), "=r"(tmp), "=r"(faultres)
+       : "5"(-EFAULT), "0"(count), "1"(count),
+         "2"(dst), "3"(src)
+       : "memory");
 
-       if (access_ok(src, 1))
-               __do_strncpy_from_user(dst, src, count, res);
        return res;
 }
-EXPORT_SYMBOL(strncpy_from_user);
+EXPORT_SYMBOL(__strncpy_from_user);
 
 /*
- * strlen_user: - Get the size of a string in user space.
+ * strnlen_user: - Get the size of a string in user space.
  * @str: The string to measure.
  * @n:   The maximum valid length
  *
@@ -126,14 +211,11 @@ EXPORT_SYMBOL(strncpy_from_user);
  * On exception, returns 0.
  * If the string is too long, returns a value greater than @n.
  */
-long strnlen_user(const char *s, long n)
+long __strnlen_user(const char *s, long n)
 {
        unsigned long res, tmp;
 
-       if (s == NULL)
-               return 0;
-
-       asm volatile(
+       __asm__ __volatile__(
        "       cmpnei  %1, 0           \n"
        "       bf      3f              \n"
        "1:     cmpnei  %0, 0           \n"
@@ -156,87 +238,11 @@ long strnlen_user(const char *s, long n)
        "5:                             \n"
        : "=r"(n), "=r"(s), "=r"(res), "=r"(tmp)
        : "0"(n), "1"(s), "2"(n)
-       : "memory", "cc");
+       : "memory");
 
        return res;
 }
-EXPORT_SYMBOL(strnlen_user);
-
-#define __do_clear_user(addr, size)                    \
-do {                                                   \
-       int __d0, zvalue, tmp;                          \
-                                                       \
-       asm volatile(                                   \
-       "0:     cmpnei  %1, 0           \n"             \
-       "       bf      7f              \n"             \
-       "       mov     %3, %1          \n"             \
-       "       andi    %3, 3           \n"             \
-       "       cmpnei  %3, 0           \n"             \
-       "       bf      1f              \n"             \
-       "       br      5f              \n"             \
-       "1:     cmplti  %0, 32          \n" /* 4W */    \
-       "       bt      3f              \n"             \
-       "8:     stw     %2, (%1, 0)     \n"             \
-       "10:    stw     %2, (%1, 4)     \n"             \
-       "11:    stw     %2, (%1, 8)     \n"             \
-       "12:    stw     %2, (%1, 12)    \n"             \
-       "13:    stw     %2, (%1, 16)    \n"             \
-       "14:    stw     %2, (%1, 20)    \n"             \
-       "15:    stw     %2, (%1, 24)    \n"             \
-       "16:    stw     %2, (%1, 28)    \n"             \
-       "       addi    %1, 32          \n"             \
-       "       subi    %0, 32          \n"             \
-       "       br      1b              \n"             \
-       "3:     cmplti  %0, 4           \n" /* 1W */    \
-       "       bt      5f              \n"             \
-       "4:     stw     %2, (%1, 0)     \n"             \
-       "       addi    %1, 4           \n"             \
-       "       subi    %0, 4           \n"             \
-       "       br      3b              \n"             \
-       "5:     cmpnei  %0, 0           \n" /* 1B */    \
-       "9:     bf      7f              \n"             \
-       "6:     stb     %2, (%1, 0)     \n"             \
-       "       addi    %1,  1          \n"             \
-       "       subi    %0,  1          \n"             \
-       "       br      5b              \n"             \
-       ".section __ex_table,\"a\"      \n"             \
-       ".align   2                     \n"             \
-       ".long    8b, 9b                \n"             \
-       ".long    10b, 9b               \n"             \
-       ".long    11b, 9b               \n"             \
-       ".long    12b, 9b               \n"             \
-       ".long    13b, 9b               \n"             \
-       ".long    14b, 9b               \n"             \
-       ".long    15b, 9b               \n"             \
-       ".long    16b, 9b               \n"             \
-       ".long    4b, 9b                \n"             \
-       ".long    6b, 9b                \n"             \
-       ".previous                      \n"             \
-       "7:                             \n"             \
-       : "=r"(size), "=r" (__d0),                      \
-         "=r"(zvalue), "=r"(tmp)                       \
-       : "0"(size), "1"(addr), "2"(0)                  \
-       : "memory", "cc");                              \
-} while (0)
-
-/*
- * clear_user: - Zero a block of memory in user space.
- * @to:   Destination address, in user space.
- * @n:    Number of bytes to zero.
- *
- * Zero a block of memory in user space.
- *
- * Returns number of bytes that could not be cleared.
- * On success, this will be zero.
- */
-unsigned long
-clear_user(void __user *to, unsigned long n)
-{
-       if (access_ok(to, n))
-               __do_clear_user(to, n);
-       return n;
-}
-EXPORT_SYMBOL(clear_user);
+EXPORT_SYMBOL(__strnlen_user);
 
 /*
  * __clear_user: - Zero a block of memory in user space, with less checking.
@@ -252,7 +258,59 @@ EXPORT_SYMBOL(clear_user);
 unsigned long
 __clear_user(void __user *to, unsigned long n)
 {
-       __do_clear_user(to, n);
+       int data, value, tmp;
+
+       __asm__ __volatile__(
+       "0:     cmpnei  %1, 0           \n"
+       "       bf      7f              \n"
+       "       mov     %3, %1          \n"
+       "       andi    %3, 3           \n"
+       "       cmpnei  %3, 0           \n"
+       "       bf      1f              \n"
+       "       br      5f              \n"
+       "1:     cmplti  %0, 32          \n" /* 4W */
+       "       bt      3f              \n"
+       "8:     stw     %2, (%1, 0)     \n"
+       "10:    stw     %2, (%1, 4)     \n"
+       "11:    stw     %2, (%1, 8)     \n"
+       "12:    stw     %2, (%1, 12)    \n"
+       "13:    stw     %2, (%1, 16)    \n"
+       "14:    stw     %2, (%1, 20)    \n"
+       "15:    stw     %2, (%1, 24)    \n"
+       "16:    stw     %2, (%1, 28)    \n"
+       "       addi    %1, 32          \n"
+       "       subi    %0, 32          \n"
+       "       br      1b              \n"
+       "3:     cmplti  %0, 4           \n" /* 1W */
+       "       bt      5f              \n"
+       "4:     stw     %2, (%1, 0)     \n"
+       "       addi    %1, 4           \n"
+       "       subi    %0, 4           \n"
+       "       br      3b              \n"
+       "5:     cmpnei  %0, 0           \n" /* 1B */
+       "9:     bf      7f              \n"
+       "6:     stb     %2, (%1, 0)     \n"
+       "       addi    %1,  1          \n"
+       "       subi    %0,  1          \n"
+       "       br      5b              \n"
+       ".section __ex_table,\"a\"      \n"
+       ".align   2                     \n"
+       ".long    8b, 9b                \n"
+       ".long    10b, 9b               \n"
+       ".long    11b, 9b               \n"
+       ".long    12b, 9b               \n"
+       ".long    13b, 9b               \n"
+       ".long    14b, 9b               \n"
+       ".long    15b, 9b               \n"
+       ".long    16b, 9b               \n"
+       ".long    4b, 9b                \n"
+       ".long    6b, 9b                \n"
+       ".previous                      \n"
+       "7:                             \n"
+       : "=r"(n), "=r" (data), "=r"(value), "=r"(tmp)
+       : "0"(n), "1"(to), "2"(0)
+       : "memory");
+
        return n;
 }
 EXPORT_SYMBOL(__clear_user);
index 1482de5..466ad94 100644 (file)
@@ -12,7 +12,7 @@ int fixup_exception(struct pt_regs *regs)
 
        fixup = search_exception_tables(instruction_pointer(regs));
        if (fixup) {
-               regs->pc = fixup->nextinsn;
+               regs->pc = fixup->fixup;
 
                return 1;
        }
index ffade2f..4e51d63 100644 (file)
@@ -17,6 +17,7 @@ SYSCALL_DEFINE3(cacheflush,
                flush_icache_mm_range(current->mm,
                                (unsigned long)addr,
                                (unsigned long)addr + bytes);
+               fallthrough;
        case DCACHE:
                dcache_wb_range((unsigned long)addr,
                                (unsigned long)addr + bytes);
index 7aa16c7..c867a80 100644 (file)
@@ -9,6 +9,10 @@
 
 #include <linux/compiler.h>
 
+#include <asm-generic/bitops/fls.h>
+#include <asm-generic/bitops/__fls.h>
+#include <asm-generic/bitops/fls64.h>
+
 #ifdef __KERNEL__
 
 #ifndef _LINUX_BITOPS_H
@@ -173,8 +177,4 @@ static inline unsigned long __ffs(unsigned long word)
 
 #endif /* __KERNEL__ */
 
-#include <asm-generic/bitops/fls.h>
-#include <asm-generic/bitops/__fls.h>
-#include <asm-generic/bitops/fls64.h>
-
 #endif /* _H8300_BITOPS_H */
index c168c69..74b644e 100644 (file)
@@ -10,6 +10,9 @@ LDFLAGS_vmlinux += -G0
 # Do not use single-byte enums; these will overflow.
 KBUILD_CFLAGS += -fno-short-enums
 
+# We must use long-calls:
+KBUILD_CFLAGS += -mlong-calls
+
 # Modules must use either long-calls, or use pic/plt.
 # Use long-calls for now, it's easier.  And faster.
 # KBUILD_CFLAGS_MODULE += -fPIC
@@ -30,9 +33,6 @@ TIR_NAME := r19
 KBUILD_CFLAGS += -ffixed-$(TIR_NAME) -DTHREADINFO_REG=$(TIR_NAME) -D__linux__
 KBUILD_AFLAGS += -DTHREADINFO_REG=$(TIR_NAME)
 
-LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name 2>/dev/null)
-libs-y += $(LIBGCC)
-
 head-y := arch/hexagon/kernel/head.o
 
 core-y += arch/hexagon/kernel/ \
index f19ae2a..9b2b1cc 100644 (file)
@@ -34,7 +34,6 @@ CONFIG_NET_ETHERNET=y
 # CONFIG_SERIO is not set
 # CONFIG_CONSOLE_TRANSLATIONS is not set
 CONFIG_LEGACY_PTY_COUNT=64
-# CONFIG_DEVKMEM is not set
 # CONFIG_HW_RANDOM is not set
 CONFIG_SPI=y
 CONFIG_SPI_DEBUG=y
@@ -81,4 +80,3 @@ CONFIG_FRAME_WARN=0
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_FS=y
 # CONFIG_SCHED_DEBUG is not set
-CONFIG_DEBUG_INFO=y
index 6b9c554..9fb00a0 100644 (file)
@@ -21,7 +21,7 @@
        "3:\n" \
        ".section .fixup,\"ax\"\n" \
        "4: %1 = #%5;\n" \
-       "   jump 3b\n" \
+       "   jump ##3b\n" \
        ".previous\n" \
        ".section __ex_table,\"a\"\n" \
        ".long 1b,4b,2b,4b\n" \
@@ -90,7 +90,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
        "3:\n"
        ".section .fixup,\"ax\"\n"
        "4: %0 = #%6\n"
-       "   jump 3b\n"
+       "   jump ##3b\n"
        ".previous\n"
        ".section __ex_table,\"a\"\n"
        ".long 1b,4b,2b,4b\n"
index bda2a9c..c332414 100644 (file)
@@ -64,7 +64,6 @@ static inline void *phys_to_virt(unsigned long address)
  * convert a physical pointer to a virtual kernel pointer for
  * /dev/mem access.
  */
-#define xlate_dev_kmem_ptr(p)    __va(p)
 #define xlate_dev_mem_ptr(p)    __va(p)
 
 /*
index 78338d8..8d4ec76 100644 (file)
@@ -8,6 +8,7 @@
 
 #include <asm-generic/timex.h>
 #include <asm/timer-regs.h>
+#include <asm/hexagon_vm.h>
 
 /* Using TCX0 as our clock.  CLOCK_TICK_RATE scheduled to be removed. */
 #define CLOCK_TICK_RATE              TCX0_CLK_RATE
@@ -16,7 +17,7 @@
 
 static inline int read_current_timer(unsigned long *timer_val)
 {
-       *timer_val = (unsigned long) __vmgettime();
+       *timer_val = __vmgettime();
        return 0;
 }
 
index 6fb1aaa..35545a7 100644 (file)
@@ -35,8 +35,8 @@ EXPORT_SYMBOL(_dflt_cache_att);
 DECLARE_EXPORT(__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes);
 
 /* Additional functions */
-DECLARE_EXPORT(__divsi3);
-DECLARE_EXPORT(__modsi3);
-DECLARE_EXPORT(__udivsi3);
-DECLARE_EXPORT(__umodsi3);
+DECLARE_EXPORT(__hexagon_divsi3);
+DECLARE_EXPORT(__hexagon_modsi3);
+DECLARE_EXPORT(__hexagon_udivsi3);
+DECLARE_EXPORT(__hexagon_umodsi3);
 DECLARE_EXPORT(csum_tcpudp_magic);
index a5a89e9..8975f9b 100644 (file)
@@ -35,7 +35,7 @@ void user_disable_single_step(struct task_struct *child)
 
 static int genregs_get(struct task_struct *target,
                   const struct user_regset *regset,
-                  srtuct membuf to)
+                  struct membuf to)
 {
        struct pt_regs *regs = task_pt_regs(target);
 
@@ -54,7 +54,7 @@ static int genregs_get(struct task_struct *target,
        membuf_store(&to, regs->m0);
        membuf_store(&to, regs->m1);
        membuf_store(&to, regs->usr);
-       membuf_store(&to, regs->p3_0);
+       membuf_store(&to, regs->preds);
        membuf_store(&to, regs->gp);
        membuf_store(&to, regs->ugp);
        membuf_store(&to, pt_elr(regs)); // pc
index 54be529..a64641e 100644 (file)
@@ -2,4 +2,5 @@
 #
 # Makefile for hexagon-specific library files.
 #
-obj-y = checksum.o io.o memcpy.o memset.o
+obj-y = checksum.o io.o memcpy.o memset.o memcpy_likely_aligned.o \
+         divsi3.o modsi3.o udivsi3.o  umodsi3.o
diff --git a/arch/hexagon/lib/divsi3.S b/arch/hexagon/lib/divsi3.S
new file mode 100644 (file)
index 0000000..783e094
--- /dev/null
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/linkage.h>
+
+SYM_FUNC_START(__hexagon_divsi3)
+        {
+                p0 = cmp.gt(r0,#-1)
+                p1 = cmp.gt(r1,#-1)
+                r3:2 = vabsw(r1:0)
+        }
+        {
+                p3 = xor(p0,p1)
+                r4 = sub(r2,r3)
+                r6 = cl0(r2)
+                p0 = cmp.gtu(r3,r2)
+        }
+        {
+                r0 = mux(p3,#-1,#1)
+                r7 = cl0(r3)
+                p1 = cmp.gtu(r3,r4)
+        }
+        {
+                r0 = mux(p0,#0,r0)
+                p0 = or(p0,p1)
+                if (p0.new) jumpr:nt r31
+                r6 = sub(r7,r6)
+        }
+        {
+                r7 = r6
+                r5:4 = combine(#1,r3)
+                r6 = add(#1,lsr(r6,#1))
+                p0 = cmp.gtu(r6,#4)
+        }
+        {
+                r5:4 = vaslw(r5:4,r7)
+                if (!p0) r6 = #3
+        }
+        {
+                loop0(1f,r6)
+                r7:6 = vlsrw(r5:4,#1)
+                r1:0 = #0
+        }
+        .falign
+1:
+        {
+                r5:4 = vlsrw(r5:4,#2)
+                if (!p0.new) r0 = add(r0,r5)
+                if (!p0.new) r2 = sub(r2,r4)
+                p0 = cmp.gtu(r4,r2)
+        }
+        {
+                r7:6 = vlsrw(r7:6,#2)
+                if (!p0.new) r0 = add(r0,r7)
+                if (!p0.new) r2 = sub(r2,r6)
+                p0 = cmp.gtu(r6,r2)
+        }:endloop0
+        {
+                if (!p0) r0 = add(r0,r7)
+        }
+        {
+                if (p3) r0 = sub(r1,r0)
+                jumpr r31
+        }
+SYM_FUNC_END(__hexagon_divsi3)
diff --git a/arch/hexagon/lib/memcpy_likely_aligned.S b/arch/hexagon/lib/memcpy_likely_aligned.S
new file mode 100644 (file)
index 0000000..6a541fb
--- /dev/null
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/linkage.h>
+
+SYM_FUNC_START(__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes)
+        {
+                p0 = bitsclr(r1,#7)
+                p0 = bitsclr(r0,#7)
+                if (p0.new) r5:4 = memd(r1)
+                if (p0.new) r7:6 = memd(r1+#8)
+        }
+        {
+                if (!p0) jump:nt .Lmemcpy_call
+                if (p0) r9:8 = memd(r1+#16)
+                if (p0) r11:10 = memd(r1+#24)
+                p0 = cmp.gtu(r2,#64)
+        }
+        {
+                if (p0) jump:nt .Lmemcpy_call
+                if (!p0) memd(r0) = r5:4
+                if (!p0) memd(r0+#8) = r7:6
+                p0 = cmp.gtu(r2,#32)
+        }
+        {
+                p1 = cmp.gtu(r2,#40)
+                p2 = cmp.gtu(r2,#48)
+                if (p0) r13:12 = memd(r1+#32)
+                if (p1.new) r15:14 = memd(r1+#40)
+        }
+        {
+                memd(r0+#16) = r9:8
+                memd(r0+#24) = r11:10
+        }
+        {
+                if (p0) memd(r0+#32) = r13:12
+                if (p1) memd(r0+#40) = r15:14
+                if (!p2) jumpr:t r31
+        }
+        {
+                p0 = cmp.gtu(r2,#56)
+                r5:4 = memd(r1+#48)
+                if (p0.new) r7:6 = memd(r1+#56)
+        }
+        {
+                memd(r0+#48) = r5:4
+                if (p0) memd(r0+#56) = r7:6
+                jumpr r31
+        }
+
+.Lmemcpy_call:
+        jump memcpy
+
+SYM_FUNC_END(__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes)
diff --git a/arch/hexagon/lib/modsi3.S b/arch/hexagon/lib/modsi3.S
new file mode 100644 (file)
index 0000000..9ea1c86
--- /dev/null
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/linkage.h>
+
+SYM_FUNC_START(__hexagon_modsi3)
+        {
+                p2 = cmp.ge(r0,#0)
+                r2 = abs(r0)
+                r1 = abs(r1)
+        }
+        {
+                r3 = cl0(r2)
+                r4 = cl0(r1)
+                p0 = cmp.gtu(r1,r2)
+        }
+        {
+                r3 = sub(r4,r3)
+                if (p0) jumpr r31
+        }
+        {
+                p1 = cmp.eq(r3,#0)
+                loop0(1f,r3)
+                r0 = r2
+                r2 = lsl(r1,r3)
+        }
+        .falign
+1:
+        {
+                p0 = cmp.gtu(r2,r0)
+                if (!p0.new) r0 = sub(r0,r2)
+                r2 = lsr(r2,#1)
+                if (p1) r1 = #0
+        }:endloop0
+        {
+                p0 = cmp.gtu(r2,r0)
+                if (!p0.new) r0 = sub(r0,r1)
+                if (p2) jumpr r31
+        }
+        {
+                r0 = neg(r0)
+                jumpr r31
+        }
+SYM_FUNC_END(__hexagon_modsi3)
diff --git a/arch/hexagon/lib/udivsi3.S b/arch/hexagon/lib/udivsi3.S
new file mode 100644 (file)
index 0000000..477f27b
--- /dev/null
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/linkage.h>
+
+SYM_FUNC_START(__hexagon_udivsi3)
+        {
+                r2 = cl0(r0)
+                r3 = cl0(r1)
+                r5:4 = combine(#1,#0)
+                p0 = cmp.gtu(r1,r0)
+        }
+        {
+                r6 = sub(r3,r2)
+                r4 = r1
+                r1:0 = combine(r0,r4)
+                if (p0) jumpr r31
+        }
+        {
+                r3:2 = vlslw(r5:4,r6)
+                loop0(1f,r6)
+        }
+        .falign
+1:
+        {
+                p0 = cmp.gtu(r2,r1)
+                if (!p0.new) r1 = sub(r1,r2)
+                if (!p0.new) r0 = add(r0,r3)
+                r3:2 = vlsrw(r3:2,#1)
+        }:endloop0
+        {
+                p0 = cmp.gtu(r2,r1)
+                if (!p0.new) r0 = add(r0,r3)
+                jumpr r31
+        }
+SYM_FUNC_END(__hexagon_udivsi3)
diff --git a/arch/hexagon/lib/umodsi3.S b/arch/hexagon/lib/umodsi3.S
new file mode 100644 (file)
index 0000000..280bf06
--- /dev/null
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/linkage.h>
+
+SYM_FUNC_START(__hexagon_umodsi3)
+        {
+                r2 = cl0(r0)
+                r3 = cl0(r1)
+                p0 = cmp.gtu(r1,r0)
+        }
+        {
+                r2 = sub(r3,r2)
+                if (p0) jumpr r31
+        }
+        {
+                loop0(1f,r2)
+                p1 = cmp.eq(r2,#0)
+                r2 = lsl(r1,r2)
+        }
+        .falign
+1:
+        {
+                p0 = cmp.gtu(r2,r0)
+                if (!p0.new) r0 = sub(r0,r2)
+                r2 = lsr(r2,#1)
+                if (p1) r1 = #0
+        }:endloop0
+        {
+                p0 = cmp.gtu(r2,r0)
+                if (!p0.new) r0 = sub(r0,r1)
+                jumpr r31
+        }
+SYM_FUNC_END(__hexagon_umodsi3)
index 81e2b89..279252e 100644 (file)
@@ -13,6 +13,8 @@ config IA64
        select ARCH_MIGHT_HAVE_PC_SERIO
        select ACPI
        select ACPI_NUMA if NUMA
+       select ARCH_ENABLE_MEMORY_HOTPLUG
+       select ARCH_ENABLE_MEMORY_HOTREMOVE
        select ARCH_SUPPORTS_ACPI
        select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
        select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
@@ -32,6 +34,7 @@ config IA64
        select TTY
        select HAVE_ARCH_TRACEHOOK
        select HAVE_VIRT_CPU_ACCOUNTING
+       select HUGETLB_PAGE_SIZE_VARIABLE if HUGETLB_PAGE
        select VIRT_TO_BUS
        select GENERIC_IRQ_PROBE
        select GENERIC_PENDING_IRQ if SMP
@@ -82,11 +85,6 @@ config STACKTRACE_SUPPORT
 config GENERIC_LOCKBREAK
        def_bool n
 
-config HUGETLB_PAGE_SIZE_VARIABLE
-       bool
-       depends on HUGETLB_PAGE
-       default y
-
 config GENERIC_CALIBRATE_DELAY
        bool
        default y
@@ -250,12 +248,6 @@ config HOTPLUG_CPU
          can be controlled through /sys/devices/system/cpu/cpu#.
          Say N if you want to disable CPU hotplug.
 
-config ARCH_ENABLE_MEMORY_HOTPLUG
-       def_bool y
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
-       def_bool y
-
 config SCHED_SMT
        bool "SMT scheduler support"
        depends on SMP
index 3d666a1..6d93b92 100644 (file)
@@ -277,7 +277,6 @@ extern void memset_io(volatile void __iomem *s, int c, long n);
 #define memcpy_fromio memcpy_fromio
 #define memcpy_toio memcpy_toio
 #define memset_io memset_io
-#define xlate_dev_kmem_ptr xlate_dev_kmem_ptr
 #define xlate_dev_mem_ptr xlate_dev_mem_ptr
 #include <asm-generic/io.h>
 #undef PCI_IOBASE
index 179243c..e19d2dc 100644 (file)
@@ -272,22 +272,4 @@ xlate_dev_mem_ptr(phys_addr_t p)
        return ptr;
 }
 
-/*
- * Convert a virtual cached kernel memory pointer to an uncached pointer
- */
-static __inline__ void *
-xlate_dev_kmem_ptr(void *p)
-{
-       struct page *page;
-       void *ptr;
-
-       page = virt_to_page((unsigned long)p);
-       if (PageUncached(page))
-               ptr = (void *)__pa(p) + __IA64_UNCACHED_OFFSET;
-       else
-               ptr = p;
-
-       return ptr;
-}
-
 #endif /* _ASM_IA64_UACCESS_H */
index c072cd4..1ee8e73 100644 (file)
 441    common  epoll_pwait2                    sys_epoll_pwait2
 442    common  mount_setattr                   sys_mount_setattr
 443    common  quotactl_path                   sys_quotactl_path
+444    common  landlock_create_ruleset         sys_landlock_create_ruleset
+445    common  landlock_add_rule               sys_landlock_add_rule
+446    common  landlock_restrict_self          sys_landlock_restrict_self
index b331f94..f993cb3 100644 (file)
@@ -25,7 +25,8 @@ unsigned int hpage_shift = HPAGE_SHIFT_DEFAULT;
 EXPORT_SYMBOL(hpage_shift);
 
 pte_t *
-huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
+huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+              unsigned long addr, unsigned long sz)
 {
        unsigned long taddr = htlbpage_to_page(addr);
        pgd_t *pgd;
index 1068670..7e44d0e 100644 (file)
@@ -317,10 +317,3 @@ int atari_tt_hwclk( int op, struct rtc_time *t )
 
     return( 0 );
 }
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 8
- * End:
- */
index 15c4b7a..f7c2c41 100644 (file)
@@ -68,9 +68,9 @@ static void intc_irq_mask(struct irq_data *d)
 {
        unsigned int irq = d->irq - MCFINT_VECBASE;
 
-       if (MCFINTC2_SIMR && (irq > 128))
+       if (MCFINTC2_SIMR && (irq > 127))
                __raw_writeb(irq - 128, MCFINTC2_SIMR);
-       else if (MCFINTC1_SIMR && (irq > 64))
+       else if (MCFINTC1_SIMR && (irq > 63))
                __raw_writeb(irq - 64, MCFINTC1_SIMR);
        else
                __raw_writeb(irq, MCFINTC0_SIMR);
@@ -80,9 +80,9 @@ static void intc_irq_unmask(struct irq_data *d)
 {
        unsigned int irq = d->irq - MCFINT_VECBASE;
 
-       if (MCFINTC2_CIMR && (irq > 128))
+       if (MCFINTC2_CIMR && (irq > 127))
                __raw_writeb(irq - 128, MCFINTC2_CIMR);
-       else if (MCFINTC1_CIMR && (irq > 64))
+       else if (MCFINTC1_CIMR && (irq > 63))
                __raw_writeb(irq - 64, MCFINTC1_CIMR);
        else
                __raw_writeb(irq, MCFINTC0_CIMR);
@@ -115,9 +115,9 @@ static unsigned int intc_irq_startup(struct irq_data *d)
        }
 
        irq -= MCFINT_VECBASE;
-       if (MCFINTC2_ICR0 && (irq > 128))
+       if (MCFINTC2_ICR0 && (irq > 127))
                __raw_writeb(5, MCFINTC2_ICR0 + irq - 128);
-       else if (MCFINTC1_ICR0 && (irq > 64))
+       else if (MCFINTC1_ICR0 && (irq > 63))
                __raw_writeb(5, MCFINTC1_ICR0 + irq - 64);
        else
                __raw_writeb(5, MCFINTC0_ICR0 + irq);
index 3a84f24..6d9ed21 100644 (file)
@@ -60,7 +60,6 @@ CONFIG_DM9000=y
 # CONFIG_VT is not set
 # CONFIG_UNIX98_PTYS is not set
 # CONFIG_DEVMEM is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_MCF=y
 CONFIG_SERIAL_MCF_BAUDRATE=115200
 CONFIG_SERIAL_MCF_CONSOLE=y
index 10133a9..7b41409 100644 (file)
@@ -440,8 +440,6 @@ static inline unsigned long ffz(unsigned long word)
 
 #endif
 
-#include <asm-generic/bitops/find.h>
-
 #ifdef __KERNEL__
 
 #if defined(CONFIG_CPU_HAS_NO_BITFIELDS)
@@ -525,10 +523,12 @@ static inline int __fls(int x)
 #define __clear_bit_unlock     clear_bit_unlock
 
 #include <asm-generic/bitops/ext2-atomic.h>
-#include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/le.h>
 #endif /* __KERNEL__ */
 
+#include <asm-generic/bitops/find.h>
+
 #endif /* _M68K_BITOPS_H */
index 819f611..d41fa48 100644 (file)
@@ -397,11 +397,6 @@ static inline void isa_delay(void)
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 #define readb_relaxed(addr)    readb(addr)
 #define readw_relaxed(addr)    readw(addr)
 #define readl_relaxed(addr)    readl(addr)
index 5e9f810..0dd019d 100644 (file)
 441    common  epoll_pwait2                    sys_epoll_pwait2
 442    common  mount_setattr                   sys_mount_setattr
 443    common  quotactl_path                   sys_quotactl_path
+444    common  landlock_create_ruleset         sys_landlock_create_ruleset
+445    common  landlock_add_rule               sys_landlock_add_rule
+446    common  landlock_restrict_self          sys_landlock_restrict_self
index 5db7f44..6a92bed 100644 (file)
@@ -13,7 +13,7 @@ extern void ftrace_call_graph(void);
 #endif
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-/* reloction of mcount call site is the same as the address */
+/* relocation of mcount call site is the same as the address */
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
        return addr;
index 8e74d69..2ac7169 100644 (file)
 441    common  epoll_pwait2                    sys_epoll_pwait2
 442    common  mount_setattr                   sys_mount_setattr
 443    common  quotactl_path                   sys_quotactl_path
+444    common  landlock_create_ruleset         sys_landlock_create_ruleset
+445    common  landlock_add_rule               sys_landlock_add_rule
+446    common  landlock_restrict_self          sys_landlock_restrict_self
index 49a3c9c..ed51970 100644 (file)
@@ -19,6 +19,7 @@ config MIPS
        select ARCH_USE_MEMTEST
        select ARCH_USE_QUEUED_RWLOCKS
        select ARCH_USE_QUEUED_SPINLOCKS
+       select ARCH_SUPPORTS_HUGETLBFS if CPU_SUPPORTS_HUGEPAGES
        select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
        select ARCH_WANT_IPC_PARSE_VERSION
        select ARCH_WANT_LD_ORPHAN_WARN
@@ -1287,11 +1288,6 @@ config SYS_SUPPORTS_BIG_ENDIAN
 config SYS_SUPPORTS_LITTLE_ENDIAN
        bool
 
-config SYS_SUPPORTS_HUGETLBFS
-       bool
-       depends on CPU_SUPPORTS_HUGEPAGES
-       default y
-
 config MIPS_HUGE_TLB_SUPPORT
        def_bool HUGETLB_PAGE || TRANSPARENT_HUGEPAGE
 
index 2c13845..6f5c86d 100644 (file)
@@ -564,11 +564,6 @@ extern void (*_dma_cache_inv)(unsigned long start, unsigned long size);
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 void __ioread64_copy(void *to, const void __iomem *from, size_t count);
 
 #endif /* _ASM_IO_H */
index 603ad56..fca4547 100644 (file)
@@ -740,14 +740,7 @@ struct kvm_mips_callbacks {
        int (*vcpu_init)(struct kvm_vcpu *vcpu);
        void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
        int (*vcpu_setup)(struct kvm_vcpu *vcpu);
-       void (*flush_shadow_all)(struct kvm *kvm);
-       /*
-        * Must take care of flushing any cached GPA PTEs (e.g. guest entries in
-        * VZ root TLB, or T&E GVA page tables and corresponding root TLB
-        * mappings).
-        */
-       void (*flush_shadow_memslot)(struct kvm *kvm,
-                                    const struct kvm_memory_slot *slot);
+       void (*prepare_flush_shadow)(struct kvm *kvm);
        gpa_t (*gva_to_gpa)(gva_t gva);
        void (*queue_timer_int)(struct kvm_vcpu *vcpu);
        void (*dequeue_timer_int)(struct kvm_vcpu *vcpu);
@@ -824,11 +817,6 @@ pgd_t *kvm_pgd_alloc(void);
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm,
-                       unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 
 /* Emulation */
 enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
@@ -916,4 +904,7 @@ static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
+int kvm_arch_flush_remote_tlb(struct kvm *kvm);
+
 #endif /* __MIPS_KVM_HOST_H__ */
index 6f397e5..5e00966 100644 (file)
 441    n32     epoll_pwait2                    compat_sys_epoll_pwait2
 442    n32     mount_setattr                   sys_mount_setattr
 443    n32     quotactl_path                   sys_quotactl_path
+444    n32     landlock_create_ruleset         sys_landlock_create_ruleset
+445    n32     landlock_add_rule               sys_landlock_add_rule
+446    n32     landlock_restrict_self          sys_landlock_restrict_self
index ab85a35..9974f5f 100644 (file)
 441    n64     epoll_pwait2                    sys_epoll_pwait2
 442    n64     mount_setattr                   sys_mount_setattr
 443    n64     quotactl_path                   sys_quotactl_path
+444    n64     landlock_create_ruleset         sys_landlock_create_ruleset
+445    n64     landlock_add_rule               sys_landlock_add_rule
+446    n64     landlock_restrict_self          sys_landlock_restrict_self
index 9c4cd2b..39d6e71 100644 (file)
 441    o32     epoll_pwait2                    sys_epoll_pwait2                compat_sys_epoll_pwait2
 442    o32     mount_setattr                   sys_mount_setattr
 443    o32     quotactl_path                   sys_quotactl_path
+444    o32     landlock_create_ruleset         sys_landlock_create_ruleset
+445    o32     landlock_add_rule               sys_landlock_add_rule
+446    o32     landlock_restrict_self          sys_landlock_restrict_self
index 29d37ba..4d4af97 100644 (file)
@@ -197,9 +197,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
        /* Flush whole GPA */
        kvm_mips_flush_gpa_pt(kvm, 0, ~0);
-
-       /* Let implementation do the rest */
-       kvm_mips_callbacks->flush_shadow_all(kvm);
+       kvm_flush_remote_tlbs(kvm);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -214,8 +212,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
        /* Flush slot from GPA */
        kvm_mips_flush_gpa_pt(kvm, slot->base_gfn,
                              slot->base_gfn + slot->npages - 1);
-       /* Let implementation do the rest */
-       kvm_mips_callbacks->flush_shadow_memslot(kvm, slot);
+       kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
        spin_unlock(&kvm->mmu_lock);
 }
 
@@ -255,9 +252,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                /* Write protect GPA page table entries */
                needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn,
                                        new->base_gfn + new->npages - 1);
-               /* Let implementation do the rest */
                if (needs_flush)
-                       kvm_mips_callbacks->flush_shadow_memslot(kvm, new);
+                       kvm_arch_flush_remote_tlbs_memslot(kvm, new);
                spin_unlock(&kvm->mmu_lock);
        }
 }
@@ -972,11 +968,16 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 
 }
 
+int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+{
+       kvm_mips_callbacks->prepare_flush_shadow(kvm);
+       return 1;
+}
+
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot)
+                                       const struct kvm_memory_slot *memslot)
 {
-       /* Let implementation handle TLB/GVA invalidation */
-       kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
+       kvm_flush_remote_tlbs(kvm);
 }
 
 long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
index 190ca24..6d1f68c 100644 (file)
@@ -439,85 +439,34 @@ static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn,
                                  end_gfn << PAGE_SHIFT);
 }
 
-static int handle_hva_to_gpa(struct kvm *kvm,
-                            unsigned long start,
-                            unsigned long end,
-                            int (*handler)(struct kvm *kvm, gfn_t gfn,
-                                           gpa_t gfn_end,
-                                           struct kvm_memory_slot *memslot,
-                                           void *data),
-                            void *data)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-       int ret = 0;
-
-       slots = kvm_memslots(kvm);
-
-       /* we only care about the pages that the guest sees */
-       kvm_for_each_memslot(memslot, slots) {
-               unsigned long hva_start, hva_end;
-               gfn_t gfn, gfn_end;
-
-               hva_start = max(start, memslot->userspace_addr);
-               hva_end = min(end, memslot->userspace_addr +
-                                       (memslot->npages << PAGE_SHIFT));
-               if (hva_start >= hva_end)
-                       continue;
-
-               /*
-                * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                * {gfn_start, gfn_start+1, ..., gfn_end-1}.
-                */
-               gfn = hva_to_gfn_memslot(hva_start, memslot);
-               gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-               ret |= handler(kvm, gfn, gfn_end, memslot, data);
-       }
-
-       return ret;
-}
-
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
-                                struct kvm_memory_slot *memslot, void *data)
-{
-       kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end);
+       kvm_mips_flush_gpa_pt(kvm, range->start, range->end);
        return 1;
 }
 
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags)
-{
-       handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
-
-       kvm_mips_callbacks->flush_shadow_all(kvm);
-       return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
-                               struct kvm_memory_slot *memslot, void *data)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       gpa_t gpa = gfn << PAGE_SHIFT;
-       pte_t hva_pte = *(pte_t *)data;
+       gpa_t gpa = range->start << PAGE_SHIFT;
+       pte_t hva_pte = range->pte;
        pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
        pte_t old_pte;
 
        if (!gpa_pte)
-               return 0;
+               return false;
 
        /* Mapping may need adjusting depending on memslot flags */
        old_pte = *gpa_pte;
-       if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
+       if (range->slot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
                hva_pte = pte_mkclean(hva_pte);
-       else if (memslot->flags & KVM_MEM_READONLY)
+       else if (range->slot->flags & KVM_MEM_READONLY)
                hva_pte = pte_wrprotect(hva_pte);
 
        set_pte(gpa_pte, hva_pte);
 
        /* Replacing an absent or old page doesn't need flushes */
        if (!pte_present(old_pte) || !pte_young(old_pte))
-               return 0;
+               return false;
 
        /* Pages swapped, aged, moved, or cleaned require flushes */
        return !pte_present(hva_pte) ||
@@ -526,27 +475,14 @@ static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
               (pte_dirty(old_pte) && !pte_dirty(hva_pte));
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       unsigned long end = hva + PAGE_SIZE;
-       int ret;
-
-       ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte);
-       if (ret)
-               kvm_mips_callbacks->flush_shadow_all(kvm);
-       return 0;
+       return kvm_mips_mkold_gpa_pt(kvm, range->start, range->end);
 }
 
-static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
-                              struct kvm_memory_slot *memslot, void *data)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end);
-}
-
-static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
-                                   struct kvm_memory_slot *memslot, void *data)
-{
-       gpa_t gpa = gfn << PAGE_SHIFT;
+       gpa_t gpa = range->start << PAGE_SHIFT;
        pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
 
        if (!gpa_pte)
@@ -554,16 +490,6 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
        return pte_young(*gpa_pte);
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
-{
-       return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
-
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
-       return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
-}
-
 /**
  * _kvm_mips_map_page_fast() - Fast path GPA fault handler.
  * @vcpu:              VCPU pointer.
index d0d03bd..43cad10 100644 (file)
@@ -3210,32 +3210,22 @@ static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static void kvm_vz_flush_shadow_all(struct kvm *kvm)
+static void kvm_vz_prepare_flush_shadow(struct kvm *kvm)
 {
-       if (cpu_has_guestid) {
-               /* Flush GuestID for each VCPU individually */
-               kvm_flush_remote_tlbs(kvm);
-       } else {
+       if (!cpu_has_guestid) {
                /*
                 * For each CPU there is a single GPA ASID used by all VCPUs in
                 * the VM, so it doesn't make sense for the VCPUs to handle
                 * invalidation of these ASIDs individually.
                 *
                 * Instead mark all CPUs as needing ASID invalidation in
-                * asid_flush_mask, and just use kvm_flush_remote_tlbs(kvm) to
+                * asid_flush_mask, and kvm_flush_remote_tlbs(kvm) will
                 * kick any running VCPUs so they check asid_flush_mask.
                 */
                cpumask_setall(&kvm->arch.asid_flush_mask);
-               kvm_flush_remote_tlbs(kvm);
        }
 }
 
-static void kvm_vz_flush_shadow_memslot(struct kvm *kvm,
-                                       const struct kvm_memory_slot *slot)
-{
-       kvm_vz_flush_shadow_all(kvm);
-}
-
 static void kvm_vz_vcpu_reenter(struct kvm_vcpu *vcpu)
 {
        int cpu = smp_processor_id();
@@ -3291,8 +3281,7 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = {
        .vcpu_init = kvm_vz_vcpu_init,
        .vcpu_uninit = kvm_vz_vcpu_uninit,
        .vcpu_setup = kvm_vz_vcpu_setup,
-       .flush_shadow_all = kvm_vz_flush_shadow_all,
-       .flush_shadow_memslot = kvm_vz_flush_shadow_memslot,
+       .prepare_flush_shadow = kvm_vz_prepare_flush_shadow,
        .gva_to_gpa = kvm_vz_gva_to_gpa_cb,
        .queue_timer_int = kvm_vz_queue_timer_int_cb,
        .dequeue_timer_int = kvm_vz_dequeue_timer_int_cb,
index b9f76f4..7eaff5b 100644 (file)
@@ -21,8 +21,8 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr,
-                     unsigned long sz)
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, unsigned long sz)
 {
        pgd_t *pgd;
        p4d_t *p4d;
index 010ba5f..d4cbf06 100644 (file)
@@ -260,7 +260,6 @@ do {                                                                        \
 
 extern unsigned long __arch_clear_user(void __user * addr, unsigned long n);
 extern long strncpy_from_user(char *dest, const char __user * src, long count);
-extern __must_check long strlen_user(const char __user * str);
 extern __must_check long strnlen_user(const char __user * str, long n);
 extern unsigned long __arch_copy_from_user(void *to, const void __user * from,
                                            unsigned long n);
index 414f8a7..0e23e3a 100644 (file)
@@ -236,7 +236,7 @@ void __naked return_to_handler(void)
                "bal ftrace_return_to_handler\n\t"
                "move $lp, $r0               \n\t"
 
-               /* restore state nedded by the ABI  */
+               /* restore state needed by the ABI  */
                "lmw.bim $r0,[$sp],$r1,#0x0  \n\t");
 }
 
index a741abb..ba9340e 100644 (file)
@@ -83,7 +83,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n);
 
 extern long strncpy_from_user(char *__to, const char __user *__from,
                              long __len);
-extern __must_check long strlen_user(const char __user *str);
 extern __must_check long strnlen_user(const char __user *s, long n);
 
 /* Optimized macros */
index 75f2da3..6e1e004 100644 (file)
@@ -43,7 +43,6 @@ CONFIG_MICREL_PHY=y
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
 # CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_OF_PLATFORM=y
index afc3b8d..bde9907 100644 (file)
@@ -12,6 +12,7 @@ config PARISC
        select ARCH_HAS_STRICT_KERNEL_RWX
        select ARCH_HAS_UBSAN_SANITIZE_ALL
        select ARCH_NO_SG_CHAIN
+       select ARCH_SUPPORTS_HUGETLBFS if PA20
        select ARCH_SUPPORTS_MEMORY_FAILURE
        select DMA_OPS
        select RTC_CLASS
@@ -138,10 +139,6 @@ config PGTABLE_LEVELS
        default 3 if 64BIT && PARISC_PAGE_SIZE_4KB
        default 2
 
-config SYS_SUPPORTS_HUGETLBFS
-       def_bool y if PA20
-
-
 menu "Processor type and features"
 
 choice
index 4406475..e6e7f74 100644 (file)
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 generated-y += syscall_table_32.h
 generated-y += syscall_table_64.h
-generated-y += syscall_table_c32.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
 generic-y += user.h
index 8a11b8c..0b52591 100644 (file)
@@ -316,11 +316,6 @@ extern void iowrite64be(u64 val, void __iomem *addr);
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 extern int devmem_is_allowed(unsigned long pfn);
 
 #endif
index ae3e108..d6d82f5 100644 (file)
@@ -365,4 +365,3 @@ void parisc_pdc_chassis_init(void);
                                         PDC_CHASSIS_EOM_SET            )
 
 #endif /* _PARISC_PDC_CHASSIS_H */
-/* vim: set ts=8 */
index e320bae..3fb86ee 100644 (file)
@@ -268,7 +268,7 @@ static int __init parisc_init_resources(void)
        result = request_resource(&iomem_resource, &local_broadcast);
        if (result < 0) {
                printk(KERN_ERR 
-                      "%s: failed to claim %saddress space!\n", 
+                      "%s: failed to claim %s address space!\n",
                       __FILE__, local_broadcast.name);
                return result;
        }
index 3225037..3f24a0a 100644 (file)
@@ -919,24 +919,24 @@ ENTRY(lws_table)
 END(lws_table)
        /* End of lws table */
 
+#ifdef CONFIG_64BIT
+#define __SYSCALL_WITH_COMPAT(nr, native, compat)      __SYSCALL(nr, compat)
+#else
+#define __SYSCALL_WITH_COMPAT(nr, native, compat)      __SYSCALL(nr, native)
+#endif
 #define __SYSCALL(nr, entry)   ASM_ULONG_INSN entry
        .align 8
 ENTRY(sys_call_table)
        .export sys_call_table,data
-#ifdef CONFIG_64BIT
-#include <asm/syscall_table_c32.h>   /* Compat syscalls */
-#else
-#include <asm/syscall_table_32.h>    /* 32-bit native syscalls */
-#endif
+#include <asm/syscall_table_32.h>    /* 32-bit syscalls */
 END(sys_call_table)
 
 #ifdef CONFIG_64BIT
        .align 8
 ENTRY(sys_call_table64)
-#include <asm/syscall_table_64.h>    /* 64-bit native syscalls */
+#include <asm/syscall_table_64.h>    /* 64-bit syscalls */
 END(sys_call_table64)
 #endif
-#undef __SYSCALL
 
        /*
                All light-weight-syscall atomic operations 
@@ -961,5 +961,3 @@ END(lws_lock_start)
        .previous
 
 .end
-
-
index 283f644..0f2ea5b 100644 (file)
@@ -6,46 +6,34 @@ _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)')     \
          $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)')
 
 syscall := $(src)/syscall.tbl
-syshdr := $(srctree)/$(src)/syscallhdr.sh
-systbl := $(srctree)/$(src)/syscalltbl.sh
+syshdr := $(srctree)/scripts/syscallhdr.sh
+systbl := $(srctree)/scripts/syscalltbl.sh
 
 quiet_cmd_syshdr = SYSHDR  $@
-      cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@'       \
-                  '$(syshdr_abis_$(basetarget))'               \
-                  '$(syshdr_pfx_$(basetarget))'                \
-                  '$(syshdr_offset_$(basetarget))'
+      cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr --abis $(abis) $< $@
 
 quiet_cmd_systbl = SYSTBL  $@
-      cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@'       \
-                  '$(systbl_abis_$(basetarget))'               \
-                  '$(systbl_abi_$(basetarget))'                \
-                  '$(systbl_offset_$(basetarget))'
+      cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@
 
-syshdr_abis_unistd_32 := common,32
+$(uapi)/unistd_32.h: abis := common,32
 $(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE
        $(call if_changed,syshdr)
 
-syshdr_abis_unistd_64 := common,64
+$(uapi)/unistd_64.h: abis := common,64
 $(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE
        $(call if_changed,syshdr)
 
-systbl_abis_syscall_table_32 := common,32
+$(kapi)/syscall_table_32.h: abis := common,32
 $(kapi)/syscall_table_32.h: $(syscall) $(systbl) FORCE
        $(call if_changed,systbl)
 
-systbl_abis_syscall_table_64 := common,64
+$(kapi)/syscall_table_64.h: abis := common,64
 $(kapi)/syscall_table_64.h: $(syscall) $(systbl) FORCE
        $(call if_changed,systbl)
 
-systbl_abis_syscall_table_c32 := common,32
-systbl_abi_syscall_table_c32 := c32
-$(kapi)/syscall_table_c32.h: $(syscall) $(systbl) FORCE
-       $(call if_changed,systbl)
-
 uapisyshdr-y           += unistd_32.h unistd_64.h
 kapisyshdr-y           += syscall_table_32.h           \
-                          syscall_table_64.h           \
-                          syscall_table_c32.h
+                          syscall_table_64.h
 
 uapisyshdr-y   := $(addprefix $(uapi)/, $(uapisyshdr-y))
 kapisyshdr-y   := $(addprefix $(kapi)/, $(kapisyshdr-y))
index 80fba3f..5ac80b8 100644 (file)
 441    common  epoll_pwait2                    sys_epoll_pwait2                compat_sys_epoll_pwait2
 442    common  mount_setattr                   sys_mount_setattr
 443    common  quotactl_path                   sys_quotactl_path
+444    common  landlock_create_ruleset         sys_landlock_create_ruleset
+445    common  landlock_add_rule               sys_landlock_add_rule
+446    common  landlock_restrict_self          sys_landlock_restrict_self
diff --git a/arch/parisc/kernel/syscalls/syscallhdr.sh b/arch/parisc/kernel/syscalls/syscallhdr.sh
deleted file mode 100644 (file)
index 730db28..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-in="$1"
-out="$2"
-my_abis=`echo "($3)" | tr ',' '|'`
-prefix="$4"
-offset="$5"
-
-fileguard=_UAPI_ASM_PARISC_`basename "$out" | sed \
-       -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
-       -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
-grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
-       printf "#ifndef %s\n" "${fileguard}"
-       printf "#define %s\n" "${fileguard}"
-       printf "\n"
-
-       nxt=0
-       while read nr abi name entry compat ; do
-               if [ -z "$offset" ]; then
-                       printf "#define __NR_%s%s\t%s\n" \
-                               "${prefix}" "${name}" "${nr}"
-               else
-                       printf "#define __NR_%s%s\t(%s + %s)\n" \
-                               "${prefix}" "${name}" "${offset}" "${nr}"
-               fi
-               nxt=$((nr+1))
-       done
-
-       printf "\n"
-       printf "#ifdef __KERNEL__\n"
-       printf "#define __NR_syscalls\t%s\n" "${nxt}"
-       printf "#endif\n"
-       printf "\n"
-       printf "#endif /* %s */\n" "${fileguard}"
-) > "$out"
diff --git a/arch/parisc/kernel/syscalls/syscalltbl.sh b/arch/parisc/kernel/syscalls/syscalltbl.sh
deleted file mode 100644 (file)
index f7393a7..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-in="$1"
-out="$2"
-my_abis=`echo "($3)" | tr ',' '|'`
-my_abi="$4"
-offset="$5"
-
-emit() {
-       t_nxt="$1"
-       t_nr="$2"
-       t_entry="$3"
-
-       while [ $t_nxt -lt $t_nr ]; do
-               printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}"
-               t_nxt=$((t_nxt+1))
-       done
-       printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}"
-}
-
-grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
-       nxt=0
-       if [ -z "$offset" ]; then
-               offset=0
-       fi
-
-       while read nr abi name entry compat ; do
-               if [ "$my_abi" = "c32" ] && [ ! -z "$compat" ]; then
-                       emit $((nxt+offset)) $((nr+offset)) $compat
-               else
-                       emit $((nxt+offset)) $((nr+offset)) $entry
-               fi
-               nxt=$((nr+1))
-       done
-) > "$out"
index 43652de..d1d3990 100644 (file)
@@ -44,7 +44,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 }
 
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz)
 {
        pgd_t *pgd;
index 1e6230b..088dd2a 100644 (file)
@@ -118,28 +118,31 @@ config PPC
        # Please keep this list sorted alphabetically.
        #
        select ARCH_32BIT_OFF_T if PPC32
+       select ARCH_ENABLE_MEMORY_HOTPLUG
+       select ARCH_ENABLE_MEMORY_HOTREMOVE
+       select ARCH_HAS_COPY_MC                 if PPC64
        select ARCH_HAS_DEBUG_VIRTUAL
        select ARCH_HAS_DEBUG_VM_PGTABLE
        select ARCH_HAS_DEVMEM_IS_ALLOWED
+       select ARCH_HAS_DMA_MAP_DIRECT          if PPC_PSERIES
        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_FORTIFY_SOURCE
        select ARCH_HAS_GCOV_PROFILE_ALL
-       select ARCH_HAS_KCOV
        select ARCH_HAS_HUGEPD                  if HUGETLB_PAGE
+       select ARCH_HAS_KCOV
+       select ARCH_HAS_MEMBARRIER_CALLBACKS
+       select ARCH_HAS_MEMBARRIER_SYNC_CORE
        select ARCH_HAS_MEMREMAP_COMPAT_ALIGN
        select ARCH_HAS_MMIOWB                  if PPC64
+       select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
        select ARCH_HAS_PHYS_TO_DMA
        select ARCH_HAS_PMEM_API
-       select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
        select ARCH_HAS_PTE_DEVMAP              if PPC_BOOK3S_64
        select ARCH_HAS_PTE_SPECIAL
-       select ARCH_HAS_MEMBARRIER_CALLBACKS
-       select ARCH_HAS_MEMBARRIER_SYNC_CORE
        select ARCH_HAS_SCALED_CPUTIME          if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64
        select ARCH_HAS_STRICT_KERNEL_RWX       if ((PPC_BOOK3S_64 || PPC32) && !HIBERNATION)
        select ARCH_HAS_TICK_BROADCAST          if GENERIC_CLOCKEVENTS_BROADCAST
        select ARCH_HAS_UACCESS_FLUSHCACHE
-       select ARCH_HAS_COPY_MC                 if PPC64
        select ARCH_HAS_UBSAN_SANITIZE_ALL
        select ARCH_HAVE_NMI_SAFE_CMPXCHG
        select ARCH_KEEP_MEMBLOCK
@@ -162,9 +165,8 @@ config PPC
        select BUILDTIME_TABLE_SORT
        select CLONE_BACKWARDS
        select DCACHE_WORD_ACCESS               if PPC64 && CPU_LITTLE_ENDIAN
-       select DMA_OPS                          if PPC64
        select DMA_OPS_BYPASS                   if PPC64
-       select ARCH_HAS_DMA_MAP_DIRECT          if PPC64 && PPC_PSERIES
+       select DMA_OPS                          if PPC64
        select DYNAMIC_FTRACE                   if FUNCTION_TRACER
        select EDAC_ATOMIC_SCRUB
        select EDAC_SUPPORT
@@ -184,23 +186,22 @@ config PPC
        select GENERIC_TIME_VSYSCALL
        select GENERIC_VDSO_TIME_NS
        select HAVE_ARCH_AUDITSYSCALL
+       select HAVE_ARCH_HUGE_VMALLOC           if HAVE_ARCH_HUGE_VMAP
        select HAVE_ARCH_HUGE_VMAP              if PPC_BOOK3S_64 && PPC_RADIX_MMU
        select HAVE_ARCH_JUMP_LABEL
        select HAVE_ARCH_JUMP_LABEL_RELATIVE
        select HAVE_ARCH_KASAN                  if PPC32 && PPC_PAGE_SHIFT <= 14
        select HAVE_ARCH_KASAN_VMALLOC          if PPC32 && PPC_PAGE_SHIFT <= 14
-       select HAVE_ARCH_KGDB
        select HAVE_ARCH_KFENCE                 if PPC32
+       select HAVE_ARCH_KGDB
        select HAVE_ARCH_MMAP_RND_BITS
        select HAVE_ARCH_MMAP_RND_COMPAT_BITS   if COMPAT
        select HAVE_ARCH_NVRAM_OPS
        select HAVE_ARCH_SECCOMP_FILTER
        select HAVE_ARCH_TRACEHOOK
        select HAVE_ASM_MODVERSIONS
-       select HAVE_C_RECORDMCOUNT
-       select HAVE_STACKPROTECTOR              if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13)
-       select HAVE_STACKPROTECTOR              if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2)
        select HAVE_CONTEXT_TRACKING            if PPC64
+       select HAVE_C_RECORDMCOUNT
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_DEBUG_STACKOVERFLOW
        select HAVE_DYNAMIC_FTRACE
@@ -214,10 +215,13 @@ config PPC
        select HAVE_FUNCTION_TRACER
        select HAVE_GCC_PLUGINS                 if GCC_VERSION >= 50200   # plugin support on gcc <= 5.1 is buggy on PPC
        select HAVE_GENERIC_VDSO
+       select HAVE_HARDLOCKUP_DETECTOR_ARCH    if PPC_BOOK3S_64 && SMP
+       select HAVE_HARDLOCKUP_DETECTOR_PERF    if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
        select HAVE_HW_BREAKPOINT               if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx)
        select HAVE_IDE
        select HAVE_IOREMAP_PROT
        select HAVE_IRQ_EXIT_ON_IRQ_STACK
+       select HAVE_IRQ_TIME_ACCOUNTING
        select HAVE_KERNEL_GZIP
        select HAVE_KERNEL_LZMA                 if DEFAULT_UIMAGE
        select HAVE_KERNEL_LZO                  if DEFAULT_UIMAGE
@@ -229,25 +233,25 @@ config PPC
        select HAVE_LIVEPATCH                   if HAVE_DYNAMIC_FTRACE_WITH_REGS
        select HAVE_MOD_ARCH_SPECIFIC
        select HAVE_NMI                         if PERF_EVENTS || (PPC64 && PPC_BOOK3S)
-       select HAVE_HARDLOCKUP_DETECTOR_ARCH    if PPC64 && PPC_BOOK3S && SMP
        select HAVE_OPTPROBES
        select HAVE_PERF_EVENTS
        select HAVE_PERF_EVENTS_NMI             if PPC64
-       select HAVE_HARDLOCKUP_DETECTOR_PERF    if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
        select HAVE_PERF_REGS
        select HAVE_PERF_USER_STACK_DUMP
-       select MMU_GATHER_RCU_TABLE_FREE
-       select MMU_GATHER_PAGE_SIZE
        select HAVE_REGS_AND_STACK_ACCESS_API
        select HAVE_RELIABLE_STACKTRACE
+       select HAVE_RSEQ
        select HAVE_SOFTIRQ_ON_OWN_STACK
+       select HAVE_STACKPROTECTOR              if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2)
+       select HAVE_STACKPROTECTOR              if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13)
        select HAVE_SYSCALL_TRACEPOINTS
        select HAVE_VIRT_CPU_ACCOUNTING
-       select HAVE_IRQ_TIME_ACCOUNTING
-       select HAVE_RSEQ
+       select HUGETLB_PAGE_SIZE_VARIABLE       if PPC_BOOK3S_64 && HUGETLB_PAGE
        select IOMMU_HELPER                     if PPC64
        select IRQ_DOMAIN
        select IRQ_FORCED_THREADING
+       select MMU_GATHER_PAGE_SIZE
+       select MMU_GATHER_RCU_TABLE_FREE
        select MODULES_USE_ELF_RELA
        select NEED_DMA_MAP_STATE               if PPC64 || NOT_COHERENT_CACHE
        select NEED_SG_DMA_LENGTH
@@ -420,11 +424,6 @@ config HIGHMEM
 
 source "kernel/Kconfig.hz"
 
-config HUGETLB_PAGE_SIZE_VARIABLE
-       bool
-       depends on HUGETLB_PAGE && PPC_BOOK3S_64
-       default y
-
 config MATH_EMULATION
        bool "Math emulation"
        depends on 4xx || PPC_8xx || PPC_MPC832x || BOOKE
@@ -520,12 +519,6 @@ config ARCH_CPU_PROBE_RELEASE
        def_bool y
        depends on HOTPLUG_CPU
 
-config ARCH_ENABLE_MEMORY_HOTPLUG
-       def_bool y
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
-       def_bool y
-
 config PPC64_SUPPORTS_MEMORY_FAILURE
        bool "Add support for memory hwpoison"
        depends on PPC_BOOK3S_64
@@ -705,9 +698,6 @@ config ARCH_SPARSEMEM_DEFAULT
        def_bool y
        depends on PPC_BOOK3S_64
 
-config SYS_SUPPORTS_HUGETLBFS
-       bool
-
 config ILLEGAL_POINTER_VALUE
        hex
        # This is roughly half way between the top of user space and the bottom
index 30a31ad..c0fbadb 100644 (file)
@@ -7,6 +7,8 @@
 #ifndef __FSL_PAMU_STASH_H
 #define __FSL_PAMU_STASH_H
 
+struct iommu_domain;
+
 /* cache stash targets */
 enum pamu_stash_target {
        PAMU_ATTR_CACHE_L1 = 1,
@@ -14,14 +16,6 @@ enum pamu_stash_target {
        PAMU_ATTR_CACHE_L3,
 };
 
-/*
- * This attribute allows configuring stashig specific parameters
- * in the PAMU hardware.
- */
-
-struct pamu_stash_attribute {
-       u32     cpu;    /* cpu number */
-       u32     cache;  /* cache to stash to: L1,L2,L3 */
-};
+int fsl_pamu_configure_l1_stash(struct iommu_domain *domain, u32 cpu);
 
 #endif  /* __FSL_PAMU_STASH_H */
index bc76970..debe8c4 100644 (file)
@@ -12,7 +12,7 @@
 
 #ifdef __ASSEMBLY__
 
-/* Based off of objdump optput from glibc */
+/* Based off of objdump output from glibc */
 
 #define MCOUNT_SAVE_FRAME                      \
        stwu    r1,-48(r1);                     \
@@ -52,7 +52,7 @@ extern void _mcount(void);
 
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
-       /* reloction of mcount call site is the same as the address */
+       /* relocation of mcount call site is the same as the address */
        return addr;
 }
 
index 273edd2..f130783 100644 (file)
@@ -662,11 +662,6 @@ static inline void name at                                 \
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 /*
  * We don't do relaxed operations yet, at least not with this semantic
  */
index c581215..e6b53c6 100644 (file)
@@ -210,12 +210,12 @@ extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
                                      unsigned int lpid);
 extern int kvmppc_radix_init(void);
 extern void kvmppc_radix_exit(void);
-extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                       unsigned long gfn);
-extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                       unsigned long gfn);
-extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                       unsigned long gfn);
+extern void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                           unsigned long gfn);
+extern bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                         unsigned long gfn);
+extern bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                              unsigned long gfn);
 extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
                        struct kvm_memory_slot *memslot, unsigned long *map);
 extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
index 05fb00d..1e83359 100644 (file)
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 
-extern int kvm_unmap_hva_range(struct kvm *kvm,
-                              unsigned long start, unsigned long end,
-                              unsigned flags);
-extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-
 #define HPTEG_CACHE_NUM                        (1 << 15)
 #define HPTEG_HASH_BITS_PTE            13
 #define HPTEG_HASH_BITS_PTE_LONG       12
index 9531b1c..5bf8ae9 100644 (file)
@@ -281,11 +281,10 @@ struct kvmppc_ops {
                                     const struct kvm_memory_slot *old,
                                     const struct kvm_memory_slot *new,
                                     enum kvm_mr_change change);
-       int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
-                          unsigned long end);
-       int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end);
-       int (*test_age_hva)(struct kvm *kvm, unsigned long hva);
-       void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte);
+       bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
+       bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+       bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+       bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
        void (*free_memslot)(struct kvm_memory_slot *slot);
        int (*init_vm)(struct kvm *kvm);
        void (*destroy_vm)(struct kvm *kvm);
index c761572..6ea9001 100644 (file)
@@ -28,9 +28,6 @@ extern struct device_node *opal_node;
 
 /* API functions */
 int64_t opal_invalid_call(void);
-int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid, uint64_t bdf);
-int64_t opal_npu_init_context(uint64_t phb_id, int pasid, uint64_t msr,
-                       uint64_t bdf);
 int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid,
                        uint64_t lpcr);
 int64_t opal_npu_spa_setup(uint64_t phb_id, uint32_t bdfn,
index d2a2a14..74424c1 100644 (file)
@@ -126,7 +126,6 @@ struct pci_controller {
 #endif /* CONFIG_PPC64 */
 
        void *private_data;
-       struct npu *npu;
 };
 
 /* These are used for config access before all the PCI probing
index 6436f0b..d1f5326 100644 (file)
@@ -119,11 +119,4 @@ extern void pcibios_scan_phb(struct pci_controller *hose);
 
 #endif /* __KERNEL__ */
 
-extern struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev);
-extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index);
-extern int pnv_npu2_init(struct pci_controller *hose);
-extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
-               unsigned long msr);
-extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev);
-
 #endif /* __ASM_POWERPC_PCI_H */
index fab8402..3f35c8d 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/moduleloader.h>
 #include <linux/err.h>
 #include <linux/vmalloc.h>
+#include <linux/mm.h>
 #include <linux/bug.h>
 #include <asm/module.h>
 #include <linux/uaccess.h>
@@ -88,17 +89,22 @@ int module_finalize(const Elf_Ehdr *hdr,
        return 0;
 }
 
-#ifdef MODULES_VADDR
 static __always_inline void *
 __module_alloc(unsigned long size, unsigned long start, unsigned long end)
 {
-       return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL,
-                                   PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
-                                   __builtin_return_address(0));
+       /*
+        * Don't do huge page allocations for modules yet until more testing
+        * is done. STRICT_MODULE_RWX may require extra work to support this
+        * too.
+        */
+       return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL, PAGE_KERNEL_EXEC,
+                                   VM_FLUSH_RESET_PERMS | VM_NO_HUGE_VMAP,
+                                   NUMA_NO_NODE, __builtin_return_address(0));
 }
 
 void *module_alloc(unsigned long size)
 {
+#ifdef MODULES_VADDR
        unsigned long limit = (unsigned long)_etext - SZ_32M;
        void *ptr = NULL;
 
@@ -112,5 +118,7 @@ void *module_alloc(unsigned long size)
                ptr = __module_alloc(size, MODULES_VADDR, MODULES_END);
 
        return ptr;
-}
+#else
+       return __module_alloc(size, VMALLOC_START, VMALLOC_END);
 #endif
+}
index f66f9c9..2e68fbb 100644 (file)
 441    common  epoll_pwait2                    sys_epoll_pwait2                compat_sys_epoll_pwait2
 442    common  mount_setattr                   sys_mount_setattr
 443    common  quotactl_path                   sys_quotactl_path
+444    common  landlock_create_ruleset         sys_landlock_create_ruleset
+445    common  landlock_add_rule               sys_landlock_add_rule
+446    common  landlock_restrict_self          sys_landlock_restrict_self
index f9eb49e..5056e17 100644 (file)
@@ -950,6 +950,93 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image)
        return (unsigned int)(usm_entries * sizeof(u64));
 }
 
+/**
+ * add_node_props - Reads node properties from device node structure and add
+ *                  them to fdt.
+ * @fdt:            Flattened device tree of the kernel
+ * @node_offset:    offset of the node to add a property at
+ * @dn:             device node pointer
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int add_node_props(void *fdt, int node_offset, const struct device_node *dn)
+{
+       int ret = 0;
+       struct property *pp;
+
+       if (!dn)
+               return -EINVAL;
+
+       for_each_property_of_node(dn, pp) {
+               ret = fdt_setprop(fdt, node_offset, pp->name, pp->value, pp->length);
+               if (ret < 0) {
+                       pr_err("Unable to add %s property: %s\n", pp->name, fdt_strerror(ret));
+                       return ret;
+               }
+       }
+       return ret;
+}
+
+/**
+ * update_cpus_node - Update cpus node of flattened device tree using of_root
+ *                    device node.
+ * @fdt:              Flattened device tree of the kernel.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int update_cpus_node(void *fdt)
+{
+       struct device_node *cpus_node, *dn;
+       int cpus_offset, cpus_subnode_offset, ret = 0;
+
+       cpus_offset = fdt_path_offset(fdt, "/cpus");
+       if (cpus_offset < 0 && cpus_offset != -FDT_ERR_NOTFOUND) {
+               pr_err("Malformed device tree: error reading /cpus node: %s\n",
+                      fdt_strerror(cpus_offset));
+               return cpus_offset;
+       }
+
+       if (cpus_offset > 0) {
+               ret = fdt_del_node(fdt, cpus_offset);
+               if (ret < 0) {
+                       pr_err("Error deleting /cpus node: %s\n", fdt_strerror(ret));
+                       return -EINVAL;
+               }
+       }
+
+       /* Add cpus node to fdt */
+       cpus_offset = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"), "cpus");
+       if (cpus_offset < 0) {
+               pr_err("Error creating /cpus node: %s\n", fdt_strerror(cpus_offset));
+               return -EINVAL;
+       }
+
+       /* Add cpus node properties */
+       cpus_node = of_find_node_by_path("/cpus");
+       ret = add_node_props(fdt, cpus_offset, cpus_node);
+       of_node_put(cpus_node);
+       if (ret < 0)
+               return ret;
+
+       /* Loop through all subnodes of cpus and add them to fdt */
+       for_each_node_by_type(dn, "cpu") {
+               cpus_subnode_offset = fdt_add_subnode(fdt, cpus_offset, dn->full_name);
+               if (cpus_subnode_offset < 0) {
+                       pr_err("Unable to add %s subnode: %s\n", dn->full_name,
+                              fdt_strerror(cpus_subnode_offset));
+                       ret = cpus_subnode_offset;
+                       goto out;
+               }
+
+               ret = add_node_props(fdt, cpus_subnode_offset, dn);
+               if (ret < 0)
+                       goto out;
+       }
+out:
+       of_node_put(dn);
+       return ret;
+}
+
 /**
  * setup_new_fdt_ppc64 - Update the flattend device-tree of the kernel
  *                       being loaded.
@@ -1006,6 +1093,11 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
                }
        }
 
+       /* Update cpus nodes information to account hotplug CPUs. */
+       ret =  update_cpus_node(fdt);
+       if (ret < 0)
+               goto out;
+
        /* Update memory reserve map */
        ret = get_reserved_memory_ranges(&rmem);
        if (ret)
index 44bf567..2b691f4 100644 (file)
@@ -834,26 +834,24 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
        kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change);
 }
 
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end);
+       return kvm->arch.kvm_ops->unmap_gfn_range(kvm, range);
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm->arch.kvm_ops->age_hva(kvm, start, end);
+       return kvm->arch.kvm_ops->age_gfn(kvm, range);
 }
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm->arch.kvm_ops->test_age_hva(kvm, hva);
+       return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte);
-       return 0;
+       return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
 }
 
 int kvmppc_core_init_vm(struct kvm *kvm)
index 9b6323e..740e51d 100644 (file)
@@ -9,12 +9,10 @@
 
 extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
                                         struct kvm_memory_slot *memslot);
-extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start,
-                                 unsigned long end);
-extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start,
-                         unsigned long end);
-extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva);
-extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte);
+extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
 
 extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu);
index bb67735..2d9193c 100644 (file)
@@ -752,51 +752,6 @@ void kvmppc_rmap_reset(struct kvm *kvm)
        srcu_read_unlock(&kvm->srcu, srcu_idx);
 }
 
-typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                             unsigned long gfn);
-
-static int kvm_handle_hva_range(struct kvm *kvm,
-                               unsigned long start,
-                               unsigned long end,
-                               hva_handler_fn handler)
-{
-       int ret;
-       int retval = 0;
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-
-       slots = kvm_memslots(kvm);
-       kvm_for_each_memslot(memslot, slots) {
-               unsigned long hva_start, hva_end;
-               gfn_t gfn, gfn_end;
-
-               hva_start = max(start, memslot->userspace_addr);
-               hva_end = min(end, memslot->userspace_addr +
-                                       (memslot->npages << PAGE_SHIFT));
-               if (hva_start >= hva_end)
-                       continue;
-               /*
-                * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                * {gfn, gfn+1, ..., gfn_end-1}.
-                */
-               gfn = hva_to_gfn_memslot(hva_start, memslot);
-               gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-               for (; gfn < gfn_end; ++gfn) {
-                       ret = handler(kvm, memslot, gfn);
-                       retval |= ret;
-               }
-       }
-
-       return retval;
-}
-
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-                         hva_handler_fn handler)
-{
-       return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
-}
-
 /* Must be called with both HPTE and rmap locked */
 static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
                              struct kvm_memory_slot *memslot,
@@ -840,8 +795,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
        }
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                          unsigned long gfn)
+static void kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                           unsigned long gfn)
 {
        unsigned long i;
        __be64 *hptep;
@@ -874,16 +829,21 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
                unlock_rmap(rmapp);
                __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
        }
-       return 0;
 }
 
-int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       hva_handler_fn handler;
+       gfn_t gfn;
 
-       handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
-       kvm_handle_hva_range(kvm, start, end, handler);
-       return 0;
+       if (kvm_is_radix(kvm)) {
+               for (gfn = range->start; gfn < range->end; gfn++)
+                       kvm_unmap_radix(kvm, range->slot, gfn);
+       } else {
+               for (gfn = range->start; gfn < range->end; gfn++)
+                       kvm_unmap_rmapp(kvm, range->slot, range->start);
+       }
+
+       return false;
 }
 
 void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
@@ -913,8 +873,8 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
        }
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                        unsigned long gfn)
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                         unsigned long gfn)
 {
        struct revmap_entry *rev = kvm->arch.hpt.rev;
        unsigned long head, i, j;
@@ -968,26 +928,34 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
        return ret;
 }
 
-int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       hva_handler_fn handler;
+       gfn_t gfn;
+       bool ret = false;
 
-       handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp;
-       return kvm_handle_hva_range(kvm, start, end, handler);
+       if (kvm_is_radix(kvm)) {
+               for (gfn = range->start; gfn < range->end; gfn++)
+                       ret |= kvm_age_radix(kvm, range->slot, gfn);
+       } else {
+               for (gfn = range->start; gfn < range->end; gfn++)
+                       ret |= kvm_age_rmapp(kvm, range->slot, gfn);
+       }
+
+       return ret;
 }
 
-static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                             unsigned long gfn)
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                              unsigned long gfn)
 {
        struct revmap_entry *rev = kvm->arch.hpt.rev;
        unsigned long head, i, j;
        unsigned long *hp;
-       int ret = 1;
+       bool ret = true;
        unsigned long *rmapp;
 
        rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
        if (*rmapp & KVMPPC_RMAP_REFERENCED)
-               return 1;
+               return true;
 
        lock_rmap(rmapp);
        if (*rmapp & KVMPPC_RMAP_REFERENCED)
@@ -1002,27 +970,33 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
                                goto out;
                } while ((i = j) != head);
        }
-       ret = 0;
+       ret = false;
 
  out:
        unlock_rmap(rmapp);
        return ret;
 }
 
-int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       hva_handler_fn handler;
+       WARN_ON(range->start + 1 != range->end);
 
-       handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp;
-       return kvm_handle_hva(kvm, hva, handler);
+       if (kvm_is_radix(kvm))
+               return kvm_test_age_radix(kvm, range->slot, range->start);
+       else
+               return kvm_test_age_rmapp(kvm, range->slot, range->start);
 }
 
-void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       hva_handler_fn handler;
+       WARN_ON(range->start + 1 != range->end);
+
+       if (kvm_is_radix(kvm))
+               kvm_unmap_radix(kvm, range->slot, range->start);
+       else
+               kvm_unmap_rmapp(kvm, range->slot, range->start);
 
-       handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
-       kvm_handle_hva(kvm, hva, handler);
+       return false;
 }
 
 static int vcpus_running(struct kvm *kvm)
index e603de7..d909c06 100644 (file)
@@ -993,8 +993,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
 }
 
 /* Called with kvm->mmu_lock held */
-int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                   unsigned long gfn)
+void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                    unsigned long gfn)
 {
        pte_t *ptep;
        unsigned long gpa = gfn << PAGE_SHIFT;
@@ -1002,24 +1002,23 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
                uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
-               return 0;
+               return;
        }
 
        ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
        if (ptep && pte_present(*ptep))
                kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
                                 kvm->arch.lpid);
-       return 0;
 }
 
 /* Called with kvm->mmu_lock held */
-int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                 unsigned long gfn)
+bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                  unsigned long gfn)
 {
        pte_t *ptep;
        unsigned long gpa = gfn << PAGE_SHIFT;
        unsigned int shift;
-       int ref = 0;
+       bool ref = false;
        unsigned long old, *rmapp;
 
        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
@@ -1035,26 +1034,27 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
                kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
                                               old & PTE_RPN_MASK,
                                               1UL << shift);
-               ref = 1;
+               ref = true;
        }
        return ref;
 }
 
 /* Called with kvm->mmu_lock held */
-int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                      unsigned long gfn)
+bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                       unsigned long gfn)
+
 {
        pte_t *ptep;
        unsigned long gpa = gfn << PAGE_SHIFT;
        unsigned int shift;
-       int ref = 0;
+       bool ref = false;
 
        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
                return ref;
 
        ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
        if (ptep && pte_present(*ptep) && pte_young(*ptep))
-               ref = 1;
+               ref = true;
        return ref;
 }
 
index 4a53241..28a80d2 100644 (file)
@@ -4812,7 +4812,7 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
                kvmhv_release_all_nested(kvm);
        kvmppc_rmap_reset(kvm);
        kvm->arch.process_table = 0;
-       /* Mutual exclusion with kvm_unmap_hva_range etc. */
+       /* Mutual exclusion with kvm_unmap_gfn_range etc. */
        spin_lock(&kvm->mmu_lock);
        kvm->arch.radix = 0;
        spin_unlock(&kvm->mmu_lock);
@@ -4834,7 +4834,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
        if (err)
                return err;
        kvmppc_rmap_reset(kvm);
-       /* Mutual exclusion with kvm_unmap_hva_range etc. */
+       /* Mutual exclusion with kvm_unmap_gfn_range etc. */
        spin_lock(&kvm->mmu_lock);
        kvm->arch.radix = 1;
        spin_unlock(&kvm->mmu_lock);
@@ -5699,10 +5699,10 @@ static struct kvmppc_ops kvm_ops_hv = {
        .flush_memslot  = kvmppc_core_flush_memslot_hv,
        .prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
        .commit_memory_region  = kvmppc_core_commit_memory_region_hv,
-       .unmap_hva_range = kvm_unmap_hva_range_hv,
-       .age_hva  = kvm_age_hva_hv,
-       .test_age_hva = kvm_test_age_hva_hv,
-       .set_spte_hva = kvm_set_spte_hva_hv,
+       .unmap_gfn_range = kvm_unmap_gfn_range_hv,
+       .age_gfn = kvm_age_gfn_hv,
+       .test_age_gfn = kvm_test_age_gfn_hv,
+       .set_spte_gfn = kvm_set_spte_gfn_hv,
        .free_memslot = kvmppc_core_free_memslot_hv,
        .init_vm =  kvmppc_core_init_vm_hv,
        .destroy_vm = kvmppc_core_destroy_vm_hv,
index 913944d..d7733b0 100644 (file)
@@ -425,61 +425,39 @@ static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
 }
 
 /************* MMU Notifiers *************/
-static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start,
-                            unsigned long end)
+static bool do_kvm_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        long i;
        struct kvm_vcpu *vcpu;
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
 
-       slots = kvm_memslots(kvm);
-       kvm_for_each_memslot(memslot, slots) {
-               unsigned long hva_start, hva_end;
-               gfn_t gfn, gfn_end;
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvmppc_mmu_pte_pflush(vcpu, range->start << PAGE_SHIFT,
+                                     range->end << PAGE_SHIFT);
 
-               hva_start = max(start, memslot->userspace_addr);
-               hva_end = min(end, memslot->userspace_addr +
-                                       (memslot->npages << PAGE_SHIFT));
-               if (hva_start >= hva_end)
-                       continue;
-               /*
-                * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                * {gfn, gfn+1, ..., gfn_end-1}.
-                */
-               gfn = hva_to_gfn_memslot(hva_start, memslot);
-               gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-               kvm_for_each_vcpu(i, vcpu, kvm)
-                       kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT,
-                                             gfn_end << PAGE_SHIFT);
-       }
+       return false;
 }
 
-static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start,
-                                 unsigned long end)
+static bool kvm_unmap_gfn_range_pr(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       do_kvm_unmap_hva(kvm, start, end);
-
-       return 0;
+       return do_kvm_unmap_gfn(kvm, range);
 }
 
-static int kvm_age_hva_pr(struct kvm *kvm, unsigned long start,
-                         unsigned long end)
+static bool kvm_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* XXX could be more clever ;) */
-       return 0;
+       return false;
 }
 
-static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva)
+static bool kvm_test_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* XXX could be more clever ;) */
-       return 0;
+       return false;
 }
 
-static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte)
+static bool kvm_set_spte_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* The page will get remapped properly on its next fault */
-       do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE);
+       return do_kvm_unmap_gfn(kvm, range);
 }
 
 /*****************************************/
@@ -2079,10 +2057,10 @@ static struct kvmppc_ops kvm_ops_pr = {
        .flush_memslot = kvmppc_core_flush_memslot_pr,
        .prepare_memory_region = kvmppc_core_prepare_memory_region_pr,
        .commit_memory_region = kvmppc_core_commit_memory_region_pr,
-       .unmap_hva_range = kvm_unmap_hva_range_pr,
-       .age_hva  = kvm_age_hva_pr,
-       .test_age_hva = kvm_test_age_hva_pr,
-       .set_spte_hva = kvm_set_spte_hva_pr,
+       .unmap_gfn_range = kvm_unmap_gfn_range_pr,
+       .age_gfn  = kvm_age_gfn_pr,
+       .test_age_gfn = kvm_test_age_gfn_pr,
+       .set_spte_gfn = kvm_set_spte_gfn_pr,
        .free_memslot = kvmppc_core_free_memslot_pr,
        .init_vm = kvmppc_core_init_vm_pr,
        .destroy_vm = kvmppc_core_destroy_vm_pr,
index ed0c9c4..7f16afc 100644 (file)
@@ -721,45 +721,36 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
 
 /************* MMU Notifiers *************/
 
-static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+static bool kvm_e500_mmu_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       trace_kvm_unmap_hva(hva);
-
        /*
         * Flush all shadow tlb entries everywhere. This is slow, but
         * we are 100% sure that we catch the to be unmapped page
         */
-       kvm_flush_remote_tlbs(kvm);
-
-       return 0;
+       return true;
 }
 
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       /* kvm_unmap_hva flushes everything anyways */
-       kvm_unmap_hva(kvm, start);
-
-       return 0;
+       return kvm_e500_mmu_unmap_gfn(kvm, range);
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* XXX could be more clever ;) */
-       return 0;
+       return false;
 }
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* XXX could be more clever ;) */
-       return 0;
+       return false;
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* The page will get remapped properly on its next fault */
-       kvm_unmap_hva(kvm, hva);
-       return 0;
+       return kvm_e500_mmu_unmap_gfn(kvm, range);
 }
 
 /*****************************************/
index 3837842..eff6e82 100644 (file)
@@ -69,21 +69,6 @@ TRACE_EVENT(kvm_exit,
                )
 );
 
-TRACE_EVENT(kvm_unmap_hva,
-       TP_PROTO(unsigned long hva),
-       TP_ARGS(hva),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  hva             )
-       ),
-
-       TP_fast_assign(
-               __entry->hva            = hva;
-       ),
-
-       TP_printk("unmap hva 0x%lx\n", __entry->hva)
-);
-
 TRACE_EVENT(kvm_booke206_stlb_write,
        TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3),
        TP_ARGS(mas0, mas8, mas1, mas2, mas7_3),
index f2c690e..cc1a8a0 100644 (file)
@@ -5,6 +5,9 @@
 
 ccflags-$(CONFIG_PPC64)        := $(NO_MINIMAL_TOC)
 
+CFLAGS_code-patching.o += -fno-stack-protector
+CFLAGS_feature-fixups.o += -fno-stack-protector
+
 CFLAGS_REMOVE_code-patching.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE)
 
index d142b76..9a75ba0 100644 (file)
@@ -106,7 +106,8 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  * At this point we do the placement change only for BOOK3S 64. This would
  * possibly work on other subarchs.
  */
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, unsigned long sz)
 {
        pgd_t *pg;
        p4d_t *p4;
index e4b0566..f998e65 100644 (file)
@@ -40,8 +40,8 @@ config PPC_85xx
 
 config PPC_8xx
        bool "Freescale 8xx"
+       select ARCH_SUPPORTS_HUGETLBFS
        select FSL_SOC
-       select SYS_SUPPORTS_HUGETLBFS
        select PPC_HAVE_KUEP
        select PPC_HAVE_KUAP
        select HAVE_ARCH_VMAP_STACK
@@ -95,9 +95,11 @@ config PPC_BOOK3S_64
        bool "Server processors"
        select PPC_FPU
        select PPC_HAVE_PMU_SUPPORT
-       select SYS_SUPPORTS_HUGETLBFS
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+       select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
+       select ARCH_ENABLE_PMD_SPLIT_PTLOCK
        select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
+       select ARCH_SUPPORTS_HUGETLBFS
        select ARCH_SUPPORTS_NUMA_BALANCING
        select IRQ_WORK
        select PPC_MM_SLICES
@@ -280,9 +282,9 @@ config FSL_BOOKE
 # this is for common code between PPC32 & PPC64 FSL BOOKE
 config PPC_FSL_BOOK3E
        bool
+       select ARCH_SUPPORTS_HUGETLBFS if PHYS_64BIT || PPC64
        select FSL_EMB_PERFMON
        select PPC_SMP_MUXED_IPI
-       select SYS_SUPPORTS_HUGETLBFS if PHYS_64BIT || PPC64
        select PPC_DOORBELL
        default y if FSL_BOOKE
 
@@ -358,10 +360,6 @@ config SPE
 
          If in doubt, say Y here.
 
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
-       def_bool y
-       depends on PPC_BOOK3S_64
-
 config PPC_RADIX_MMU
        bool "Radix MMU Support"
        depends on PPC_BOOK3S_64
@@ -421,10 +419,6 @@ config PPC_PKEY
        depends on PPC_BOOK3S_64
        depends on PPC_MEM_KEYS || PPC_KUAP || PPC_KUEP
 
-config ARCH_ENABLE_HUGEPAGE_MIGRATION
-       def_bool y
-       depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION
-
 
 config PPC_MMU_NOHASH
        def_bool y
index 2eb6ae1..be2546b 100644 (file)
@@ -10,7 +10,7 @@ obj-$(CONFIG_SMP)     += smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_FA_DUMP)  += opal-fadump.o
 obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o
 obj-$(CONFIG_OPAL_CORE)        += opal-core.o
-obj-$(CONFIG_PCI)      += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
+obj-$(CONFIG_PCI)      += pci.o pci-ioda.o pci-ioda-tce.o
 obj-$(CONFIG_PCI_IOV)   += pci-sriov.o
 obj-$(CONFIG_CXL_BASE) += pci-cxl.o
 obj-$(CONFIG_EEH)      += eeh-powernv.o
index 71c1262..537a4da 100644 (file)
@@ -104,8 +104,8 @@ static void memtrace_clear_range(unsigned long start_pfn,
         * Before we go ahead and use this range as cache inhibited range
         * flush the cache.
         */
-       flush_dcache_range_chunked(PFN_PHYS(start_pfn),
-                                  PFN_PHYS(start_pfn + nr_pages),
+       flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn),
+                                  (unsigned long)pfn_to_kaddr(start_pfn + nr_pages),
                                   FLUSH_CHUNK_SIZE);
 }
 
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
deleted file mode 100644 (file)
index b711dc3..0000000
+++ /dev/null
@@ -1,705 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * This file implements the DMA operations for NVLink devices. The NPU
- * devices all point to the same iommu table as the parent PCI device.
- *
- * Copyright Alistair Popple, IBM Corporation 2015.
- */
-
-#include <linux/mmu_notifier.h>
-#include <linux/mmu_context.h>
-#include <linux/of.h>
-#include <linux/pci.h>
-#include <linux/memblock.h>
-#include <linux/sizes.h>
-
-#include <asm/debugfs.h>
-#include <asm/powernv.h>
-#include <asm/ppc-pci.h>
-#include <asm/opal.h>
-
-#include "pci.h"
-
-static struct pci_dev *get_pci_dev(struct device_node *dn)
-{
-       struct pci_dn *pdn = PCI_DN(dn);
-       struct pci_dev *pdev;
-
-       pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
-                                          pdn->busno, pdn->devfn);
-
-       /*
-        * pci_get_domain_bus_and_slot() increased the reference count of
-        * the PCI device, but callers don't need that actually as the PE
-        * already holds a reference to the device. Since callers aren't
-        * aware of the reference count change, call pci_dev_put() now to
-        * avoid leaks.
-        */
-       if (pdev)
-               pci_dev_put(pdev);
-
-       return pdev;
-}
-
-/* Given a NPU device get the associated PCI device. */
-struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
-{
-       struct device_node *dn;
-       struct pci_dev *gpdev;
-
-       if (WARN_ON(!npdev))
-               return NULL;
-
-       if (WARN_ON(!npdev->dev.of_node))
-               return NULL;
-
-       /* Get assoicated PCI device */
-       dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
-       if (!dn)
-               return NULL;
-
-       gpdev = get_pci_dev(dn);
-       of_node_put(dn);
-
-       return gpdev;
-}
-EXPORT_SYMBOL(pnv_pci_get_gpu_dev);
-
-/* Given the real PCI device get a linked NPU device. */
-struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
-{
-       struct device_node *dn;
-       struct pci_dev *npdev;
-
-       if (WARN_ON(!gpdev))
-               return NULL;
-
-       /* Not all PCI devices have device-tree nodes */
-       if (!gpdev->dev.of_node)
-               return NULL;
-
-       /* Get assoicated PCI device */
-       dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
-       if (!dn)
-               return NULL;
-
-       npdev = get_pci_dev(dn);
-       of_node_put(dn);
-
-       return npdev;
-}
-EXPORT_SYMBOL(pnv_pci_get_npu_dev);
-
-#ifdef CONFIG_IOMMU_API
-/*
- * Returns the PE assoicated with the PCI device of the given
- * NPU. Returns the linked pci device if pci_dev != NULL.
- */
-static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
-                                                 struct pci_dev **gpdev)
-{
-       struct pnv_phb *phb;
-       struct pci_controller *hose;
-       struct pci_dev *pdev;
-       struct pnv_ioda_pe *pe;
-       struct pci_dn *pdn;
-
-       pdev = pnv_pci_get_gpu_dev(npe->pdev);
-       if (!pdev)
-               return NULL;
-
-       pdn = pci_get_pdn(pdev);
-       if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
-               return NULL;
-
-       hose = pci_bus_to_host(pdev->bus);
-       phb = hose->private_data;
-       pe = &phb->ioda.pe_array[pdn->pe_number];
-
-       if (gpdev)
-               *gpdev = pdev;
-
-       return pe;
-}
-
-static long pnv_npu_unset_window(struct iommu_table_group *table_group,
-               int num);
-
-static long pnv_npu_set_window(struct iommu_table_group *table_group, int num,
-               struct iommu_table *tbl)
-{
-       struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
-                       table_group);
-       struct pnv_phb *phb = npe->phb;
-       int64_t rc;
-       const unsigned long size = tbl->it_indirect_levels ?
-               tbl->it_level_size : tbl->it_size;
-       const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
-       const __u64 win_size = tbl->it_size << tbl->it_page_shift;
-       int num2 = (num == 0) ? 1 : 0;
-
-       /* NPU has just one TVE so if there is another table, remove it first */
-       if (npe->table_group.tables[num2])
-               pnv_npu_unset_window(&npe->table_group, num2);
-
-       pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
-                       start_addr, start_addr + win_size - 1,
-                       IOMMU_PAGE_SIZE(tbl));
-
-       rc = opal_pci_map_pe_dma_window(phb->opal_id,
-                       npe->pe_number,
-                       npe->pe_number,
-                       tbl->it_indirect_levels + 1,
-                       __pa(tbl->it_base),
-                       size << 3,
-                       IOMMU_PAGE_SIZE(tbl));
-       if (rc) {
-               pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
-               return rc;
-       }
-       pnv_pci_ioda2_tce_invalidate_entire(phb, false);
-
-       /* Add the table to the list so its TCE cache will get invalidated */
-       pnv_pci_link_table_and_group(phb->hose->node, num,
-                       tbl, &npe->table_group);
-
-       return 0;
-}
-
-static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num)
-{
-       struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
-                       table_group);
-       struct pnv_phb *phb = npe->phb;
-       int64_t rc;
-
-       if (!npe->table_group.tables[num])
-               return 0;
-
-       pe_info(npe, "Removing DMA window\n");
-
-       rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
-                       npe->pe_number,
-                       0/* levels */, 0/* table address */,
-                       0/* table size */, 0/* page size */);
-       if (rc) {
-               pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
-               return rc;
-       }
-       pnv_pci_ioda2_tce_invalidate_entire(phb, false);
-
-       pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
-                       &npe->table_group);
-
-       return 0;
-}
-
-/* Switch ownership from platform code to external user (e.g. VFIO) */
-static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
-{
-       struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
-                       table_group);
-       struct pnv_phb *phb = npe->phb;
-       int64_t rc;
-       struct pci_dev *gpdev = NULL;
-
-       /*
-        * Note: NPU has just a single TVE in the hardware which means that
-        * while used by the kernel, it can have either 32bit window or
-        * DMA bypass but never both. So we deconfigure 32bit window only
-        * if it was enabled at the moment of ownership change.
-        */
-       if (npe->table_group.tables[0]) {
-               pnv_npu_unset_window(&npe->table_group, 0);
-               return;
-       }
-
-       /* Disable bypass */
-       rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
-                       npe->pe_number, npe->pe_number,
-                       0 /* bypass base */, 0);
-       if (rc) {
-               pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
-               return;
-       }
-       pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
-
-       get_gpu_pci_dev_and_pe(npe, &gpdev);
-       if (gpdev)
-               pnv_npu2_unmap_lpar_dev(gpdev);
-}
-
-static void pnv_npu_release_ownership(struct iommu_table_group *table_group)
-{
-       struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
-                       table_group);
-       struct pci_dev *gpdev = NULL;
-
-       get_gpu_pci_dev_and_pe(npe, &gpdev);
-       if (gpdev)
-               pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV);
-}
-
-static struct iommu_table_group_ops pnv_pci_npu_ops = {
-       .set_window = pnv_npu_set_window,
-       .unset_window = pnv_npu_unset_window,
-       .take_ownership = pnv_npu_take_ownership,
-       .release_ownership = pnv_npu_release_ownership,
-};
-#endif /* !CONFIG_IOMMU_API */
-
-/*
- * NPU2 ATS
- */
-/* Maximum possible number of ATSD MMIO registers per NPU */
-#define NV_NMMU_ATSD_REGS 8
-#define NV_NPU_MAX_PE_NUM      16
-
-/*
- * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or
- * up to 3 x (GPU + 2xNPUs) (POWER9).
- */
-struct npu_comp {
-       struct iommu_table_group table_group;
-       int pe_num;
-       struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM];
-};
-
-/* An NPU descriptor, valid for POWER9 only */
-struct npu {
-       int index;
-       struct npu_comp npucomp;
-};
-
-#ifdef CONFIG_IOMMU_API
-static long pnv_npu_peers_create_table_userspace(
-               struct iommu_table_group *table_group,
-               int num, __u32 page_shift, __u64 window_size, __u32 levels,
-               struct iommu_table **ptbl)
-{
-       struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
-                       table_group);
-
-       if (!npucomp->pe_num || !npucomp->pe[0] ||
-                       !npucomp->pe[0]->table_group.ops ||
-                       !npucomp->pe[0]->table_group.ops->create_table)
-               return -EFAULT;
-
-       return npucomp->pe[0]->table_group.ops->create_table(
-                       &npucomp->pe[0]->table_group, num, page_shift,
-                       window_size, levels, ptbl);
-}
-
-static long pnv_npu_peers_set_window(struct iommu_table_group *table_group,
-               int num, struct iommu_table *tbl)
-{
-       int i, j;
-       long ret = 0;
-       struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
-                       table_group);
-
-       for (i = 0; i < npucomp->pe_num; ++i) {
-               struct pnv_ioda_pe *pe = npucomp->pe[i];
-
-               if (!pe->table_group.ops->set_window)
-                       continue;
-
-               ret = pe->table_group.ops->set_window(&pe->table_group,
-                               num, tbl);
-               if (ret)
-                       break;
-       }
-
-       if (ret) {
-               for (j = 0; j < i; ++j) {
-                       struct pnv_ioda_pe *pe = npucomp->pe[j];
-
-                       if (!pe->table_group.ops->unset_window)
-                               continue;
-
-                       ret = pe->table_group.ops->unset_window(
-                                       &pe->table_group, num);
-                       if (ret)
-                               break;
-               }
-       } else {
-               table_group->tables[num] = iommu_tce_table_get(tbl);
-       }
-
-       return ret;
-}
-
-static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group,
-               int num)
-{
-       int i, j;
-       long ret = 0;
-       struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
-                       table_group);
-
-       for (i = 0; i < npucomp->pe_num; ++i) {
-               struct pnv_ioda_pe *pe = npucomp->pe[i];
-
-               WARN_ON(npucomp->table_group.tables[num] !=
-                               table_group->tables[num]);
-               if (!npucomp->table_group.tables[num])
-                       continue;
-
-               if (!pe->table_group.ops->unset_window)
-                       continue;
-
-               ret = pe->table_group.ops->unset_window(&pe->table_group, num);
-               if (ret)
-                       break;
-       }
-
-       if (ret) {
-               for (j = 0; j < i; ++j) {
-                       struct pnv_ioda_pe *pe = npucomp->pe[j];
-
-                       if (!npucomp->table_group.tables[num])
-                               continue;
-
-                       if (!pe->table_group.ops->set_window)
-                               continue;
-
-                       ret = pe->table_group.ops->set_window(&pe->table_group,
-                                       num, table_group->tables[num]);
-                       if (ret)
-                               break;
-               }
-       } else if (table_group->tables[num]) {
-               iommu_tce_table_put(table_group->tables[num]);
-               table_group->tables[num] = NULL;
-       }
-
-       return ret;
-}
-
-static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group)
-{
-       int i;
-       struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
-                       table_group);
-
-       for (i = 0; i < npucomp->pe_num; ++i) {
-               struct pnv_ioda_pe *pe = npucomp->pe[i];
-
-               if (!pe->table_group.ops ||
-                   !pe->table_group.ops->take_ownership)
-                       continue;
-               pe->table_group.ops->take_ownership(&pe->table_group);
-       }
-}
-
-static void pnv_npu_peers_release_ownership(
-               struct iommu_table_group *table_group)
-{
-       int i;
-       struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
-                       table_group);
-
-       for (i = 0; i < npucomp->pe_num; ++i) {
-               struct pnv_ioda_pe *pe = npucomp->pe[i];
-
-               if (!pe->table_group.ops ||
-                   !pe->table_group.ops->release_ownership)
-                       continue;
-               pe->table_group.ops->release_ownership(&pe->table_group);
-       }
-}
-
-static struct iommu_table_group_ops pnv_npu_peers_ops = {
-       .get_table_size = pnv_pci_ioda2_get_table_size,
-       .create_table = pnv_npu_peers_create_table_userspace,
-       .set_window = pnv_npu_peers_set_window,
-       .unset_window = pnv_npu_peers_unset_window,
-       .take_ownership = pnv_npu_peers_take_ownership,
-       .release_ownership = pnv_npu_peers_release_ownership,
-};
-
-static void pnv_comp_attach_table_group(struct npu_comp *npucomp,
-               struct pnv_ioda_pe *pe)
-{
-       if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM))
-               return;
-
-       npucomp->pe[npucomp->pe_num] = pe;
-       ++npucomp->pe_num;
-}
-
-static struct iommu_table_group *
-       pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
-{
-       struct iommu_table_group *compound_group;
-       struct npu_comp *npucomp;
-       struct pci_dev *gpdev = NULL;
-       struct pci_controller *hose;
-       struct pci_dev *npdev = NULL;
-
-       list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) {
-               npdev = pnv_pci_get_npu_dev(gpdev, 0);
-               if (npdev)
-                       break;
-       }
-
-       if (!npdev)
-               /* It is not an NPU attached device, skip */
-               return NULL;
-
-       hose = pci_bus_to_host(npdev->bus);
-
-       if (hose->npu) {
-               /* P9 case: compound group is per-NPU (all gpus, all links) */
-               npucomp = &hose->npu->npucomp;
-       } else {
-               /* P8 case: Compound group is per-GPU (1 gpu, 2 links) */
-               npucomp = pe->npucomp = kzalloc(sizeof(*npucomp), GFP_KERNEL);
-       }
-
-       compound_group = &npucomp->table_group;
-       if (!compound_group->group) {
-               compound_group->ops = &pnv_npu_peers_ops;
-               iommu_register_group(compound_group, hose->global_number,
-                               pe->pe_number);
-
-               /* Steal capabilities from a GPU PE */
-               compound_group->max_dynamic_windows_supported =
-                       pe->table_group.max_dynamic_windows_supported;
-               compound_group->tce32_start = pe->table_group.tce32_start;
-               compound_group->tce32_size = pe->table_group.tce32_size;
-               compound_group->max_levels = pe->table_group.max_levels;
-               if (!compound_group->pgsizes)
-                       compound_group->pgsizes = pe->table_group.pgsizes;
-       }
-
-       /*
-        * The gpu would have been added to the iommu group that's created
-        * for the PE. Pull it out now.
-        */
-       iommu_del_device(&gpdev->dev);
-
-       /*
-       * I'm not sure this is strictly required, but it's probably a good idea
-       * since the table_group for the PE is going to be attached to the
-       * compound table group. If we leave the PE's iommu group active then
-       * we might have the same table_group being modifiable via two sepeate
-       * iommu groups.
-       */
-       iommu_group_put(pe->table_group.group);
-
-       /* now put the GPU into the compound group */
-       pnv_comp_attach_table_group(npucomp, pe);
-       iommu_add_device(compound_group, &gpdev->dev);
-
-       return compound_group;
-}
-
-static struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
-{
-       struct iommu_table_group *table_group;
-       struct npu_comp *npucomp;
-       struct pci_dev *gpdev = NULL;
-       struct pci_dev *npdev;
-       struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev);
-
-       WARN_ON(!(pe->flags & PNV_IODA_PE_DEV));
-       if (!gpe)
-               return NULL;
-
-       /*
-        * IODA2 bridges get this set up from pci_controller_ops::setup_bridge
-        * but NPU bridges do not have this hook defined so we do it here.
-        * We do not setup other table group parameters as they won't be used
-        * anyway - NVLink bridges are subordinate PEs.
-        */
-       pe->table_group.ops = &pnv_pci_npu_ops;
-
-       table_group = iommu_group_get_iommudata(
-                       iommu_group_get(&gpdev->dev));
-
-       /*
-        * On P9 NPU PHB and PCI PHB support different page sizes,
-        * keep only matching. We expect here that NVLink bridge PE pgsizes is
-        * initialized by the caller.
-        */
-       table_group->pgsizes &= pe->table_group.pgsizes;
-       npucomp = container_of(table_group, struct npu_comp, table_group);
-       pnv_comp_attach_table_group(npucomp, pe);
-
-       list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) {
-               struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev);
-
-               if (gpdevtmp != gpdev)
-                       continue;
-
-               iommu_add_device(table_group, &npdev->dev);
-       }
-
-       return table_group;
-}
-
-void pnv_pci_npu_setup_iommu_groups(void)
-{
-       struct pci_controller *hose;
-       struct pnv_phb *phb;
-       struct pnv_ioda_pe *pe;
-
-       /*
-        * For non-nvlink devices the IOMMU group is registered when the PE is
-        * configured and devices are added to the group when the per-device
-        * DMA setup is run. That's done in hose->ops.dma_dev_setup() which is
-        * only initialise for "normal" IODA PHBs.
-        *
-        * For NVLink devices we need to ensure the NVLinks and the GPU end up
-        * in the same IOMMU group, so that's handled here.
-        */
-       list_for_each_entry(hose, &hose_list, list_node) {
-               phb = hose->private_data;
-
-               if (phb->type == PNV_PHB_IODA2)
-                       list_for_each_entry(pe, &phb->ioda.pe_list, list)
-                               pnv_try_setup_npu_table_group(pe);
-       }
-
-       /*
-        * Now we have all PHBs discovered, time to add NPU devices to
-        * the corresponding IOMMU groups.
-        */
-       list_for_each_entry(hose, &hose_list, list_node) {
-               unsigned long  pgsizes;
-
-               phb = hose->private_data;
-
-               if (phb->type != PNV_PHB_NPU_NVLINK)
-                       continue;
-
-               pgsizes = pnv_ioda_parse_tce_sizes(phb);
-               list_for_each_entry(pe, &phb->ioda.pe_list, list) {
-                       /*
-                        * IODA2 bridges get this set up from
-                        * pci_controller_ops::setup_bridge but NPU bridges
-                        * do not have this hook defined so we do it here.
-                        */
-                       pe->table_group.pgsizes = pgsizes;
-                       pnv_npu_compound_attach(pe);
-               }
-       }
-}
-#endif /* CONFIG_IOMMU_API */
-
-int pnv_npu2_init(struct pci_controller *hose)
-{
-       static int npu_index;
-       struct npu *npu;
-       int ret;
-
-       npu = kzalloc(sizeof(*npu), GFP_KERNEL);
-       if (!npu)
-               return -ENOMEM;
-
-       npu_index++;
-       if (WARN_ON(npu_index >= NV_MAX_NPUS)) {
-               ret = -ENOSPC;
-               goto fail_exit;
-       }
-       npu->index = npu_index;
-       hose->npu = npu;
-
-       return 0;
-
-fail_exit:
-       kfree(npu);
-       return ret;
-}
-
-int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
-               unsigned long msr)
-{
-       int ret;
-       struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
-       struct pci_controller *hose;
-       struct pnv_phb *nphb;
-
-       if (!npdev)
-               return -ENODEV;
-
-       hose = pci_bus_to_host(npdev->bus);
-       if (hose->npu == NULL) {
-               dev_info_once(&npdev->dev, "Nvlink1 does not support contexts");
-               return 0;
-       }
-
-       nphb = hose->private_data;
-
-       dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=%u\n",
-                       nphb->opal_id, lparid);
-       /*
-        * Currently we only support radix and non-zero LPCR only makes sense
-        * for hash tables so skiboot expects the LPCR parameter to be a zero.
-        */
-       ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), lparid,
-                               0 /* LPCR bits */);
-       if (ret) {
-               dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
-               return ret;
-       }
-
-       dev_dbg(&gpdev->dev, "init context opalid=%llu msr=%lx\n",
-                       nphb->opal_id, msr);
-       ret = opal_npu_init_context(nphb->opal_id, 0/*__unused*/, msr,
-                                   pci_dev_id(gpdev));
-       if (ret < 0)
-               dev_err(&gpdev->dev, "Failed to init context: %d\n", ret);
-       else
-               ret = 0;
-
-       return 0;
-}
-EXPORT_SYMBOL_GPL(pnv_npu2_map_lpar_dev);
-
-void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr)
-{
-       struct pci_dev *gpdev;
-
-       list_for_each_entry(gpdev, &gpe->pbus->devices, bus_list)
-               pnv_npu2_map_lpar_dev(gpdev, 0, msr);
-}
-
-int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev)
-{
-       int ret;
-       struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
-       struct pci_controller *hose;
-       struct pnv_phb *nphb;
-
-       if (!npdev)
-               return -ENODEV;
-
-       hose = pci_bus_to_host(npdev->bus);
-       if (hose->npu == NULL) {
-               dev_info_once(&npdev->dev, "Nvlink1 does not support contexts");
-               return 0;
-       }
-
-       nphb = hose->private_data;
-
-       dev_dbg(&gpdev->dev, "destroy context opalid=%llu\n",
-                       nphb->opal_id);
-       ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/,
-                                      pci_dev_id(gpdev));
-       if (ret < 0) {
-               dev_err(&gpdev->dev, "Failed to destroy context: %d\n", ret);
-               return ret;
-       }
-
-       /* Set LPID to 0 anyway, just to be safe */
-       dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id);
-       ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), 0 /*LPID*/,
-                               0 /* LPCR bits */);
-       if (ret)
-               dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev);
index 5cd0f52..01401e3 100644 (file)
@@ -267,8 +267,6 @@ OPAL_CALL(opal_xive_get_queue_state,                OPAL_XIVE_GET_QUEUE_STATE);
 OPAL_CALL(opal_xive_set_queue_state,           OPAL_XIVE_SET_QUEUE_STATE);
 OPAL_CALL(opal_xive_get_vp_state,              OPAL_XIVE_GET_VP_STATE);
 OPAL_CALL(opal_signal_system_reset,            OPAL_SIGNAL_SYSTEM_RESET);
-OPAL_CALL(opal_npu_init_context,               OPAL_NPU_INIT_CONTEXT);
-OPAL_CALL(opal_npu_destroy_context,            OPAL_NPU_DESTROY_CONTEXT);
 OPAL_CALL(opal_npu_map_lpar,                   OPAL_NPU_MAP_LPAR);
 OPAL_CALL(opal_imc_counters_init,              OPAL_IMC_COUNTERS_INIT);
 OPAL_CALL(opal_imc_counters_start,             OPAL_IMC_COUNTERS_START);
index 66c3c33..7de4646 100644 (file)
@@ -47,8 +47,7 @@
 #define PNV_IODA1_M64_SEGS     8       /* Segments per M64 BAR */
 #define PNV_IODA1_DMA32_SEGSIZE        0x10000000
 
-static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
-                                             "NPU_OCAPI" };
+static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_OCAPI" };
 
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
 static void pnv_pci_configure_bus(struct pci_bus *bus);
@@ -192,8 +191,6 @@ void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
        unsigned int pe_num = pe->pe_number;
 
        WARN_ON(pe->pdev);
-       WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */
-       kfree(pe->npucomp);
        memset(pe, 0, sizeof(struct pnv_ioda_pe));
 
        mutex_lock(&phb->ioda.pe_alloc_mutex);
@@ -875,7 +872,7 @@ int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
         * Release from all parents PELT-V. NPUs don't have a PELTV
         * table
         */
-       if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
+       if (phb->type != PNV_PHB_NPU_OCAPI)
                pnv_ioda_unset_peltv(phb, pe, parent);
 
        rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
@@ -946,7 +943,7 @@ int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
         * Configure PELTV. NPUs don't have a PELTV table so skip
         * configuration on them.
         */
-       if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
+       if (phb->type != PNV_PHB_NPU_OCAPI)
                pnv_ioda_set_peltv(phb, pe, true);
 
        /* Setup reverse map */
@@ -1002,8 +999,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 
        /* NOTE: We don't get a reference for the pointer in the PE
         * data structure, both the device and PE structures should be
-        * destroyed at the same time. However, removing nvlink
-        * devices will need some work.
+        * destroyed at the same time.
         *
         * At some point we want to remove the PDN completely anyways
         */
@@ -1099,113 +1095,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
        return pe;
 }
 
-static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
-{
-       int pe_num, found_pe = false, rc;
-       long rid;
-       struct pnv_ioda_pe *pe;
-       struct pci_dev *gpu_pdev;
-       struct pci_dn *npu_pdn;
-       struct pnv_phb *phb = pci_bus_to_pnvhb(npu_pdev->bus);
-
-       /*
-        * Intentionally leak a reference on the npu device (for
-        * nvlink only; this is not an opencapi path) to make sure it
-        * never goes away, as it's been the case all along and some
-        * work is needed otherwise.
-        */
-       pci_dev_get(npu_pdev);
-
-       /*
-        * Due to a hardware errata PE#0 on the NPU is reserved for
-        * error handling. This means we only have three PEs remaining
-        * which need to be assigned to four links, implying some
-        * links must share PEs.
-        *
-        * To achieve this we assign PEs such that NPUs linking the
-        * same GPU get assigned the same PE.
-        */
-       gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
-       for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
-               pe = &phb->ioda.pe_array[pe_num];
-               if (!pe->pdev)
-                       continue;
-
-               if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) {
-                       /*
-                        * This device has the same peer GPU so should
-                        * be assigned the same PE as the existing
-                        * peer NPU.
-                        */
-                       dev_info(&npu_pdev->dev,
-                               "Associating to existing PE %x\n", pe_num);
-                       npu_pdn = pci_get_pdn(npu_pdev);
-                       rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
-                       npu_pdn->pe_number = pe_num;
-                       phb->ioda.pe_rmap[rid] = pe->pe_number;
-                       pe->device_count++;
-
-                       /* Map the PE to this link */
-                       rc = opal_pci_set_pe(phb->opal_id, pe_num, rid,
-                                       OpalPciBusAll,
-                                       OPAL_COMPARE_RID_DEVICE_NUMBER,
-                                       OPAL_COMPARE_RID_FUNCTION_NUMBER,
-                                       OPAL_MAP_PE);
-                       WARN_ON(rc != OPAL_SUCCESS);
-                       found_pe = true;
-                       break;
-               }
-       }
-
-       if (!found_pe)
-               /*
-                * Could not find an existing PE so allocate a new
-                * one.
-                */
-               return pnv_ioda_setup_dev_PE(npu_pdev);
-       else
-               return pe;
-}
-
-static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
-{
-       struct pci_dev *pdev;
-
-       list_for_each_entry(pdev, &bus->devices, bus_list)
-               pnv_ioda_setup_npu_PE(pdev);
-}
-
-static void pnv_pci_ioda_setup_nvlink(void)
-{
-       struct pci_controller *hose;
-       struct pnv_phb *phb;
-       struct pnv_ioda_pe *pe;
-
-       list_for_each_entry(hose, &hose_list, list_node) {
-               phb = hose->private_data;
-               if (phb->type == PNV_PHB_NPU_NVLINK) {
-                       /* PE#0 is needed for error reporting */
-                       pnv_ioda_reserve_pe(phb, 0);
-                       pnv_ioda_setup_npu_PEs(hose->bus);
-                       if (phb->model == PNV_PHB_MODEL_NPU2)
-                               WARN_ON_ONCE(pnv_npu2_init(hose));
-               }
-       }
-       list_for_each_entry(hose, &hose_list, list_node) {
-               phb = hose->private_data;
-               if (phb->type != PNV_PHB_IODA2)
-                       continue;
-
-               list_for_each_entry(pe, &phb->ioda.pe_list, list)
-                       pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
-       }
-
-#ifdef CONFIG_IOMMU_API
-       /* setup iommu groups so we can do nvlink pass-thru */
-       pnv_pci_npu_setup_iommu_groups();
-#endif
-}
-
 static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
                                       struct pnv_ioda_pe *pe);
 
@@ -1468,18 +1357,6 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 #define PHB3_TCE_KILL_INVAL_PE         PPC_BIT(1)
 #define PHB3_TCE_KILL_INVAL_ONE                PPC_BIT(2)
 
-static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
-{
-       __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
-       const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
-
-       mb(); /* Ensure previous TCE table stores are visible */
-       if (rm)
-               __raw_rm_writeq_be(val, invalidate);
-       else
-               __raw_writeq_be(val, invalidate);
-}
-
 static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
 {
        /* 01xb - invalidate TCEs that match the specified PE# */
@@ -1539,20 +1416,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
                struct pnv_phb *phb = pe->phb;
                unsigned int shift = tbl->it_page_shift;
 
-               /*
-                * NVLink1 can use the TCE kill register directly as
-                * it's the same as PHB3. NVLink2 is different and
-                * should go via the OPAL call.
-                */
-               if (phb->model == PNV_PHB_MODEL_NPU) {
-                       /*
-                        * The NVLink hardware does not support TCE kill
-                        * per TCE entry so we have to invalidate
-                        * the entire cache for it.
-                        */
-                       pnv_pci_phb3_tce_invalidate_entire(phb, rm);
-                       continue;
-               }
                if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
                        pnv_pci_phb3_tce_invalidate(pe, rm, shift,
                                                    index, npages);
@@ -1564,14 +1427,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
        }
 }
 
-void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
-{
-       if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
-               pnv_pci_phb3_tce_invalidate_entire(phb, rm);
-       else
-               opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
-}
-
 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
                long npages, unsigned long uaddr,
                enum dma_data_direction direction,
@@ -2451,7 +2306,6 @@ static void pnv_pci_enable_bridges(void)
 
 static void pnv_pci_ioda_fixup(void)
 {
-       pnv_pci_ioda_setup_nvlink();
        pnv_pci_ioda_create_dbgfs();
 
        pnv_pci_enable_bridges();
@@ -2824,15 +2678,6 @@ static void pnv_pci_release_device(struct pci_dev *pdev)
                pnv_ioda_release_pe(pe);
 }
 
-static void pnv_npu_disable_device(struct pci_dev *pdev)
-{
-       struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
-       struct eeh_pe *eehpe = edev ? edev->pe : NULL;
-
-       if (eehpe && eeh_ops && eeh_ops->reset)
-               eeh_ops->reset(eehpe, EEH_RESET_HOT);
-}
-
 static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
 {
        struct pnv_phb *phb = hose->private_data;
@@ -2874,16 +2719,6 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
        .shutdown               = pnv_pci_ioda_shutdown,
 };
 
-static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
-       .setup_msi_irqs         = pnv_setup_msi_irqs,
-       .teardown_msi_irqs      = pnv_teardown_msi_irqs,
-       .enable_device_hook     = pnv_pci_enable_device_hook,
-       .window_alignment       = pnv_pci_window_alignment,
-       .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
-       .shutdown               = pnv_pci_ioda_shutdown,
-       .disable_device         = pnv_npu_disable_device,
-};
-
 static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
        .enable_device_hook     = pnv_ocapi_enable_device_hook,
        .release_device         = pnv_pci_release_device,
@@ -2957,10 +2792,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
                phb->model = PNV_PHB_MODEL_P7IOC;
        else if (of_device_is_compatible(np, "ibm,power8-pciex"))
                phb->model = PNV_PHB_MODEL_PHB3;
-       else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
-               phb->model = PNV_PHB_MODEL_NPU;
-       else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))
-               phb->model = PNV_PHB_MODEL_NPU2;
        else
                phb->model = PNV_PHB_MODEL_UNKNOWN;
 
@@ -3118,9 +2949,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
        ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
 
        switch (phb->type) {
-       case PNV_PHB_NPU_NVLINK:
-               hose->controller_ops = pnv_npu_ioda_controller_ops;
-               break;
        case PNV_PHB_NPU_OCAPI:
                hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
                break;
@@ -3173,11 +3001,6 @@ void __init pnv_pci_init_ioda2_phb(struct device_node *np)
        pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
 }
 
-void __init pnv_pci_init_npu_phb(struct device_node *np)
-{
-       pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_NVLINK);
-}
-
 void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
 {
        pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI);
index 9b9bca1..b18468d 100644 (file)
@@ -926,17 +926,6 @@ void __init pnv_pci_init(void)
        for_each_compatible_node(np, NULL, "ibm,ioda3-phb")
                pnv_pci_init_ioda2_phb(np);
 
-       /* Look for NPU PHBs */
-       for_each_compatible_node(np, NULL, "ibm,ioda2-npu-phb")
-               pnv_pci_init_npu_phb(np);
-
-       /*
-        * Look for NPU2 PHBs which we treat mostly as NPU PHBs with
-        * the exception of TCE kill which requires an OPAL call.
-        */
-       for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-phb")
-               pnv_pci_init_npu_phb(np);
-
        /* Look for NPU2 OpenCAPI PHBs */
        for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-opencapi-phb")
                pnv_pci_init_npu2_opencapi_phb(np);
index 36d2292..c8d4f22 100644 (file)
 struct pci_dn;
 
 enum pnv_phb_type {
-       PNV_PHB_IODA1           = 0,
-       PNV_PHB_IODA2           = 1,
-       PNV_PHB_NPU_NVLINK      = 2,
-       PNV_PHB_NPU_OCAPI       = 3,
+       PNV_PHB_IODA1,
+       PNV_PHB_IODA2,
+       PNV_PHB_NPU_OCAPI,
 };
 
 /* Precise PHB model for error management */
@@ -21,8 +20,6 @@ enum pnv_phb_model {
        PNV_PHB_MODEL_UNKNOWN,
        PNV_PHB_MODEL_P7IOC,
        PNV_PHB_MODEL_PHB3,
-       PNV_PHB_MODEL_NPU,
-       PNV_PHB_MODEL_NPU2,
 };
 
 #define PNV_PCI_DIAG_BUF_SIZE  8192
@@ -81,7 +78,6 @@ struct pnv_ioda_pe {
 
        /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
        struct iommu_table_group table_group;
-       struct npu_comp         *npucomp;
 
        /* 64-bit TCE bypass region */
        bool                    tce_bypass_enabled;
@@ -289,9 +285,7 @@ extern struct iommu_table *pnv_pci_table_alloc(int nid);
 
 extern void pnv_pci_init_ioda_hub(struct device_node *np);
 extern void pnv_pci_init_ioda2_phb(struct device_node *np);
-extern void pnv_pci_init_npu_phb(struct device_node *np);
 extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np);
-extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr);
 extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
 extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
 
@@ -314,11 +308,6 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 #define pe_info(pe, fmt, ...)                                  \
        pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
 
-/* Nvlink functions */
-extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
-extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
-extern void pnv_pci_npu_setup_iommu_groups(void);
-
 /* pci-ioda-tce.c */
 #define POWERNV_IOMMU_DEFAULT_LEVELS   2
 #define POWERNV_IOMMU_MAX_LEVELS       5
index 1bffbd1..3b6800f 100644 (file)
@@ -224,8 +224,6 @@ static void __init pSeries_request_regions(void)
 
 void __init pSeries_final_fixup(void)
 {
-       struct pci_controller *hose;
-
        pSeries_request_regions();
 
        eeh_show_enabled();
@@ -234,27 +232,6 @@ void __init pSeries_final_fixup(void)
        ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable;
        ppc_md.pcibios_sriov_disable = pseries_pcibios_sriov_disable;
 #endif
-       list_for_each_entry(hose, &hose_list, list_node) {
-               struct device_node *dn = hose->dn, *nvdn;
-
-               while (1) {
-                       dn = of_find_all_nodes(dn);
-                       if (!dn)
-                               break;
-                       nvdn = of_parse_phandle(dn, "ibm,nvlink", 0);
-                       if (!nvdn)
-                               continue;
-                       if (!of_device_is_compatible(nvdn, "ibm,npu-link"))
-                               continue;
-                       if (!of_device_is_compatible(nvdn->parent,
-                                               "ibm,power9-npu"))
-                               continue;
-#ifdef CONFIG_PPC_POWERNV
-                       WARN_ON_ONCE(pnv_npu2_init(hose));
-#endif
-                       break;
-               }
-       }
 }
 
 /*
index 7b739cc..1d829e2 100644 (file)
@@ -55,9 +55,9 @@ void __init svm_swiotlb_init(void)
        if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, false))
                return;
 
-       if (io_tlb_start)
-               memblock_free_early(io_tlb_start,
-                                   PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+
+       memblock_free_early(__pa(vstart),
+                           PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
        panic("SVM: Cannot allocate SWIOTLB buffer");
 }
 
index 4515a10..a8ad8eb 100644 (file)
@@ -20,6 +20,7 @@ config RISCV
        select ARCH_HAS_DEBUG_VM_PGTABLE
        select ARCH_HAS_DEBUG_VIRTUAL if MMU
        select ARCH_HAS_DEBUG_WX
+       select ARCH_HAS_FORTIFY_SOURCE
        select ARCH_HAS_GCOV_PROFILE_ALL
        select ARCH_HAS_GIGANTIC_PAGE
        select ARCH_HAS_KCOV
@@ -27,18 +28,23 @@ config RISCV
        select ARCH_HAS_PTE_SPECIAL
        select ARCH_HAS_SET_DIRECT_MAP
        select ARCH_HAS_SET_MEMORY
-       select ARCH_HAS_STRICT_KERNEL_RWX if MMU
+       select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
+       select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL
+       select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
        select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
        select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
+       select ARCH_SUPPORTS_HUGETLBFS if MMU
        select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
        select ARCH_WANT_FRAME_POINTERS
        select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
+       select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
        select CLONE_BACKWARDS
        select CLINT_TIMER if !MMU
        select COMMON_CLK
        select EDAC_SUPPORT
        select GENERIC_ARCH_TOPOLOGY if SMP
        select GENERIC_ATOMIC64 if !64BIT
+       select GENERIC_CLOCKEVENTS_BROADCAST if SMP
        select GENERIC_EARLY_IOREMAP
        select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO
        select GENERIC_IOREMAP
@@ -165,10 +171,6 @@ config ARCH_WANT_GENERAL_HUGETLB
 config ARCH_SUPPORTS_UPROBES
        def_bool y
 
-config SYS_SUPPORTS_HUGETLBFS
-       depends on MMU
-       def_bool y
-
 config STACKTRACE_SUPPORT
        def_bool y
 
@@ -204,6 +206,7 @@ config LOCKDEP_SUPPORT
        def_bool y
 
 source "arch/riscv/Kconfig.socs"
+source "arch/riscv/Kconfig.erratas"
 
 menu "Platform type"
 
@@ -227,7 +230,7 @@ config ARCH_RV64I
        bool "RV64I"
        select 64BIT
        select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && GCC_VERSION >= 50000
-       select HAVE_DYNAMIC_FTRACE if MMU
+       select HAVE_DYNAMIC_FTRACE if MMU && $(cc-option,-fpatchable-function-entry=8)
        select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
        select HAVE_FTRACE_MCOUNT_RECORD
        select HAVE_FUNCTION_GRAPH_TRACER
@@ -386,6 +389,31 @@ config RISCV_SBI_V01
        help
          This config allows kernel to use SBI v0.1 APIs. This will be
          deprecated in future once legacy M-mode software are no longer in use.
+
+config KEXEC
+       bool "Kexec system call"
+       select KEXEC_CORE
+       select HOTPLUG_CPU if SMP
+       depends on MMU
+       help
+         kexec is a system call that implements the ability to shutdown your
+         current kernel, and to start another kernel. It is like a reboot
+         but it is independent of the system firmware. And like a reboot
+         you can start any kernel with it, not just Linux.
+
+         The name comes from the similarity to the exec system call.
+
+config CRASH_DUMP
+       bool "Build kdump crash kernel"
+       help
+         Generate crash dump after being started by kexec. This should
+         be normally only set in special crash dump kernels which are
+         loaded in the main kernel with kexec-tools into a specially
+         reserved region and then later executed after a crash by
+         kdump/kexec.
+
+         For more details see Documentation/admin-guide/kdump/kdump.rst
+
 endmenu
 
 menu "Boot options"
@@ -438,7 +466,7 @@ config EFI_STUB
 
 config EFI
        bool "UEFI runtime support"
-       depends on OF
+       depends on OF && !XIP_KERNEL
        select LIBFDT
        select UCS2_STRING
        select EFI_PARAMS_FROM_FDT
@@ -462,11 +490,63 @@ config STACKPROTECTOR_PER_TASK
        def_bool y
        depends on STACKPROTECTOR && CC_HAVE_STACKPROTECTOR_TLS
 
+config PHYS_RAM_BASE_FIXED
+       bool "Explicitly specified physical RAM address"
+       default n
+
+config PHYS_RAM_BASE
+       hex "Platform Physical RAM address"
+       depends on PHYS_RAM_BASE_FIXED
+       default "0x80000000"
+       help
+         This is the physical address of RAM in the system. It has to be
+         explicitly specified to run early relocations of read-write data
+         from flash to RAM.
+
+config XIP_KERNEL
+       bool "Kernel Execute-In-Place from ROM"
+       depends on MMU && SPARSEMEM
+       # This prevents XIP from being enabled by all{yes,mod}config, which
+       # fail to build since XIP doesn't support large kernels.
+       depends on !COMPILE_TEST
+       select PHYS_RAM_BASE_FIXED
+       help
+         Execute-In-Place allows the kernel to run from non-volatile storage
+         directly addressable by the CPU, such as NOR flash. This saves RAM
+         space since the text section of the kernel is not loaded from flash
+         to RAM.  Read-write sections, such as the data section and stack,
+         are still copied to RAM.  The XIP kernel is not compressed since
+         it has to run directly from flash, so it will take more space to
+         store it.  The flash address used to link the kernel object files,
+         and for storing it, is configuration dependent. Therefore, if you
+         say Y here, you must know the proper physical address where to
+         store the kernel image depending on your own flash memory usage.
+
+         Also note that the make target becomes "make xipImage" rather than
+         "make zImage" or "make Image".  The final kernel binary to put in
+         ROM memory will be arch/riscv/boot/xipImage.
+
+         SPARSEMEM is required because the kernel text and rodata that are
+         flash resident are not backed by memmap, then any attempt to get
+         a struct page on those regions will trigger a fault.
+
+         If unsure, say N.
+
+config XIP_PHYS_ADDR
+       hex "XIP Kernel Physical Location"
+       depends on XIP_KERNEL
+       default "0x21000000"
+       help
+         This is the physical address in your flash memory the kernel will
+         be linked for and stored to.  This address is dependent on your
+         own flash usage.
+
 endmenu
 
 config BUILTIN_DTB
-       def_bool n
+       bool
        depends on OF
+       default y if XIP_KERNEL
 
 menu "Power management options"
 
diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas
new file mode 100644 (file)
index 0000000..d5d03ae
--- /dev/null
@@ -0,0 +1,44 @@
+menu "CPU errata selection"
+
+config RISCV_ERRATA_ALTERNATIVE
+       bool "RISC-V alternative scheme"
+       default y
+       help
+         This Kconfig allows the kernel to automatically patch the
+         errata required by the execution platform at run time. The
+         code patching is performed once in the boot stages. It means
+         that the overhead from this mechanism is just taken once.
+
+config ERRATA_SIFIVE
+       bool "SiFive errata"
+       depends on RISCV_ERRATA_ALTERNATIVE
+       help
+         All SiFive errata Kconfig depend on this Kconfig. Disabling
+         this Kconfig will disable all SiFive errata. Please say "Y"
+         here if your platform uses SiFive CPU cores.
+
+         Otherwise, please say "N" here to avoid unnecessary overhead.
+
+config ERRATA_SIFIVE_CIP_453
+       bool "Apply SiFive errata CIP-453"
+       depends on ERRATA_SIFIVE
+       default y
+       help
+         This will apply the SiFive CIP-453 errata to add sign extension
+         to the $badaddr when exception type is instruction page fault
+         and instruction access fault.
+
+         If you don't know what to do here, say "Y".
+
+config ERRATA_SIFIVE_CIP_1200
+       bool "Apply SiFive errata CIP-1200"
+       depends on ERRATA_SIFIVE
+       default y
+       help
+         This will apply the SiFive CIP-1200 errata to repalce all
+         "sfence.vma addr" with "sfence.vma" to ensure that the addr
+         has been flushed from TLB.
+
+         If you don't know what to do here, say "Y".
+
+endmenu
index e1b2690..ed96376 100644 (file)
@@ -1,5 +1,12 @@
 menu "SoC selection"
 
+config SOC_MICROCHIP_POLARFIRE
+       bool "Microchip PolarFire SoCs"
+       select MCHP_CLK_MPFS
+       select SIFIVE_PLIC
+       help
+         This enables support for Microchip PolarFire SoC platforms.
+
 config SOC_SIFIVE
        bool "SiFive SoCs"
        select SERIAL_SIFIVE if TTY
@@ -7,6 +14,7 @@ config SOC_SIFIVE
        select CLK_SIFIVE
        select CLK_SIFIVE_PRCI
        select SIFIVE_PLIC
+       select ERRATA_SIFIVE
        help
          This enables support for SiFive SoC platform hardware.
 
index 1368d94..3eb9590 100644 (file)
@@ -82,11 +82,16 @@ CHECKFLAGS += -D__riscv -D__riscv_xlen=$(BITS)
 
 # Default target when executing plain make
 boot           := arch/riscv/boot
+ifeq ($(CONFIG_XIP_KERNEL),y)
+KBUILD_IMAGE := $(boot)/xipImage
+else
 KBUILD_IMAGE   := $(boot)/Image.gz
+endif
 
 head-y := arch/riscv/kernel/head.o
 
 core-y += arch/riscv/
+core-$(CONFIG_RISCV_ERRATA_ALTERNATIVE) += arch/riscv/errata/
 
 libs-y += arch/riscv/lib/
 libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
@@ -95,12 +100,14 @@ PHONY += vdso_install
 vdso_install:
        $(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@
 
+ifneq ($(CONFIG_XIP_KERNEL),y)
 ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_SOC_CANAAN),yy)
 KBUILD_IMAGE := $(boot)/loader.bin
 else
 KBUILD_IMAGE := $(boot)/Image.gz
 endif
-BOOT_TARGETS := Image Image.gz loader loader.bin
+endif
+BOOT_TARGETS := Image Image.gz loader loader.bin xipImage
 
 all:   $(notdir $(KBUILD_IMAGE))
 
index 03404c8..6bf299f 100644 (file)
 KCOV_INSTRUMENT := n
 
 OBJCOPYFLAGS_Image :=-O binary -R .note -R .note.gnu.build-id -R .comment -S
+OBJCOPYFLAGS_xipImage :=-O binary -R .note -R .note.gnu.build-id -R .comment -S
 
 targets := Image Image.* loader loader.o loader.lds loader.bin
+targets := Image Image.* loader loader.o loader.lds loader.bin xipImage
+
+ifeq ($(CONFIG_XIP_KERNEL),y)
+
+quiet_cmd_mkxip = $(quiet_cmd_objcopy)
+cmd_mkxip = $(cmd_objcopy)
+
+$(obj)/xipImage: vmlinux FORCE
+       $(call if_changed,mkxip)
+       @$(kecho) '  Physical Address of xipImage: $(CONFIG_XIP_PHYS_ADDR)'
+
+endif
 
 $(obj)/Image: vmlinux FORCE
        $(call if_changed,objcopy)
index 7ffd502..fe996b8 100644 (file)
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 subdir-y += sifive
 subdir-$(CONFIG_SOC_CANAAN_K210_DTB_BUILTIN) += canaan
+subdir-y += microchip
 
 obj-$(CONFIG_BUILTIN_DTB) := $(addsuffix /, $(subdir-y))
diff --git a/arch/riscv/boot/dts/microchip/Makefile b/arch/riscv/boot/dts/microchip/Makefile
new file mode 100644 (file)
index 0000000..622b127
--- /dev/null
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+dtb-$(CONFIG_SOC_MICROCHIP_POLARFIRE) += microchip-mpfs-icicle-kit.dtb
diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
new file mode 100644 (file)
index 0000000..ec79944
--- /dev/null
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/* Copyright (c) 2020 Microchip Technology Inc */
+
+/dts-v1/;
+
+#include "microchip-mpfs.dtsi"
+
+/* Clock frequency (in Hz) of the rtcclk */
+#define RTCCLK_FREQ            1000000
+
+/ {
+       #address-cells = <2>;
+       #size-cells = <2>;
+       model = "Microchip PolarFire-SoC Icicle Kit";
+       compatible = "microchip,mpfs-icicle-kit";
+
+       chosen {
+               stdout-path = &serial0;
+       };
+
+       cpus {
+               timebase-frequency = <RTCCLK_FREQ>;
+       };
+
+       memory@80000000 {
+               device_type = "memory";
+               reg = <0x0 0x80000000 0x0 0x40000000>;
+               clocks = <&clkcfg 26>;
+       };
+
+       soc {
+       };
+};
+
+&serial0 {
+       status = "okay";
+};
+
+&serial1 {
+       status = "okay";
+};
+
+&serial2 {
+       status = "okay";
+};
+
+&serial3 {
+       status = "okay";
+};
+
+&sdcard {
+       status = "okay";
+};
+
+&emac0 {
+       phy-mode = "sgmii";
+       phy-handle = <&phy0>;
+       phy0: ethernet-phy@8 {
+               reg = <8>;
+               ti,fifo-depth = <0x01>;
+       };
+};
+
+&emac1 {
+       status = "okay";
+       phy-mode = "sgmii";
+       phy-handle = <&phy1>;
+       phy1: ethernet-phy@9 {
+               reg = <9>;
+               ti,fifo-depth = <0x01>;
+       };
+};
diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
new file mode 100644 (file)
index 0000000..b981957
--- /dev/null
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/* Copyright (c) 2020 Microchip Technology Inc */
+
+/dts-v1/;
+
+/ {
+       #address-cells = <2>;
+       #size-cells = <2>;
+       model = "Microchip MPFS Icicle Kit";
+       compatible = "microchip,mpfs-icicle-kit";
+
+       chosen {
+       };
+
+       cpus {
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               cpu@0 {
+                       clock-frequency = <0>;
+                       compatible = "sifive,e51", "sifive,rocket0", "riscv";
+                       device_type = "cpu";
+                       i-cache-block-size = <64>;
+                       i-cache-sets = <128>;
+                       i-cache-size = <16384>;
+                       reg = <0>;
+                       riscv,isa = "rv64imac";
+                       status = "disabled";
+
+                       cpu0_intc: interrupt-controller {
+                               #interrupt-cells = <1>;
+                               compatible = "riscv,cpu-intc";
+                               interrupt-controller;
+                       };
+               };
+
+               cpu@1 {
+                       clock-frequency = <0>;
+                       compatible = "sifive,u54-mc", "sifive,rocket0", "riscv";
+                       d-cache-block-size = <64>;
+                       d-cache-sets = <64>;
+                       d-cache-size = <32768>;
+                       d-tlb-sets = <1>;
+                       d-tlb-size = <32>;
+                       device_type = "cpu";
+                       i-cache-block-size = <64>;
+                       i-cache-sets = <64>;
+                       i-cache-size = <32768>;
+                       i-tlb-sets = <1>;
+                       i-tlb-size = <32>;
+                       mmu-type = "riscv,sv39";
+                       reg = <1>;
+                       riscv,isa = "rv64imafdc";
+                       tlb-split;
+                       status = "okay";
+
+                       cpu1_intc: interrupt-controller {
+                               #interrupt-cells = <1>;
+                               compatible = "riscv,cpu-intc";
+                               interrupt-controller;
+                       };
+               };
+
+               cpu@2 {
+                       clock-frequency = <0>;
+                       compatible = "sifive,u54-mc", "sifive,rocket0", "riscv";
+                       d-cache-block-size = <64>;
+                       d-cache-sets = <64>;
+                       d-cache-size = <32768>;
+                       d-tlb-sets = <1>;
+                       d-tlb-size = <32>;
+                       device_type = "cpu";
+                       i-cache-block-size = <64>;
+                       i-cache-sets = <64>;
+                       i-cache-size = <32768>;
+                       i-tlb-sets = <1>;
+                       i-tlb-size = <32>;
+                       mmu-type = "riscv,sv39";
+                       reg = <2>;
+                       riscv,isa = "rv64imafdc";
+                       tlb-split;
+                       status = "okay";
+
+                       cpu2_intc: interrupt-controller {
+                               #interrupt-cells = <1>;
+                               compatible = "riscv,cpu-intc";
+                               interrupt-controller;
+                       };
+               };
+
+               cpu@3 {
+                       clock-frequency = <0>;
+                       compatible = "sifive,u54-mc", "sifive,rocket0", "riscv";
+                       d-cache-block-size = <64>;
+                       d-cache-sets = <64>;
+                       d-cache-size = <32768>;
+                       d-tlb-sets = <1>;
+                       d-tlb-size = <32>;
+                       device_type = "cpu";
+                       i-cache-block-size = <64>;
+                       i-cache-sets = <64>;
+                       i-cache-size = <32768>;
+                       i-tlb-sets = <1>;
+                       i-tlb-size = <32>;
+                       mmu-type = "riscv,sv39";
+                       reg = <3>;
+                       riscv,isa = "rv64imafdc";
+                       tlb-split;
+                       status = "okay";
+
+                       cpu3_intc: interrupt-controller {
+                               #interrupt-cells = <1>;
+                               compatible = "riscv,cpu-intc";
+                               interrupt-controller;
+                       };
+               };
+
+               cpu@4 {
+                       clock-frequency = <0>;
+                       compatible = "sifive,u54-mc", "sifive,rocket0", "riscv";
+                       d-cache-block-size = <64>;
+                       d-cache-sets = <64>;
+                       d-cache-size = <32768>;
+                       d-tlb-sets = <1>;
+                       d-tlb-size = <32>;
+                       device_type = "cpu";
+                       i-cache-block-size = <64>;
+                       i-cache-sets = <64>;
+                       i-cache-size = <32768>;
+                       i-tlb-sets = <1>;
+                       i-tlb-size = <32>;
+                       mmu-type = "riscv,sv39";
+                       reg = <4>;
+                       riscv,isa = "rv64imafdc";
+                       tlb-split;
+                       status = "okay";
+                       cpu4_intc: interrupt-controller {
+                               #interrupt-cells = <1>;
+                               compatible = "riscv,cpu-intc";
+                               interrupt-controller;
+                       };
+               };
+       };
+
+       soc {
+               #address-cells = <2>;
+               #size-cells = <2>;
+               compatible = "simple-bus";
+               ranges;
+
+               cache-controller@2010000 {
+                       compatible = "sifive,fu540-c000-ccache", "cache";
+                       cache-block-size = <64>;
+                       cache-level = <2>;
+                       cache-sets = <1024>;
+                       cache-size = <2097152>;
+                       cache-unified;
+                       interrupt-parent = <&plic>;
+                       interrupts = <1 2 3>;
+                       reg = <0x0 0x2010000 0x0 0x1000>;
+               };
+
+               clint@2000000 {
+                       compatible = "sifive,clint0";
+                       reg = <0x0 0x2000000 0x0 0xC000>;
+                       interrupts-extended = <&cpu0_intc 3 &cpu0_intc 7
+                                               &cpu1_intc 3 &cpu1_intc 7
+                                               &cpu2_intc 3 &cpu2_intc 7
+                                               &cpu3_intc 3 &cpu3_intc 7
+                                               &cpu4_intc 3 &cpu4_intc 7>;
+               };
+
+               plic: interrupt-controller@c000000 {
+                       #interrupt-cells = <1>;
+                       compatible = "sifive,plic-1.0.0";
+                       reg = <0x0 0xc000000 0x0 0x4000000>;
+                       riscv,ndev = <186>;
+                       interrupt-controller;
+                       interrupts-extended = <&cpu0_intc 11
+                                       &cpu1_intc 11 &cpu1_intc 9
+                                       &cpu2_intc 11 &cpu2_intc 9
+                                       &cpu3_intc 11 &cpu3_intc 9
+                                       &cpu4_intc 11 &cpu4_intc 9>;
+               };
+
+               dma@3000000 {
+                       compatible = "sifive,fu540-c000-pdma";
+                       reg = <0x0 0x3000000 0x0 0x8000>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <23 24 25 26 27 28 29 30>;
+                       #dma-cells = <1>;
+               };
+
+               refclk: refclk {
+                       compatible = "fixed-clock";
+                       #clock-cells = <0>;
+                       clock-frequency = <600000000>;
+                       clock-output-names = "msspllclk";
+               };
+
+               clkcfg: clkcfg@20002000 {
+                       compatible = "microchip,mpfs-clkcfg";
+                       reg = <0x0 0x20002000 0x0 0x1000>;
+                       reg-names = "mss_sysreg";
+                       clocks = <&refclk>;
+                       #clock-cells = <1>;
+                       clock-output-names = "cpu", "axi", "ahb", "envm",       /* 0-3   */
+                                "mac0", "mac1", "mmc", "timer",                /* 4-7   */
+                               "mmuart0", "mmuart1", "mmuart2", "mmuart3",     /* 8-11  */
+                               "mmuart4", "spi0", "spi1", "i2c0",              /* 12-15 */
+                               "i2c1", "can0", "can1", "usb",                  /* 16-19 */
+                               "rsvd", "rtc", "qspi", "gpio0",                 /* 20-23 */
+                               "gpio1", "gpio2", "ddrc", "fic0",               /* 24-27 */
+                               "fic1", "fic2", "fic3", "athena", "cfm";        /* 28-32 */
+               };
+
+               serial0: serial@20000000 {
+                       compatible = "ns16550a";
+                       reg = <0x0 0x20000000 0x0 0x400>;
+                       reg-io-width = <4>;
+                       reg-shift = <2>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <90>;
+                       current-speed = <115200>;
+                       clocks = <&clkcfg 8>;
+                       status = "disabled";
+               };
+
+               serial1: serial@20100000 {
+                       compatible = "ns16550a";
+                       reg = <0x0 0x20100000 0x0 0x400>;
+                       reg-io-width = <4>;
+                       reg-shift = <2>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <91>;
+                       current-speed = <115200>;
+                       clocks = <&clkcfg 9>;
+                       status = "disabled";
+               };
+
+               serial2: serial@20102000 {
+                       compatible = "ns16550a";
+                       reg = <0x0 0x20102000 0x0 0x400>;
+                       reg-io-width = <4>;
+                       reg-shift = <2>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <92>;
+                       current-speed = <115200>;
+                       clocks = <&clkcfg 10>;
+                       status = "disabled";
+               };
+
+               serial3: serial@20104000 {
+                       compatible = "ns16550a";
+                       reg = <0x0 0x20104000 0x0 0x400>;
+                       reg-io-width = <4>;
+                       reg-shift = <2>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <93>;
+                       current-speed = <115200>;
+                       clocks = <&clkcfg 11>;
+                       status = "disabled";
+               };
+
+               emmc: mmc@20008000 {
+                       compatible = "cdns,sd4hc";
+                       reg = <0x0 0x20008000 0x0 0x1000>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <88 89>;
+                       pinctrl-names = "default";
+                       clocks = <&clkcfg 6>;
+                       bus-width = <4>;
+                       cap-mmc-highspeed;
+                       mmc-ddr-3_3v;
+                       max-frequency = <200000000>;
+                       non-removable;
+                       no-sd;
+                       no-sdio;
+                       voltage-ranges = <3300 3300>;
+                       status = "disabled";
+               };
+
+               sdcard: sdhc@20008000 {
+                       compatible = "cdns,sd4hc";
+                       reg = <0x0 0x20008000 0x0 0x1000>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <88>;
+                       pinctrl-names = "default";
+                       clocks = <&clkcfg 6>;
+                       bus-width = <4>;
+                       disable-wp;
+                       cap-sd-highspeed;
+                       card-detect-delay = <200>;
+                       sd-uhs-sdr12;
+                       sd-uhs-sdr25;
+                       sd-uhs-sdr50;
+                       sd-uhs-sdr104;
+                       max-frequency = <200000000>;
+                       status = "disabled";
+               };
+
+               emac0: ethernet@20110000 {
+                       compatible = "cdns,macb";
+                       reg = <0x0 0x20110000 0x0 0x2000>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <64 65 66 67>;
+                       local-mac-address = [00 00 00 00 00 00];
+                       clocks = <&clkcfg 4>, <&clkcfg 2>;
+                       clock-names = "pclk", "hclk";
+                       status = "disabled";
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+               };
+
+               emac1: ethernet@20112000 {
+                       compatible = "cdns,macb";
+                       reg = <0x0 0x20112000 0x0 0x2000>;
+                       interrupt-parent = <&plic>;
+                       interrupts = <70 71 72 73>;
+                       mac-address = [00 00 00 00 00 00];
+                       clocks = <&clkcfg 5>, <&clkcfg 2>;
+                       status = "disabled";
+                       clock-names = "pclk", "hclk";
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+               };
+
+       };
+};
index eeb4f8c..8eef82e 100644 (file)
                        reg = <0x0 0x10000000 0x0 0x1000>;
                        clocks = <&hfclk>, <&rtcclk>;
                        #clock-cells = <1>;
+                       #reset-cells = <1>;
                };
                uart0: serial@10010000 {
                        compatible = "sifive,fu740-c000-uart", "sifive,uart0";
                        clocks = <&prci PRCI_CLK_PCLK>;
                        status = "disabled";
                };
+               pcie@e00000000 {
+                       compatible = "sifive,fu740-pcie";
+                       #address-cells = <3>;
+                       #size-cells = <2>;
+                       #interrupt-cells = <1>;
+                       reg = <0xe 0x00000000 0x0 0x80000000>,
+                             <0xd 0xf0000000 0x0 0x10000000>,
+                             <0x0 0x100d0000 0x0 0x1000>;
+                       reg-names = "dbi", "config", "mgmt";
+                       device_type = "pci";
+                       dma-coherent;
+                       bus-range = <0x0 0xff>;
+                       ranges = <0x81000000  0x0 0x60080000  0x0 0x60080000 0x0 0x10000>,      /* I/O */
+                                <0x82000000  0x0 0x60090000  0x0 0x60090000 0x0 0xff70000>,    /* mem */
+                                <0x82000000  0x0 0x70000000  0x0 0x70000000 0x0 0x1000000>,    /* mem */
+                                <0xc3000000 0x20 0x00000000 0x20 0x00000000 0x20 0x00000000>;  /* mem prefetchable */
+                       num-lanes = <0x8>;
+                       interrupts = <56>, <57>, <58>, <59>, <60>, <61>, <62>, <63>, <64>;
+                       interrupt-names = "msi", "inta", "intb", "intc", "intd";
+                       interrupt-parent = <&plic0>;
+                       interrupt-map-mask = <0x0 0x0 0x0 0x7>;
+                       interrupt-map = <0x0 0x0 0x0 0x1 &plic0 57>,
+                                       <0x0 0x0 0x0 0x2 &plic0 58>,
+                                       <0x0 0x0 0x0 0x3 &plic0 59>,
+                                       <0x0 0x0 0x0 0x4 &plic0 60>;
+                       clock-names = "pcie_aux";
+                       clocks = <&prci PRCI_CLK_PCIE_AUX>;
+                       pwren-gpios = <&gpio 5 0>;
+                       reset-gpios = <&gpio 8 0>;
+                       resets = <&prci 4>;
+                       status = "okay";
+               };
        };
 };
index 47a5003..62d9469 100644 (file)
@@ -1,13 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 #include <asm/page.h>
+#include <asm/pgtable.h>
 
 OUTPUT_ARCH(riscv)
 ENTRY(_start)
 
 SECTIONS
 {
-       . = PAGE_OFFSET;
+       . = KERNEL_LINK_ADDR;
 
        .payload : {
                *(.payload)
index 6c0625a..1f2be23 100644 (file)
@@ -16,6 +16,7 @@ CONFIG_EXPERT=y
 CONFIG_BPF_SYSCALL=y
 CONFIG_SOC_SIFIVE=y
 CONFIG_SOC_VIRT=y
+CONFIG_SOC_MICROCHIP_POLARFIRE=y
 CONFIG_SMP=y
 CONFIG_HOTPLUG_CPU=y
 CONFIG_JUMP_LABEL=y
@@ -82,6 +83,9 @@ CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_OHCI_HCD_PLATFORM=y
 CONFIG_USB_STORAGE=y
 CONFIG_USB_UAS=y
+CONFIG_MMC_SDHCI=y
+CONFIG_MMC_SDHCI_PLTFM=y
+CONFIG_MMC_SDHCI_CADENCE=y
 CONFIG_MMC=y
 CONFIG_MMC_SPI=y
 CONFIG_RTC_CLASS=y
diff --git a/arch/riscv/errata/Makefile b/arch/riscv/errata/Makefile
new file mode 100644 (file)
index 0000000..b8f8740
--- /dev/null
@@ -0,0 +1,2 @@
+obj-y  += alternative.o
+obj-$(CONFIG_ERRATA_SIFIVE) += sifive/
diff --git a/arch/riscv/errata/alternative.c b/arch/riscv/errata/alternative.c
new file mode 100644 (file)
index 0000000..3b15885
--- /dev/null
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * alternative runtime patching
+ * inspired by the ARM64 and x86 version
+ *
+ * Copyright (C) 2021 Sifive.
+ */
+
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <asm/alternative.h>
+#include <asm/sections.h>
+#include <asm/vendorid_list.h>
+#include <asm/sbi.h>
+#include <asm/csr.h>
+
+static struct cpu_manufacturer_info_t {
+       unsigned long vendor_id;
+       unsigned long arch_id;
+       unsigned long imp_id;
+} cpu_mfr_info;
+
+static void (*vendor_patch_func)(struct alt_entry *begin, struct alt_entry *end,
+                                unsigned long archid, unsigned long impid);
+
+static inline void __init riscv_fill_cpu_mfr_info(void)
+{
+#ifdef CONFIG_RISCV_M_MODE
+       cpu_mfr_info.vendor_id = csr_read(CSR_MVENDORID);
+       cpu_mfr_info.arch_id = csr_read(CSR_MARCHID);
+       cpu_mfr_info.imp_id = csr_read(CSR_MIMPID);
+#else
+       cpu_mfr_info.vendor_id = sbi_get_mvendorid();
+       cpu_mfr_info.arch_id = sbi_get_marchid();
+       cpu_mfr_info.imp_id = sbi_get_mimpid();
+#endif
+}
+
+static void __init init_alternative(void)
+{
+       riscv_fill_cpu_mfr_info();
+
+       switch (cpu_mfr_info.vendor_id) {
+#ifdef CONFIG_ERRATA_SIFIVE
+       case SIFIVE_VENDOR_ID:
+               vendor_patch_func = sifive_errata_patch_func;
+               break;
+#endif
+       default:
+               vendor_patch_func = NULL;
+       }
+}
+
+/*
+ * This is called very early in the boot process (directly after we run
+ * a feature detect on the boot CPU). No need to worry about other CPUs
+ * here.
+ */
+void __init apply_boot_alternatives(void)
+{
+       /* If called on non-boot cpu things could go wrong */
+       WARN_ON(smp_processor_id() != 0);
+
+       init_alternative();
+
+       if (!vendor_patch_func)
+               return;
+
+       vendor_patch_func((struct alt_entry *)__alt_start,
+                         (struct alt_entry *)__alt_end,
+                         cpu_mfr_info.arch_id, cpu_mfr_info.imp_id);
+}
+
diff --git a/arch/riscv/errata/sifive/Makefile b/arch/riscv/errata/sifive/Makefile
new file mode 100644 (file)
index 0000000..bdd5fc8
--- /dev/null
@@ -0,0 +1,2 @@
+obj-y += errata_cip_453.o
+obj-y += errata.o
diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c
new file mode 100644 (file)
index 0000000..f5e5ae7
--- /dev/null
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Sifive.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/bug.h>
+#include <asm/patch.h>
+#include <asm/alternative.h>
+#include <asm/vendorid_list.h>
+#include <asm/errata_list.h>
+
+struct errata_info_t {
+       char name[ERRATA_STRING_LENGTH_MAX];
+       bool (*check_func)(unsigned long  arch_id, unsigned long impid);
+};
+
+static bool errata_cip_453_check_func(unsigned long  arch_id, unsigned long impid)
+{
+       /*
+        * Affected cores:
+        * Architecture ID: 0x8000000000000007
+        * Implement ID: 0x20181004 <= impid <= 0x20191105
+        */
+       if (arch_id != 0x8000000000000007 ||
+           (impid < 0x20181004 || impid > 0x20191105))
+               return false;
+       return true;
+}
+
+static bool errata_cip_1200_check_func(unsigned long  arch_id, unsigned long impid)
+{
+       /*
+        * Affected cores:
+        * Architecture ID: 0x8000000000000007 or 0x1
+        * Implement ID: mimpid[23:0] <= 0x200630 and mimpid != 0x01200626
+        */
+       if (arch_id != 0x8000000000000007 && arch_id != 0x1)
+               return false;
+       if ((impid & 0xffffff) > 0x200630 || impid == 0x1200626)
+               return false;
+       return true;
+}
+
+static struct errata_info_t errata_list[ERRATA_SIFIVE_NUMBER] = {
+       {
+               .name = "cip-453",
+               .check_func = errata_cip_453_check_func
+       },
+       {
+               .name = "cip-1200",
+               .check_func = errata_cip_1200_check_func
+       },
+};
+
+static u32 __init sifive_errata_probe(unsigned long archid, unsigned long impid)
+{
+       int idx;
+       u32 cpu_req_errata = 0;
+
+       for (idx = 0; idx < ERRATA_SIFIVE_NUMBER; idx++)
+               if (errata_list[idx].check_func(archid, impid))
+                       cpu_req_errata |= (1U << idx);
+
+       return cpu_req_errata;
+}
+
+static void __init warn_miss_errata(u32 miss_errata)
+{
+       int i;
+
+       pr_warn("----------------------------------------------------------------\n");
+       pr_warn("WARNING: Missing the following errata may cause potential issues\n");
+       for (i = 0; i < ERRATA_SIFIVE_NUMBER; i++)
+               if (miss_errata & 0x1 << i)
+                       pr_warn("\tSiFive Errata[%d]:%s\n", i, errata_list[i].name);
+       pr_warn("Please enable the corresponding Kconfig to apply them\n");
+       pr_warn("----------------------------------------------------------------\n");
+}
+
+void __init sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end,
+                                    unsigned long archid, unsigned long impid)
+{
+       struct alt_entry *alt;
+       u32 cpu_req_errata = sifive_errata_probe(archid, impid);
+       u32 cpu_apply_errata = 0;
+       u32 tmp;
+
+       for (alt = begin; alt < end; alt++) {
+               if (alt->vendor_id != SIFIVE_VENDOR_ID)
+                       continue;
+               if (alt->errata_id >= ERRATA_SIFIVE_NUMBER) {
+                       WARN(1, "This errata id:%d is not in kernel errata list", alt->errata_id);
+                       continue;
+               }
+
+               tmp = (1U << alt->errata_id);
+               if (cpu_req_errata & tmp) {
+                       patch_text_nosync(alt->old_ptr, alt->alt_ptr, alt->alt_len);
+                       cpu_apply_errata |= tmp;
+               }
+       }
+       if (cpu_apply_errata != cpu_req_errata)
+               warn_miss_errata(cpu_req_errata - cpu_apply_errata);
+}
diff --git a/arch/riscv/errata/sifive/errata_cip_453.S b/arch/riscv/errata/sifive/errata_cip_453.S
new file mode 100644 (file)
index 0000000..f1b9623
--- /dev/null
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 SiFive
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/alternative.h>
+
+.macro ADD_SIGN_EXT pt_reg badaddr tmp_reg
+       REG_L \badaddr, PT_BADADDR(\pt_reg)
+       li \tmp_reg,1
+       slli \tmp_reg,\tmp_reg,0x26
+       and \tmp_reg,\tmp_reg,\badaddr
+       beqz \tmp_reg, 1f
+       li \tmp_reg,-1
+       slli \tmp_reg,\tmp_reg,0x27
+       or \badaddr,\tmp_reg,\badaddr
+       REG_S \badaddr, PT_BADADDR(\pt_reg)
+1:
+.endm
+
+ENTRY(sifive_cip_453_page_fault_trp)
+       ADD_SIGN_EXT a0, t0, t1
+#ifdef CONFIG_MMU
+       la t0, do_page_fault
+#else
+       la t0, do_trap_unknown
+#endif
+       jr t0
+END(sifive_cip_453_page_fault_trp)
+
+ENTRY(sifive_cip_453_insn_fault_trp)
+       ADD_SIGN_EXT a0, t0, t1
+       la t0, do_trap_insn_fault
+       jr t0
+END(sifive_cip_453_insn_fault_trp)
diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h
new file mode 100644 (file)
index 0000000..88c0870
--- /dev/null
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_ALTERNATIVE_MACROS_H
+#define __ASM_ALTERNATIVE_MACROS_H
+
+#ifdef CONFIG_RISCV_ERRATA_ALTERNATIVE
+
+#ifdef __ASSEMBLY__
+
+.macro ALT_ENTRY oldptr newptr vendor_id errata_id new_len
+       RISCV_PTR \oldptr
+       RISCV_PTR \newptr
+       REG_ASM \vendor_id
+       REG_ASM \new_len
+       .word   \errata_id
+.endm
+
+.macro ALT_NEW_CONTENT vendor_id, errata_id, enable = 1, new_c : vararg
+       .if \enable
+       .pushsection .alternative, "a"
+       ALT_ENTRY 886b, 888f, \vendor_id, \errata_id, 889f - 888f
+       .popsection
+       .subsection 1
+888 :
+       \new_c
+889 :
+       .previous
+       .org    . - (889b - 888b) + (887b - 886b)
+       .org    . - (887b - 886b) + (889b - 888b)
+       .endif
+.endm
+
+.macro __ALTERNATIVE_CFG old_c, new_c, vendor_id, errata_id, enable
+886 :
+       \old_c
+887 :
+       ALT_NEW_CONTENT \vendor_id, \errata_id, \enable, \new_c
+.endm
+
+#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \
+       __ALTERNATIVE_CFG old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k)
+
+#else /* !__ASSEMBLY__ */
+
+#include <asm/asm.h>
+#include <linux/stringify.h>
+
+#define ALT_ENTRY(oldptr, newptr, vendor_id, errata_id, newlen) \
+       RISCV_PTR " " oldptr "\n" \
+       RISCV_PTR " " newptr "\n" \
+       REG_ASM " " vendor_id "\n" \
+       REG_ASM " " newlen "\n" \
+       ".word " errata_id "\n"
+
+#define ALT_NEW_CONSTENT(vendor_id, errata_id, enable, new_c) \
+       ".if " __stringify(enable) " == 1\n"                            \
+       ".pushsection .alternative, \"a\"\n"                            \
+       ALT_ENTRY("886b", "888f", __stringify(vendor_id), __stringify(errata_id), "889f - 888f") \
+       ".popsection\n"                                                 \
+       ".subsection 1\n"                                               \
+       "888 :\n"                                                       \
+       new_c "\n"                                                      \
+       "889 :\n"                                                       \
+       ".previous\n"                                                   \
+       ".org   . - (887b - 886b) + (889b - 888b)\n"                    \
+       ".org   . - (889b - 888b) + (887b - 886b)\n"                    \
+       ".endif\n"
+
+#define __ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, enable) \
+       "886 :\n"       \
+       old_c "\n"      \
+       "887 :\n"       \
+       ALT_NEW_CONSTENT(vendor_id, errata_id, enable, new_c)
+
+#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \
+       __ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k))
+
+#endif /* __ASSEMBLY__ */
+
+#else /* !CONFIG_RISCV_ERRATA_ALTERNATIVE*/
+#ifdef __ASSEMBLY__
+
+.macro __ALTERNATIVE_CFG old_c
+       \old_c
+.endm
+
+#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \
+       __ALTERNATIVE_CFG old_c
+
+#else /* !__ASSEMBLY__ */
+
+#define __ALTERNATIVE_CFG(old_c)  \
+       old_c "\n"
+
+#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \
+       __ALTERNATIVE_CFG(old_c)
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_RISCV_ERRATA_ALTERNATIVE */
+/*
+ * Usage:
+ *   ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k)
+ * in the assembly code. Otherwise,
+ *   asm(ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k));
+ *
+ * old_content: The old content which is probably replaced with new content.
+ * new_content: The new content.
+ * vendor_id: The CPU vendor ID.
+ * errata_id: The errata ID.
+ * CONFIG_k: The Kconfig of this errata. When Kconfig is disabled, the old
+ *          content will alwyas be executed.
+ */
+#define ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k) \
+       _ALTERNATIVE_CFG(old_content, new_content, vendor_id, errata_id, CONFIG_k)
+
+/*
+ * A vendor wants to replace an old_content, but another vendor has used
+ * ALTERNATIVE() to patch its customized content at the same location. In
+ * this case, this vendor can create a new macro ALTERNATIVE_2() based
+ * on the following sample code and then replace ALTERNATIVE() with
+ * ALTERNATIVE_2() to append its customized content.
+ *
+ * .macro __ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, enable_1, \
+ *                                   new_c_2, vendor_id_2, errata_id_2, enable_2
+ * 886 :
+ *      \old_c
+ * 887 :
+ *      ALT_NEW_CONTENT \vendor_id_1, \errata_id_1, \enable_1, \new_c_1
+ *      ALT_NEW_CONTENT \vendor_id_2, \errata_id_2, \enable_2, \new_c_2
+ * .endm
+ *
+ * #define _ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, CONFIG_k_1, \
+ *                                   new_c_2, vendor_id_2, errata_id_2, CONFIG_k_2) \
+ *        __ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, IS_ENABLED(CONFIG_k_1), \
+ *                                   new_c_2, vendor_id_2, errata_id_2, IS_ENABLED(CONFIG_k_2) \
+ *
+ * #define ALTERNATIVE_2(old_content, new_content_1, vendor_id_1, errata_id_1, CONFIG_k_1, \
+ *                                    new_content_2, vendor_id_2, errata_id_2, CONFIG_k_2) \
+ *         _ALTERNATIVE_CFG_2(old_content, new_content_1, vendor_id_1, errata_id_1, CONFIG_k_1, \
+ *                                         new_content_2, vendor_id_2, errata_id_2, CONFIG_k_2)
+ *
+ */
+#endif
diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h
new file mode 100644 (file)
index 0000000..e625d3c
--- /dev/null
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Sifive.
+ */
+
+#ifndef __ASM_ALTERNATIVE_H
+#define __ASM_ALTERNATIVE_H
+
+#define ERRATA_STRING_LENGTH_MAX 32
+
+#include <asm/alternative-macros.h>
+
+#ifndef __ASSEMBLY__
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <asm/hwcap.h>
+
+void __init apply_boot_alternatives(void);
+
+struct alt_entry {
+       void *old_ptr;           /* address of original instruciton or data  */
+       void *alt_ptr;           /* address of replacement instruction or data */
+       unsigned long vendor_id; /* cpu vendor id */
+       unsigned long alt_len;   /* The replacement size */
+       unsigned int errata_id;  /* The errata id */
+} __packed;
+
+struct errata_checkfunc_id {
+       unsigned long vendor_id;
+       bool (*func)(struct alt_entry *alt);
+};
+
+void sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end,
+                             unsigned long archid, unsigned long impid);
+
+#endif
+#endif
index 9c992a8..618d7c5 100644 (file)
@@ -23,6 +23,7 @@
 #define REG_L          __REG_SEL(ld, lw)
 #define REG_S          __REG_SEL(sd, sw)
 #define REG_SC         __REG_SEL(sc.d, sc.w)
+#define REG_ASM                __REG_SEL(.dword, .word)
 #define SZREG          __REG_SEL(8, 4)
 #define LGREG          __REG_SEL(3, 2)
 
index caadfc1..87ac656 100644 (file)
 #define CSR_MIP                        0x344
 #define CSR_PMPCFG0            0x3a0
 #define CSR_PMPADDR0           0x3b0
+#define CSR_MVENDORID          0xf11
+#define CSR_MARCHID            0xf12
+#define CSR_MIMPID             0xf13
 #define CSR_MHARTID            0xf14
 
 #ifdef CONFIG_RISCV_M_MODE
index 5c725e1..f4b490c 100644 (file)
@@ -81,4 +81,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
        int uses_interp);
 #endif /* CONFIG_MMU */
 
+#define ELF_CORE_COPY_REGS(dest, regs)                 \
+do {                                                   \
+       *(struct user_regs_struct *)&(dest) =           \
+               *(struct user_regs_struct *)regs;       \
+} while (0);
+
 #endif /* _ASM_RISCV_ELF_H */
diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
new file mode 100644 (file)
index 0000000..5f1046e
--- /dev/null
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Sifive.
+ */
+#ifndef ASM_ERRATA_LIST_H
+#define ASM_ERRATA_LIST_H
+
+#include <asm/alternative.h>
+#include <asm/vendorid_list.h>
+
+#ifdef CONFIG_ERRATA_SIFIVE
+#define        ERRATA_SIFIVE_CIP_453 0
+#define        ERRATA_SIFIVE_CIP_1200 1
+#define        ERRATA_SIFIVE_NUMBER 2
+#endif
+
+#ifdef __ASSEMBLY__
+
+#define ALT_INSN_FAULT(x)                                              \
+ALTERNATIVE(__stringify(RISCV_PTR do_trap_insn_fault),                 \
+           __stringify(RISCV_PTR sifive_cip_453_insn_fault_trp),       \
+           SIFIVE_VENDOR_ID, ERRATA_SIFIVE_CIP_453,                    \
+           CONFIG_ERRATA_SIFIVE_CIP_453)
+
+#define ALT_PAGE_FAULT(x)                                              \
+ALTERNATIVE(__stringify(RISCV_PTR do_page_fault),                      \
+           __stringify(RISCV_PTR sifive_cip_453_page_fault_trp),       \
+           SIFIVE_VENDOR_ID, ERRATA_SIFIVE_CIP_453,                    \
+           CONFIG_ERRATA_SIFIVE_CIP_453)
+#else /* !__ASSEMBLY__ */
+
+#define ALT_FLUSH_TLB_PAGE(x)                                          \
+asm(ALTERNATIVE("sfence.vma %0", "sfence.vma", SIFIVE_VENDOR_ID,       \
+               ERRATA_SIFIVE_CIP_1200, CONFIG_ERRATA_SIFIVE_CIP_1200)  \
+               : : "r" (addr) : "memory")
+
+#endif /* __ASSEMBLY__ */
+
+#endif
index 845002c..04dad33 100644 (file)
 #endif
 #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
 
+/*
+ * Clang prior to 13 had "mcount" instead of "_mcount":
+ * https://reviews.llvm.org/D98881
+ */
+#if defined(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 130000
+#define MCOUNT_NAME _mcount
+#else
+#define MCOUNT_NAME mcount
+#endif
+
 #define ARCH_SUPPORTS_FTRACE_OPS 1
 #ifndef __ASSEMBLY__
-void _mcount(void);
+void MCOUNT_NAME(void);
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
        return addr;
@@ -36,7 +46,7 @@ struct dyn_arch_ftrace {
  * both auipc and jalr at the same time.
  */
 
-#define MCOUNT_ADDR            ((unsigned long)_mcount)
+#define MCOUNT_ADDR            ((unsigned long)MCOUNT_NAME)
 #define JALR_SIGN_MASK         (0x00000800)
 #define JALR_OFFSET_MASK       (0x00000fff)
 #define AUIPC_OFFSET_MASK      (0xfffff000)
diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h
new file mode 100644 (file)
index 0000000..1e95410
--- /dev/null
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 FORTH-ICS/CARV
+ *  Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#ifndef _RISCV_KEXEC_H
+#define _RISCV_KEXEC_H
+
+#include <asm/page.h>    /* For PAGE_SIZE */
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+
+/* Maximum address we can use for the control code buffer */
+#define KEXEC_CONTROL_MEMORY_LIMIT (-1UL)
+
+/* Reserve a page for the control code buffer */
+#define KEXEC_CONTROL_PAGE_SIZE PAGE_SIZE
+
+#define KEXEC_ARCH KEXEC_ARCH_RISCV
+
+extern void riscv_crash_save_regs(struct pt_regs *newregs);
+
+static inline void
+crash_setup_regs(struct pt_regs *newregs,
+                struct pt_regs *oldregs)
+{
+       if (oldregs)
+               memcpy(newregs, oldregs, sizeof(struct pt_regs));
+       else
+               riscv_crash_save_regs(newregs);
+}
+
+
+#define ARCH_HAS_KIMAGE_ARCH
+
+struct kimage_arch {
+       unsigned long fdt_addr;
+};
+
+const extern unsigned char riscv_kexec_relocate[];
+const extern unsigned int riscv_kexec_relocate_size;
+
+typedef void (*riscv_kexec_method)(unsigned long first_ind_entry,
+                                  unsigned long jump_addr,
+                                  unsigned long fdt_addr,
+                                  unsigned long hartid,
+                                  unsigned long va_pa_off);
+
+extern riscv_kexec_method riscv_kexec_norelocate;
+
+#endif
index adc9d26..6a7761c 100644 (file)
@@ -90,15 +90,58 @@ typedef struct page *pgtable_t;
 
 #ifdef CONFIG_MMU
 extern unsigned long va_pa_offset;
+#ifdef CONFIG_64BIT
+extern unsigned long va_kernel_pa_offset;
+#endif
+#ifdef CONFIG_XIP_KERNEL
+extern unsigned long va_kernel_xip_pa_offset;
+#endif
 extern unsigned long pfn_base;
 #define ARCH_PFN_OFFSET                (pfn_base)
 #else
 #define va_pa_offset           0
+#ifdef CONFIG_64BIT
+#define va_kernel_pa_offset    0
+#endif
 #define ARCH_PFN_OFFSET                (PAGE_OFFSET >> PAGE_SHIFT)
 #endif /* CONFIG_MMU */
 
-#define __pa_to_va_nodebug(x)  ((void *)((unsigned long) (x) + va_pa_offset))
-#define __va_to_pa_nodebug(x)  ((unsigned long)(x) - va_pa_offset)
+extern unsigned long kernel_virt_addr;
+
+#ifdef CONFIG_64BIT
+#define linear_mapping_pa_to_va(x)     ((void *)((unsigned long)(x) + va_pa_offset))
+#ifdef CONFIG_XIP_KERNEL
+#define kernel_mapping_pa_to_va(y)     ({                                              \
+       unsigned long _y = y;                                                           \
+       (_y >= CONFIG_PHYS_RAM_BASE) ?                                                  \
+               (void *)((unsigned long)(_y) + va_kernel_pa_offset + XIP_OFFSET) :      \
+               (void *)((unsigned long)(_y) + va_kernel_xip_pa_offset);                \
+       })
+#else
+#define kernel_mapping_pa_to_va(x)     ((void *)((unsigned long)(x) + va_kernel_pa_offset))
+#endif
+#define __pa_to_va_nodebug(x)          linear_mapping_pa_to_va(x)
+
+#define linear_mapping_va_to_pa(x)     ((unsigned long)(x) - va_pa_offset)
+#ifdef CONFIG_XIP_KERNEL
+#define kernel_mapping_va_to_pa(y) ({                                          \
+       unsigned long _y = y;                                                   \
+       (_y < kernel_virt_addr + XIP_OFFSET) ?                                  \
+               ((unsigned long)(_y) - va_kernel_xip_pa_offset) :               \
+               ((unsigned long)(_y) - va_kernel_pa_offset - XIP_OFFSET);       \
+       })
+#else
+#define kernel_mapping_va_to_pa(x)     ((unsigned long)(x) - va_kernel_pa_offset)
+#endif
+#define __va_to_pa_nodebug(x)  ({                                              \
+       unsigned long _x = x;                                                   \
+       (_x < kernel_virt_addr) ?                                               \
+               linear_mapping_va_to_pa(_x) : kernel_mapping_va_to_pa(_x);      \
+       })
+#else
+#define __pa_to_va_nodebug(x)  ((void *)((unsigned long) (x) + va_pa_offset))
+#define __va_to_pa_nodebug(x)  ((unsigned long)(x) - va_pa_offset)
+#endif
 
 #ifdef CONFIG_DEBUG_VIRTUAL
 extern phys_addr_t __virt_to_phys(unsigned long x);
index ebf817c..9469f46 100644 (file)
 
 #include <asm/pgtable-bits.h>
 
-#ifndef __ASSEMBLY__
+#ifndef CONFIG_MMU
+#define KERNEL_LINK_ADDR       PAGE_OFFSET
+#else
 
-/* Page Upper Directory not used in RISC-V */
-#include <asm-generic/pgtable-nopud.h>
-#include <asm/page.h>
-#include <asm/tlbflush.h>
-#include <linux/mm_types.h>
+#define ADDRESS_SPACE_END      (UL(-1))
 
-#ifdef CONFIG_MMU
+#ifdef CONFIG_64BIT
+/* Leave 2GB for kernel and BPF at the end of the address space */
+#define KERNEL_LINK_ADDR       (ADDRESS_SPACE_END - SZ_2G + 1)
+#else
+#define KERNEL_LINK_ADDR       PAGE_OFFSET
+#endif
 
 #define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
 #define VMALLOC_END      (PAGE_OFFSET - 1)
 #define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
 
 #define BPF_JIT_REGION_SIZE    (SZ_128M)
+#ifdef CONFIG_64BIT
+/* KASLR should leave at least 128MB for BPF after the kernel */
+#define BPF_JIT_REGION_START   PFN_ALIGN((unsigned long)&_end)
+#define BPF_JIT_REGION_END     (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
+#else
 #define BPF_JIT_REGION_START   (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
 #define BPF_JIT_REGION_END     (VMALLOC_END)
+#endif
+
+/* Modules always live before the kernel */
+#ifdef CONFIG_64BIT
+#define MODULES_VADDR  (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
+#define MODULES_END    (PFN_ALIGN((unsigned long)&_start))
+#endif
 
 /*
  * Roughly size the vmemmap space to be large enough to fit enough
 
 #endif
 
+#ifdef CONFIG_XIP_KERNEL
+#define XIP_OFFSET             SZ_8M
+#endif
+
+#ifndef __ASSEMBLY__
+
+/* Page Upper Directory not used in RISC-V */
+#include <asm-generic/pgtable-nopud.h>
+#include <asm/page.h>
+#include <asm/tlbflush.h>
+#include <linux/mm_types.h>
+
 #ifdef CONFIG_64BIT
 #include <asm/pgtable-64.h>
 #else
 #include <asm/pgtable-32.h>
 #endif /* CONFIG_64BIT */
 
+#ifdef CONFIG_XIP_KERNEL
+#define XIP_FIXUP(addr) ({                                                     \
+       uintptr_t __a = (uintptr_t)(addr);                                      \
+       (__a >= CONFIG_XIP_PHYS_ADDR && __a < CONFIG_XIP_PHYS_ADDR + SZ_16M) ?  \
+               __a - CONFIG_XIP_PHYS_ADDR + CONFIG_PHYS_RAM_BASE - XIP_OFFSET :\
+               __a;                                                            \
+       })
+#else
+#define XIP_FIXUP(addr)                (addr)
+#endif /* CONFIG_XIP_KERNEL */
+
 #ifdef CONFIG_MMU
 /* Number of entries in the page global directory */
 #define PTRS_PER_PGD    (PAGE_SIZE / sizeof(pgd_t))
@@ -484,8 +522,17 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 
 #define kern_addr_valid(addr)   (1) /* FIXME */
 
-extern void *dtb_early_va;
-extern uintptr_t dtb_early_pa;
+extern char _start[];
+extern void *_dtb_early_va;
+extern uintptr_t _dtb_early_pa;
+#if defined(CONFIG_XIP_KERNEL) && defined(CONFIG_MMU)
+#define dtb_early_va   (*(void **)XIP_FIXUP(&_dtb_early_va))
+#define dtb_early_pa   (*(uintptr_t *)XIP_FIXUP(&_dtb_early_pa))
+#else
+#define dtb_early_va   _dtb_early_va
+#define dtb_early_pa   _dtb_early_pa
+#endif /* CONFIG_XIP_KERNEL */
+
 void setup_bootmem(void);
 void paging_init(void);
 void misc_mem_init(void);
index d702741..0d42693 100644 (file)
@@ -97,6 +97,9 @@ struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
 
 void sbi_console_putchar(int ch);
 int sbi_console_getchar(void);
+long sbi_get_mvendorid(void);
+long sbi_get_marchid(void);
+long sbi_get_mimpid(void);
 void sbi_set_timer(uint64_t stime_value);
 void sbi_shutdown(void);
 void sbi_clear_ipi(void);
index 1595c5b..8a303fb 100644 (file)
@@ -11,5 +11,6 @@ extern char _start[];
 extern char _start_kernel[];
 extern char __init_data_begin[], __init_data_end[];
 extern char __init_text_begin[], __init_text_end[];
+extern char __alt_start[], __alt_end[];
 
 #endif /* __ASM_SECTIONS_H */
index 6887b3d..a9c5677 100644 (file)
@@ -17,6 +17,7 @@ int set_memory_x(unsigned long addr, int numpages);
 int set_memory_nx(unsigned long addr, int numpages);
 int set_memory_rw_nx(unsigned long addr, int numpages);
 void protect_kernel_text_data(void);
+void protect_kernel_linear_mapping_text_rodata(void);
 #else
 static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; }
 static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; }
index df1f7c4..a7d2811 100644 (file)
@@ -46,7 +46,7 @@ int riscv_hartid_to_cpuid(int hartid);
 void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out);
 
 /* Set custom IPI operations */
-void riscv_set_ipi_ops(struct riscv_ipi_ops *ops);
+void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops);
 
 /* Clear IPI for current CPU */
 void riscv_clear_ipi(void);
@@ -92,7 +92,7 @@ static inline void riscv_cpuid_to_hartid_mask(const struct cpumask *in,
        cpumask_set_cpu(boot_cpu_hartid, out);
 }
 
-static inline void riscv_set_ipi_ops(struct riscv_ipi_ops *ops)
+static inline void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops)
 {
 }
 
index 5477e7e..9090493 100644 (file)
@@ -23,5 +23,10 @@ extern asmlinkage void *__memmove(void *, const void *, size_t);
 #define memcpy(dst, src, len) __memcpy(dst, src, len)
 #define memset(s, c, n) __memset(s, c, n)
 #define memmove(dst, src, len) __memmove(dst, src, len)
+
+#ifndef __NO_FORTIFY
+#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
+#endif
+
 #endif
 #endif /* _ASM_RISCV_STRING_H */
index 49350c8..b933b15 100644 (file)
@@ -15,7 +15,7 @@
 #include <linux/err.h>
 
 /* The array of function pointers for syscalls. */
-extern void *sys_call_table[];
+extern void * const sys_call_table[];
 
 /*
  * Only the low 32 bits of orig_r0 are meaningful, so we return int.
index 394cfbc..c84218a 100644 (file)
@@ -9,6 +9,7 @@
 
 #include <linux/mm_types.h>
 #include <asm/smp.h>
+#include <asm/errata_list.h>
 
 #ifdef CONFIG_MMU
 static inline void local_flush_tlb_all(void)
@@ -19,7 +20,7 @@ static inline void local_flush_tlb_all(void)
 /* Flush one page from local TLB */
 static inline void local_flush_tlb_page(unsigned long addr)
 {
-       __asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory");
+       ALT_FLUSH_TLB_PAGE(__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory"));
 }
 #else /* CONFIG_MMU */
 #define local_flush_tlb_all()                  do { } while (0)
index f944062..f314ff4 100644 (file)
@@ -375,7 +375,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 
 extern long strncpy_from_user(char *dest, const char __user *src, long count);
 
-extern long __must_check strlen_user(const char __user *str);
 extern long __must_check strnlen_user(const char __user *str, long n);
 
 extern
diff --git a/arch/riscv/include/asm/vendorid_list.h b/arch/riscv/include/asm/vendorid_list.h
new file mode 100644 (file)
index 0000000..9d93421
--- /dev/null
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 SiFive
+ */
+#ifndef ASM_VENDOR_LIST_H
+#define ASM_VENDOR_LIST_H
+
+#define SIFIVE_VENDOR_ID       0x489
+
+#endif
index 647a47f..d3081e4 100644 (file)
@@ -10,6 +10,10 @@ CFLAGS_REMOVE_sbi.o  = $(CC_FLAGS_FTRACE)
 endif
 CFLAGS_syscall_table.o += $(call cc-option,-Wno-override-init,)
 
+ifdef CONFIG_KEXEC
+AFLAGS_kexec_relocate.o := -mcmodel=medany -mno-relax
+endif
+
 extra-y += head.o
 extra-y += vmlinux.lds
 
@@ -55,6 +59,8 @@ obj-$(CONFIG_SMP) += cpu_ops_sbi.o
 endif
 obj-$(CONFIG_HOTPLUG_CPU)      += cpu-hotplug.o
 obj-$(CONFIG_KGDB)             += kgdb.o
+obj-$(CONFIG_KEXEC)            += kexec_relocate.o crash_save_regs.o machine_kexec.o
+obj-$(CONFIG_CRASH_DUMP)       += crash_dump.o
 
 obj-$(CONFIG_JUMP_LABEL)       += jump_label.o
 
diff --git a/arch/riscv/kernel/crash_dump.c b/arch/riscv/kernel/crash_dump.c
new file mode 100644 (file)
index 0000000..86cc0ad
--- /dev/null
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This code comes from arch/arm64/kernel/crash_dump.c
+ * Created by: AKASHI Takahiro <takahiro.akashi@linaro.org>
+ * Copyright (C) 2017 Linaro Limited
+ */
+
+#include <linux/crash_dump.h>
+#include <linux/io.h>
+
+/**
+ * copy_oldmem_page() - copy one page from old kernel memory
+ * @pfn: page frame number to be copied
+ * @buf: buffer where the copied page is placed
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page
+ * @userbuf: if set, @buf is in a user address space
+ *
+ * This function copies one page from old kernel memory into buffer pointed by
+ * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes
+ * copied or negative error in case of failure.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+                        size_t csize, unsigned long offset,
+                        int userbuf)
+{
+       void *vaddr;
+
+       if (!csize)
+               return 0;
+
+       vaddr = memremap(__pfn_to_phys(pfn), PAGE_SIZE, MEMREMAP_WB);
+       if (!vaddr)
+               return -ENOMEM;
+
+       if (userbuf) {
+               if (copy_to_user((char __user *)buf, vaddr + offset, csize)) {
+                       memunmap(vaddr);
+                       return -EFAULT;
+               }
+       } else
+               memcpy(buf, vaddr + offset, csize);
+
+       memunmap(vaddr);
+       return csize;
+}
diff --git a/arch/riscv/kernel/crash_save_regs.S b/arch/riscv/kernel/crash_save_regs.S
new file mode 100644 (file)
index 0000000..7832fb7
--- /dev/null
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 FORTH-ICS/CARV
+ *  Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#include <asm/asm.h>           /* For RISCV_* and REG_* macros */
+#include <asm/csr.h>           /* For CSR_* macros */
+#include <asm/asm-offsets.h>   /* For offsets on pt_regs */
+#include <linux/linkage.h>     /* For SYM_* macros */
+
+.section ".text"
+SYM_CODE_START(riscv_crash_save_regs)
+       REG_S ra,  PT_RA(a0)    /* x1 */
+       REG_S sp,  PT_SP(a0)    /* x2 */
+       REG_S gp,  PT_GP(a0)    /* x3 */
+       REG_S tp,  PT_TP(a0)    /* x4 */
+       REG_S t0,  PT_T0(a0)    /* x5 */
+       REG_S t1,  PT_T1(a0)    /* x6 */
+       REG_S t2,  PT_T2(a0)    /* x7 */
+       REG_S s0,  PT_S0(a0)    /* x8/fp */
+       REG_S s1,  PT_S1(a0)    /* x9 */
+       REG_S a0,  PT_A0(a0)    /* x10 */
+       REG_S a1,  PT_A1(a0)    /* x11 */
+       REG_S a2,  PT_A2(a0)    /* x12 */
+       REG_S a3,  PT_A3(a0)    /* x13 */
+       REG_S a4,  PT_A4(a0)    /* x14 */
+       REG_S a5,  PT_A5(a0)    /* x15 */
+       REG_S a6,  PT_A6(a0)    /* x16 */
+       REG_S a7,  PT_A7(a0)    /* x17 */
+       REG_S s2,  PT_S2(a0)    /* x18 */
+       REG_S s3,  PT_S3(a0)    /* x19 */
+       REG_S s4,  PT_S4(a0)    /* x20 */
+       REG_S s5,  PT_S5(a0)    /* x21 */
+       REG_S s6,  PT_S6(a0)    /* x22 */
+       REG_S s7,  PT_S7(a0)    /* x23 */
+       REG_S s8,  PT_S8(a0)    /* x24 */
+       REG_S s9,  PT_S9(a0)    /* x25 */
+       REG_S s10, PT_S10(a0)   /* x26 */
+       REG_S s11, PT_S11(a0)   /* x27 */
+       REG_S t3,  PT_T3(a0)    /* x28 */
+       REG_S t4,  PT_T4(a0)    /* x29 */
+       REG_S t5,  PT_T5(a0)    /* x30 */
+       REG_S t6,  PT_T6(a0)    /* x31 */
+
+       csrr t1, CSR_STATUS
+       csrr t2, CSR_EPC
+       csrr t3, CSR_TVAL
+       csrr t4, CSR_CAUSE
+
+       REG_S t1, PT_STATUS(a0)
+       REG_S t2, PT_EPC(a0)
+       REG_S t3, PT_BADADDR(a0)
+       REG_S t4, PT_CAUSE(a0)
+       ret
+SYM_CODE_END(riscv_crash_save_regs)
index 83095fa..80d5a9e 100644 (file)
@@ -12,6 +12,7 @@
 #include <asm/unistd.h>
 #include <asm/thread_info.h>
 #include <asm/asm-offsets.h>
+#include <asm/errata_list.h>
 
 #if !IS_ENABLED(CONFIG_PREEMPTION)
 .set resume_kernel, restore_all
@@ -454,7 +455,7 @@ ENDPROC(__switch_to)
        /* Exception vector table */
 ENTRY(excp_vect_table)
        RISCV_PTR do_trap_insn_misaligned
-       RISCV_PTR do_trap_insn_fault
+       ALT_INSN_FAULT(RISCV_PTR do_trap_insn_fault)
        RISCV_PTR do_trap_insn_illegal
        RISCV_PTR do_trap_break
        RISCV_PTR do_trap_load_misaligned
@@ -465,7 +466,8 @@ ENTRY(excp_vect_table)
        RISCV_PTR do_trap_ecall_s
        RISCV_PTR do_trap_unknown
        RISCV_PTR do_trap_ecall_m
-       RISCV_PTR do_page_fault   /* instruction page fault */
+       /* instruciton page fault */
+       ALT_PAGE_FAULT(RISCV_PTR do_page_fault)
        RISCV_PTR do_page_fault   /* load page fault */
        RISCV_PTR do_trap_unknown
        RISCV_PTR do_page_fault   /* store page fault */
index f5a9bad..89cc58a 100644 (file)
@@ -9,11 +9,23 @@
 #include <linux/linkage.h>
 #include <asm/thread_info.h>
 #include <asm/page.h>
+#include <asm/pgtable.h>
 #include <asm/csr.h>
 #include <asm/hwcap.h>
 #include <asm/image.h>
 #include "efi-header.S"
 
+#ifdef CONFIG_XIP_KERNEL
+.macro XIP_FIXUP_OFFSET reg
+       REG_L t0, _xip_fixup
+       add \reg, \reg, t0
+.endm
+_xip_fixup: .dword CONFIG_PHYS_RAM_BASE - CONFIG_XIP_PHYS_ADDR - XIP_OFFSET
+#else
+.macro XIP_FIXUP_OFFSET reg
+.endm
+#endif /* CONFIG_XIP_KERNEL */
+
 __HEAD
 ENTRY(_start)
        /*
@@ -69,7 +81,9 @@ pe_head_start:
 #ifdef CONFIG_MMU
 relocate:
        /* Relocate return address */
-       li a1, PAGE_OFFSET
+       la a1, kernel_virt_addr
+       XIP_FIXUP_OFFSET a1
+       REG_L a1, 0(a1)
        la a2, _start
        sub a1, a1, a2
        add ra, ra, a1
@@ -91,6 +105,7 @@ relocate:
         * to ensure the new translations are in use.
         */
        la a0, trampoline_pg_dir
+       XIP_FIXUP_OFFSET a0
        srl a0, a0, PAGE_SHIFT
        or a0, a0, a1
        sfence.vma
@@ -144,7 +159,9 @@ secondary_start_sbi:
 
        slli a3, a0, LGREG
        la a4, __cpu_up_stack_pointer
+       XIP_FIXUP_OFFSET a4
        la a5, __cpu_up_task_pointer
+       XIP_FIXUP_OFFSET a5
        add a4, a3, a4
        add a5, a3, a5
        REG_L sp, (a4)
@@ -156,6 +173,7 @@ secondary_start_common:
 #ifdef CONFIG_MMU
        /* Enable virtual memory and relocate to virtual address */
        la a0, swapper_pg_dir
+       XIP_FIXUP_OFFSET a0
        call relocate
 #endif
        call setup_trap_vector
@@ -236,12 +254,33 @@ pmp_done:
 .Lgood_cores:
 #endif
 
+#ifndef CONFIG_XIP_KERNEL
        /* Pick one hart to run the main boot sequence */
        la a3, hart_lottery
        li a2, 1
        amoadd.w a3, a2, (a3)
        bnez a3, .Lsecondary_start
 
+#else
+       /* hart_lottery in flash contains a magic number */
+       la a3, hart_lottery
+       mv a2, a3
+       XIP_FIXUP_OFFSET a2
+       lw t1, (a3)
+       amoswap.w t0, t1, (a2)
+       /* first time here if hart_lottery in RAM is not set */
+       beq t0, t1, .Lsecondary_start
+
+       la sp, _end + THREAD_SIZE
+       XIP_FIXUP_OFFSET sp
+       mv s0, a0
+       call __copy_data
+
+       /* Restore a0 copy */
+       mv a0, s0
+#endif
+
+#ifndef CONFIG_XIP_KERNEL
        /* Clear BSS for flat non-ELF images */
        la a3, __bss_start
        la a4, __bss_stop
@@ -251,15 +290,18 @@ clear_bss:
        add a3, a3, RISCV_SZPTR
        blt a3, a4, clear_bss
 clear_bss_done:
-
+#endif
        /* Save hart ID and DTB physical address */
        mv s0, a0
        mv s1, a1
+
        la a2, boot_cpu_hartid
+       XIP_FIXUP_OFFSET a2
        REG_S a0, (a2)
 
        /* Initialize page tables and relocate to virtual addresses */
        la sp, init_thread_union + THREAD_SIZE
+       XIP_FIXUP_OFFSET sp
 #ifdef CONFIG_BUILTIN_DTB
        la a0, __dtb_start
 #else
@@ -268,6 +310,7 @@ clear_bss_done:
        call setup_vm
 #ifdef CONFIG_MMU
        la a0, early_pg_dir
+       XIP_FIXUP_OFFSET a0
        call relocate
 #endif /* CONFIG_MMU */
 
@@ -292,7 +335,9 @@ clear_bss_done:
 
        slli a3, a0, LGREG
        la a1, __cpu_up_stack_pointer
+       XIP_FIXUP_OFFSET a1
        la a2, __cpu_up_task_pointer
+       XIP_FIXUP_OFFSET a2
        add a1, a3, a1
        add a2, a3, a2
 
index b48dda3..aabbc3a 100644 (file)
@@ -12,6 +12,9 @@ extern atomic_t hart_lottery;
 
 asmlinkage void do_page_fault(struct pt_regs *regs);
 asmlinkage void __init setup_vm(uintptr_t dtb_pa);
+#ifdef CONFIG_XIP_KERNEL
+asmlinkage void __init __copy_data(void);
+#endif
 
 extern void *__cpu_up_stack_pointer[];
 extern void *__cpu_up_task_pointer[];
diff --git a/arch/riscv/kernel/kexec_relocate.S b/arch/riscv/kernel/kexec_relocate.S
new file mode 100644 (file)
index 0000000..88c3bea
--- /dev/null
@@ -0,0 +1,223 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 FORTH-ICS/CARV
+ *  Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#include <asm/asm.h>   /* For RISCV_* and REG_* macros */
+#include <asm/csr.h>   /* For CSR_* macros */
+#include <asm/page.h>  /* For PAGE_SIZE */
+#include <linux/linkage.h> /* For SYM_* macros */
+
+.section ".rodata"
+SYM_CODE_START(riscv_kexec_relocate)
+
+       /*
+        * s0: Pointer to the current entry
+        * s1: (const) Phys address to jump to after relocation
+        * s2: (const) Phys address of the FDT image
+        * s3: (const) The hartid of the current hart
+        * s4: Pointer to the destination address for the relocation
+        * s5: (const) Number of words per page
+        * s6: (const) 1, used for subtraction
+        * s7: (const) va_pa_offset, used when switching MMU off
+        * s8: (const) Physical address of the main loop
+        * s9: (debug) indirection page counter
+        * s10: (debug) entry counter
+        * s11: (debug) copied words counter
+        */
+       mv      s0, a0
+       mv      s1, a1
+       mv      s2, a2
+       mv      s3, a3
+       mv      s4, zero
+       li      s5, (PAGE_SIZE / RISCV_SZPTR)
+       li      s6, 1
+       mv      s7, a4
+       mv      s8, zero
+       mv      s9, zero
+       mv      s10, zero
+       mv      s11, zero
+
+       /* Disable / cleanup interrupts */
+       csrw    CSR_SIE, zero
+       csrw    CSR_SIP, zero
+
+       /*
+        * When we switch SATP.MODE to "Bare" we'll only
+        * play with physical addresses. However the first time
+        * we try to jump somewhere, the offset on the jump
+        * will be relative to pc which will still be on VA. To
+        * deal with this we set stvec to the physical address at
+        * the start of the loop below so that we jump there in
+        * any case.
+        */
+       la      s8, 1f
+       sub     s8, s8, s7
+       csrw    CSR_STVEC, s8
+
+       /* Process entries in a loop */
+.align 2
+1:
+       addi    s10, s10, 1
+       REG_L   t0, 0(s0)               /* t0 = *image->entry */
+       addi    s0, s0, RISCV_SZPTR     /* image->entry++ */
+
+       /* IND_DESTINATION entry ? -> save destination address */
+       andi    t1, t0, 0x1
+       beqz    t1, 2f
+       andi    s4, t0, ~0x1
+       j       1b
+
+2:
+       /* IND_INDIRECTION entry ? -> update next entry ptr (PA) */
+       andi    t1, t0, 0x2
+       beqz    t1, 2f
+       andi    s0, t0, ~0x2
+       addi    s9, s9, 1
+       csrw    CSR_SATP, zero
+       jalr    zero, s8, 0
+
+2:
+       /* IND_DONE entry ? -> jump to done label */
+       andi    t1, t0, 0x4
+       beqz    t1, 2f
+       j       4f
+
+2:
+       /*
+        * IND_SOURCE entry ? -> copy page word by word to the
+        * destination address we got from IND_DESTINATION
+        */
+       andi    t1, t0, 0x8
+       beqz    t1, 1b          /* Unknown entry type, ignore it */
+       andi    t0, t0, ~0x8
+       mv      t3, s5          /* i = num words per page */
+3:     /* copy loop */
+       REG_L   t1, (t0)        /* t1 = *src_ptr */
+       REG_S   t1, (s4)        /* *dst_ptr = *src_ptr */
+       addi    t0, t0, RISCV_SZPTR /* stc_ptr++ */
+       addi    s4, s4, RISCV_SZPTR /* dst_ptr++ */
+       sub     t3, t3, s6      /* i-- */
+       addi    s11, s11, 1     /* c++ */
+       beqz    t3, 1b          /* copy done ? */
+       j       3b
+
+4:
+       /* Pass the arguments to the next kernel  / Cleanup*/
+       mv      a0, s3
+       mv      a1, s2
+       mv      a2, s1
+
+       /* Cleanup */
+       mv      a3, zero
+       mv      a4, zero
+       mv      a5, zero
+       mv      a6, zero
+       mv      a7, zero
+
+       mv      s0, zero
+       mv      s1, zero
+       mv      s2, zero
+       mv      s3, zero
+       mv      s4, zero
+       mv      s5, zero
+       mv      s6, zero
+       mv      s7, zero
+       mv      s8, zero
+       mv      s9, zero
+       mv      s10, zero
+       mv      s11, zero
+
+       mv      t0, zero
+       mv      t1, zero
+       mv      t2, zero
+       mv      t3, zero
+       mv      t4, zero
+       mv      t5, zero
+       mv      t6, zero
+       csrw    CSR_SEPC, zero
+       csrw    CSR_SCAUSE, zero
+       csrw    CSR_SSCRATCH, zero
+
+       /*
+        * Make sure the relocated code is visible
+        * and jump to the new kernel
+        */
+       fence.i
+
+       jalr    zero, a2, 0
+
+SYM_CODE_END(riscv_kexec_relocate)
+riscv_kexec_relocate_end:
+
+
+/* Used for jumping to crashkernel */
+.section ".text"
+SYM_CODE_START(riscv_kexec_norelocate)
+       /*
+        * s0: (const) Phys address to jump to
+        * s1: (const) Phys address of the FDT image
+        * s2: (const) The hartid of the current hart
+        * s3: (const) va_pa_offset, used when switching MMU off
+        */
+       mv      s0, a1
+       mv      s1, a2
+       mv      s2, a3
+       mv      s3, a4
+
+       /* Disable / cleanup interrupts */
+       csrw    CSR_SIE, zero
+       csrw    CSR_SIP, zero
+
+       /* Switch to physical addressing */
+       la      s4, 1f
+       sub     s4, s4, s3
+       csrw    CSR_STVEC, s4
+       csrw    CSR_SATP, zero
+
+.align 2
+1:
+       /* Pass the arguments to the next kernel  / Cleanup*/
+       mv      a0, s2
+       mv      a1, s1
+       mv      a2, s0
+
+       /* Cleanup */
+       mv      a3, zero
+       mv      a4, zero
+       mv      a5, zero
+       mv      a6, zero
+       mv      a7, zero
+
+       mv      s0, zero
+       mv      s1, zero
+       mv      s2, zero
+       mv      s3, zero
+       mv      s4, zero
+       mv      s5, zero
+       mv      s6, zero
+       mv      s7, zero
+       mv      s8, zero
+       mv      s9, zero
+       mv      s10, zero
+       mv      s11, zero
+
+       mv      t0, zero
+       mv      t1, zero
+       mv      t2, zero
+       mv      t3, zero
+       mv      t4, zero
+       mv      t5, zero
+       mv      t6, zero
+       csrw    CSR_SEPC, zero
+       csrw    CSR_SCAUSE, zero
+       csrw    CSR_SSCRATCH, zero
+
+       jalr    zero, a2, 0
+SYM_CODE_END(riscv_kexec_norelocate)
+
+.section ".rodata"
+SYM_DATA(riscv_kexec_relocate_size,
+       .long riscv_kexec_relocate_end - riscv_kexec_relocate)
+
diff --git a/arch/riscv/kernel/machine_kexec.c b/arch/riscv/kernel/machine_kexec.c
new file mode 100644 (file)
index 0000000..cc04814
--- /dev/null
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 FORTH-ICS/CARV
+ *  Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#include <linux/kexec.h>
+#include <asm/kexec.h>         /* For riscv_kexec_* symbol defines */
+#include <linux/smp.h>         /* For smp_send_stop () */
+#include <asm/cacheflush.h>    /* For local_flush_icache_all() */
+#include <asm/barrier.h>       /* For smp_wmb() */
+#include <asm/page.h>          /* For PAGE_MASK */
+#include <linux/libfdt.h>      /* For fdt_check_header() */
+#include <asm/set_memory.h>    /* For set_memory_x() */
+#include <linux/compiler.h>    /* For unreachable() */
+#include <linux/cpu.h>         /* For cpu_down() */
+
+/**
+ * kexec_image_info - Print received image details
+ */
+static void
+kexec_image_info(const struct kimage *image)
+{
+       unsigned long i;
+
+       pr_debug("Kexec image info:\n");
+       pr_debug("\ttype:        %d\n", image->type);
+       pr_debug("\tstart:       %lx\n", image->start);
+       pr_debug("\thead:        %lx\n", image->head);
+       pr_debug("\tnr_segments: %lu\n", image->nr_segments);
+
+       for (i = 0; i < image->nr_segments; i++) {
+               pr_debug("\t    segment[%lu]: %016lx - %016lx", i,
+                       image->segment[i].mem,
+                       image->segment[i].mem + image->segment[i].memsz);
+               pr_debug("\t\t0x%lx bytes, %lu pages\n",
+                       (unsigned long) image->segment[i].memsz,
+                       (unsigned long) image->segment[i].memsz /  PAGE_SIZE);
+       }
+}
+
+/**
+ * machine_kexec_prepare - Initialize kexec
+ *
+ * This function is called from do_kexec_load, when the user has
+ * provided us with an image to be loaded. Its goal is to validate
+ * the image and prepare the control code buffer as needed.
+ * Note that kimage_alloc_init has already been called and the
+ * control buffer has already been allocated.
+ */
+int
+machine_kexec_prepare(struct kimage *image)
+{
+       struct kimage_arch *internal = &image->arch;
+       struct fdt_header fdt = {0};
+       void *control_code_buffer = NULL;
+       unsigned int control_code_buffer_sz = 0;
+       int i = 0;
+
+       kexec_image_info(image);
+
+       /* Find the Flattened Device Tree and save its physical address */
+       for (i = 0; i < image->nr_segments; i++) {
+               if (image->segment[i].memsz <= sizeof(fdt))
+                       continue;
+
+               if (copy_from_user(&fdt, image->segment[i].buf, sizeof(fdt)))
+                       continue;
+
+               if (fdt_check_header(&fdt))
+                       continue;
+
+               internal->fdt_addr = (unsigned long) image->segment[i].mem;
+               break;
+       }
+
+       if (!internal->fdt_addr) {
+               pr_err("Device tree not included in the provided image\n");
+               return -EINVAL;
+       }
+
+       /* Copy the assembler code for relocation to the control page */
+       if (image->type != KEXEC_TYPE_CRASH) {
+               control_code_buffer = page_address(image->control_code_page);
+               control_code_buffer_sz = page_size(image->control_code_page);
+
+               if (unlikely(riscv_kexec_relocate_size > control_code_buffer_sz)) {
+                       pr_err("Relocation code doesn't fit within a control page\n");
+                       return -EINVAL;
+               }
+
+               memcpy(control_code_buffer, riscv_kexec_relocate,
+                       riscv_kexec_relocate_size);
+
+               /* Mark the control page executable */
+               set_memory_x((unsigned long) control_code_buffer, 1);
+       }
+
+       return 0;
+}
+
+
+/**
+ * machine_kexec_cleanup - Cleanup any leftovers from
+ *                        machine_kexec_prepare
+ *
+ * This function is called by kimage_free to handle any arch-specific
+ * allocations done on machine_kexec_prepare. Since we didn't do any
+ * allocations there, this is just an empty function. Note that the
+ * control buffer is freed by kimage_free.
+ */
+void
+machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+
+/*
+ * machine_shutdown - Prepare for a kexec reboot
+ *
+ * This function is called by kernel_kexec just before machine_kexec
+ * below. Its goal is to prepare the rest of the system (the other
+ * harts and possibly devices etc) for a kexec reboot.
+ */
+void machine_shutdown(void)
+{
+       /*
+        * No more interrupts on this hart
+        * until we are back up.
+        */
+       local_irq_disable();
+
+#if defined(CONFIG_HOTPLUG_CPU)
+       smp_shutdown_nonboot_cpus(smp_processor_id());
+#endif
+}
+
+/**
+ * machine_crash_shutdown - Prepare to kexec after a kernel crash
+ *
+ * This function is called by crash_kexec just before machine_kexec
+ * below and its goal is similar to machine_shutdown, but in case of
+ * a kernel crash. Since we don't handle such cases yet, this function
+ * is empty.
+ */
+void
+machine_crash_shutdown(struct pt_regs *regs)
+{
+       crash_save_cpu(regs, smp_processor_id());
+       machine_shutdown();
+       pr_info("Starting crashdump kernel...\n");
+}
+
+/**
+ * machine_kexec - Jump to the loaded kimage
+ *
+ * This function is called by kernel_kexec which is called by the
+ * reboot system call when the reboot cmd is LINUX_REBOOT_CMD_KEXEC,
+ * or by crash_kernel which is called by the kernel's arch-specific
+ * trap handler in case of a kernel panic. It's the final stage of
+ * the kexec process where the pre-loaded kimage is ready to be
+ * executed. We assume at this point that all other harts are
+ * suspended and this hart will be the new boot hart.
+ */
+void __noreturn
+machine_kexec(struct kimage *image)
+{
+       struct kimage_arch *internal = &image->arch;
+       unsigned long jump_addr = (unsigned long) image->start;
+       unsigned long first_ind_entry = (unsigned long) &image->head;
+       unsigned long this_hart_id = raw_smp_processor_id();
+       unsigned long fdt_addr = internal->fdt_addr;
+       void *control_code_buffer = page_address(image->control_code_page);
+       riscv_kexec_method kexec_method = NULL;
+
+       if (image->type != KEXEC_TYPE_CRASH)
+               kexec_method = control_code_buffer;
+       else
+               kexec_method = (riscv_kexec_method) &riscv_kexec_norelocate;
+
+       pr_notice("Will call new kernel at %08lx from hart id %lx\n",
+                 jump_addr, this_hart_id);
+       pr_notice("FDT image at %08lx\n", fdt_addr);
+
+       /* Make sure the relocation code is visible to the hart */
+       local_flush_icache_all();
+
+       /* Jump to the relocation code */
+       pr_notice("Bye...\n");
+       kexec_method(first_ind_entry, jump_addr, fdt_addr,
+                    this_hart_id, va_pa_offset);
+       unreachable();
+}
index 8a5593f..6d46268 100644 (file)
@@ -47,8 +47,8 @@
 
 ENTRY(ftrace_stub)
 #ifdef CONFIG_DYNAMIC_FTRACE
-       .global _mcount
-       .set    _mcount, ftrace_stub
+       .global MCOUNT_NAME
+       .set    MCOUNT_NAME, ftrace_stub
 #endif
        ret
 ENDPROC(ftrace_stub)
@@ -78,7 +78,7 @@ ENDPROC(return_to_handler)
 #endif
 
 #ifndef CONFIG_DYNAMIC_FTRACE
-ENTRY(_mcount)
+ENTRY(MCOUNT_NAME)
        la      t4, ftrace_stub
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        la      t0, ftrace_graph_return
@@ -124,6 +124,6 @@ do_trace:
        jalr    t5
        RESTORE_ABI_STATE
        ret
-ENDPROC(_mcount)
+ENDPROC(MCOUNT_NAME)
 #endif
-EXPORT_SYMBOL(_mcount)
+EXPORT_SYMBOL(MCOUNT_NAME)
index 104fba8..68a9e3d 100644 (file)
@@ -408,13 +408,11 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 }
 
 #if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
-#define VMALLOC_MODULE_START \
-        max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
 void *module_alloc(unsigned long size)
 {
-       return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
-                                   VMALLOC_END, GFP_KERNEL,
-                                   PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+       return __vmalloc_node_range(size, 1, MODULES_VADDR,
+                                   MODULES_END, GFP_KERNEL,
+                                   PAGE_KERNEL, 0, NUMA_NO_NODE,
                                    __builtin_return_address(0));
 }
 #endif
index 7e2c78e..10b965c 100644 (file)
@@ -84,6 +84,14 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
        return 0;
 }
 
+void *alloc_insn_page(void)
+{
+       return  __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
+                                    GFP_KERNEL, PAGE_KERNEL_READ_EXEC,
+                                    VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
+                                    __builtin_return_address(0));
+}
+
 /* install breakpoint in text */
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
@@ -260,8 +268,10 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr)
 
                if (kcb->kprobe_status == KPROBE_REENTER)
                        restore_previous_kprobe(kcb);
-               else
+               else {
+                       kprobes_restore_local_irqflag(kcb, regs);
                        reset_current_kprobe();
+               }
 
                break;
        case KPROBE_HIT_ACTIVE:
index d3bf756..7402a41 100644 (file)
 #include <asm/smp.h>
 
 /* default SBI version is 0.1 */
-unsigned long sbi_spec_version = SBI_SPEC_VERSION_DEFAULT;
+unsigned long sbi_spec_version __ro_after_init = SBI_SPEC_VERSION_DEFAULT;
 EXPORT_SYMBOL(sbi_spec_version);
 
-static void (*__sbi_set_timer)(uint64_t stime);
-static int (*__sbi_send_ipi)(const unsigned long *hart_mask);
+static void (*__sbi_set_timer)(uint64_t stime) __ro_after_init;
+static int (*__sbi_send_ipi)(const unsigned long *hart_mask) __ro_after_init;
 static int (*__sbi_rfence)(int fid, const unsigned long *hart_mask,
                           unsigned long start, unsigned long size,
-                          unsigned long arg4, unsigned long arg5);
+                          unsigned long arg4, unsigned long arg5) __ro_after_init;
 
 struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
                        unsigned long arg1, unsigned long arg2,
@@ -547,6 +547,21 @@ static inline long sbi_get_firmware_version(void)
        return __sbi_base_ecall(SBI_EXT_BASE_GET_IMP_VERSION);
 }
 
+long sbi_get_mvendorid(void)
+{
+       return __sbi_base_ecall(SBI_EXT_BASE_GET_MVENDORID);
+}
+
+long sbi_get_marchid(void)
+{
+       return __sbi_base_ecall(SBI_EXT_BASE_GET_MARCHID);
+}
+
+long sbi_get_mimpid(void)
+{
+       return __sbi_base_ecall(SBI_EXT_BASE_GET_MIMPID);
+}
+
 static void sbi_send_cpumask_ipi(const struct cpumask *target)
 {
        struct cpumask hartid_mask;
@@ -556,7 +571,7 @@ static void sbi_send_cpumask_ipi(const struct cpumask *target)
        sbi_send_ipi(cpumask_bits(&hartid_mask));
 }
 
-static struct riscv_ipi_ops sbi_ipi_ops = {
+static const struct riscv_ipi_ops sbi_ipi_ops = {
        .ipi_inject = sbi_send_cpumask_ipi
 };
 
@@ -577,19 +592,19 @@ void __init sbi_init(void)
                        sbi_get_firmware_id(), sbi_get_firmware_version());
                if (sbi_probe_extension(SBI_EXT_TIME) > 0) {
                        __sbi_set_timer = __sbi_set_timer_v02;
-                       pr_info("SBI v0.2 TIME extension detected\n");
+                       pr_info("SBI TIME extension detected\n");
                } else {
                        __sbi_set_timer = __sbi_set_timer_v01;
                }
                if (sbi_probe_extension(SBI_EXT_IPI) > 0) {
                        __sbi_send_ipi  = __sbi_send_ipi_v02;
-                       pr_info("SBI v0.2 IPI extension detected\n");
+                       pr_info("SBI IPI extension detected\n");
                } else {
                        __sbi_send_ipi  = __sbi_send_ipi_v01;
                }
                if (sbi_probe_extension(SBI_EXT_RFENCE) > 0) {
                        __sbi_rfence    = __sbi_rfence_v02;
-                       pr_info("SBI v0.2 RFENCE extension detected\n");
+                       pr_info("SBI RFENCE extension detected\n");
                } else {
                        __sbi_rfence    = __sbi_rfence_v01;
                }
index f8f1533..7b31779 100644 (file)
 #include <linux/swiotlb.h>
 #include <linux/smp.h>
 #include <linux/efi.h>
+#include <linux/crash_dump.h>
 
 #include <asm/cpu_ops.h>
 #include <asm/early_ioremap.h>
+#include <asm/pgtable.h>
 #include <asm/setup.h>
 #include <asm/set_memory.h>
 #include <asm/sections.h>
@@ -50,7 +52,11 @@ struct screen_info screen_info __section(".data") = {
  * This is used before the kernel initializes the BSS so it can't be in the
  * BSS.
  */
-atomic_t hart_lottery __section(".sdata");
+atomic_t hart_lottery __section(".sdata")
+#ifdef CONFIG_XIP_KERNEL
+= ATOMIC_INIT(0xC001BEEF)
+#endif
+;
 unsigned long boot_cpu_hartid;
 static DEFINE_PER_CPU(struct cpu, cpu_devices);
 
@@ -60,10 +66,14 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
  * also add "System RAM" regions for compatibility with other
  * archs, and the rest of the known regions for completeness.
  */
+static struct resource kimage_res = { .name = "Kernel image", };
 static struct resource code_res = { .name = "Kernel code", };
 static struct resource data_res = { .name = "Kernel data", };
 static struct resource rodata_res = { .name = "Kernel rodata", };
 static struct resource bss_res = { .name = "Kernel bss", };
+#ifdef CONFIG_CRASH_DUMP
+static struct resource elfcorehdr_res = { .name = "ELF Core hdr", };
+#endif
 
 static int __init add_resource(struct resource *parent,
                                struct resource *res)
@@ -80,45 +90,54 @@ static int __init add_resource(struct resource *parent,
        return 1;
 }
 
-static int __init add_kernel_resources(struct resource *res)
+static int __init add_kernel_resources(void)
 {
        int ret = 0;
 
        /*
         * The memory region of the kernel image is continuous and
-        * was reserved on setup_bootmem, find it here and register
-        * it as a resource, then register the various segments of
-        * the image as child nodes
+        * was reserved on setup_bootmem, register it here as a
+        * resource, with the various segments of the image as
+        * child nodes.
         */
-       if (!(res->start <= code_res.start && res->end >= data_res.end))
-               return 0;
 
-       res->name = "Kernel image";
-       res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+       code_res.start = __pa_symbol(_text);
+       code_res.end = __pa_symbol(_etext) - 1;
+       code_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
-       /*
-        * We removed a part of this region on setup_bootmem so
-        * we need to expand the resource for the bss to fit in.
-        */
-       res->end = bss_res.end;
+       rodata_res.start = __pa_symbol(__start_rodata);
+       rodata_res.end = __pa_symbol(__end_rodata) - 1;
+       rodata_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+       data_res.start = __pa_symbol(_data);
+       data_res.end = __pa_symbol(_edata) - 1;
+       data_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+       bss_res.start = __pa_symbol(__bss_start);
+       bss_res.end = __pa_symbol(__bss_stop) - 1;
+       bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+       kimage_res.start = code_res.start;
+       kimage_res.end = bss_res.end;
+       kimage_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
-       ret = add_resource(&iomem_resource, res);
+       ret = add_resource(&iomem_resource, &kimage_res);
        if (ret < 0)
                return ret;
 
-       ret = add_resource(res, &code_res);
+       ret = add_resource(&kimage_res, &code_res);
        if (ret < 0)
                return ret;
 
-       ret = add_resource(res, &rodata_res);
+       ret = add_resource(&kimage_res, &rodata_res);
        if (ret < 0)
                return ret;
 
-       ret = add_resource(res, &data_res);
+       ret = add_resource(&kimage_res, &data_res);
        if (ret < 0)
                return ret;
 
-       ret = add_resource(res, &bss_res);
+       ret = add_resource(&kimage_res, &bss_res);
 
        return ret;
 }
@@ -129,54 +148,59 @@ static void __init init_resources(void)
        struct resource *res = NULL;
        struct resource *mem_res = NULL;
        size_t mem_res_sz = 0;
-       int ret = 0, i = 0;
-
-       code_res.start = __pa_symbol(_text);
-       code_res.end = __pa_symbol(_etext) - 1;
-       code_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-       rodata_res.start = __pa_symbol(__start_rodata);
-       rodata_res.end = __pa_symbol(__end_rodata) - 1;
-       rodata_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-       data_res.start = __pa_symbol(_data);
-       data_res.end = __pa_symbol(_edata) - 1;
-       data_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-       bss_res.start = __pa_symbol(__bss_start);
-       bss_res.end = __pa_symbol(__bss_stop) - 1;
-       bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+       int num_resources = 0, res_idx = 0;
+       int ret = 0;
 
        /* + 1 as memblock_alloc() might increase memblock.reserved.cnt */
-       mem_res_sz = (memblock.memory.cnt + memblock.reserved.cnt + 1) * sizeof(*mem_res);
+       num_resources = memblock.memory.cnt + memblock.reserved.cnt + 1;
+       res_idx = num_resources - 1;
+
+       mem_res_sz = num_resources * sizeof(*mem_res);
        mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES);
        if (!mem_res)
                panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz);
+
        /*
         * Start by adding the reserved regions, if they overlap
         * with /memory regions, insert_resource later on will take
         * care of it.
         */
+       ret = add_kernel_resources();
+       if (ret < 0)
+               goto error;
+
+#ifdef CONFIG_KEXEC_CORE
+       if (crashk_res.start != crashk_res.end) {
+               ret = add_resource(&iomem_resource, &crashk_res);
+               if (ret < 0)
+                       goto error;
+       }
+#endif
+
+#ifdef CONFIG_CRASH_DUMP
+       if (elfcorehdr_size > 0) {
+               elfcorehdr_res.start = elfcorehdr_addr;
+               elfcorehdr_res.end = elfcorehdr_addr + elfcorehdr_size - 1;
+               elfcorehdr_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+               add_resource(&iomem_resource, &elfcorehdr_res);
+       }
+#endif
+
        for_each_reserved_mem_region(region) {
-               res = &mem_res[i++];
+               res = &mem_res[res_idx--];
 
                res->name = "Reserved";
                res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
                res->start = __pfn_to_phys(memblock_region_reserved_base_pfn(region));
                res->end = __pfn_to_phys(memblock_region_reserved_end_pfn(region)) - 1;
 
-               ret = add_kernel_resources(res);
-               if (ret < 0)
-                       goto error;
-               else if (ret)
-                       continue;
-
                /*
                 * Ignore any other reserved regions within
                 * system memory.
                 */
                if (memblock_is_memory(res->start)) {
-                       memblock_free((phys_addr_t) res, sizeof(struct resource));
+                       /* Re-use this pre-allocated resource */
+                       res_idx++;
                        continue;
                }
 
@@ -187,7 +211,7 @@ static void __init init_resources(void)
 
        /* Add /memory regions to the resource tree */
        for_each_mem_region(region) {
-               res = &mem_res[i++];
+               res = &mem_res[res_idx--];
 
                if (unlikely(memblock_is_nomap(region))) {
                        res->name = "Reserved";
@@ -205,6 +229,9 @@ static void __init init_resources(void)
                        goto error;
        }
 
+       /* Clean-up any unused pre-allocated resources */
+       mem_res_sz = (num_resources - res_idx + 1) * sizeof(*mem_res);
+       memblock_free((phys_addr_t) mem_res, mem_res_sz);
        return;
 
  error:
@@ -251,21 +278,26 @@ void __init setup_arch(char **cmdline_p)
        efi_init();
        setup_bootmem();
        paging_init();
-       init_resources();
 #if IS_ENABLED(CONFIG_BUILTIN_DTB)
        unflatten_and_copy_device_tree();
 #else
-       if (early_init_dt_verify(__va(dtb_early_pa)))
+       if (early_init_dt_verify(__va(XIP_FIXUP(dtb_early_pa))))
                unflatten_device_tree();
        else
                pr_err("No DTB found in kernel mappings\n");
 #endif
        misc_mem_init();
 
+       init_resources();
        sbi_init();
 
-       if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
+       if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) {
                protect_kernel_text_data();
+#if defined(CONFIG_64BIT) && defined(CONFIG_MMU) && !defined(CONFIG_XIP_KERNEL)
+               protect_kernel_linear_mapping_text_rodata();
+#endif
+       }
+
 #ifdef CONFIG_SWIOTLB
        swiotlb_init(1);
 #endif
index ea028d9..921d9d7 100644 (file)
@@ -9,6 +9,7 @@
  */
 
 #include <linux/cpu.h>
+#include <linux/clockchips.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/profile.h>
@@ -27,10 +28,11 @@ enum ipi_message_type {
        IPI_CALL_FUNC,
        IPI_CPU_STOP,
        IPI_IRQ_WORK,
+       IPI_TIMER,
        IPI_MAX
 };
 
-unsigned long __cpuid_to_hartid_map[NR_CPUS] = {
+unsigned long __cpuid_to_hartid_map[NR_CPUS] __ro_after_init = {
        [0 ... NR_CPUS-1] = INVALID_HARTID
 };
 
@@ -54,7 +56,7 @@ int riscv_hartid_to_cpuid(int hartid)
                        return i;
 
        pr_err("Couldn't find cpu id for hartid [%d]\n", hartid);
-       return i;
+       return -ENOENT;
 }
 
 void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out)
@@ -85,9 +87,9 @@ static void ipi_stop(void)
                wait_for_interrupt();
 }
 
-static struct riscv_ipi_ops *ipi_ops;
+static const struct riscv_ipi_ops *ipi_ops __ro_after_init;
 
-void riscv_set_ipi_ops(struct riscv_ipi_ops *ops)
+void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops)
 {
        ipi_ops = ops;
 }
@@ -176,6 +178,12 @@ void handle_IPI(struct pt_regs *regs)
                        irq_work_run();
                }
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+               if (ops & (1 << IPI_TIMER)) {
+                       stats[IPI_TIMER]++;
+                       tick_receive_broadcast();
+               }
+#endif
                BUG_ON((ops >> IPI_MAX) != 0);
 
                /* Order data access and bit testing. */
@@ -192,6 +200,7 @@ static const char * const ipi_names[] = {
        [IPI_CALL_FUNC]         = "Function call interrupts",
        [IPI_CPU_STOP]          = "CPU stop interrupts",
        [IPI_IRQ_WORK]          = "IRQ work interrupts",
+       [IPI_TIMER]             = "Timer broadcast interrupts",
 };
 
 void show_ipi_stats(struct seq_file *p, int prec)
@@ -217,6 +226,13 @@ void arch_send_call_function_single_ipi(int cpu)
        send_ipi_single(cpu, IPI_CALL_FUNC);
 }
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+void tick_broadcast(const struct cpumask *mask)
+{
+       send_ipi_mask(mask, IPI_TIMER);
+}
+#endif
+
 void smp_send_stop(void)
 {
        unsigned long timeout;
index 5e276c2..9a408e2 100644 (file)
@@ -32,6 +32,7 @@
 #include <asm/sections.h>
 #include <asm/sbi.h>
 #include <asm/smp.h>
+#include <asm/alternative.h>
 
 #include "head.h"
 
@@ -40,6 +41,9 @@ static DECLARE_COMPLETION(cpu_running);
 void __init smp_prepare_boot_cpu(void)
 {
        init_cpu_topology();
+#ifdef CONFIG_RISCV_ERRATA_ALTERNATIVE
+       apply_boot_alternatives();
+#endif
 }
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
index f1ead9d..a63c667 100644 (file)
@@ -13,7 +13,7 @@
 #undef __SYSCALL
 #define __SYSCALL(nr, call)    [nr] = (call),
 
-void *sys_call_table[__NR_syscalls] = {
+void * const sys_call_table[__NR_syscalls] = {
        [0 ... __NR_syscalls - 1] = sys_ni_syscall,
 #include <asm/unistd.h>
 };
index 1b43226..8217b0f 100644 (file)
@@ -11,7 +11,7 @@
 #include <asm/processor.h>
 #include <asm/timex.h>
 
-unsigned long riscv_timebase;
+unsigned long riscv_timebase __ro_after_init;
 EXPORT_SYMBOL_GPL(riscv_timebase);
 
 void __init time_init(void)
index 1357abf..07fdded 100644 (file)
@@ -197,6 +197,6 @@ int is_valid_bugaddr(unsigned long pc)
 #endif /* CONFIG_GENERIC_BUG */
 
 /* stvec & scratch is already set from head.S */
-void trap_init(void)
+void __init trap_init(void)
 {
 }
index 3f1d35e..25a3b88 100644 (file)
@@ -20,8 +20,8 @@
 
 extern char vdso_start[], vdso_end[];
 
-static unsigned int vdso_pages;
-static struct page **vdso_pagelist;
+static unsigned int vdso_pages __ro_after_init;
+static struct page **vdso_pagelist __ro_after_init;
 
 /*
  * The vDSO data page.
index 71a315e..24d936c 100644 (file)
@@ -23,7 +23,7 @@ ifneq ($(c-gettimeofday-y),)
 endif
 
 # Build rules
-targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds vdso-dummy.o
+targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds vdso-syms.S
 obj-vdso := $(addprefix $(obj)/, $(obj-vdso))
 
 obj-y += vdso.o vdso-syms.o
@@ -41,11 +41,10 @@ KASAN_SANITIZE := n
 $(obj)/vdso.o: $(obj)/vdso.so
 
 # link rule for the .so file, .lds has to be first
-SYSCFLAGS_vdso.so.dbg = $(c_flags)
-$(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) FORCE
+$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE
        $(call if_changed,vdsold)
-SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \
-       -Wl,--build-id=sha1 -Wl,--hash-style=both
+LDFLAGS_vdso.so.dbg = -shared -s -soname=linux-vdso.so.1 \
+       --build-id=sha1 --hash-style=both --eh-frame-hdr
 
 # We also create a special relocatable object that should mirror the symbol
 # table and layout of the linked DSO. With ld --just-symbols we can then
@@ -60,13 +59,10 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
 
 # actual build commands
 # The DSO images are built using a special linker script
-# Add -lgcc so rv32 gets static muldi3 and lshrdi3 definitions.
 # Make sure only to export the intended __vdso_xxx symbol offsets.
 quiet_cmd_vdsold = VDSOLD  $@
-      cmd_vdsold = $(CC) $(KBUILD_CFLAGS) $(call cc-option, -no-pie) -nostdlib -nostartfiles $(SYSCFLAGS_$(@F)) \
-                           -Wl,-T,$(filter-out FORCE,$^) -o $@.tmp && \
-                   $(CROSS_COMPILE)objcopy \
-                           $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \
+      cmd_vdsold = $(LD) $(ld_flags) -T $(filter-out FORCE,$^) -o $@.tmp && \
+                   $(OBJCOPY) $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \
                    rm $@.tmp
 
 # Extracts symbol offsets from the VDSO, converting them into an assembly file
diff --git a/arch/riscv/kernel/vmlinux-xip.lds.S b/arch/riscv/kernel/vmlinux-xip.lds.S
new file mode 100644 (file)
index 0000000..4b29b99
--- /dev/null
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2012 Regents of the University of California
+ * Copyright (C) 2017 SiFive
+ * Copyright (C) 2020 Vitaly Wool, Konsulko AB
+ */
+
+#include <asm/pgtable.h>
+#define LOAD_OFFSET KERNEL_LINK_ADDR
+/* No __ro_after_init data in the .rodata section - which will always be ro */
+#define RO_AFTER_INIT_DATA
+
+#include <asm/vmlinux.lds.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+
+OUTPUT_ARCH(riscv)
+ENTRY(_start)
+
+jiffies = jiffies_64;
+
+SECTIONS
+{
+       /* Beginning of code and text segment */
+       . = LOAD_OFFSET;
+       _xiprom = .;
+       _start = .;
+       HEAD_TEXT_SECTION
+       INIT_TEXT_SECTION(PAGE_SIZE)
+       /* we have to discard exit text and such at runtime, not link time */
+       .exit.text :
+       {
+               EXIT_TEXT
+       }
+
+       .text : {
+               _text = .;
+               _stext = .;
+               TEXT_TEXT
+               SCHED_TEXT
+               CPUIDLE_TEXT
+               LOCK_TEXT
+               KPROBES_TEXT
+               ENTRY_TEXT
+               IRQENTRY_TEXT
+               SOFTIRQENTRY_TEXT
+               *(.fixup)
+               _etext = .;
+       }
+       RO_DATA(L1_CACHE_BYTES)
+       .srodata : {
+               *(.srodata*)
+       }
+       .init.rodata : {
+               INIT_SETUP(16)
+               INIT_CALLS
+               CON_INITCALL
+               INIT_RAM_FS
+       }
+       _exiprom = .;                   /* End of XIP ROM area */
+
+
+/*
+ * From this point, stuff is considered writable and will be copied to RAM
+ */
+       __data_loc = ALIGN(16);         /* location in file */
+       . = LOAD_OFFSET + XIP_OFFSET;   /* location in memory */
+
+       _sdata = .;                     /* Start of data section */
+       _data = .;
+       RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
+       _edata = .;
+       __start_ro_after_init = .;
+       .data.ro_after_init : AT(ADDR(.data.ro_after_init) - LOAD_OFFSET) {
+               *(.data..ro_after_init)
+       }
+       __end_ro_after_init = .;
+
+       . = ALIGN(PAGE_SIZE);
+       __init_begin = .;
+       .init.data : {
+               INIT_DATA
+       }
+       .exit.data : {
+               EXIT_DATA
+       }
+       . = ALIGN(8);
+       __soc_early_init_table : {
+               __soc_early_init_table_start = .;
+               KEEP(*(__soc_early_init_table))
+               __soc_early_init_table_end = .;
+       }
+       __soc_builtin_dtb_table : {
+               __soc_builtin_dtb_table_start = .;
+               KEEP(*(__soc_builtin_dtb_table))
+               __soc_builtin_dtb_table_end = .;
+       }
+       PERCPU_SECTION(L1_CACHE_BYTES)
+
+       . = ALIGN(PAGE_SIZE);
+       __init_end = .;
+
+       .sdata : {
+               __global_pointer$ = . + 0x800;
+               *(.sdata*)
+               *(.sbss*)
+       }
+
+       BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0)
+       EXCEPTION_TABLE(0x10)
+
+       .rel.dyn : AT(ADDR(.rel.dyn) - LOAD_OFFSET) {
+               *(.rel.dyn*)
+       }
+
+       /*
+        * End of copied data. We need a dummy section to get its LMA.
+        * Also located before final ALIGN() as trailing padding is not stored
+        * in the resulting binary file and useless to copy.
+        */
+       .data.endmark : AT(ADDR(.data.endmark) - LOAD_OFFSET) { }
+       _edata_loc = LOADADDR(.data.endmark);
+
+       . = ALIGN(PAGE_SIZE);
+       _end = .;
+
+       STABS_DEBUG
+       DWARF_DEBUG
+
+       DISCARDS
+}
index de03cb2..891742f 100644 (file)
@@ -4,7 +4,13 @@
  * Copyright (C) 2017 SiFive
  */
 
-#define LOAD_OFFSET PAGE_OFFSET
+#ifdef CONFIG_XIP_KERNEL
+#include "vmlinux-xip.lds.S"
+#else
+
+#include <asm/pgtable.h>
+#define LOAD_OFFSET KERNEL_LINK_ADDR
+
 #include <asm/vmlinux.lds.h>
 #include <asm/page.h>
 #include <asm/cache.h>
@@ -90,6 +96,13 @@ SECTIONS
        }
 
        __init_data_end = .;
+
+       . = ALIGN(8);
+       .alternative : {
+               __alt_start = .;
+               *(.alternative)
+               __alt_end = .;
+       }
        __init_end = .;
 
        /* Start of data section */
@@ -132,3 +145,4 @@ SECTIONS
 
        DISCARDS
 }
+#endif /* CONFIG_XIP_KERNEL */
index c5dbd55..096463c 100644 (file)
@@ -231,6 +231,19 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
                return;
        }
 
+#ifdef CONFIG_64BIT
+       /*
+        * Modules in 64bit kernels lie in their own virtual region which is not
+        * in the vmalloc region, but dealing with page faults in this region
+        * or the vmalloc region amounts to doing the same thing: checking that
+        * the mapping exists in init_mm.pgd and updating user page table, so
+        * just use vmalloc_fault.
+        */
+       if (unlikely(addr >= MODULES_VADDR && addr < MODULES_END)) {
+               vmalloc_fault(regs, code, addr);
+               return;
+       }
+#endif
        /* Enable interrupts if they were enabled in the parent context. */
        if (likely(regs->status & SR_PIE))
                local_irq_enable();
index 92e39cf..dfb5e4f 100644 (file)
@@ -2,6 +2,8 @@
 /*
  * Copyright (C) 2012 Regents of the University of California
  * Copyright (C) 2019 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2020 FORTH-ICS/CARV
+ *  Nick Kossifidis <mick@ics.forth.gr>
  */
 
 #include <linux/init.h>
 #include <linux/swap.h>
 #include <linux/sizes.h>
 #include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
 #include <linux/libfdt.h>
 #include <linux/set_memory.h>
 #include <linux/dma-map-ops.h>
+#include <linux/crash_dump.h>
 
 #include <asm/fixmap.h>
 #include <asm/tlbflush.h>
 
 #include "../kernel/head.h"
 
+unsigned long kernel_virt_addr = KERNEL_LINK_ADDR;
+EXPORT_SYMBOL(kernel_virt_addr);
+#ifdef CONFIG_XIP_KERNEL
+#define kernel_virt_addr       (*((unsigned long *)XIP_FIXUP(&kernel_virt_addr)))
+#endif
+
 unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
                                                        __page_aligned_bss;
 EXPORT_SYMBOL(empty_zero_page);
 
 extern char _start[];
 #define DTB_EARLY_BASE_VA      PGDIR_SIZE
-void *dtb_early_va __initdata;
-uintptr_t dtb_early_pa __initdata;
+void *_dtb_early_va __initdata;
+uintptr_t _dtb_early_pa __initdata;
 
 struct pt_alloc_ops {
        pte_t *(*get_pte_virt)(phys_addr_t pa);
@@ -57,7 +67,7 @@ static void __init zone_sizes_init(void)
        free_area_init(max_zone_pfns);
 }
 
-static void setup_zero_page(void)
+static void __init setup_zero_page(void)
 {
        memset((void *)empty_zero_page, 0, PAGE_SIZE);
 }
@@ -75,7 +85,7 @@ static inline void print_mlm(char *name, unsigned long b, unsigned long t)
                  (((t) - (b)) >> 20));
 }
 
-static void print_vm_layout(void)
+static void __init print_vm_layout(void)
 {
        pr_notice("Virtual kernel memory layout:\n");
        print_mlk("fixmap", (unsigned long)FIXADDR_START,
@@ -88,6 +98,10 @@ static void print_vm_layout(void)
                  (unsigned long)VMALLOC_END);
        print_mlm("lowmem", (unsigned long)PAGE_OFFSET,
                  (unsigned long)high_memory);
+#ifdef CONFIG_64BIT
+       print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR,
+                 (unsigned long)ADDRESS_SPACE_END);
+#endif
 }
 #else
 static void print_vm_layout(void) { }
@@ -112,11 +126,20 @@ void __init setup_bootmem(void)
        phys_addr_t dram_end = memblock_end_of_DRAM();
        phys_addr_t max_mapped_addr = __pa(~(ulong)0);
 
+#ifdef CONFIG_XIP_KERNEL
+       vmlinux_start = __pa_symbol(&_sdata);
+#endif
+
        /* The maximal physical memory size is -PAGE_OFFSET. */
        memblock_enforce_memory_limit(-PAGE_OFFSET);
 
-       /* Reserve from the start of the kernel to the end of the kernel */
-       memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
+       /*
+        * Reserve from the start of the kernel to the end of the kernel
+        * and make sure we align the reservation on PMD_SIZE since we will
+        * map the kernel in the linear mapping as read-only: we do not want
+        * any allocation to happen between _end and the next pmd aligned page.
+        */
+       memblock_reserve(vmlinux_start, (vmlinux_end - vmlinux_start + PMD_SIZE - 1) & PMD_MASK);
 
        /*
         * memblock allocator is not aware of the fact that last 4K bytes of
@@ -127,8 +150,9 @@ void __init setup_bootmem(void)
        if (max_mapped_addr == (dram_end - 1))
                memblock_set_current_limit(max_mapped_addr - 4096);
 
-       max_pfn = PFN_DOWN(dram_end);
-       max_low_pfn = max_pfn;
+       min_low_pfn = PFN_UP(memblock_start_of_DRAM());
+       max_low_pfn = max_pfn = PFN_DOWN(dram_end);
+
        dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn));
        set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET);
 
@@ -147,12 +171,42 @@ void __init setup_bootmem(void)
        memblock_allow_resize();
 }
 
+#ifdef CONFIG_XIP_KERNEL
+
+extern char _xiprom[], _exiprom[];
+extern char _sdata[], _edata[];
+
+#endif /* CONFIG_XIP_KERNEL */
+
 #ifdef CONFIG_MMU
-static struct pt_alloc_ops pt_ops;
+static struct pt_alloc_ops _pt_ops __ro_after_init;
+
+#ifdef CONFIG_XIP_KERNEL
+#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops))
+#else
+#define pt_ops _pt_ops
+#endif
 
-unsigned long va_pa_offset;
+/* Offset between linear mapping virtual address and kernel load address */
+unsigned long va_pa_offset __ro_after_init;
 EXPORT_SYMBOL(va_pa_offset);
-unsigned long pfn_base;
+#ifdef CONFIG_XIP_KERNEL
+#define va_pa_offset   (*((unsigned long *)XIP_FIXUP(&va_pa_offset)))
+#endif
+/* Offset between kernel mapping virtual address and kernel load address */
+#ifdef CONFIG_64BIT
+unsigned long va_kernel_pa_offset;
+EXPORT_SYMBOL(va_kernel_pa_offset);
+#endif
+#ifdef CONFIG_XIP_KERNEL
+#define va_kernel_pa_offset    (*((unsigned long *)XIP_FIXUP(&va_kernel_pa_offset)))
+#endif
+unsigned long va_kernel_xip_pa_offset;
+EXPORT_SYMBOL(va_kernel_xip_pa_offset);
+#ifdef CONFIG_XIP_KERNEL
+#define va_kernel_xip_pa_offset        (*((unsigned long *)XIP_FIXUP(&va_kernel_xip_pa_offset)))
+#endif
+unsigned long pfn_base __ro_after_init;
 EXPORT_SYMBOL(pfn_base);
 
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
@@ -161,6 +215,12 @@ pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;
 
 pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
 
+#ifdef CONFIG_XIP_KERNEL
+#define trampoline_pg_dir      ((pgd_t *)XIP_FIXUP(trampoline_pg_dir))
+#define fixmap_pte             ((pte_t *)XIP_FIXUP(fixmap_pte))
+#define early_pg_dir           ((pgd_t *)XIP_FIXUP(early_pg_dir))
+#endif /* CONFIG_XIP_KERNEL */
+
 void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
 {
        unsigned long addr = __fix_to_virt(idx);
@@ -212,8 +272,8 @@ static phys_addr_t alloc_pte_late(uintptr_t va)
        unsigned long vaddr;
 
        vaddr = __get_free_page(GFP_KERNEL);
-       if (!vaddr || !pgtable_pte_page_ctor(virt_to_page(vaddr)))
-               BUG();
+       BUG_ON(!vaddr || !pgtable_pte_page_ctor(virt_to_page(vaddr)));
+
        return __pa(vaddr);
 }
 
@@ -236,6 +296,12 @@ pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
 pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
 pmd_t early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
 
+#ifdef CONFIG_XIP_KERNEL
+#define trampoline_pmd ((pmd_t *)XIP_FIXUP(trampoline_pmd))
+#define fixmap_pmd     ((pmd_t *)XIP_FIXUP(fixmap_pmd))
+#define early_pmd      ((pmd_t *)XIP_FIXUP(early_pmd))
+#endif /* CONFIG_XIP_KERNEL */
+
 static pmd_t *__init get_pmd_virt_early(phys_addr_t pa)
 {
        /* Before MMU is enabled */
@@ -255,7 +321,7 @@ static pmd_t *get_pmd_virt_late(phys_addr_t pa)
 
 static phys_addr_t __init alloc_pmd_early(uintptr_t va)
 {
-       BUG_ON((va - PAGE_OFFSET) >> PGDIR_SHIFT);
+       BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT);
 
        return (uintptr_t)early_pmd;
 }
@@ -352,6 +418,19 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
        return PMD_SIZE;
 }
 
+#ifdef CONFIG_XIP_KERNEL
+/* called from head.S with MMU off */
+asmlinkage void __init __copy_data(void)
+{
+       void *from = (void *)(&_sdata);
+       void *end = (void *)(&_end);
+       void *to = (void *)CONFIG_PHYS_RAM_BASE;
+       size_t sz = (size_t)(end - from + 1);
+
+       memcpy(to, from, sz);
+}
+#endif
+
 /*
  * setup_vm() is called from head.S with MMU-off.
  *
@@ -370,17 +449,74 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
 #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing."
 #endif
 
+uintptr_t load_pa, load_sz;
+#ifdef CONFIG_XIP_KERNEL
+#define load_pa        (*((uintptr_t *)XIP_FIXUP(&load_pa)))
+#define load_sz        (*((uintptr_t *)XIP_FIXUP(&load_sz)))
+#endif
+
+#ifdef CONFIG_XIP_KERNEL
+uintptr_t xiprom, xiprom_sz;
+#define xiprom_sz      (*((uintptr_t *)XIP_FIXUP(&xiprom_sz)))
+#define xiprom         (*((uintptr_t *)XIP_FIXUP(&xiprom)))
+
+static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size)
+{
+       uintptr_t va, end_va;
+
+       /* Map the flash resident part */
+       end_va = kernel_virt_addr + xiprom_sz;
+       for (va = kernel_virt_addr; va < end_va; va += map_size)
+               create_pgd_mapping(pgdir, va,
+                                  xiprom + (va - kernel_virt_addr),
+                                  map_size, PAGE_KERNEL_EXEC);
+
+       /* Map the data in RAM */
+       end_va = kernel_virt_addr + XIP_OFFSET + load_sz;
+       for (va = kernel_virt_addr + XIP_OFFSET; va < end_va; va += map_size)
+               create_pgd_mapping(pgdir, va,
+                                  load_pa + (va - (kernel_virt_addr + XIP_OFFSET)),
+                                  map_size, PAGE_KERNEL);
+}
+#else
+static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size)
+{
+       uintptr_t va, end_va;
+
+       end_va = kernel_virt_addr + load_sz;
+       for (va = kernel_virt_addr; va < end_va; va += map_size)
+               create_pgd_mapping(pgdir, va,
+                                  load_pa + (va - kernel_virt_addr),
+                                  map_size, PAGE_KERNEL_EXEC);
+}
+#endif
+
 asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 {
-       uintptr_t va, pa, end_va;
-       uintptr_t load_pa = (uintptr_t)(&_start);
-       uintptr_t load_sz = (uintptr_t)(&_end) - load_pa;
+       uintptr_t __maybe_unused pa;
        uintptr_t map_size;
 #ifndef __PAGETABLE_PMD_FOLDED
        pmd_t fix_bmap_spmd, fix_bmap_epmd;
 #endif
 
+#ifdef CONFIG_XIP_KERNEL
+       xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR;
+       xiprom_sz = (uintptr_t)(&_exiprom) - (uintptr_t)(&_xiprom);
+
+       load_pa = (uintptr_t)CONFIG_PHYS_RAM_BASE;
+       load_sz = (uintptr_t)(&_end) - (uintptr_t)(&_sdata);
+
+       va_kernel_xip_pa_offset = kernel_virt_addr - xiprom;
+#else
+       load_pa = (uintptr_t)(&_start);
+       load_sz = (uintptr_t)(&_end) - load_pa;
+#endif
+
        va_pa_offset = PAGE_OFFSET - load_pa;
+#ifdef CONFIG_64BIT
+       va_kernel_pa_offset = kernel_virt_addr - load_pa;
+#endif
+
        pfn_base = PFN_DOWN(load_pa);
 
        /*
@@ -408,26 +544,27 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
        create_pmd_mapping(fixmap_pmd, FIXADDR_START,
                           (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
        /* Setup trampoline PGD and PMD */
-       create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
+       create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
                           (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
-       create_pmd_mapping(trampoline_pmd, PAGE_OFFSET,
+#ifdef CONFIG_XIP_KERNEL
+       create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
+                          xiprom, PMD_SIZE, PAGE_KERNEL_EXEC);
+#else
+       create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
                           load_pa, PMD_SIZE, PAGE_KERNEL_EXEC);
+#endif
 #else
        /* Setup trampoline PGD */
-       create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
+       create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
                           load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC);
 #endif
 
        /*
-        * Setup early PGD covering entire kernel which will allows
+        * Setup early PGD covering entire kernel which will allow
         * us to reach paging_init(). We map all memory banks later
         * in setup_vm_final() below.
         */
-       end_va = PAGE_OFFSET + load_sz;
-       for (va = PAGE_OFFSET; va < end_va; va += map_size)
-               create_pgd_mapping(early_pg_dir, va,
-                                  load_pa + (va - PAGE_OFFSET),
-                                  map_size, PAGE_KERNEL_EXEC);
+       create_kernel_page_table(early_pg_dir, map_size);
 
 #ifndef __PAGETABLE_PMD_FOLDED
        /* Setup early PMD for DTB */
@@ -442,7 +579,16 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
                           pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL);
        dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1));
 #else /* CONFIG_BUILTIN_DTB */
+#ifdef CONFIG_64BIT
+       /*
+        * __va can't be used since it would return a linear mapping address
+        * whereas dtb_early_va will be used before setup_vm_final installs
+        * the linear mapping.
+        */
+       dtb_early_va = kernel_mapping_pa_to_va(XIP_FIXUP(dtb_pa));
+#else
        dtb_early_va = __va(dtb_pa);
+#endif /* CONFIG_64BIT */
 #endif /* CONFIG_BUILTIN_DTB */
 #else
 #ifndef CONFIG_BUILTIN_DTB
@@ -454,7 +600,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
                           pa + PGDIR_SIZE, PGDIR_SIZE, PAGE_KERNEL);
        dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PGDIR_SIZE - 1));
 #else /* CONFIG_BUILTIN_DTB */
+#ifdef CONFIG_64BIT
+       dtb_early_va = kernel_mapping_pa_to_va(XIP_FIXUP(dtb_pa));
+#else
        dtb_early_va = __va(dtb_pa);
+#endif /* CONFIG_64BIT */
 #endif /* CONFIG_BUILTIN_DTB */
 #endif
        dtb_early_pa = dtb_pa;
@@ -490,6 +640,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 #endif
 }
 
+#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL)
+void protect_kernel_linear_mapping_text_rodata(void)
+{
+       unsigned long text_start = (unsigned long)lm_alias(_start);
+       unsigned long init_text_start = (unsigned long)lm_alias(__init_text_begin);
+       unsigned long rodata_start = (unsigned long)lm_alias(__start_rodata);
+       unsigned long data_start = (unsigned long)lm_alias(_data);
+
+       set_memory_ro(text_start, (init_text_start - text_start) >> PAGE_SHIFT);
+       set_memory_nx(text_start, (init_text_start - text_start) >> PAGE_SHIFT);
+
+       set_memory_ro(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT);
+       set_memory_nx(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT);
+}
+#endif
+
 static void __init setup_vm_final(void)
 {
        uintptr_t va, map_size;
@@ -511,7 +677,7 @@ static void __init setup_vm_final(void)
                           __pa_symbol(fixmap_pgd_next),
                           PGDIR_SIZE, PAGE_TABLE);
 
-       /* Map all memory banks */
+       /* Map all memory banks in the linear mapping */
        for_each_mem_range(i, &start, &end) {
                if (start >= end)
                        break;
@@ -523,10 +689,22 @@ static void __init setup_vm_final(void)
                for (pa = start; pa < end; pa += map_size) {
                        va = (uintptr_t)__va(pa);
                        create_pgd_mapping(swapper_pg_dir, va, pa,
-                                          map_size, PAGE_KERNEL_EXEC);
+                                          map_size,
+#ifdef CONFIG_64BIT
+                                          PAGE_KERNEL
+#else
+                                          PAGE_KERNEL_EXEC
+#endif
+                                       );
+
                }
        }
 
+#ifdef CONFIG_64BIT
+       /* Map the kernel */
+       create_kernel_page_table(swapper_pg_dir, PMD_SIZE);
+#endif
+
        /* Clear fixmap PTE and PMD mappings */
        clear_fixmap(FIX_PTE);
        clear_fixmap(FIX_PMD);
@@ -556,7 +734,7 @@ static inline void setup_vm_final(void)
 #endif /* CONFIG_MMU */
 
 #ifdef CONFIG_STRICT_KERNEL_RWX
-void protect_kernel_text_data(void)
+void __init protect_kernel_text_data(void)
 {
        unsigned long text_start = (unsigned long)_start;
        unsigned long init_text_start = (unsigned long)__init_text_begin;
@@ -584,6 +762,103 @@ void mark_rodata_ro(void)
 }
 #endif
 
+#ifdef CONFIG_KEXEC_CORE
+/*
+ * reserve_crashkernel() - reserves memory for crash kernel
+ *
+ * This function reserves memory area given in "crashkernel=" kernel command
+ * line parameter. The memory reserved is used by dump capture kernel when
+ * primary kernel is crashing.
+ */
+static void __init reserve_crashkernel(void)
+{
+       unsigned long long crash_base = 0;
+       unsigned long long crash_size = 0;
+       unsigned long search_start = memblock_start_of_DRAM();
+       unsigned long search_end = memblock_end_of_DRAM();
+
+       int ret = 0;
+
+       /*
+        * Don't reserve a region for a crash kernel on a crash kernel
+        * since it doesn't make much sense and we have limited memory
+        * resources.
+        */
+#ifdef CONFIG_CRASH_DUMP
+       if (is_kdump_kernel()) {
+               pr_info("crashkernel: ignoring reservation request\n");
+               return;
+       }
+#endif
+
+       ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
+                               &crash_size, &crash_base);
+       if (ret || !crash_size)
+               return;
+
+       crash_size = PAGE_ALIGN(crash_size);
+
+       if (crash_base == 0) {
+               /*
+                * Current riscv boot protocol requires 2MB alignment for
+                * RV64 and 4MB alignment for RV32 (hugepage size)
+                */
+               crash_base = memblock_find_in_range(search_start, search_end,
+                                                   crash_size, PMD_SIZE);
+
+               if (crash_base == 0) {
+                       pr_warn("crashkernel: couldn't allocate %lldKB\n",
+                               crash_size >> 10);
+                       return;
+               }
+       } else {
+               /* User specifies base address explicitly. */
+               if (!memblock_is_region_memory(crash_base, crash_size)) {
+                       pr_warn("crashkernel: requested region is not memory\n");
+                       return;
+               }
+
+               if (memblock_is_region_reserved(crash_base, crash_size)) {
+                       pr_warn("crashkernel: requested region is reserved\n");
+                       return;
+               }
+
+
+               if (!IS_ALIGNED(crash_base, PMD_SIZE)) {
+                       pr_warn("crashkernel: requested region is misaligned\n");
+                       return;
+               }
+       }
+       memblock_reserve(crash_base, crash_size);
+
+       pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n",
+               crash_base, crash_base + crash_size, crash_size >> 20);
+
+       crashk_res.start = crash_base;
+       crashk_res.end = crash_base + crash_size - 1;
+}
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_CRASH_DUMP
+/*
+ * We keep track of the ELF core header of the crashed
+ * kernel with a reserved-memory region with compatible
+ * string "linux,elfcorehdr". Here we register a callback
+ * to populate elfcorehdr_addr/size when this region is
+ * present. Note that this region will be marked as
+ * reserved once we call early_init_fdt_scan_reserved_mem()
+ * later on.
+ */
+static int elfcore_hdr_setup(struct reserved_mem *rmem)
+{
+       elfcorehdr_addr = rmem->base;
+       elfcorehdr_size = rmem->size;
+       return 0;
+}
+
+RESERVEDMEM_OF_DECLARE(elfcorehdr, "linux,elfcorehdr", elfcore_hdr_setup);
+#endif
+
 void __init paging_init(void)
 {
        setup_vm_final();
@@ -592,9 +867,13 @@ void __init paging_init(void)
 
 void __init misc_mem_init(void)
 {
+       early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT);
        arch_numa_init();
        sparse_init();
        zone_sizes_init();
+#ifdef CONFIG_KEXEC_CORE
+       reserve_crashkernel();
+#endif
        memblock_dump_all();
 }
 
index 937d13c..9daacae 100644 (file)
 #include <asm/fixmap.h>
 #include <asm/pgalloc.h>
 
-static __init void *early_alloc(size_t size, int node)
-{
-       void *ptr = memblock_alloc_try_nid(size, size,
-               __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, node);
-
-       if (!ptr)
-               panic("%pS: Failed to allocate %zu bytes align=%zx nid=%d from=%llx\n",
-                       __func__, size, size, node, (u64)__pa(MAX_DMA_ADDRESS));
-
-       return ptr;
-}
-
 extern pgd_t early_pg_dir[PTRS_PER_PGD];
 asmlinkage void __init kasan_early_init(void)
 {
@@ -60,7 +48,7 @@ asmlinkage void __init kasan_early_init(void)
        local_flush_tlb_all();
 }
 
-static void kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end)
+static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end)
 {
        phys_addr_t phys_addr;
        pte_t *ptep, *base_pte;
@@ -82,7 +70,7 @@ static void kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long en
        set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE));
 }
 
-static void kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end)
+static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end)
 {
        phys_addr_t phys_addr;
        pmd_t *pmdp, *base_pmd;
@@ -117,7 +105,7 @@ static void kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long en
        set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
 }
 
-static void kasan_populate_pgd(unsigned long vaddr, unsigned long end)
+static void __init kasan_populate_pgd(unsigned long vaddr, unsigned long end)
 {
        phys_addr_t phys_addr;
        pgd_t *pgdp = pgd_offset_k(vaddr);
@@ -155,39 +143,27 @@ static void __init kasan_populate(void *start, void *end)
        memset(start, KASAN_SHADOW_INIT, end - start);
 }
 
+static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end)
+{
+       unsigned long next;
+       void *p;
+       pgd_t *pgd_k = pgd_offset_k(vaddr);
+
+       do {
+               next = pgd_addr_end(vaddr, end);
+               if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) {
+                       p = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+                       set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE));
+               }
+       } while (pgd_k++, vaddr = next, vaddr != end);
+}
+
 static void __init kasan_shallow_populate(void *start, void *end)
 {
        unsigned long vaddr = (unsigned long)start & PAGE_MASK;
        unsigned long vend = PAGE_ALIGN((unsigned long)end);
-       unsigned long pfn;
-       int index;
-       void *p;
-       pud_t *pud_dir, *pud_k;
-       pgd_t *pgd_dir, *pgd_k;
-       p4d_t *p4d_dir, *p4d_k;
-
-       while (vaddr < vend) {
-               index = pgd_index(vaddr);
-               pfn = csr_read(CSR_SATP) & SATP_PPN;
-               pgd_dir = (pgd_t *)pfn_to_virt(pfn) + index;
-               pgd_k = init_mm.pgd + index;
-               pgd_dir = pgd_offset_k(vaddr);
-               set_pgd(pgd_dir, *pgd_k);
-
-               p4d_dir = p4d_offset(pgd_dir, vaddr);
-               p4d_k  = p4d_offset(pgd_k, vaddr);
-
-               vaddr = (vaddr + PUD_SIZE) & PUD_MASK;
-               pud_dir = pud_offset(p4d_dir, vaddr);
-               pud_k = pud_offset(p4d_k, vaddr);
-
-               if (pud_present(*pud_dir)) {
-                       p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
-                       pud_populate(&init_mm, pud_dir, p);
-               }
-               vaddr += PAGE_SIZE;
-       }
 
+       kasan_shallow_populate_pgd(vaddr, vend);
        local_flush_tlb_all();
 }
 
@@ -196,6 +172,10 @@ void __init kasan_init(void)
        phys_addr_t _start, _end;
        u64 i;
 
+       /*
+        * Populate all kernel virtual address space with kasan_early_shadow_page
+        * except for the linear mapping and the modules/kernel/BPF mapping.
+        */
        kasan_populate_early_shadow((void *)KASAN_SHADOW_START,
                                    (void *)kasan_mem_to_shadow((void *)
                                                                VMEMMAP_END));
@@ -208,6 +188,7 @@ void __init kasan_init(void)
                        (void *)kasan_mem_to_shadow((void *)VMALLOC_START),
                        (void *)kasan_mem_to_shadow((void *)VMALLOC_END));
 
+       /* Populate the linear mapping */
        for_each_mem_range(i, &_start, &_end) {
                void *start = (void *)__va(_start);
                void *end = (void *)__va(_end);
@@ -218,6 +199,10 @@ void __init kasan_init(void)
                kasan_populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end));
        }
 
+       /* Populate kernel, BPF, modules mapping */
+       kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR),
+                      kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END));
+
        for (i = 0; i < PTRS_PER_PTE; i++)
                set_pte(&kasan_early_shadow_pte[i],
                        mk_pte(virt_to_page(kasan_early_shadow_page),
index e8e4dcd..35703d5 100644 (file)
@@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys);
 
 phys_addr_t __phys_addr_symbol(unsigned long x)
 {
-       unsigned long kernel_start = (unsigned long)PAGE_OFFSET;
+       unsigned long kernel_start = (unsigned long)kernel_virt_addr;
        unsigned long kernel_end = (unsigned long)_end;
 
        /*
index ace74de..0536ac8 100644 (file)
@@ -58,29 +58,56 @@ struct ptd_mm_info {
        unsigned long end;
 };
 
+enum address_markers_idx {
+#ifdef CONFIG_KASAN
+       KASAN_SHADOW_START_NR,
+       KASAN_SHADOW_END_NR,
+#endif
+       FIXMAP_START_NR,
+       FIXMAP_END_NR,
+       PCI_IO_START_NR,
+       PCI_IO_END_NR,
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+       VMEMMAP_START_NR,
+       VMEMMAP_END_NR,
+#endif
+       VMALLOC_START_NR,
+       VMALLOC_END_NR,
+       PAGE_OFFSET_NR,
+#ifdef CONFIG_64BIT
+       MODULES_MAPPING_NR,
+       KERNEL_MAPPING_NR,
+#endif
+       END_OF_SPACE_NR
+};
+
 static struct addr_marker address_markers[] = {
 #ifdef CONFIG_KASAN
-       {KASAN_SHADOW_START,    "Kasan shadow start"},
-       {KASAN_SHADOW_END,      "Kasan shadow end"},
+       {0, "Kasan shadow start"},
+       {0, "Kasan shadow end"},
 #endif
-       {FIXADDR_START,         "Fixmap start"},
-       {FIXADDR_TOP,           "Fixmap end"},
-       {PCI_IO_START,          "PCI I/O start"},
-       {PCI_IO_END,            "PCI I/O end"},
+       {0, "Fixmap start"},
+       {0, "Fixmap end"},
+       {0, "PCI I/O start"},
+       {0, "PCI I/O end"},
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-       {VMEMMAP_START,         "vmemmap start"},
-       {VMEMMAP_END,           "vmemmap end"},
+       {0, "vmemmap start"},
+       {0, "vmemmap end"},
+#endif
+       {0, "vmalloc() area"},
+       {0, "vmalloc() end"},
+       {0, "Linear mapping"},
+#ifdef CONFIG_64BIT
+       {0, "Modules mapping"},
+       {0, "Kernel mapping (kernel, BPF)"},
 #endif
-       {VMALLOC_START,         "vmalloc() area"},
-       {VMALLOC_END,           "vmalloc() end"},
-       {PAGE_OFFSET,           "Linear mapping"},
        {-1, NULL},
 };
 
 static struct ptd_mm_info kernel_ptd_info = {
        .mm             = &init_mm,
        .markers        = address_markers,
-       .base_addr      = KERN_VIRT_START,
+       .base_addr      = 0,
        .end            = ULONG_MAX,
 };
 
@@ -331,10 +358,32 @@ static int ptdump_show(struct seq_file *m, void *v)
 
 DEFINE_SHOW_ATTRIBUTE(ptdump);
 
-static int ptdump_init(void)
+static int __init ptdump_init(void)
 {
        unsigned int i, j;
 
+#ifdef CONFIG_KASAN
+       address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START;
+       address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
+#endif
+       address_markers[FIXMAP_START_NR].start_address = FIXADDR_START;
+       address_markers[FIXMAP_END_NR].start_address = FIXADDR_TOP;
+       address_markers[PCI_IO_START_NR].start_address = PCI_IO_START;
+       address_markers[PCI_IO_END_NR].start_address = PCI_IO_END;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+       address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
+       address_markers[VMEMMAP_END_NR].start_address = VMEMMAP_END;
+#endif
+       address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
+       address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
+       address_markers[PAGE_OFFSET_NR].start_address = PAGE_OFFSET;
+#ifdef CONFIG_64BIT
+       address_markers[MODULES_MAPPING_NR].start_address = MODULES_VADDR;
+       address_markers[KERNEL_MAPPING_NR].start_address = kernel_virt_addr;
+#endif
+
+       kernel_ptd_info.base_addr = KERN_VIRT_START;
+
        for (i = 0; i < ARRAY_SIZE(pg_level); i++)
                for (j = 0; j < ARRAY_SIZE(pte_bits); j++)
                        pg_level[i].mask |= pte_bits[j].mask;
index b44ff52..87e3bf5 100644 (file)
@@ -1148,16 +1148,3 @@ void bpf_jit_build_epilogue(struct rv_jit_context *ctx)
 {
        __build_epilogue(false, ctx);
 }
-
-void *bpf_jit_alloc_exec(unsigned long size)
-{
-       return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
-                                   BPF_JIT_REGION_END, GFP_KERNEL,
-                                   PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-                                   __builtin_return_address(0));
-}
-
-void bpf_jit_free_exec(void *addr)
-{
-       return vfree(addr);
-}
index 3630d44..fed86f4 100644 (file)
@@ -152,6 +152,7 @@ skip_init_ctx:
        bpf_flush_icache(jit_data->header, ctx->insns + ctx->ninsns);
 
        if (!prog->is_func || extra_pass) {
+               bpf_jit_binary_lock_ro(jit_data->header);
 out_offset:
                kfree(ctx->offset);
                kfree(jit_data);
@@ -164,3 +165,16 @@ out:
                                           tmp : orig_prog);
        return prog;
 }
+
+void *bpf_jit_alloc_exec(unsigned long size)
+{
+       return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
+                                   BPF_JIT_REGION_END, GFP_KERNEL,
+                                   PAGE_KERNEL, 0, NUMA_NO_NODE,
+                                   __builtin_return_address(0));
+}
+
+void bpf_jit_free_exec(void *addr)
+{
+       return vfree(addr);
+}
index c1ff874..b4c7c34 100644 (file)
@@ -60,6 +60,9 @@ config S390
        imply IMA_SECURE_AND_OR_TRUSTED_BOOT
        select ARCH_32BIT_USTAT_F_TINODE
        select ARCH_BINFMT_ELF_STATE
+       select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM
+       select ARCH_ENABLE_MEMORY_HOTREMOVE
+       select ARCH_ENABLE_SPLIT_PMD_PTLOCK
        select ARCH_HAS_DEBUG_VM_PGTABLE
        select ARCH_HAS_DEBUG_WX
        select ARCH_HAS_DEVMEM_IS_ALLOWED
@@ -137,6 +140,7 @@ config S390
        select HAVE_ARCH_JUMP_LABEL_RELATIVE
        select HAVE_ARCH_KASAN
        select HAVE_ARCH_KASAN_VMALLOC
+       select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
        select HAVE_ARCH_SECCOMP_FILTER
        select HAVE_ARCH_SOFT_DIRTY
        select HAVE_ARCH_TRACEHOOK
@@ -626,15 +630,6 @@ config ARCH_SPARSEMEM_ENABLE
 config ARCH_SPARSEMEM_DEFAULT
        def_bool y
 
-config ARCH_ENABLE_MEMORY_HOTPLUG
-       def_bool y if SPARSEMEM
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
-       def_bool y
-
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
-       def_bool y
-
 config MAX_PHYSMEM_BITS
        int "Maximum size of supported physical memory in bits (42-53)"
        range 42 53
index 6422618..86afcc6 100644 (file)
@@ -387,6 +387,7 @@ CONFIG_CGROUP_NET_PRIO=y
 CONFIG_BPF_JIT=y
 CONFIG_NET_PKTGEN=m
 CONFIG_PCI=y
+CONFIG_PCI_IOV=y
 # CONFIG_PCIEASPM is not set
 CONFIG_PCI_DEBUG=y
 CONFIG_HOTPLUG_PCI=y
@@ -548,7 +549,7 @@ CONFIG_INPUT_EVDEV=y
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
 CONFIG_LEGACY_PTY_COUNT=0
-CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=m
 CONFIG_HW_RANDOM_VIRTIO=m
 CONFIG_RAW_DRIVER=m
 CONFIG_HANGCHECK_TIMER=m
index 371a529..71b49ea 100644 (file)
@@ -377,6 +377,7 @@ CONFIG_CGROUP_NET_PRIO=y
 CONFIG_BPF_JIT=y
 CONFIG_NET_PKTGEN=m
 CONFIG_PCI=y
+CONFIG_PCI_IOV=y
 # CONFIG_PCIEASPM is not set
 CONFIG_HOTPLUG_PCI=y
 CONFIG_HOTPLUG_PCI_S390=y
@@ -540,7 +541,7 @@ CONFIG_INPUT_EVDEV=y
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
 CONFIG_LEGACY_PTY_COUNT=0
-CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=m
 CONFIG_HW_RANDOM_VIRTIO=m
 CONFIG_RAW_DRIVER=m
 CONFIG_HANGCHECK_TIMER=m
index 649b9fc..3e4cbcb 100644 (file)
@@ -123,4 +123,6 @@ static inline int stccm_avail(void)
        return test_facility(142);
 }
 
+size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset,
+                          struct cpumf_ctr_info *info);
 #endif /* _ASM_S390_CPU_MCF_H */
index 9cceb26..baa8005 100644 (file)
@@ -4,9 +4,11 @@
 
 #include <linux/sched.h>
 #include <linux/audit.h>
+#include <linux/randomize_kstack.h>
 #include <linux/tracehook.h>
 #include <linux/processor.h>
 #include <linux/uaccess.h>
+#include <asm/timex.h>
 #include <asm/fpu/api.h>
 
 #define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP)
@@ -48,6 +50,14 @@ static __always_inline void arch_exit_to_user_mode(void)
 
 #define arch_exit_to_user_mode arch_exit_to_user_mode
 
+static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
+                                                 unsigned long ti_work)
+{
+       choose_random_kstack_offset(get_tod_clock_fast() & 0xff);
+}
+
+#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare
+
 static inline bool on_thread_stack(void)
 {
        return !(((unsigned long)(current->stack) ^ current_stack_pointer()) & ~(THREAD_SIZE - 1));
index 28664ee..e3882b0 100644 (file)
@@ -20,11 +20,6 @@ void *xlate_dev_mem_ptr(phys_addr_t phys);
 #define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 #define IO_SPACE_LIMIT 0
 
 void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot);
index 6bcfc56..8925f39 100644 (file)
@@ -454,6 +454,7 @@ struct kvm_vcpu_stat {
        u64 diagnose_44;
        u64 diagnose_9c;
        u64 diagnose_9c_ignored;
+       u64 diagnose_9c_forward;
        u64 diagnose_258;
        u64 diagnose_308;
        u64 diagnose_500;
@@ -700,6 +701,10 @@ struct kvm_hw_bp_info_arch {
 #define guestdbg_exit_pending(vcpu) (guestdbg_enabled(vcpu) && \
                (vcpu->guest_debug & KVM_GUESTDBG_EXIT_PENDING))
 
+#define KVM_GUESTDBG_VALID_MASK \
+               (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |\
+               KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_EXIT_PENDING)
+
 struct kvm_guestdbg_info_arch {
        unsigned long cr0;
        unsigned long cr9;
index 35c2af9..10b67f8 100644 (file)
@@ -204,7 +204,7 @@ extern unsigned int s390_pci_no_rid;
 struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state);
 int zpci_enable_device(struct zpci_dev *);
 int zpci_disable_device(struct zpci_dev *);
-int zpci_configure_device(struct zpci_dev *zdev, u32 fh);
+int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh);
 int zpci_deconfigure_device(struct zpci_dev *zdev);
 
 int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64);
index 01e3600..e317fd4 100644 (file)
@@ -63,5 +63,6 @@ extern void __noreturn cpu_die(void);
 extern void __cpu_die(unsigned int cpu);
 extern int __cpu_disable(void);
 extern void schedule_mcck_handler(void);
+void notrace smp_yield_cpu(int cpu);
 
 #endif /* __ASM_SMP_H */
index b3beef6..31a605b 100644 (file)
@@ -230,9 +230,7 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
                /* No support for kernel space counters only */
                } else if (!attr->exclude_kernel && attr->exclude_user) {
                        return -EOPNOTSUPP;
-
-               /* Count user and kernel space */
-               } else {
+               } else {        /* Count user and kernel space */
                        if (ev >= ARRAY_SIZE(cpumf_generic_events_basic))
                                return -EOPNOTSUPP;
                        ev = cpumf_generic_events_basic[ev];
@@ -402,12 +400,12 @@ static void cpumf_pmu_stop(struct perf_event *event, int flags)
                 */
                if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base]))
                        ctr_set_stop(&cpuhw->state, hwc->config_base);
-               event->hw.state |= PERF_HES_STOPPED;
+               hwc->state |= PERF_HES_STOPPED;
        }
 
        if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
                hw_perf_event_update(event);
-               event->hw.state |= PERF_HES_UPTODATE;
+               hwc->state |= PERF_HES_UPTODATE;
        }
 }
 
@@ -430,8 +428,6 @@ static int cpumf_pmu_add(struct perf_event *event, int flags)
        if (flags & PERF_EF_START)
                cpumf_pmu_start(event, PERF_EF_RELOAD);
 
-       perf_event_update_userpage(event);
-
        return 0;
 }
 
@@ -451,8 +447,6 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
         */
        if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base]))
                ctr_set_disable(&cpuhw->state, event->hw.config_base);
-
-       perf_event_update_userpage(event);
 }
 
 /*
index 3bced89..6d53215 100644 (file)
@@ -170,6 +170,52 @@ static int cpum_cf_offline_cpu(unsigned int cpu)
        return cpum_cf_setup(cpu, PMC_RELEASE);
 }
 
+/* Return the maximum possible counter set size (in number of 8 byte counters)
+ * depending on type and model number.
+ */
+size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset,
+                          struct cpumf_ctr_info *info)
+{
+       size_t ctrset_size = 0;
+
+       switch (ctrset) {
+       case CPUMF_CTR_SET_BASIC:
+               if (info->cfvn >= 1)
+                       ctrset_size = 6;
+               break;
+       case CPUMF_CTR_SET_USER:
+               if (info->cfvn == 1)
+                       ctrset_size = 6;
+               else if (info->cfvn >= 3)
+                       ctrset_size = 2;
+               break;
+       case CPUMF_CTR_SET_CRYPTO:
+               if (info->csvn >= 1 && info->csvn <= 5)
+                       ctrset_size = 16;
+               else if (info->csvn == 6)
+                       ctrset_size = 20;
+               break;
+       case CPUMF_CTR_SET_EXT:
+               if (info->csvn == 1)
+                       ctrset_size = 32;
+               else if (info->csvn == 2)
+                       ctrset_size = 48;
+               else if (info->csvn >= 3 && info->csvn <= 5)
+                       ctrset_size = 128;
+               else if (info->csvn == 6)
+                       ctrset_size = 160;
+               break;
+       case CPUMF_CTR_SET_MT_DIAG:
+               if (info->csvn > 3)
+                       ctrset_size = 48;
+               break;
+       case CPUMF_CTR_SET_MAX:
+               break;
+       }
+
+       return ctrset_size;
+}
+
 static int __init cpum_cf_init(void)
 {
        int rc;
index 2e3e7ed..08c985c 100644 (file)
@@ -316,52 +316,6 @@ static void cf_diag_read(struct perf_event *event)
        debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event);
 }
 
-/* Return the maximum possible counter set size (in number of 8 byte counters)
- * depending on type and model number.
- */
-static size_t cf_diag_ctrset_size(enum cpumf_ctr_set ctrset,
-                                struct cpumf_ctr_info *info)
-{
-       size_t ctrset_size = 0;
-
-       switch (ctrset) {
-       case CPUMF_CTR_SET_BASIC:
-               if (info->cfvn >= 1)
-                       ctrset_size = 6;
-               break;
-       case CPUMF_CTR_SET_USER:
-               if (info->cfvn == 1)
-                       ctrset_size = 6;
-               else if (info->cfvn >= 3)
-                       ctrset_size = 2;
-               break;
-       case CPUMF_CTR_SET_CRYPTO:
-               if (info->csvn >= 1 && info->csvn <= 5)
-                       ctrset_size = 16;
-               else if (info->csvn == 6)
-                       ctrset_size = 20;
-               break;
-       case CPUMF_CTR_SET_EXT:
-               if (info->csvn == 1)
-                       ctrset_size = 32;
-               else if (info->csvn == 2)
-                       ctrset_size = 48;
-               else if (info->csvn >= 3 && info->csvn <= 5)
-                       ctrset_size = 128;
-               else if (info->csvn == 6)
-                       ctrset_size = 160;
-               break;
-       case CPUMF_CTR_SET_MT_DIAG:
-               if (info->csvn > 3)
-                       ctrset_size = 48;
-               break;
-       case CPUMF_CTR_SET_MAX:
-               break;
-       }
-
-       return ctrset_size;
-}
-
 /* Calculate memory needed to store all counter sets together with header and
  * trailer data. This is independend of the counter set authorization which
  * can vary depending on the configuration.
@@ -372,7 +326,7 @@ static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info)
        enum cpumf_ctr_set i;
 
        for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-               size_t size = cf_diag_ctrset_size(i, info);
+               size_t size = cpum_cf_ctrset_size(i, info);
 
                if (size)
                        max_size += size * sizeof(u64) +
@@ -405,7 +359,7 @@ static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
        ctrdata->def = CF_DIAG_CTRSET_DEF;
        ctrdata->set = ctrset;
        ctrdata->res1 = 0;
-       ctrset_size = cf_diag_ctrset_size(ctrset, &cpuhw->info);
+       ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info);
 
        if (ctrset_size) {                      /* Save data */
                need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
@@ -845,7 +799,7 @@ static void cf_diag_cpu_read(void *parm)
 
                if (!(p->sets & cpumf_ctr_ctl[set]))
                        continue;       /* Counter set not in list */
-               set_size = cf_diag_ctrset_size(set, &cpuhw->info);
+               set_size = cpum_cf_ctrset_size(set, &cpuhw->info);
                space = sizeof(csd->data) - csd->used;
                space = cf_diag_cpuset_read(sp, set, set_size, space);
                if (space) {
@@ -975,7 +929,7 @@ static size_t cf_diag_needspace(unsigned int sets)
        for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
                if (!(sets & cpumf_ctr_ctl[i]))
                        continue;
-               bytes += cf_diag_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
+               bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
                         sizeof(((struct s390_ctrset_setdata *)0)->set) +
                         sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
        }
index 72134f9..5aab59a 100644 (file)
@@ -937,9 +937,9 @@ static int __init setup_hwcaps(void)
        if (MACHINE_HAS_VX) {
                elf_hwcap |= HWCAP_S390_VXRS;
                if (test_facility(134))
-                       elf_hwcap |= HWCAP_S390_VXRS_EXT;
-               if (test_facility(135))
                        elf_hwcap |= HWCAP_S390_VXRS_BCD;
+               if (test_facility(135))
+                       elf_hwcap |= HWCAP_S390_VXRS_EXT;
                if (test_facility(148))
                        elf_hwcap |= HWCAP_S390_VXRS_EXT2;
                if (test_facility(152))
index 58c8afa..2fec2b8 100644 (file)
@@ -429,6 +429,7 @@ void notrace smp_yield_cpu(int cpu)
        asm volatile("diag %0,0,0x9c"
                     : : "d" (pcpu_devices[cpu].address));
 }
+EXPORT_SYMBOL_GPL(smp_yield_cpu);
 
 /*
  * Send cpus emergency shutdown signal. This gives the cpus the
index bc8e650..4e5cc7d 100644 (file)
@@ -142,6 +142,7 @@ void do_syscall(struct pt_regs *regs)
 
 void noinstr __do_syscall(struct pt_regs *regs, int per_trap)
 {
+       add_random_kstack_offset();
        enter_from_user_mode(regs);
 
        memcpy(&regs->gprs[8], S390_lowcore.save_area_sync, 8 * sizeof(unsigned long));
index a421905..7e4a2ab 100644 (file)
 441  common    epoll_pwait2            sys_epoll_pwait2                compat_sys_epoll_pwait2
 442  common    mount_setattr           sys_mount_setattr               sys_mount_setattr
 443  common    quotactl_path           sys_quotactl_path               sys_quotactl_path
+444  common    landlock_create_ruleset sys_landlock_create_ruleset     sys_landlock_create_ruleset
+445  common    landlock_add_rule       sys_landlock_add_rule           sys_landlock_add_rule
+446  common    landlock_restrict_self  sys_landlock_restrict_self      sys_landlock_restrict_self
index 63021d4..8dd23c7 100644 (file)
@@ -17,6 +17,7 @@
 #include "asm/ptrace.h"
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
+#include <linux/randomize_kstack.h>
 #include <linux/extable.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
@@ -301,6 +302,7 @@ void noinstr __do_pgm_check(struct pt_regs *regs)
        unsigned int trapnr, syscall_redirect = 0;
        irqentry_state_t state;
 
+       add_random_kstack_offset();
        regs->int_code = *(u32 *)&S390_lowcore.pgm_ilc;
        regs->int_parm_long = S390_lowcore.trans_exc_code;
 
index 5b8ec1c..02c146f 100644 (file)
@@ -150,6 +150,19 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int forward_cnt;
+static unsigned long cur_slice;
+
+static int diag9c_forwarding_overrun(void)
+{
+       /* Reset the count on a new slice */
+       if (time_after(jiffies, cur_slice)) {
+               cur_slice = jiffies;
+               forward_cnt = diag9c_forwarding_hz / HZ;
+       }
+       return forward_cnt-- <= 0 ? 1 : 0;
+}
+
 static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
 {
        struct kvm_vcpu *tcpu;
@@ -167,9 +180,21 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
        if (!tcpu)
                goto no_yield;
 
-       /* target already running */
-       if (READ_ONCE(tcpu->cpu) >= 0)
-               goto no_yield;
+       /* target guest VCPU already running */
+       if (READ_ONCE(tcpu->cpu) >= 0) {
+               if (!diag9c_forwarding_hz || diag9c_forwarding_overrun())
+                       goto no_yield;
+
+               /* target host CPU already running */
+               if (!vcpu_is_preempted(tcpu->cpu))
+                       goto no_yield;
+               smp_yield_cpu(tcpu->cpu);
+               VCPU_EVENT(vcpu, 5,
+                          "diag time slice end directed to %d: yield forwarded",
+                          tid);
+               vcpu->stat.diagnose_9c_forward++;
+               return 0;
+       }
 
        if (kvm_vcpu_yield_to(tcpu) <= 0)
                goto no_yield;
index 6d6b570..b9f85b2 100644 (file)
@@ -976,7 +976,9 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
  * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
- * @pgt: pointer to the page table address result
+ * @pgt: pointer to the beginning of the page table for the given address if
+ *      successful (return value 0), or to the first invalid DAT entry in
+ *      case of exceptions (return value > 0)
  * @fake: pgt references contiguous guest memory block, not a pgtable
  */
 static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
@@ -1034,6 +1036,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
                        rfte.val = ptr;
                        goto shadow_r2t;
                }
+               *pgt = ptr + vaddr.rfx * 8;
                rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
                if (rc)
                        return rc;
@@ -1060,6 +1063,7 @@ shadow_r2t:
                        rste.val = ptr;
                        goto shadow_r3t;
                }
+               *pgt = ptr + vaddr.rsx * 8;
                rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
                if (rc)
                        return rc;
@@ -1087,6 +1091,7 @@ shadow_r3t:
                        rtte.val = ptr;
                        goto shadow_sgt;
                }
+               *pgt = ptr + vaddr.rtx * 8;
                rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
                if (rc)
                        return rc;
@@ -1123,6 +1128,7 @@ shadow_sgt:
                        ste.val = ptr;
                        goto shadow_pgt;
                }
+               *pgt = ptr + vaddr.sx * 8;
                rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
                if (rc)
                        return rc;
@@ -1157,6 +1163,8 @@ shadow_pgt:
  * @vcpu: virtual cpu
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
+ * @datptr: will contain the address of the faulting DAT table entry, or of
+ *         the valid leaf, plus some flags
  *
  * Returns: - 0 if the shadow fault was successfully resolved
  *         - > 0 (pgm exception code) on exceptions while faulting
@@ -1165,11 +1173,11 @@ shadow_pgt:
  *         - -ENOMEM if out of memory
  */
 int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
-                         unsigned long saddr)
+                         unsigned long saddr, unsigned long *datptr)
 {
        union vaddress vaddr;
        union page_table_entry pte;
-       unsigned long pgt;
+       unsigned long pgt = 0;
        int dat_protection, fake;
        int rc;
 
@@ -1191,8 +1199,20 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
                pte.val = pgt + vaddr.px * PAGE_SIZE;
                goto shadow_page;
        }
-       if (!rc)
-               rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+
+       switch (rc) {
+       case PGM_SEGMENT_TRANSLATION:
+       case PGM_REGION_THIRD_TRANS:
+       case PGM_REGION_SECOND_TRANS:
+       case PGM_REGION_FIRST_TRANS:
+               pgt |= PEI_NOT_PTE;
+               break;
+       case 0:
+               pgt += vaddr.px * 8;
+               rc = gmap_read_table(sg->parent, pgt, &pte.val);
+       }
+       if (datptr)
+               *datptr = pgt | dat_protection * PEI_DAT_PROT;
        if (!rc && pte.i)
                rc = PGM_PAGE_TRANSLATION;
        if (!rc && pte.z)
index f4c5175..7c72a5e 100644 (file)
 
 /**
  * kvm_s390_real_to_abs - convert guest real address to guest absolute address
- * @vcpu - guest virtual cpu
+ * @prefix - guest prefix
  * @gra - guest real address
  *
  * Returns the guest absolute address that corresponds to the passed guest real
- * address @gra of a virtual guest cpu by applying its prefix.
+ * address @gra of by applying the given prefix.
  */
-static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
-                                                unsigned long gra)
+static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra)
 {
-       unsigned long prefix  = kvm_s390_get_prefix(vcpu);
-
        if (gra < 2 * PAGE_SIZE)
                gra += prefix;
        else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE)
@@ -36,6 +33,43 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
        return gra;
 }
 
+/**
+ * kvm_s390_real_to_abs - convert guest real address to guest absolute address
+ * @vcpu - guest virtual cpu
+ * @gra - guest real address
+ *
+ * Returns the guest absolute address that corresponds to the passed guest real
+ * address @gra of a virtual guest cpu by applying its prefix.
+ */
+static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
+                                                unsigned long gra)
+{
+       return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra);
+}
+
+/**
+ * _kvm_s390_logical_to_effective - convert guest logical to effective address
+ * @psw: psw of the guest
+ * @ga: guest logical address
+ *
+ * Convert a guest logical address to an effective address by applying the
+ * rules of the addressing mode defined by bits 31 and 32 of the given PSW
+ * (extendended/basic addressing mode).
+ *
+ * Depending on the addressing mode, the upper 40 bits (24 bit addressing
+ * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing
+ * mode) of @ga will be zeroed and the remaining bits will be returned.
+ */
+static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw,
+                                                          unsigned long ga)
+{
+       if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
+               return ga;
+       if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
+               return ga & ((1UL << 31) - 1);
+       return ga & ((1UL << 24) - 1);
+}
+
 /**
  * kvm_s390_logical_to_effective - convert guest logical to effective address
  * @vcpu: guest virtual cpu
@@ -52,13 +86,7 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
 static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu,
                                                          unsigned long ga)
 {
-       psw_t *psw = &vcpu->arch.sie_block->gpsw;
-
-       if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
-               return ga;
-       if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
-               return ga & ((1UL << 31) - 1);
-       return ga & ((1UL << 24) - 1);
+       return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga);
 }
 
 /*
@@ -359,7 +387,11 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
 int ipte_lock_held(struct kvm_vcpu *vcpu);
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
 
+/* MVPG PEI indication bits */
+#define PEI_DAT_PROT 2
+#define PEI_NOT_PTE 4
+
 int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
-                         unsigned long saddr);
+                         unsigned long saddr, unsigned long *datptr);
 
 #endif /* __KVM_S390_GACCESS_H */
index 2f09e9d..1296fc1 100644 (file)
@@ -158,6 +158,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        VCPU_STAT("instruction_diag_44", diagnose_44),
        VCPU_STAT("instruction_diag_9c", diagnose_9c),
        VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored),
+       VCPU_STAT("diag_9c_forward", diagnose_9c_forward),
        VCPU_STAT("instruction_diag_258", diagnose_258),
        VCPU_STAT("instruction_diag_308", diagnose_308),
        VCPU_STAT("instruction_diag_500", diagnose_500),
@@ -185,6 +186,11 @@ static bool use_gisa  = true;
 module_param(use_gisa, bool, 0644);
 MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it.");
 
+/* maximum diag9c forwarding per second */
+unsigned int diag9c_forwarding_hz;
+module_param(diag9c_forwarding_hz, uint, 0644);
+MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off");
+
 /*
  * For now we handle at most 16 double words as this is what the s390 base
  * kernel handles and stores in the prefix page. If we ever need to go beyond
@@ -544,6 +550,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_S390_DIAG318:
                r = 1;
                break;
+       case KVM_CAP_SET_GUEST_DEBUG2:
+               r = KVM_GUESTDBG_VALID_MASK;
+               break;
        case KVM_CAP_S390_HPAGE_1M:
                r = 0;
                if (hpage && !kvm_is_ucontrol(kvm))
@@ -4307,16 +4316,16 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu)
        kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
        kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val;
        if (MACHINE_HAS_GS) {
+               preempt_disable();
                __ctl_set_bit(2, 4);
                if (vcpu->arch.gs_enabled)
                        save_gs_cb(current->thread.gs_cb);
-               preempt_disable();
                current->thread.gs_cb = vcpu->arch.host_gscb;
                restore_gs_cb(vcpu->arch.host_gscb);
-               preempt_enable();
                if (!vcpu->arch.host_gscb)
                        __ctl_clear_bit(2, 4);
                vcpu->arch.host_gscb = NULL;
+               preempt_enable();
        }
        /* SIE will save etoken directly into SDNX and therefore kvm_run */
 }
@@ -4542,7 +4551,7 @@ int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
                /*
                 * As we are starting a second VCPU, we have to disable
                 * the IBS facility on all VCPUs to remove potentially
-                * oustanding ENABLE requests.
+                * outstanding ENABLE requests.
                 */
                __disable_ibs_on_all_vcpus(vcpu->kvm);
        }
index 79dcd64..9fad251 100644 (file)
@@ -471,4 +471,12 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
  * @kvm: the KVM guest
  */
 void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm);
+
+/**
+ * diag9c_forwarding_hz
+ *
+ * Set the maximum number of diag9c forwarding per second
+ */
+extern unsigned int diag9c_forwarding_hz;
+
 #endif
index bd803e0..4002a24 100644 (file)
@@ -417,11 +417,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                memcpy((void *)((u64)scb_o + 0xc0),
                       (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
                break;
-       case ICPT_PARTEXEC:
-               /* MVPG only */
-               memcpy((void *)((u64)scb_o + 0xc0),
-                      (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
-               break;
        }
 
        if (scb_s->ihcpu != 0xffffU)
@@ -620,10 +615,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
        /* with mso/msl, the prefix lies at offset *mso* */
        prefix += scb_s->mso;
 
-       rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+       rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
        if (!rc && (scb_s->ecb & ECB_TE))
                rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-                                          prefix + PAGE_SIZE);
+                                          prefix + PAGE_SIZE, NULL);
        /*
         * We don't have to mprotect, we will be called for all unshadows.
         * SIE will detect if protection applies and trigger a validity.
@@ -914,7 +909,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                                    current->thread.gmap_addr, 1);
 
        rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-                                  current->thread.gmap_addr);
+                                  current->thread.gmap_addr, NULL);
        if (rc > 0) {
                rc = inject_fault(vcpu, rc,
                                  current->thread.gmap_addr,
@@ -936,7 +931,7 @@ static void handle_last_fault(struct kvm_vcpu *vcpu,
 {
        if (vsie_page->fault_addr)
                kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-                                     vsie_page->fault_addr);
+                                     vsie_page->fault_addr, NULL);
        vsie_page->fault_addr = 0;
 }
 
@@ -983,6 +978,98 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
        return 0;
 }
 
+/*
+ * Get a register for a nested guest.
+ * @vcpu the vcpu of the guest
+ * @vsie_page the vsie_page for the nested guest
+ * @reg the register number, the upper 4 bits are ignored.
+ * returns: the value of the register.
+ */
+static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg)
+{
+       /* no need to validate the parameter and/or perform error handling */
+       reg &= 0xf;
+       switch (reg) {
+       case 15:
+               return vsie_page->scb_s.gg15;
+       case 14:
+               return vsie_page->scb_s.gg14;
+       default:
+               return vcpu->run->s.regs.gprs[reg];
+       }
+}
+
+static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       unsigned long pei_dest, pei_src, src, dest, mask, prefix;
+       u64 *pei_block = &vsie_page->scb_o->mcic;
+       int edat, rc_dest, rc_src;
+       union ctlreg0 cr0;
+
+       cr0.val = vcpu->arch.sie_block->gcr[0];
+       edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+       mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK);
+       prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+
+       dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask;
+       dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso;
+       src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
+       src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
+
+       rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
+       rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
+       /*
+        * Either everything went well, or something non-critical went wrong
+        * e.g. because of a race. In either case, simply retry.
+        */
+       if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) {
+               retry_vsie_icpt(vsie_page);
+               return -EAGAIN;
+       }
+       /* Something more serious went wrong, propagate the error */
+       if (rc_dest < 0)
+               return rc_dest;
+       if (rc_src < 0)
+               return rc_src;
+
+       /* The only possible suppressing exception: just deliver it */
+       if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) {
+               clear_vsie_icpt(vsie_page);
+               rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC);
+               WARN_ON_ONCE(rc_dest);
+               return 1;
+       }
+
+       /*
+        * Forward the PEI intercept to the guest if it was a page fault, or
+        * also for segment and region table faults if EDAT applies.
+        */
+       if (edat) {
+               rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0;
+               rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0;
+       } else {
+               rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0;
+               rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
+       }
+       if (!rc_dest && !rc_src) {
+               pei_block[0] = pei_dest;
+               pei_block[1] = pei_src;
+               return 1;
+       }
+
+       retry_vsie_icpt(vsie_page);
+
+       /*
+        * The host has edat, and the guest does not, or it was an ASCE type
+        * exception. The host needs to inject the appropriate DAT interrupts
+        * into the guest.
+        */
+       if (rc_dest)
+               return inject_fault(vcpu, rc_dest, dest, 1);
+       return inject_fault(vcpu, rc_src, src, 0);
+}
+
 /*
  * Run the vsie on a shadow scb and a shadow gmap, without any further
  * sanity checks, handling SIE faults.
@@ -1071,6 +1158,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                if ((scb_s->ipa & 0xf000) != 0xf000)
                        scb_s->ipa += 0x1000;
                break;
+       case ICPT_PARTEXEC:
+               if (scb_s->ipa == 0xb254)
+                       rc = vsie_handle_mvpg(vcpu, vsie_page);
+               break;
        }
        return rc;
 }
index 3b5a4d2..da36d13 100644 (file)
@@ -189,7 +189,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
        return pte;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz)
 {
        pgd_t *pgdp;
index c01b6db..b0993e0 100644 (file)
@@ -738,17 +738,19 @@ error:
 }
 
 /**
- * zpci_configure_device() - Configure a zpci_dev
+ * zpci_scan_configured_device() - Scan a freshly configured zpci_dev
  * @zdev: The zpci_dev to be configured
  * @fh: The general function handle supplied by the platform
  *
  * Given a device in the configuration state Configured, enables, scans and
- * adds it to the common code PCI subsystem. If any failure occurs, the
- * zpci_dev is left disabled.
+ * adds it to the common code PCI subsystem if possible. If the PCI device is
+ * parked because we can not yet create a PCI bus because we have not seen
+ * function 0, it is ignored but will be scanned once function 0 appears.
+ * If any failure occurs, the zpci_dev is left disabled.
  *
  * Return: 0 on success, or an error code otherwise
  */
-int zpci_configure_device(struct zpci_dev *zdev, u32 fh)
+int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh)
 {
        int rc;
 
index 1178b48..cd447b9 100644 (file)
@@ -76,8 +76,6 @@ void zpci_event_error(void *data)
 
 static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh)
 {
-       enum zpci_state state;
-
        zdev->fh = fh;
        /* Give the driver a hint that the function is
         * already unusable.
@@ -88,15 +86,12 @@ static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh)
         */
        zpci_disable_device(zdev);
        zdev->state = ZPCI_FN_STATE_STANDBY;
-       if (!clp_get_state(zdev->fid, &state) &&
-           state == ZPCI_FN_STATE_RESERVED) {
-               zpci_zdev_put(zdev);
-       }
 }
 
 static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
 {
        struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
+       enum zpci_state state;
 
        zpci_err("avail CCDF:\n");
        zpci_err_hex(ccdf, sizeof(*ccdf));
@@ -113,7 +108,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
                                break;
                        zdev->state = ZPCI_FN_STATE_CONFIGURED;
                }
-               zpci_configure_device(zdev, ccdf->fh);
+               zpci_scan_configured_device(zdev, ccdf->fh);
                break;
        case 0x0302: /* Reserved -> Standby */
                if (!zdev)
@@ -123,13 +118,28 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
                break;
        case 0x0303: /* Deconfiguration requested */
                if (zdev) {
+                       /* The event may have been queued before we confirgured
+                        * the device.
+                        */
+                       if (zdev->state != ZPCI_FN_STATE_CONFIGURED)
+                               break;
                        zdev->fh = ccdf->fh;
                        zpci_deconfigure_device(zdev);
                }
                break;
        case 0x0304: /* Configured -> Standby|Reserved */
-               if (zdev)
-                       zpci_event_hard_deconfigured(zdev, ccdf->fh);
+               if (zdev) {
+                       /* The event may have been queued before we confirgured
+                        * the device.:
+                        */
+                       if (zdev->state == ZPCI_FN_STATE_CONFIGURED)
+                               zpci_event_hard_deconfigured(zdev, ccdf->fh);
+                       /* The 0x0304 event may immediately reserve the device */
+                       if (!clp_get_state(zdev->fid, &state) &&
+                           state == ZPCI_FN_STATE_RESERVED) {
+                               zpci_zdev_put(zdev);
+                       }
+               }
                break;
        case 0x0306: /* 0x308 or 0x302 for multiple devices */
                zpci_remove_reserved_devices();
index e798e55..6812953 100644 (file)
@@ -2,6 +2,8 @@
 config SUPERH
        def_bool y
        select ARCH_32BIT_OFF_T
+       select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM && MMU
+       select ARCH_ENABLE_MEMORY_HOTREMOVE if SPARSEMEM && MMU
        select ARCH_HAVE_CUSTOM_GPIO_H
        select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
        select ARCH_HAS_BINFMT_FLAT if !MMU
@@ -101,9 +103,6 @@ config SYS_SUPPORTS_APM_EMULATION
        bool
        select ARCH_SUSPEND_POSSIBLE
 
-config SYS_SUPPORTS_HUGETLBFS
-       bool
-
 config SYS_SUPPORTS_SMP
        bool
 
@@ -175,12 +174,12 @@ config CPU_SH3
 
 config CPU_SH4
        bool
+       select ARCH_SUPPORTS_HUGETLBFS if MMU
        select CPU_HAS_INTEVT
        select CPU_HAS_SR_RB
        select CPU_HAS_FPU if !CPU_SH4AL_DSP
        select SH_INTC
        select SYS_SUPPORTS_SH_TMU
-       select SYS_SUPPORTS_HUGETLBFS if MMU
 
 config CPU_SH4A
        bool
index ef7cc31..9ee3526 100644 (file)
@@ -23,7 +23,6 @@ CONFIG_SH_PCLK_FREQ=31250000
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
-# CONFIG_DEVKMEM is not set
 # CONFIG_UNIX98_PTYS is not set
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_HW_RANDOM is not set
index 315b04a..601d062 100644 (file)
@@ -71,7 +71,6 @@ CONFIG_SMC91X=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=4
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
index 99975db..79f02f1 100644 (file)
@@ -75,7 +75,6 @@ CONFIG_INPUT_FF_MEMLESS=y
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
 CONFIG_VT_HW_CONSOLE_BINDING=y
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
 # CONFIG_LEGACY_PTYS is not set
index 2c46c00..cbc9389 100644 (file)
@@ -18,7 +18,6 @@ CONFIG_CPU_IDLE=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=6
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
index 8819315..ee2357d 100644 (file)
@@ -20,7 +20,6 @@ CONFIG_CPU_IDLE=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=6
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
index 9b885c1..5c725c7 100644 (file)
@@ -66,7 +66,6 @@ CONFIG_INPUT_FF_MEMLESS=m
 CONFIG_INPUT_EVDEV=y
 CONFIG_INPUT_EVBUG=m
 CONFIG_VT_HW_CONSOLE_BINDING=y
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=6
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
index 450b585..3b6c7b5 100644 (file)
@@ -58,15 +58,16 @@ static inline unsigned long __ffs(unsigned long word)
        return result;
 }
 
-#include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/ffs.h>
 #include <asm-generic/bitops/hweight.h>
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
-#include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/ext2-atomic.h>
 #include <asm-generic/bitops/fls.h>
 #include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/fls64.h>
 
+#include <asm-generic/bitops/le.h>
+#include <asm-generic/bitops/find.h>
+
 #endif /* __ASM_SH_BITOPS_H */
index 6d5c646..cf9a3ec 100644 (file)
@@ -283,11 +283,6 @@ static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size,
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 #define ARCH_HAS_VALID_PHYS_ADDR_RANGE
 int valid_phys_addr_range(phys_addr_t addr, size_t size);
 int valid_mmap_phys_addr_range(unsigned long pfn, size_t size);
index 0646c59..295c433 100644 (file)
@@ -67,7 +67,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
  * Modifying code must take extra care. On an SMP machine, if
  * the code being modified is also being executed on another CPU
  * that CPU will have undefined results and possibly take a GPF.
- * We use kstop_machine to stop other CPUS from exectuing code.
+ * We use kstop_machine to stop other CPUS from executing code.
  * But this does not stop NMIs from happening. We still need
  * to protect against that. We separate out the modification of
  * the code to take care of this.
index 445e3ec..1d2507f 100644 (file)
@@ -57,24 +57,6 @@ static inline int sh_pmu_initialized(void)
        return !!sh_pmu;
 }
 
-const char *perf_pmu_name(void)
-{
-       if (!sh_pmu)
-               return NULL;
-
-       return sh_pmu->name;
-}
-EXPORT_SYMBOL_GPL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
-       if (!sh_pmu)
-               return 0;
-
-       return sh_pmu->num_events;
-}
-EXPORT_SYMBOL_GPL(perf_num_counters);
-
 /*
  * Release the PMU if this is the last perf_event.
  */
index f68517a..f47a0dc 100644 (file)
 441    common  epoll_pwait2                    sys_epoll_pwait2
 442    common  mount_setattr                   sys_mount_setattr
 443    common  quotactl_path                   sys_quotactl_path
+444    common  landlock_create_ruleset         sys_landlock_create_ruleset
+445    common  landlock_add_rule               sys_landlock_add_rule
+446    common  landlock_restrict_self          sys_landlock_restrict_self
index 77aa2f8..d551a9c 100644 (file)
@@ -136,14 +136,6 @@ config ARCH_SPARSEMEM_DEFAULT
 config ARCH_SELECT_MEMORY_MODEL
        def_bool y
 
-config ARCH_ENABLE_MEMORY_HOTPLUG
-       def_bool y
-       depends on SPARSEMEM && MMU
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
-       def_bool y
-       depends on SPARSEMEM && MMU
-
 config ARCH_MEMORY_PROBE
        def_bool y
        depends on MEMORY_HOTPLUG
index 220d7bc..999ab59 100644 (file)
@@ -21,7 +21,7 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz)
 {
        pgd_t *pgd;
index 12a4fb0..1809909 100644 (file)
@@ -122,7 +122,6 @@ CONFIG_INPUT_SPARCSPKR=y
 # CONFIG_SERIO_SERPORT is not set
 CONFIG_SERIO_PCIPS2=m
 CONFIG_SERIO_RAW=m
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SUNSU=y
 CONFIG_SERIAL_SUNSU_CONSOLE=y
 CONFIG_SERIAL_SUNSAB=y
index d3aa1a5..e284394 100644 (file)
@@ -17,7 +17,7 @@ void _mcount(void);
 #endif
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-/* reloction of mcount call site is the same as the address */
+/* relocation of mcount call site is the same as the address */
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
        return addr;
index 9fbfc95..5ffa820 100644 (file)
@@ -454,11 +454,6 @@ void sbus_set_sbus64(struct device *, int);
  */
 #define xlate_dev_mem_ptr(p)   __va(p)
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)  p
-
 #endif
 
 #endif /* !(__SPARC64_IO_H) */
index 3ee8232..b9e1c0e 100644 (file)
 441    common  epoll_pwait2                    sys_epoll_pwait2                compat_sys_epoll_pwait2
 442    common  mount_setattr                   sys_mount_setattr
 443    common  quotactl_path                   sys_quotactl_path
+444    common  landlock_create_ruleset         sys_landlock_create_ruleset
+445    common  landlock_add_rule               sys_landlock_add_rule
+446    common  landlock_restrict_self          sys_landlock_restrict_self
index ad4b42f..04d8790 100644 (file)
@@ -279,7 +279,7 @@ unsigned long pud_leaf_size(pud_t pud) { return 1UL << tte_to_shift(*(pte_t *)&p
 unsigned long pmd_leaf_size(pmd_t pmd) { return 1UL << tte_to_shift(*(pte_t *)&pmd); }
 unsigned long pte_leaf_size(pte_t pte) { return 1UL << tte_to_shift(pte); }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz)
 {
        pgd_t *pgd;
index c3030db..57cfd9a 100644 (file)
@@ -5,6 +5,7 @@ menu "UML-specific options"
 config UML
        bool
        default y
+       select ARCH_EPHEMERAL_INODES
        select ARCH_HAS_KCOV
        select ARCH_NO_PREEMPT
        select HAVE_ARCH_AUDITSYSCALL
index 315d368..1dfb295 100644 (file)
@@ -17,6 +17,7 @@ config GCOV
        bool "Enable gcov support"
        depends on DEBUG_INFO
        depends on !KCOV
+       depends on !MODULES
        help
          This option allows developers to retrieve coverage data from a UML
          session.
index 103adac..9a67c01 100644 (file)
@@ -24,10 +24,3 @@ extern void cow_sizes(int version, __u64 size, int sectorsize, int align,
                      int *data_offset_out);
 
 #endif
-
-/*
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
index d35d3f3..5b064d3 100644 (file)
@@ -122,13 +122,11 @@ static ssize_t hostaudio_write(struct file *file, const char __user *buffer,
 static __poll_t hostaudio_poll(struct file *file,
                                struct poll_table_struct *wait)
 {
-       __poll_t mask = 0;
-
 #ifdef DEBUG
        printk(KERN_DEBUG "hostaudio: poll called (unimplemented)\n");
 #endif
 
-       return mask;
+       return 0;
 }
 
 static long hostaudio_ioctl(struct file *file,
index 47a02e6..d27a2a9 100644 (file)
@@ -8,7 +8,6 @@
  * Copyright (C) 2001 by various other people who didn't put their name here.
  */
 
-#include <linux/version.h>
 #include <linux/memblock.h>
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
index def3761..b9e20bb 100644 (file)
@@ -302,7 +302,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 struct mm_struct;
 extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr);
 
-#define update_mmu_cache(vma,address,ptep) do ; while (0)
+#define update_mmu_cache(vma,address,ptep) do {} while (0)
 
 /* Encode and de-code a swap entry */
 #define __swp_type(x)                  (((x).val >> 5) & 0x1f)
diff --git a/arch/um/include/uapi/asm/Kbuild b/arch/um/include/uapi/asm/Kbuild
new file mode 100644 (file)
index 0000000..f66554c
--- /dev/null
@@ -0,0 +1 @@
+# SPDX-License-Identifier: GPL-2.0
index 5aa8820..e698e0c 100644 (file)
@@ -21,7 +21,6 @@ obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \
 
 obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o
 obj-$(CONFIG_GPROF)    += gprof_syms.o
-obj-$(CONFIG_GCOV)     += gmon_syms.o
 obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 
index dacbfab..2f2a8ce 100644 (file)
@@ -6,6 +6,12 @@ OUTPUT_ARCH(ELF_ARCH)
 ENTRY(_start)
 jiffies = jiffies_64;
 
+VERSION {
+  {
+    local: *;
+  };
+}
+
 SECTIONS
 {
   PROVIDE (__executable_start = START);
diff --git a/arch/um/kernel/gmon_syms.c b/arch/um/kernel/gmon_syms.c
deleted file mode 100644 (file)
index 9361a8e..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#include <linux/module.h>
-
-extern void __bb_init_func(void *)  __attribute__((weak));
-EXPORT_SYMBOL(__bb_init_func);
-
-extern void __gcov_init(void *)  __attribute__((weak));
-EXPORT_SYMBOL(__gcov_init);
-extern void __gcov_merge_add(void *, unsigned int)  __attribute__((weak));
-EXPORT_SYMBOL(__gcov_merge_add);
-extern void __gcov_exit(void)  __attribute__((weak));
-EXPORT_SYMBOL(__gcov_exit);
index 9019ff5..8e636ce 100644 (file)
@@ -72,8 +72,7 @@ static void __init one_page_table_init(pmd_t *pmd)
 
                set_pmd(pmd, __pmd(_KERNPG_TABLE +
                                           (unsigned long) __pa(pte)));
-               if (pte != pte_offset_kernel(pmd, 0))
-                       BUG();
+               BUG_ON(pte != pte_offset_kernel(pmd, 0));
        }
 }
 
index 45d957d..7a8e2b1 100644 (file)
@@ -7,6 +7,12 @@ OUTPUT_ARCH(ELF_ARCH)
 ENTRY(_start)
 jiffies = jiffies_64;
 
+VERSION {
+  {
+    local: *;
+  };
+}
+
 SECTIONS
 {
   /* This must contain the right address - not quite the default ELF one.*/
index dac15f6..0045e1b 100644 (file)
@@ -60,7 +60,13 @@ config X86
        select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
        select ARCH_32BIT_OFF_T                 if X86_32
        select ARCH_CLOCKSOURCE_INIT
+       select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
+       select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM)
+       select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
+       select ARCH_ENABLE_SPLIT_PMD_PTLOCK if X86_64 || X86_PAE
+       select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
        select ARCH_HAS_ACPI_TABLE_UPGRADE      if ACPI
+       select ARCH_HAS_CACHE_LINE_SIZE
        select ARCH_HAS_DEBUG_VIRTUAL
        select ARCH_HAS_DEBUG_VM_PGTABLE        if !X86_PAE
        select ARCH_HAS_DEVMEM_IS_ALLOWED
@@ -165,6 +171,7 @@ config X86
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
        select HAVE_ARCH_USERFAULTFD_WP         if X86_64 && USERFAULTFD
+       select HAVE_ARCH_USERFAULTFD_MINOR      if X86_64 && USERFAULTFD
        select HAVE_ARCH_VMAP_STACK             if X86_64
        select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
        select HAVE_ARCH_WITHIN_STACK_FRAMES
@@ -315,9 +322,6 @@ config GENERIC_CALIBRATE_DELAY
 config ARCH_HAS_CPU_RELAX
        def_bool y
 
-config ARCH_HAS_CACHE_LINE_SIZE
-       def_bool y
-
 config ARCH_HAS_FILTER_PGPROT
        def_bool y
 
@@ -2428,30 +2432,13 @@ config ARCH_HAS_ADD_PAGES
        def_bool y
        depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
 
-config ARCH_ENABLE_MEMORY_HOTPLUG
-       def_bool y
-       depends on X86_64 || (X86_32 && HIGHMEM)
-
-config ARCH_ENABLE_MEMORY_HOTREMOVE
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
        def_bool y
-       depends on MEMORY_HOTPLUG
 
 config USE_PERCPU_NUMA_NODE_ID
        def_bool y
        depends on NUMA
 
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
-       def_bool y
-       depends on X86_64 || X86_PAE
-
-config ARCH_ENABLE_HUGEPAGE_MIGRATION
-       def_bool y
-       depends on X86_64 && HUGETLB_PAGE && MIGRATION
-
-config ARCH_ENABLE_THP_MIGRATION
-       def_bool y
-       depends on X86_64 && TRANSPARENT_HUGEPAGE
-
 menu "Power management and ACPI options"
 
 config ARCH_HIBERNATION_HEADER
index f52a443..28a1423 100644 (file)
 441    i386    epoll_pwait2            sys_epoll_pwait2                compat_sys_epoll_pwait2
 442    i386    mount_setattr           sys_mount_setattr
 443    i386    quotactl_path           sys_quotactl_path
+444    i386    landlock_create_ruleset sys_landlock_create_ruleset
+445    i386    landlock_add_rule       sys_landlock_add_rule
+446    i386    landlock_restrict_self  sys_landlock_restrict_self
index 7eb007b..ecd551b 100644 (file)
 441    common  epoll_pwait2            sys_epoll_pwait2
 442    common  mount_setattr           sys_mount_setattr
 443    common  quotactl_path           sys_quotactl_path
+444    common  landlock_create_ruleset sys_landlock_create_ruleset
+445    common  landlock_add_rule       sys_landlock_add_rule
+446    common  landlock_restrict_self  sys_landlock_restrict_self
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
index 6a98a76..1c1a7e4 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/cpumask.h>
 #include <linux/slab.h>
+#include <linux/amd-iommu.h>
 
 #include "../perf_event.h"
 #include "iommu.h"
index e6493a6..e6310c6 100644 (file)
 #define PC_MAX_SPEC_BNKS                       64
 #define PC_MAX_SPEC_CNTRS                      16
 
-struct amd_iommu;
-
-/* amd_iommu_init.c external support functions */
-extern int amd_iommu_get_num_iommus(void);
-
-extern bool amd_iommu_pc_supported(void);
-
-extern u8 amd_iommu_pc_get_max_banks(unsigned int idx);
-
-extern u8 amd_iommu_pc_get_max_counters(unsigned int idx);
-
-extern int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
-                               u8 fxn, u64 *value);
-
-extern int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
-                               u8 fxn, u64 *value);
-
-extern struct amd_iommu *get_amd_iommu(int idx);
-
 #endif /*_PERF_EVENT_AMD_IOMMU_H_*/
index 3c94316..ac37830 100644 (file)
 #define X86_FEATURE_AVIC               (15*32+13) /* Virtual Interrupt Controller */
 #define X86_FEATURE_V_VMSAVE_VMLOAD    (15*32+15) /* Virtual VMSAVE VMLOAD */
 #define X86_FEATURE_VGIF               (15*32+16) /* Virtual GIF */
+#define X86_FEATURE_V_SPEC_CTRL                (15*32+20) /* Virtual SPEC_CTRL */
 #define X86_FEATURE_SVME_ADDR_CHK      (15*32+28) /* "" SVME addr check */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
index 10eca9e..cbbcee0 100644 (file)
@@ -221,12 +221,22 @@ enum x86_intercept_stage;
 #define DR7_FIXED_1    0x00000400
 #define DR7_VOLATILE   0xffff2bff
 
+#define KVM_GUESTDBG_VALID_MASK \
+       (KVM_GUESTDBG_ENABLE | \
+       KVM_GUESTDBG_SINGLESTEP | \
+       KVM_GUESTDBG_USE_HW_BP | \
+       KVM_GUESTDBG_USE_SW_BP | \
+       KVM_GUESTDBG_INJECT_BP | \
+       KVM_GUESTDBG_INJECT_DB)
+
+
 #define PFERR_PRESENT_BIT 0
 #define PFERR_WRITE_BIT 1
 #define PFERR_USER_BIT 2
 #define PFERR_RSVD_BIT 3
 #define PFERR_FETCH_BIT 4
 #define PFERR_PK_BIT 5
+#define PFERR_SGX_BIT 15
 #define PFERR_GUEST_FINAL_BIT 32
 #define PFERR_GUEST_PAGE_BIT 33
 
@@ -236,6 +246,7 @@ enum x86_intercept_stage;
 #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
 #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
 #define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT)
 #define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
 #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
 
@@ -1054,6 +1065,9 @@ struct kvm_arch {
        u32 user_space_msr_mask;
        struct kvm_x86_msr_filter __rcu *msr_filter;
 
+       /* Guest can access the SGX PROVISIONKEY. */
+       bool sgx_provisioning_allowed;
+
        struct kvm_pmu_event_filter __rcu *pmu_event_filter;
        struct task_struct *nx_lpage_recovery_thread;
 
@@ -1068,25 +1082,36 @@ struct kvm_arch {
        bool tdp_mmu_enabled;
 
        /*
-        * List of struct kvmp_mmu_pages being used as roots.
+        * List of struct kvm_mmu_pages being used as roots.
         * All struct kvm_mmu_pages in the list should have
         * tdp_mmu_page set.
-        * All struct kvm_mmu_pages in the list should have a positive
-        * root_count except when a thread holds the MMU lock and is removing
-        * an entry from the list.
+        *
+        * For reads, this list is protected by:
+        *      the MMU lock in read mode + RCU or
+        *      the MMU lock in write mode
+        *
+        * For writes, this list is protected by:
+        *      the MMU lock in read mode + the tdp_mmu_pages_lock or
+        *      the MMU lock in write mode
+        *
+        * Roots will remain in the list until their tdp_mmu_root_count
+        * drops to zero, at which point the thread that decremented the
+        * count to zero should removed the root from the list and clean
+        * it up, freeing the root after an RCU grace period.
         */
        struct list_head tdp_mmu_roots;
 
        /*
         * List of struct kvmp_mmu_pages not being used as roots.
         * All struct kvm_mmu_pages in the list should have
-        * tdp_mmu_page set and a root_count of 0.
+        * tdp_mmu_page set and a tdp_mmu_root_count of 0.
         */
        struct list_head tdp_mmu_pages;
 
        /*
         * Protects accesses to the following fields when the MMU lock
         * is held in read mode:
+        *  - tdp_mmu_roots (above)
         *  - tdp_mmu_pages (above)
         *  - the link field of struct kvm_mmu_pages used by the TDP MMU
         *  - lpage_disallowed_mmu_pages
@@ -1143,6 +1168,9 @@ struct kvm_vcpu_stat {
        u64 req_event;
        u64 halt_poll_success_ns;
        u64 halt_poll_fail_ns;
+       u64 nested_run;
+       u64 directed_yield_attempted;
+       u64 directed_yield_successful;
 };
 
 struct x86_instruction_info;
@@ -1269,8 +1297,8 @@ struct kvm_x86_ops {
        int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
        u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 
-       void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd,
-                            int pgd_level);
+       void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+                            int root_level);
 
        bool (*has_wbinvd_exit)(void);
 
@@ -1339,6 +1367,7 @@ struct kvm_x86_ops {
        int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
        int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
        int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+       int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
 
        int (*get_msr_feature)(struct kvm_msr_entry *entry);
 
@@ -1357,6 +1386,7 @@ struct kvm_x86_ops {
 struct kvm_x86_nested_ops {
        int (*check_events)(struct kvm_vcpu *vcpu);
        bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
+       void (*triple_fault)(struct kvm_vcpu *vcpu);
        int (*get_state)(struct kvm_vcpu *vcpu,
                         struct kvm_nested_state __user *user_kvm_nested_state,
                         unsigned user_data_size);
@@ -1428,9 +1458,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 void kvm_mmu_init_vm(struct kvm *kvm);
 void kvm_mmu_uninit_vm(struct kvm *kvm);
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-               u64 acc_track_mask, u64 me_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -1440,8 +1467,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
                                   const struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot);
-void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot);
 void kvm_mmu_zap_all(struct kvm *kvm);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
 unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
@@ -1538,6 +1563,11 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
 int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
 int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu);
+int kvm_emulate_invd(struct kvm_vcpu *vcpu);
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu);
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
 
 int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
 int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -1566,14 +1596,14 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-bool kvm_rdpmc(struct kvm_vcpu *vcpu);
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu);
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
@@ -1614,9 +1644,6 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
-int kvm_mmu_load(struct kvm_vcpu *vcpu);
-void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
 void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                        ulong roots_to_free);
 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
@@ -1735,11 +1762,7 @@ asmlinkage void kvm_spurious_fault(void);
        _ASM_EXTABLE(666b, 667b)
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_extint(struct kvm_vcpu *v);
index 31c4df1..9c80c68 100644 (file)
@@ -20,7 +20,6 @@
 
 extern u64 sme_me_mask;
 extern u64 sev_status;
-extern bool sev_enabled;
 
 void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
                         unsigned long decrypted_kernel_vaddr,
index 1c56194..772e60e 100644 (file)
@@ -269,7 +269,9 @@ struct vmcb_save_area {
         * SEV-ES guests when referenced through the GHCB or for
         * saving to the host save area.
         */
-       u8 reserved_7[80];
+       u8 reserved_7[72];
+       u32 spec_ctrl;          /* Guest version of SPEC_CTRL at 0x2E0 */
+       u8 reserved_7b[4];
        u32 pkru;
        u8 reserved_7a[20];
        u64 reserved_8;         /* rax already available at 0x01f8 */
index 358707f..0ffaa31 100644 (file)
@@ -373,6 +373,7 @@ enum vmcs_field {
 #define GUEST_INTR_STATE_MOV_SS                0x00000002
 #define GUEST_INTR_STATE_SMI           0x00000004
 #define GUEST_INTR_STATE_NMI           0x00000008
+#define GUEST_INTR_STATE_ENCLAVE_INTR  0x00000010
 
 /* GUEST_ACTIVITY_STATE flags */
 #define GUEST_ACTIVITY_ACTIVE          0
index b8e650a..946d761 100644 (file)
@@ -27,6 +27,7 @@
 
 
 #define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
+#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE      0x08000000
 
 #define EXIT_REASON_EXCEPTION_NMI       0
 #define EXIT_REASON_EXTERNAL_INTERRUPT  1
index 5d32fa4..d307c22 100644 (file)
@@ -451,6 +451,10 @@ static void __init sev_map_percpu_data(void)
        }
 }
 
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
+
 static bool pv_tlb_flush_supported(void)
 {
        return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
@@ -458,10 +462,6 @@ static bool pv_tlb_flush_supported(void)
                kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
 }
 
-static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
-
-#ifdef CONFIG_SMP
-
 static bool pv_ipi_supported(void)
 {
        return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI);
@@ -574,6 +574,54 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
        }
 }
 
+static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
+                       const struct flush_tlb_info *info)
+{
+       u8 state;
+       int cpu;
+       struct kvm_steal_time *src;
+       struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+
+       cpumask_copy(flushmask, cpumask);
+       /*
+        * We have to call flush only on online vCPUs. And
+        * queue flush_on_enter for pre-empted vCPUs
+        */
+       for_each_cpu(cpu, flushmask) {
+               /*
+                * The local vCPU is never preempted, so we do not explicitly
+                * skip check for local vCPU - it will never be cleared from
+                * flushmask.
+                */
+               src = &per_cpu(steal_time, cpu);
+               state = READ_ONCE(src->preempted);
+               if ((state & KVM_VCPU_PREEMPTED)) {
+                       if (try_cmpxchg(&src->preempted, &state,
+                                       state | KVM_VCPU_FLUSH_TLB))
+                               __cpumask_clear_cpu(cpu, flushmask);
+               }
+       }
+
+       native_flush_tlb_multi(flushmask, info);
+}
+
+static __init int kvm_alloc_cpumask(void)
+{
+       int cpu;
+
+       if (!kvm_para_available() || nopv)
+               return 0;
+
+       if (pv_tlb_flush_supported() || pv_ipi_supported())
+               for_each_possible_cpu(cpu) {
+                       zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
+                               GFP_KERNEL, cpu_to_node(cpu));
+               }
+
+       return 0;
+}
+arch_initcall(kvm_alloc_cpumask);
+
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
        /*
@@ -611,38 +659,8 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
        local_irq_enable();
        return 0;
 }
-#endif
-
-static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
-                       const struct flush_tlb_info *info)
-{
-       u8 state;
-       int cpu;
-       struct kvm_steal_time *src;
-       struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
-
-       cpumask_copy(flushmask, cpumask);
-       /*
-        * We have to call flush only on online vCPUs. And
-        * queue flush_on_enter for pre-empted vCPUs
-        */
-       for_each_cpu(cpu, flushmask) {
-               /*
-                * The local vCPU is never preempted, so we do not explicitly
-                * skip check for local vCPU - it will never be cleared from
-                * flushmask.
-                */
-               src = &per_cpu(steal_time, cpu);
-               state = READ_ONCE(src->preempted);
-               if ((state & KVM_VCPU_PREEMPTED)) {
-                       if (try_cmpxchg(&src->preempted, &state,
-                                       state | KVM_VCPU_FLUSH_TLB))
-                               __cpumask_clear_cpu(cpu, flushmask);
-               }
-       }
 
-       native_flush_tlb_multi(flushmask, info);
-}
+#endif
 
 static void __init kvm_guest_init(void)
 {
@@ -658,12 +676,6 @@ static void __init kvm_guest_init(void)
                static_call_update(pv_steal_clock, kvm_steal_clock);
        }
 
-       if (pv_tlb_flush_supported()) {
-               pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
-               pv_ops.mmu.tlb_remove_table = tlb_remove_table;
-               pr_info("KVM setup pv remote TLB flush\n");
-       }
-
        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
                apic_set_eoi_write(kvm_guest_apic_eoi_write);
 
@@ -673,6 +685,12 @@ static void __init kvm_guest_init(void)
        }
 
 #ifdef CONFIG_SMP
+       if (pv_tlb_flush_supported()) {
+               pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
+               pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+               pr_info("KVM setup pv remote TLB flush\n");
+       }
+
        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
        if (pv_sched_yield_supported()) {
                smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
@@ -739,7 +757,7 @@ static uint32_t __init kvm_detect(void)
 
 static void __init kvm_apic_init(void)
 {
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
        if (pv_ipi_supported())
                kvm_setup_pv_ipi();
 #endif
@@ -799,32 +817,6 @@ static __init int activate_jump_labels(void)
 }
 arch_initcall(activate_jump_labels);
 
-static __init int kvm_alloc_cpumask(void)
-{
-       int cpu;
-       bool alloc = false;
-
-       if (!kvm_para_available() || nopv)
-               return 0;
-
-       if (pv_tlb_flush_supported())
-               alloc = true;
-
-#if defined(CONFIG_SMP)
-       if (pv_ipi_supported())
-               alloc = true;
-#endif
-
-       if (alloc)
-               for_each_possible_cpu(cpu) {
-                       zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
-                               GFP_KERNEL, cpu_to_node(cpu));
-               }
-
-       return 0;
-}
-arch_initcall(kvm_alloc_cpumask);
-
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
index 43cbfc8..5e1f381 100644 (file)
@@ -156,7 +156,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
 #endif
 
        /* Kernel thread ? */
-       if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+       if (unlikely(p->flags & PF_KTHREAD)) {
                memset(childregs, 0, sizeof(struct pt_regs));
                kthread_frame_init(frame, sp, arg);
                return 0;
@@ -172,6 +172,23 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
        task_user_gs(p) = get_user_gs(current_pt_regs());
 #endif
 
+       if (unlikely(p->flags & PF_IO_WORKER)) {
+               /*
+                * An IO thread is a user space thread, but it doesn't
+                * return to ret_after_fork().
+                *
+                * In order to indicate that to tools like gdb,
+                * we reset the stack and instruction pointers.
+                *
+                * It does the same kernel frame setup to return to a kernel
+                * function that a kernel thread does.
+                */
+               childregs->sp = 0;
+               childregs->ip = 0;
+               kthread_frame_init(frame, sp, arg);
+               return 0;
+       }
+
        /* Set a new TLS for the child thread? */
        if (clone_flags & CLONE_SETTLS)
                ret = set_new_tls(p, tls);
index eafc4d6..c589db5 100644 (file)
@@ -23,6 +23,8 @@ kvm-$(CONFIG_KVM_XEN) += xen.o
 
 kvm-intel-y            += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
                           vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
+kvm-intel-$(CONFIG_X86_SGX_KVM)        += vmx/sgx.o
+
 kvm-amd-y              += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
 
 obj-$(CONFIG_KVM)      += kvm.o
index c02466a..19606a3 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/processor.h>
 #include <asm/user.h>
 #include <asm/fpu/xstate.h>
+#include <asm/sgx.h>
 #include "cpuid.h"
 #include "lapic.h"
 #include "mmu.h"
@@ -28,7 +29,7 @@
  * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
  * aligned to sizeof(unsigned long) because it's not accessed via bitops.
  */
-u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_cpu_caps);
 
 static u32 xstate_required_size(u64 xstate_bv, bool compacted)
@@ -53,6 +54,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 }
 
 #define F feature_bit
+#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
 
 static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
        struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
@@ -170,6 +172,21 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                vcpu->arch.guest_supported_xcr0 =
                        (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
 
+       /*
+        * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
+        * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
+        * requested XCR0 value.  The enclave's XFRM must be a subset of XCRO
+        * at the time of EENTER, thus adjust the allowed XFRM by the guest's
+        * supported XCR0.  Similar to XCR0 handling, FP and SSE are forced to
+        * '1' even on CPUs that don't support XSAVE.
+        */
+       best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1);
+       if (best) {
+               best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff;
+               best->edx &= vcpu->arch.guest_supported_xcr0 >> 32;
+               best->ecx |= XFEATURE_MASK_FPSSE;
+       }
+
        kvm_update_pv_runtime(vcpu);
 
        vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -347,13 +364,13 @@ out:
        return r;
 }
 
-static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
+static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf)
 {
        const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
        struct kvm_cpuid_entry2 entry;
 
        reverse_cpuid_check(leaf);
-       kvm_cpu_caps[leaf] &= mask;
 
        cpuid_count(cpuid.function, cpuid.index,
                    &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
@@ -361,6 +378,27 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
        kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
 }
 
+static __always_inline
+void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
+{
+       /* Use kvm_cpu_cap_mask for non-scattered leafs. */
+       BUILD_BUG_ON(leaf < NCAPINTS);
+
+       kvm_cpu_caps[leaf] = mask;
+
+       __kvm_cpu_cap_mask(leaf);
+}
+
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+{
+       /* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+       BUILD_BUG_ON(leaf >= NCAPINTS);
+
+       kvm_cpu_caps[leaf] &= mask;
+
+       __kvm_cpu_cap_mask(leaf);
+}
+
 void kvm_set_cpu_caps(void)
 {
        unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
@@ -371,12 +409,13 @@ void kvm_set_cpu_caps(void)
        unsigned int f_gbpages = 0;
        unsigned int f_lm = 0;
 #endif
+       memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
 
-       BUILD_BUG_ON(sizeof(kvm_cpu_caps) >
+       BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
                     sizeof(boot_cpu_data.x86_capability));
 
        memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
-              sizeof(kvm_cpu_caps));
+              sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)));
 
        kvm_cpu_cap_mask(CPUID_1_ECX,
                /*
@@ -407,7 +446,7 @@ void kvm_set_cpu_caps(void)
        );
 
        kvm_cpu_cap_mask(CPUID_7_0_EBX,
-               F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+               F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
                F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
                F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
                F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
@@ -418,7 +457,8 @@ void kvm_set_cpu_caps(void)
                F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
                F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
                F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
-               F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
+               F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
+               F(SGX_LC)
        );
        /* Set LA57 based on hardware capability. */
        if (cpuid_ecx(7) & F(LA57))
@@ -457,6 +497,10 @@ void kvm_set_cpu_caps(void)
                F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
        );
 
+       kvm_cpu_cap_init_scattered(CPUID_12_EAX,
+               SF(SGX1) | SF(SGX2)
+       );
+
        kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
                F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
                F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
@@ -514,6 +558,10 @@ void kvm_set_cpu_caps(void)
         */
        kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
 
+       kvm_cpu_cap_mask(CPUID_8000_001F_EAX,
+               0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) |
+               F(SME_COHERENT));
+
        kvm_cpu_cap_mask(CPUID_C000_0001_EDX,
                F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
                F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
@@ -778,6 +826,38 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
                        entry->edx = 0;
                }
                break;
+       case 0x12:
+               /* Intel SGX */
+               if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
+                       entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+                       break;
+               }
+
+               /*
+                * Index 0: Sub-features, MISCSELECT (a.k.a extended features)
+                * and max enclave sizes.   The SGX sub-features and MISCSELECT
+                * are restricted by kernel and KVM capabilities (like most
+                * feature flags), while enclave size is unrestricted.
+                */
+               cpuid_entry_override(entry, CPUID_12_EAX);
+               entry->ebx &= SGX_MISC_EXINFO;
+
+               entry = do_host_cpuid(array, function, 1);
+               if (!entry)
+                       goto out;
+
+               /*
+                * Index 1: SECS.ATTRIBUTES.  ATTRIBUTES are restricted a la
+                * feature flags.  Advertise all supported flags, including
+                * privileged attributes that require explicit opt-in from
+                * userspace.  ATTRIBUTES.XFRM is not adjusted as userspace is
+                * expected to derive it from supported XCR0.
+                */
+               entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
+                             SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
+                             SGX_ATTR_KSS;
+               entry->ebx &= 0;
+               break;
        /* Intel PT */
        case 0x14:
                if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
@@ -869,8 +949,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
                break;
        /* Support memory encryption cpuid if host supports it */
        case 0x8000001F:
-               if (!boot_cpu_has(X86_FEATURE_SEV))
+               if (!kvm_cpu_cap_has(X86_FEATURE_SEV))
                        entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+               else
+                       cpuid_entry_override(entry, CPUID_8000_001F_EAX);
                break;
        /*Add support for Centaur's CPUID instruction*/
        case 0xC0000000:
index 2a0c506..c99edff 100644 (file)
@@ -3,11 +3,12 @@
 #define ARCH_X86_KVM_CPUID_H
 
 #include "x86.h"
+#include "reverse_cpuid.h"
 #include <asm/cpu.h>
 #include <asm/processor.h>
 #include <uapi/asm/kvm_para.h>
 
-extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 void kvm_set_cpu_caps(void);
 
 void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
@@ -58,144 +59,8 @@ static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
        return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE);
 }
 
-struct cpuid_reg {
-       u32 function;
-       u32 index;
-       int reg;
-};
-
-static const struct cpuid_reg reverse_cpuid[] = {
-       [CPUID_1_EDX]         = {         1, 0, CPUID_EDX},
-       [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX},
-       [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},
-       [CPUID_1_ECX]         = {         1, 0, CPUID_ECX},
-       [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX},
-       [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
-       [CPUID_7_0_EBX]       = {         7, 0, CPUID_EBX},
-       [CPUID_D_1_EAX]       = {       0xd, 1, CPUID_EAX},
-       [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
-       [CPUID_6_EAX]         = {         6, 0, CPUID_EAX},
-       [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
-       [CPUID_7_ECX]         = {         7, 0, CPUID_ECX},
-       [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
-       [CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
-       [CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
-};
-
-/*
- * Reverse CPUID and its derivatives can only be used for hardware-defined
- * feature words, i.e. words whose bits directly correspond to a CPUID leaf.
- * Retrieving a feature bit or masking guest CPUID from a Linux-defined word
- * is nonsensical as the bit number/mask is an arbitrary software-defined value
- * and can't be used by KVM to query/control guest capabilities.  And obviously
- * the leaf being queried must have an entry in the lookup table.
- */
-static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
-{
-       BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
-       BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
-       BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
-       BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
-       BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
-       BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
-}
-
-/*
- * Retrieve the bit mask from an X86_FEATURE_* definition.  Features contain
- * the hardware defined bit number (stored in bits 4:0) and a software defined
- * "word" (stored in bits 31:5).  The word is used to index into arrays of
- * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has().
- */
-static __always_inline u32 __feature_bit(int x86_feature)
-{
-       reverse_cpuid_check(x86_feature / 32);
-       return 1 << (x86_feature & 31);
-}
-
-#define feature_bit(name)  __feature_bit(X86_FEATURE_##name)
-
-static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
-{
-       unsigned int x86_leaf = x86_feature / 32;
-
-       reverse_cpuid_check(x86_leaf);
-       return reverse_cpuid[x86_leaf];
-}
-
-static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
-                                                 u32 reg)
-{
-       switch (reg) {
-       case CPUID_EAX:
-               return &entry->eax;
-       case CPUID_EBX:
-               return &entry->ebx;
-       case CPUID_ECX:
-               return &entry->ecx;
-       case CPUID_EDX:
-               return &entry->edx;
-       default:
-               BUILD_BUG();
-               return NULL;
-       }
-}
-
-static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
-                                               unsigned int x86_feature)
-{
-       const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
-
-       return __cpuid_entry_get_reg(entry, cpuid.reg);
-}
-
-static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry,
-                                          unsigned int x86_feature)
-{
-       u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
-
-       return *reg & __feature_bit(x86_feature);
-}
-
-static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry,
-                                           unsigned int x86_feature)
-{
-       return cpuid_entry_get(entry, x86_feature);
-}
-
-static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry,
-                                             unsigned int x86_feature)
-{
-       u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
-
-       *reg &= ~__feature_bit(x86_feature);
-}
-
-static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry,
-                                           unsigned int x86_feature)
-{
-       u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
-
-       *reg |= __feature_bit(x86_feature);
-}
-
-static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
-                                              unsigned int x86_feature,
-                                              bool set)
-{
-       u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
-
-       /*
-        * Open coded instead of using cpuid_entry_{clear,set}() to coerce the
-        * compiler into using CMOV instead of Jcc when possible.
-        */
-       if (set)
-               *reg |= __feature_bit(x86_feature);
-       else
-               *reg &= ~__feature_bit(x86_feature);
-}
-
 static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry,
-                                                enum cpuid_leafs leaf)
+                                                unsigned int leaf)
 {
        u32 *reg = cpuid_entry_get_reg(entry, leaf * 32);
 
@@ -248,6 +113,14 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
                is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
 }
 
+static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = kvm_find_cpuid_entry(vcpu, 0, 0);
+       return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
+}
+
 static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
@@ -308,7 +181,7 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
 
 static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
 {
-       unsigned int x86_leaf = x86_feature / 32;
+       unsigned int x86_leaf = __feature_leaf(x86_feature);
 
        reverse_cpuid_check(x86_leaf);
        kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
@@ -316,7 +189,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
 
 static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
 {
-       unsigned int x86_leaf = x86_feature / 32;
+       unsigned int x86_leaf = __feature_leaf(x86_feature);
 
        reverse_cpuid_check(x86_leaf);
        kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
@@ -324,7 +197,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
 
 static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
 {
-       unsigned int x86_leaf = x86_feature / 32;
+       unsigned int x86_leaf = __feature_leaf(x86_feature);
 
        reverse_cpuid_check(x86_leaf);
        return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
index cdd2a2b..77e1c89 100644 (file)
@@ -4220,7 +4220,7 @@ static bool valid_cr(int nr)
        }
 }
 
-static int check_cr_read(struct x86_emulate_ctxt *ctxt)
+static int check_cr_access(struct x86_emulate_ctxt *ctxt)
 {
        if (!valid_cr(ctxt->modrm_reg))
                return emulate_ud(ctxt);
@@ -4228,80 +4228,6 @@ static int check_cr_read(struct x86_emulate_ctxt *ctxt)
        return X86EMUL_CONTINUE;
 }
 
-static int check_cr_write(struct x86_emulate_ctxt *ctxt)
-{
-       u64 new_val = ctxt->src.val64;
-       int cr = ctxt->modrm_reg;
-       u64 efer = 0;
-
-       static u64 cr_reserved_bits[] = {
-               0xffffffff00000000ULL,
-               0, 0, 0, /* CR3 checked later */
-               CR4_RESERVED_BITS,
-               0, 0, 0,
-               CR8_RESERVED_BITS,
-       };
-
-       if (!valid_cr(cr))
-               return emulate_ud(ctxt);
-
-       if (new_val & cr_reserved_bits[cr])
-               return emulate_gp(ctxt, 0);
-
-       switch (cr) {
-       case 0: {
-               u64 cr4;
-               if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) ||
-                   ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD)))
-                       return emulate_gp(ctxt, 0);
-
-               cr4 = ctxt->ops->get_cr(ctxt, 4);
-               ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
-
-               if ((new_val & X86_CR0_PG) && (efer & EFER_LME) &&
-                   !(cr4 & X86_CR4_PAE))
-                       return emulate_gp(ctxt, 0);
-
-               break;
-               }
-       case 3: {
-               u64 rsvd = 0;
-
-               ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
-               if (efer & EFER_LMA) {
-                       u64 maxphyaddr;
-                       u32 eax, ebx, ecx, edx;
-
-                       eax = 0x80000008;
-                       ecx = 0;
-                       if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx,
-                                                &edx, true))
-                               maxphyaddr = eax & 0xff;
-                       else
-                               maxphyaddr = 36;
-                       rsvd = rsvd_bits(maxphyaddr, 63);
-                       if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE)
-                               rsvd &= ~X86_CR3_PCID_NOFLUSH;
-               }
-
-               if (new_val & rsvd)
-                       return emulate_gp(ctxt, 0);
-
-               break;
-               }
-       case 4: {
-               ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
-
-               if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
-                       return emulate_gp(ctxt, 0);
-
-               break;
-               }
-       }
-
-       return X86EMUL_CONTINUE;
-}
-
 static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
 {
        unsigned long dr7;
@@ -4841,10 +4767,10 @@ static const struct opcode twobyte_table[256] = {
        D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
        D(ImplicitOps | ModRM | SrcMem | NoAccess), /* NOP + 7 * reserved NOP */
        /* 0x20 - 0x2F */
-       DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read),
+       DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_access),
        DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
        IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write,
-                                               check_cr_write),
+                                               check_cr_access),
        IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
                                                check_dr_write),
        N, N, N, N,
index 2e11da2..3db5c42 100644 (file)
@@ -62,7 +62,12 @@ static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
        __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
 }
 
-static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
+/*
+ * The "raw" register helpers are only for cases where the full 64 bits of a
+ * register are read/written irrespective of current vCPU mode.  In other words,
+ * odds are good you shouldn't be using the raw variants.
+ */
+static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg)
 {
        if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
                return 0;
@@ -73,8 +78,8 @@ static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
        return vcpu->arch.regs[reg];
 }
 
-static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg,
-                                     unsigned long val)
+static inline void kvm_register_write_raw(struct kvm_vcpu *vcpu, int reg,
+                                         unsigned long val)
 {
        if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
                return;
@@ -85,22 +90,22 @@ static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg,
 
 static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
 {
-       return kvm_register_read(vcpu, VCPU_REGS_RIP);
+       return kvm_register_read_raw(vcpu, VCPU_REGS_RIP);
 }
 
 static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
 {
-       kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+       kvm_register_write_raw(vcpu, VCPU_REGS_RIP, val);
 }
 
 static inline unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu)
 {
-       return kvm_register_read(vcpu, VCPU_REGS_RSP);
+       return kvm_register_read_raw(vcpu, VCPU_REGS_RSP);
 }
 
 static inline void kvm_rsp_write(struct kvm_vcpu *vcpu, unsigned long val)
 {
-       kvm_register_write(vcpu, VCPU_REGS_RSP, val);
+       kvm_register_write_raw(vcpu, VCPU_REGS_RSP, val);
 }
 
 static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
index cc369b9..152591f 100644 (file)
@@ -296,6 +296,10 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
 
                atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
        }
+
+       /* Check if there are APF page ready requests pending */
+       if (enabled)
+               kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
 }
 
 static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
@@ -2261,6 +2265,8 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
                if (value & MSR_IA32_APICBASE_ENABLE) {
                        kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
                        static_branch_slow_dec_deferred(&apic_hw_disabled);
+                       /* Check if there are APF page ready requests pending */
+                       kvm_make_request(KVM_REQ_APF_READY, vcpu);
                } else {
                        static_branch_inc(&apic_hw_disabled.key);
                        atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
@@ -2869,7 +2875,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
                return;
 
        if (is_guest_mode(vcpu)) {
-               r = kvm_x86_ops.nested_ops->check_events(vcpu);
+               r = kvm_check_nested_events(vcpu);
                if (r < 0)
                        return;
                /*
index c68bfc3..88d0ed5 100644 (file)
@@ -59,7 +59,8 @@ static __always_inline u64 rsvd_bits(int s, int e)
        return ((2ULL << (e - s)) - 1) << s;
 }
 
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
 
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
@@ -73,6 +74,10 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
                                u64 fault_address, char *insn, int insn_len);
 
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+
 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 {
        if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
@@ -102,8 +107,8 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
        if (!VALID_PAGE(root_hpa))
                return;
 
-       static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
-                                vcpu->arch.mmu->shadow_root_level);
+       static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
+                                         vcpu->arch.mmu->shadow_root_level);
 }
 
 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
@@ -124,7 +129,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
  * write-protects guest page to sync the guest modification, b) another one is
  * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
  * between these two sorts are:
- * 1) the first case clears SPTE_MMU_WRITEABLE bit.
+ * 1) the first case clears MMU-writable bit.
  * 2) the first case requires flushing tlb immediately avoiding corrupting
  *    shadow page table between all vcpus so it should be in the protection of
  *    mmu-lock. And the another case does not need to flush tlb until returning
@@ -135,17 +140,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
  * So, there is the problem: the first case can meet the corrupted tlb caused
  * by another case which write-protects pages but without flush tlb
  * immediately. In order to making the first case be aware this problem we let
- * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit
- * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit.
+ * it flush tlb if we try to write-protect a spte whose MMU-writable bit
+ * is set, it works since another case never touches MMU-writable bit.
  *
  * Anyway, whenever a spte is updated (only permission and status bits are
- * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes
+ * changed) we need to check whether the spte with MMU-writable becomes
  * readonly, if that happens, we need to flush tlb. Fortunately,
  * mmu_spte_update() has already handled it perfectly.
  *
- * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK:
+ * The rules to use MMU-writable and PT_WRITABLE_MASK:
  * - if we want to see if it has writable tlb entry or if the spte can be
- *   writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most
+ *   writable on the mmu mapping, check MMU-writable, this is the most
  *   case, otherwise
  * - if we fix page fault on the spte or do write-protection by dirty logging,
  *   check PT_WRITABLE_MASK.
index 62b1729..4b3ee24 100644 (file)
@@ -48,6 +48,7 @@
 #include <asm/memtype.h>
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
+#include <asm/set_memory.h>
 #include <asm/vmx.h>
 #include <asm/kvm_page_track.h>
 #include "trace.h"
@@ -215,10 +216,10 @@ bool is_nx_huge_page_enabled(void)
 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
                           unsigned int access)
 {
-       u64 mask = make_mmio_spte(vcpu, gfn, access);
+       u64 spte = make_mmio_spte(vcpu, gfn, access);
 
-       trace_mark_mmio_spte(sptep, gfn, mask);
-       mmu_spte_set(sptep, mask);
+       trace_mark_mmio_spte(sptep, gfn, spte);
+       mmu_spte_set(sptep, spte);
 }
 
 static gfn_t get_mmio_spte_gfn(u64 spte)
@@ -236,17 +237,6 @@ static unsigned get_mmio_spte_access(u64 spte)
        return spte & shadow_mmio_access_mask;
 }
 
-static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
-                         kvm_pfn_t pfn, unsigned int access)
-{
-       if (unlikely(is_noslot_pfn(pfn))) {
-               mark_mmio_spte(vcpu, sptep, gfn, access);
-               return true;
-       }
-
-       return false;
-}
-
 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 {
        u64 kvm_gen, spte_gen, gen;
@@ -725,8 +715,7 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
  * handling slots that are not large page aligned.
  */
 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
-                                             struct kvm_memory_slot *slot,
-                                             int level)
+               const struct kvm_memory_slot *slot, int level)
 {
        unsigned long idx;
 
@@ -1118,7 +1107,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
        rmap_printk("spte %p %llx\n", sptep, *sptep);
 
        if (pt_protect)
-               spte &= ~SPTE_MMU_WRITEABLE;
+               spte &= ~shadow_mmu_writable_mask;
        spte = spte & ~PT_WRITABLE_MASK;
 
        return mmu_spte_update(sptep, spte);
@@ -1308,26 +1297,25 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
        return flush;
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                          struct kvm_memory_slot *slot, gfn_t gfn, int level,
-                          unsigned long data)
+static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                           struct kvm_memory_slot *slot, gfn_t gfn, int level,
+                           pte_t unused)
 {
        return kvm_zap_rmapp(kvm, rmap_head, slot);
 }
 
-static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                            struct kvm_memory_slot *slot, gfn_t gfn, int level,
-                            unsigned long data)
+static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                             struct kvm_memory_slot *slot, gfn_t gfn, int level,
+                             pte_t pte)
 {
        u64 *sptep;
        struct rmap_iterator iter;
        int need_flush = 0;
        u64 new_spte;
-       pte_t *ptep = (pte_t *)data;
        kvm_pfn_t new_pfn;
 
-       WARN_ON(pte_huge(*ptep));
-       new_pfn = pte_pfn(*ptep);
+       WARN_ON(pte_huge(pte));
+       new_pfn = pte_pfn(pte);
 
 restart:
        for_each_rmap_spte(rmap_head, &iter, sptep) {
@@ -1336,7 +1324,7 @@ restart:
 
                need_flush = 1;
 
-               if (pte_write(*ptep)) {
+               if (pte_write(pte)) {
                        pte_list_remove(rmap_head, sptep);
                        goto restart;
                } else {
@@ -1424,93 +1412,52 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
             slot_rmap_walk_okay(_iter_);                               \
             slot_rmap_walk_next(_iter_))
 
-static __always_inline int
-kvm_handle_hva_range(struct kvm *kvm,
-                    unsigned long start,
-                    unsigned long end,
-                    unsigned long data,
-                    int (*handler)(struct kvm *kvm,
-                                   struct kvm_rmap_head *rmap_head,
-                                   struct kvm_memory_slot *slot,
-                                   gfn_t gfn,
-                                   int level,
-                                   unsigned long data))
+typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                              struct kvm_memory_slot *slot, gfn_t gfn,
+                              int level, pte_t pte);
+
+static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
+                                                struct kvm_gfn_range *range,
+                                                rmap_handler_t handler)
 {
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
        struct slot_rmap_walk_iterator iterator;
-       int ret = 0;
-       int i;
-
-       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
-               slots = __kvm_memslots(kvm, i);
-               kvm_for_each_memslot(memslot, slots) {
-                       unsigned long hva_start, hva_end;
-                       gfn_t gfn_start, gfn_end;
+       bool ret = false;
 
-                       hva_start = max(start, memslot->userspace_addr);
-                       hva_end = min(end, memslot->userspace_addr +
-                                     (memslot->npages << PAGE_SHIFT));
-                       if (hva_start >= hva_end)
-                               continue;
-                       /*
-                        * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                        * {gfn_start, gfn_start+1, ..., gfn_end-1}.
-                        */
-                       gfn_start = hva_to_gfn_memslot(hva_start, memslot);
-                       gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-                       for_each_slot_rmap_range(memslot, PG_LEVEL_4K,
-                                                KVM_MAX_HUGEPAGE_LEVEL,
-                                                gfn_start, gfn_end - 1,
-                                                &iterator)
-                               ret |= handler(kvm, iterator.rmap, memslot,
-                                              iterator.gfn, iterator.level, data);
-               }
-       }
+       for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+                                range->start, range->end - 1, &iterator)
+               ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
+                              iterator.level, range->pte);
 
        return ret;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-                         unsigned long data,
-                         int (*handler)(struct kvm *kvm,
-                                        struct kvm_rmap_head *rmap_head,
-                                        struct kvm_memory_slot *slot,
-                                        gfn_t gfn, int level,
-                                        unsigned long data))
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags)
-{
-       int r;
+       bool flush;
 
-       r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+       flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
 
        if (is_tdp_mmu_enabled(kvm))
-               r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
+               flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
 
-       return r;
+       return flush;
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       int r;
+       bool flush;
 
-       r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
+       flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
 
        if (is_tdp_mmu_enabled(kvm))
-               r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
+               flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
 
-       return r;
+       return flush;
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                        struct kvm_memory_slot *slot, gfn_t gfn, int level,
-                        unsigned long data)
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                         struct kvm_memory_slot *slot, gfn_t gfn, int level,
+                         pte_t unused)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -1519,13 +1466,12 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
        for_each_rmap_spte(rmap_head, &iter, sptep)
                young |= mmu_spte_age(sptep);
 
-       trace_kvm_age_page(gfn, level, slot, young);
        return young;
 }
 
-static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                             struct kvm_memory_slot *slot, gfn_t gfn,
-                             int level, unsigned long data)
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                              struct kvm_memory_slot *slot, gfn_t gfn,
+                              int level, pte_t unused)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -1547,29 +1493,31 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
        rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
 
-       kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
+       kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
        kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
                        KVM_PAGES_PER_HPAGE(sp->role.level));
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       int young = false;
+       bool young;
+
+       young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
 
-       young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
        if (is_tdp_mmu_enabled(kvm))
-               young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
+               young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
 
        return young;
 }
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       int young = false;
+       bool young;
+
+       young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
 
-       young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
        if (is_tdp_mmu_enabled(kvm))
-               young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
+               young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
 
        return young;
 }
@@ -2421,6 +2369,15 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
 
        kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
 
+       /*
+        * Note, this check is intentionally soft, it only guarantees that one
+        * page is available, while the caller may end up allocating as many as
+        * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
+        * exceeding the (arbitrary by default) limit will not harm the host,
+        * being too agressive may unnecessarily kill the guest, and getting an
+        * exact count is far more trouble than it's worth, especially in the
+        * page fault paths.
+        */
        if (!kvm_mmu_available_pages(vcpu->kvm))
                return -ENOSPC;
        return 0;
@@ -2561,9 +2518,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        struct kvm_mmu_page *sp;
        int ret;
 
-       if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
-               return 0;
-
        sp = sptep_to_sp(sptep);
 
        ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative,
@@ -2593,6 +2547,11 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
                 *sptep, write_fault, gfn);
 
+       if (unlikely(is_noslot_pfn(pfn))) {
+               mark_mmio_spte(vcpu, sptep, gfn, pte_access);
+               return RET_PF_EMULATE;
+       }
+
        if (is_shadow_present_pte(*sptep)) {
                /*
                 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
@@ -2626,9 +2585,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
                                KVM_PAGES_PER_HPAGE(level));
 
-       if (unlikely(is_mmio_spte(*sptep)))
-               ret = RET_PF_EMULATE;
-
        /*
         * The fault is fully spurious if and only if the new SPTE and old SPTE
         * are identical, and emulation is not required.
@@ -2745,7 +2701,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 }
 
 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
-                                 struct kvm_memory_slot *slot)
+                                 const struct kvm_memory_slot *slot)
 {
        unsigned long hva;
        pte_t *pte;
@@ -2771,8 +2727,9 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
        return level;
 }
 
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
-                             gfn_t gfn, kvm_pfn_t pfn, int max_level)
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+                             const struct kvm_memory_slot *slot, gfn_t gfn,
+                             kvm_pfn_t pfn, int max_level)
 {
        struct kvm_lpage_info *linfo;
 
@@ -2946,9 +2903,19 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                return true;
        }
 
-       if (unlikely(is_noslot_pfn(pfn)))
+       if (unlikely(is_noslot_pfn(pfn))) {
                vcpu_cache_mmio_info(vcpu, gva, gfn,
                                     access & shadow_mmio_access_mask);
+               /*
+                * If MMIO caching is disabled, emulate immediately without
+                * touching the shadow page tables as attempting to install an
+                * MMIO SPTE will just be an expensive nop.
+                */
+               if (unlikely(!shadow_mmio_value)) {
+                       *ret_val = RET_PF_EMULATE;
+                       return true;
+               }
+       }
 
        return false;
 }
@@ -3061,6 +3028,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                        if (!is_shadow_present_pte(spte))
                                break;
 
+               if (!is_shadow_present_pte(spte))
+                       break;
+
                sp = sptep_to_sp(iterator.sptep);
                if (!is_last_spte(spte, sp->role.level))
                        break;
@@ -3150,12 +3120,10 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 
        sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
 
-       if (kvm_mmu_put_root(kvm, sp)) {
-               if (is_tdp_mmu_page(sp))
-                       kvm_tdp_mmu_free_root(kvm, sp);
-               else if (sp->role.invalid)
-                       kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
-       }
+       if (is_tdp_mmu_page(sp))
+               kvm_tdp_mmu_put_root(kvm, sp, false);
+       else if (!--sp->root_count && sp->role.invalid)
+               kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 
        *root_hpa = INVALID_PAGE;
 }
@@ -3193,14 +3161,17 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
                    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
                        mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
-               } else {
-                       for (i = 0; i < 4; ++i)
-                               if (mmu->pae_root[i] != 0)
-                                       mmu_free_root_page(kvm,
-                                                          &mmu->pae_root[i],
-                                                          &invalid_list);
-                       mmu->root_hpa = INVALID_PAGE;
+               } else if (mmu->pae_root) {
+                       for (i = 0; i < 4; ++i) {
+                               if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
+                                       continue;
+
+                               mmu_free_root_page(kvm, &mmu->pae_root[i],
+                                                  &invalid_list);
+                               mmu->pae_root[i] = INVALID_PAE_ROOT;
+                       }
                }
+               mmu->root_hpa = INVALID_PAGE;
                mmu->root_pgd = 0;
        }
 
@@ -3226,155 +3197,208 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
 {
        struct kvm_mmu_page *sp;
 
-       write_lock(&vcpu->kvm->mmu_lock);
-
-       if (make_mmu_pages_available(vcpu)) {
-               write_unlock(&vcpu->kvm->mmu_lock);
-               return INVALID_PAGE;
-       }
        sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
        ++sp->root_count;
 
-       write_unlock(&vcpu->kvm->mmu_lock);
        return __pa(sp->spt);
 }
 
 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 {
-       u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       u8 shadow_root_level = mmu->shadow_root_level;
        hpa_t root;
        unsigned i;
+       int r;
+
+       write_lock(&vcpu->kvm->mmu_lock);
+       r = make_mmu_pages_available(vcpu);
+       if (r < 0)
+               goto out_unlock;
 
        if (is_tdp_mmu_enabled(vcpu->kvm)) {
                root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
-
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->root_hpa = root;
+               mmu->root_hpa = root;
        } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
-               root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
-                                     true);
-
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->root_hpa = root;
+               root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
+               mmu->root_hpa = root;
        } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
+               if (WARN_ON_ONCE(!mmu->pae_root)) {
+                       r = -EIO;
+                       goto out_unlock;
+               }
+
                for (i = 0; i < 4; ++i) {
-                       MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
+                       WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
 
                        root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
                                              i << 30, PT32_ROOT_LEVEL, true);
-                       if (!VALID_PAGE(root))
-                               return -ENOSPC;
-                       vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
+                       mmu->pae_root[i] = root | PT_PRESENT_MASK |
+                                          shadow_me_mask;
                }
-               vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
-       } else
-               BUG();
+               mmu->root_hpa = __pa(mmu->pae_root);
+       } else {
+               WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
+               r = -EIO;
+               goto out_unlock;
+       }
 
        /* root_pgd is ignored for direct MMUs. */
-       vcpu->arch.mmu->root_pgd = 0;
-
-       return 0;
+       mmu->root_pgd = 0;
+out_unlock:
+       write_unlock(&vcpu->kvm->mmu_lock);
+       return r;
 }
 
 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 {
-       u64 pdptr, pm_mask;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       u64 pdptrs[4], pm_mask;
        gfn_t root_gfn, root_pgd;
        hpa_t root;
-       int i;
+       unsigned i;
+       int r;
 
-       root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
+       root_pgd = mmu->get_guest_pgd(vcpu);
        root_gfn = root_pgd >> PAGE_SHIFT;
 
        if (mmu_check_root(vcpu, root_gfn))
                return 1;
 
+       /*
+        * On SVM, reading PDPTRs might access guest memory, which might fault
+        * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
+        */
+       if (mmu->root_level == PT32E_ROOT_LEVEL) {
+               for (i = 0; i < 4; ++i) {
+                       pdptrs[i] = mmu->get_pdptr(vcpu, i);
+                       if (!(pdptrs[i] & PT_PRESENT_MASK))
+                               continue;
+
+                       if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
+                               return 1;
+               }
+       }
+
+       write_lock(&vcpu->kvm->mmu_lock);
+       r = make_mmu_pages_available(vcpu);
+       if (r < 0)
+               goto out_unlock;
+
        /*
         * Do we shadow a long mode page table? If so we need to
         * write-protect the guests page table root.
         */
-       if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
-               MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa));
-
+       if (mmu->root_level >= PT64_ROOT_4LEVEL) {
                root = mmu_alloc_root(vcpu, root_gfn, 0,
-                                     vcpu->arch.mmu->shadow_root_level, false);
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->root_hpa = root;
+                                     mmu->shadow_root_level, false);
+               mmu->root_hpa = root;
                goto set_root_pgd;
        }
 
+       if (WARN_ON_ONCE(!mmu->pae_root)) {
+               r = -EIO;
+               goto out_unlock;
+       }
+
        /*
         * We shadow a 32 bit page table. This may be a legacy 2-level
         * or a PAE 3-level page table. In either case we need to be aware that
         * the shadow page table may be a PAE or a long mode page table.
         */
-       pm_mask = PT_PRESENT_MASK;
-       if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+       pm_mask = PT_PRESENT_MASK | shadow_me_mask;
+       if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
                pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
+               if (WARN_ON_ONCE(!mmu->lm_root)) {
+                       r = -EIO;
+                       goto out_unlock;
+               }
+
+               mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask;
+       }
+
        for (i = 0; i < 4; ++i) {
-               MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
-               if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
-                       pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
-                       if (!(pdptr & PT_PRESENT_MASK)) {
-                               vcpu->arch.mmu->pae_root[i] = 0;
+               WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
+
+               if (mmu->root_level == PT32E_ROOT_LEVEL) {
+                       if (!(pdptrs[i] & PT_PRESENT_MASK)) {
+                               mmu->pae_root[i] = INVALID_PAE_ROOT;
                                continue;
                        }
-                       root_gfn = pdptr >> PAGE_SHIFT;
-                       if (mmu_check_root(vcpu, root_gfn))
-                               return 1;
+                       root_gfn = pdptrs[i] >> PAGE_SHIFT;
                }
 
                root = mmu_alloc_root(vcpu, root_gfn, i << 30,
                                      PT32_ROOT_LEVEL, false);
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->pae_root[i] = root | pm_mask;
+               mmu->pae_root[i] = root | pm_mask;
        }
-       vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
+
+       if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+               mmu->root_hpa = __pa(mmu->lm_root);
+       else
+               mmu->root_hpa = __pa(mmu->pae_root);
+
+set_root_pgd:
+       mmu->root_pgd = root_pgd;
+out_unlock:
+       write_unlock(&vcpu->kvm->mmu_lock);
+
+       return 0;
+}
+
+static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       u64 *lm_root, *pae_root;
 
        /*
-        * If we shadow a 32 bit page table with a long mode page
-        * table we enter this path.
+        * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
+        * tables are allocated and initialized at root creation as there is no
+        * equivalent level in the guest's NPT to shadow.  Allocate the tables
+        * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
         */
-       if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
-               if (vcpu->arch.mmu->lm_root == NULL) {
-                       /*
-                        * The additional page necessary for this is only
-                        * allocated on demand.
-                        */
+       if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
+           mmu->shadow_root_level < PT64_ROOT_4LEVEL)
+               return 0;
 
-                       u64 *lm_root;
+       /*
+        * This mess only works with 4-level paging and needs to be updated to
+        * work with 5-level paging.
+        */
+       if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
+               return -EIO;
 
-                       lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
-                       if (lm_root == NULL)
-                               return 1;
+       if (mmu->pae_root && mmu->lm_root)
+               return 0;
 
-                       lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
+       /*
+        * The special roots should always be allocated in concert.  Yell and
+        * bail if KVM ends up in a state where only one of the roots is valid.
+        */
+       if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root))
+               return -EIO;
 
-                       vcpu->arch.mmu->lm_root = lm_root;
-               }
+       /*
+        * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
+        * doesn't need to be decrypted.
+        */
+       pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+       if (!pae_root)
+               return -ENOMEM;
 
-               vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
+       lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+       if (!lm_root) {
+               free_page((unsigned long)pae_root);
+               return -ENOMEM;
        }
 
-set_root_pgd:
-       vcpu->arch.mmu->root_pgd = root_pgd;
+       mmu->pae_root = pae_root;
+       mmu->lm_root = lm_root;
 
        return 0;
 }
 
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
-       if (vcpu->arch.mmu->direct_map)
-               return mmu_alloc_direct_roots(vcpu);
-       else
-               return mmu_alloc_shadow_roots(vcpu);
-}
-
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
        int i;
@@ -3422,7 +3446,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        for (i = 0; i < 4; ++i) {
                hpa_t root = vcpu->arch.mmu->pae_root[i];
 
-               if (root && VALID_PAGE(root)) {
+               if (IS_VALID_PAE_ROOT(root)) {
                        root &= PT64_BASE_ADDR_MASK;
                        sp = to_shadow_page(root);
                        mmu_sync_children(vcpu, sp);
@@ -3554,11 +3578,12 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
                            __is_rsvd_bits_set(rsvd_check, sptes[level], level);
 
        if (reserved) {
-               pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
+               pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
                       __func__, addr);
                for (level = root; level >= leaf; level--)
-                       pr_err("------ spte 0x%llx level %d.\n",
-                              sptes[level], level);
+                       pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
+                              sptes[level], level,
+                              rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]);
        }
 
        return reserved;
@@ -3653,6 +3678,14 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
        bool async;
 
+       /*
+        * Retry the page fault if the gfn hit a memslot that is being deleted
+        * or moved.  This ensures any existing SPTEs for the old memslot will
+        * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+        */
+       if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
+               return true;
+
        /* Don't expose private memslots to L2. */
        if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
                *pfn = KVM_PFN_NOSLOT;
@@ -4615,12 +4648,17 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
        struct kvm_mmu *context = &vcpu->arch.guest_mmu;
        union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
 
-       context->shadow_root_level = new_role.base.level;
-
        __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
 
-       if (new_role.as_u64 != context->mmu_role.as_u64)
+       if (new_role.as_u64 != context->mmu_role.as_u64) {
                shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+
+               /*
+                * Override the level set by the common init helper, nested TDP
+                * always uses the host's TDP configuration.
+                */
+               context->shadow_root_level = new_role.base.level;
+       }
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
 
@@ -4802,16 +4840,23 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
        r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
        if (r)
                goto out;
-       r = mmu_alloc_roots(vcpu);
-       kvm_mmu_sync_roots(vcpu);
+       r = mmu_alloc_special_roots(vcpu);
+       if (r)
+               goto out;
+       if (vcpu->arch.mmu->direct_map)
+               r = mmu_alloc_direct_roots(vcpu);
+       else
+               r = mmu_alloc_shadow_roots(vcpu);
        if (r)
                goto out;
+
+       kvm_mmu_sync_roots(vcpu);
+
        kvm_mmu_load_pgd(vcpu);
        static_call(kvm_x86_tlb_flush_current)(vcpu);
 out:
        return r;
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
@@ -4820,7 +4865,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
        kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
        WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
 static bool need_remote_flush(u64 old, u64 new)
 {
@@ -5169,10 +5213,10 @@ typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_
 static __always_inline bool
 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
                        slot_level_handler fn, int start_level, int end_level,
-                       gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+                       gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
+                       bool flush)
 {
        struct slot_rmap_walk_iterator iterator;
-       bool flush = false;
 
        for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
                        end_gfn, &iterator) {
@@ -5180,7 +5224,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
                        flush |= fn(kvm, iterator.rmap, memslot);
 
                if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
-                       if (flush && lock_flush_tlb) {
+                       if (flush && flush_on_yield) {
                                kvm_flush_remote_tlbs_with_address(kvm,
                                                start_gfn,
                                                iterator.gfn - start_gfn + 1);
@@ -5190,36 +5234,32 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
                }
        }
 
-       if (flush && lock_flush_tlb) {
-               kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
-                                                  end_gfn - start_gfn + 1);
-               flush = false;
-       }
-
        return flush;
 }
 
 static __always_inline bool
 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
                  slot_level_handler fn, int start_level, int end_level,
-                 bool lock_flush_tlb)
+                 bool flush_on_yield)
 {
        return slot_handle_level_range(kvm, memslot, fn, start_level,
                        end_level, memslot->base_gfn,
                        memslot->base_gfn + memslot->npages - 1,
-                       lock_flush_tlb);
+                       flush_on_yield, false);
 }
 
 static __always_inline bool
 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                slot_level_handler fn, bool lock_flush_tlb)
+                slot_level_handler fn, bool flush_on_yield)
 {
        return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
-                                PG_LEVEL_4K, lock_flush_tlb);
+                                PG_LEVEL_4K, flush_on_yield);
 }
 
 static void free_mmu_pages(struct kvm_mmu *mmu)
 {
+       if (!tdp_enabled && mmu->pae_root)
+               set_memory_encrypted((unsigned long)mmu->pae_root, 1);
        free_page((unsigned long)mmu->pae_root);
        free_page((unsigned long)mmu->lm_root);
 }
@@ -5240,9 +5280,11 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
         * while the PDP table is a per-vCPU construct that's allocated at MMU
         * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
         * x86_64.  Therefore we need to allocate the PDP table in the first
-        * 4GB of memory, which happens to fit the DMA32 zone.  Except for
-        * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
-        * skip allocating the PDP table.
+        * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
+        * generally doesn't use PAE paging and can skip allocating the PDP
+        * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
+        * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
+        * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().
         */
        if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
                return 0;
@@ -5252,8 +5294,22 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
                return -ENOMEM;
 
        mmu->pae_root = page_address(page);
+
+       /*
+        * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
+        * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
+        * that KVM's writes and the CPU's reads get along.  Note, this is
+        * only necessary when using shadow paging, as 64-bit NPT can get at
+        * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
+        * by 32-bit kernels (when KVM itself uses 32-bit NPT).
+        */
+       if (!tdp_enabled)
+               set_memory_decrypted((unsigned long)mmu->pae_root, 1);
+       else
+               WARN_ON_ONCE(shadow_me_mask);
+
        for (i = 0; i < 4; ++i)
-               mmu->pae_root[i] = INVALID_PAGE;
+               mmu->pae_root[i] = INVALID_PAE_ROOT;
 
        return 0;
 }
@@ -5365,6 +5421,15 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
         */
        kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
 
+       /* In order to ensure all threads see this change when
+        * handling the MMU reload signal, this must happen in the
+        * same critical section as kvm_reload_remote_mmus, and
+        * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages
+        * could drop the MMU lock and yield.
+        */
+       if (is_tdp_mmu_enabled(kvm))
+               kvm_tdp_mmu_invalidate_all_roots(kvm);
+
        /*
         * Notify all vcpus to reload its shadow page table and flush TLB.
         * Then all vcpus will switch to new shadow page table with the new
@@ -5377,10 +5442,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 
        kvm_zap_obsolete_pages(kvm);
 
-       if (is_tdp_mmu_enabled(kvm))
-               kvm_tdp_mmu_zap_all(kvm);
-
        write_unlock(&kvm->mmu_lock);
+
+       if (is_tdp_mmu_enabled(kvm)) {
+               read_lock(&kvm->mmu_lock);
+               kvm_tdp_mmu_zap_invalidated_roots(kvm);
+               read_unlock(&kvm->mmu_lock);
+       }
 }
 
 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
@@ -5420,7 +5488,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        int i;
-       bool flush;
+       bool flush = false;
 
        write_lock(&kvm->mmu_lock);
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
@@ -5433,20 +5501,31 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
                        if (start >= end)
                                continue;
 
-                       slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
-                                               PG_LEVEL_4K,
-                                               KVM_MAX_HUGEPAGE_LEVEL,
-                                               start, end - 1, true);
+                       flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+                                                       PG_LEVEL_4K,
+                                                       KVM_MAX_HUGEPAGE_LEVEL,
+                                                       start, end - 1, true, flush);
                }
        }
 
+       if (flush)
+               kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+
+       write_unlock(&kvm->mmu_lock);
+
        if (is_tdp_mmu_enabled(kvm)) {
-               flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
+               flush = false;
+
+               read_lock(&kvm->mmu_lock);
+               for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+                       flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
+                                                         gfn_end, flush, true);
                if (flush)
-                       kvm_flush_remote_tlbs(kvm);
-       }
+                       kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+                                                          gfn_end);
 
-       write_unlock(&kvm->mmu_lock);
+               read_unlock(&kvm->mmu_lock);
+       }
 }
 
 static bool slot_rmap_write_protect(struct kvm *kvm,
@@ -5465,10 +5544,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
        write_lock(&kvm->mmu_lock);
        flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
                                start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
-       if (is_tdp_mmu_enabled(kvm))
-               flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
        write_unlock(&kvm->mmu_lock);
 
+       if (is_tdp_mmu_enabled(kvm)) {
+               read_lock(&kvm->mmu_lock);
+               flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
+               read_unlock(&kvm->mmu_lock);
+       }
+
        /*
         * We can flush all the TLBs out of the mmu lock without TLB
         * corruption since we just change the spte from writable to
@@ -5476,9 +5559,9 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
         * spte from present to present (changing the spte from present
         * to nonpresent will flush all the TLBs immediately), in other
         * words, the only case we care is mmu_spte_update() where we
-        * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
-        * instead of PT_WRITABLE_MASK, that means it does not depend
-        * on PT_WRITABLE_MASK anymore.
+        * have checked Host-writable | MMU-writable instead of
+        * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
+        * anymore.
         */
        if (flush)
                kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5529,21 +5612,32 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 {
        /* FIXME: const-ify all uses of struct kvm_memory_slot.  */
        struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
+       bool flush;
 
        write_lock(&kvm->mmu_lock);
-       slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
+       flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
 
-       if (is_tdp_mmu_enabled(kvm))
-               kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
+       if (flush)
+               kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
        write_unlock(&kvm->mmu_lock);
+
+       if (is_tdp_mmu_enabled(kvm)) {
+               flush = false;
+
+               read_lock(&kvm->mmu_lock);
+               flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
+               if (flush)
+                       kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+               read_unlock(&kvm->mmu_lock);
+       }
 }
 
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot)
+                                       const struct kvm_memory_slot *memslot)
 {
        /*
         * All current use cases for flushing the TLBs for a specific memslot
-        * are related to dirty logging, and do the TLB flush out of mmu_lock.
+        * related to dirty logging, and many do the TLB flush out of mmu_lock.
         * The interaction between the various operations on memslot must be
         * serialized by slots_locks to ensure the TLB flush from one operation
         * is observed by any other operation on the same memslot.
@@ -5560,10 +5654,14 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 
        write_lock(&kvm->mmu_lock);
        flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
-       if (is_tdp_mmu_enabled(kvm))
-               flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
        write_unlock(&kvm->mmu_lock);
 
+       if (is_tdp_mmu_enabled(kvm)) {
+               read_lock(&kvm->mmu_lock);
+               flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
+               read_unlock(&kvm->mmu_lock);
+       }
+
        /*
         * It's also safe to flush TLBs out of mmu lock here as currently this
         * function is only used for dirty logging, in which case flushing TLB
@@ -5701,25 +5799,6 @@ static void mmu_destroy_caches(void)
        kmem_cache_destroy(mmu_page_header_cache);
 }
 
-static void kvm_set_mmio_spte_mask(void)
-{
-       u64 mask;
-
-       /*
-        * Set a reserved PA bit in MMIO SPTEs to generate page faults with
-        * PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT
-        * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
-        * 52-bit physical addresses then there are no reserved PA bits in the
-        * PTEs and so the reserved PA approach must be disabled.
-        */
-       if (shadow_phys_bits < 52)
-               mask = BIT_ULL(51) | PT_PRESENT_MASK;
-       else
-               mask = 0;
-
-       kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK);
-}
-
 static bool get_nx_auto_mode(void)
 {
        /* Return true when CPU has the bug, and mitigations are ON */
@@ -5785,8 +5864,6 @@ int kvm_mmu_module_init(void)
 
        kvm_mmu_reset_all_pte_masks();
 
-       kvm_set_mmio_spte_mask();
-
        pte_list_desc_cache = kmem_cache_create("pte_list_desc",
                                            sizeof(struct pte_list_desc),
                                            0, SLAB_ACCOUNT, NULL);
index ced15fd..cedc17b 100644 (file)
@@ -70,7 +70,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
        for (i = 0; i < 4; ++i) {
                hpa_t root = vcpu->arch.mmu->pae_root[i];
 
-               if (root && VALID_PAGE(root)) {
+               if (IS_VALID_PAE_ROOT(root)) {
                        root &= PT64_BASE_ADDR_MASK;
                        sp = to_shadow_page(root);
                        __mmu_spte_walk(vcpu, sp, fn, 2);
index 3609838..d64ccb4 100644 (file)
@@ -20,6 +20,16 @@ extern bool dbg;
 #define MMU_WARN_ON(x) do { } while (0)
 #endif
 
+/*
+ * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
+ * bit, and thus are guaranteed to be non-zero when valid.  And, when a guest
+ * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE,
+ * as the CPU would treat that as PRESENT PDPTR with reserved bits set.  Use
+ * '0' instead of INVALID_PAGE to indicate an invalid PAE root.
+ */
+#define INVALID_PAE_ROOT       0
+#define IS_VALID_PAE_ROOT(x)   (!!(x))
+
 struct kvm_mmu_page {
        struct list_head link;
        struct hlist_node hash_link;
@@ -40,7 +50,11 @@ struct kvm_mmu_page {
        u64 *spt;
        /* hold the gfn of each spte inside spt */
        gfn_t *gfns;
-       int root_count;          /* Currently serving as active root */
+       /* Currently serving as active root */
+       union {
+               int root_count;
+               refcount_t tdp_mmu_root_count;
+       };
        unsigned int unsync_children;
        struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
        DECLARE_BITMAP(unsync_child_bitmap, 512);
@@ -78,9 +92,14 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
        return to_shadow_page(__pa(sptep));
 }
 
+static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
+{
+       return role.smm ? 1 : 0;
+}
+
 static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
 {
-       return sp->role.smm ? 1 : 0;
+       return kvm_mmu_role_as_id(sp->role);
 }
 
 static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
@@ -108,22 +127,6 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
                                        u64 start_gfn, u64 pages);
 
-static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-       BUG_ON(!sp->root_count);
-       lockdep_assert_held(&kvm->mmu_lock);
-
-       ++sp->root_count;
-}
-
-static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-       lockdep_assert_held(&kvm->mmu_lock);
-       --sp->root_count;
-
-       return !sp->root_count;
-}
-
 /*
  * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
  *
@@ -146,8 +149,9 @@ enum {
 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
 #define SET_SPTE_SPURIOUS              BIT(2)
 
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
-                             gfn_t gfn, kvm_pfn_t pfn, int max_level);
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+                             const struct kvm_memory_slot *slot, gfn_t gfn,
+                             kvm_pfn_t pfn, int max_level);
 int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
                            int max_level, kvm_pfn_t *pfnp,
                            bool huge_page_disallowed, int *req_level);
index 55d7b47..70b7e44 100644 (file)
@@ -503,6 +503,7 @@ error:
 #endif
        walker->fault.address = addr;
        walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+       walker->fault.async_page_fault = false;
 
        trace_kvm_mmu_walker_error(walker->fault.error_code);
        return 0;
@@ -1084,7 +1085,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
                nr_present++;
 
-               host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+               host_writable = sp->spt[i] & shadow_host_writable_mask;
 
                set_spte_ret |= set_spte(vcpu, &sp->spt[i],
                                         pte_access, PG_LEVEL_4K,
index ef55f0b..66d43ce 100644 (file)
 #include "spte.h"
 
 #include <asm/e820/api.h>
+#include <asm/vmx.h>
 
+static bool __read_mostly enable_mmio_caching = true;
+module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
+
+u64 __read_mostly shadow_host_writable_mask;
+u64 __read_mostly shadow_mmu_writable_mask;
 u64 __read_mostly shadow_nx_mask;
 u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 u64 __read_mostly shadow_user_mask;
 u64 __read_mostly shadow_accessed_mask;
 u64 __read_mostly shadow_dirty_mask;
 u64 __read_mostly shadow_mmio_value;
+u64 __read_mostly shadow_mmio_mask;
 u64 __read_mostly shadow_mmio_access_mask;
 u64 __read_mostly shadow_present_mask;
 u64 __read_mostly shadow_me_mask;
@@ -38,7 +45,6 @@ static u64 generation_mmio_spte_mask(u64 gen)
        u64 mask;
 
        WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
-       BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
 
        mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
        mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
@@ -48,16 +54,18 @@ static u64 generation_mmio_spte_mask(u64 gen)
 u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
 {
        u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
-       u64 mask = generation_mmio_spte_mask(gen);
+       u64 spte = generation_mmio_spte_mask(gen);
        u64 gpa = gfn << PAGE_SHIFT;
 
+       WARN_ON_ONCE(!shadow_mmio_value);
+
        access &= shadow_mmio_access_mask;
-       mask |= shadow_mmio_value | access;
-       mask |= gpa | shadow_nonpresent_or_rsvd_mask;
-       mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+       spte |= shadow_mmio_value | access;
+       spte |= gpa | shadow_nonpresent_or_rsvd_mask;
+       spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
                << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
 
-       return mask;
+       return spte;
 }
 
 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@ -86,13 +94,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                     bool can_unsync, bool host_writable, bool ad_disabled,
                     u64 *new_spte)
 {
-       u64 spte = 0;
+       u64 spte = SPTE_MMU_PRESENT_MASK;
        int ret = 0;
 
        if (ad_disabled)
-               spte |= SPTE_AD_DISABLED_MASK;
+               spte |= SPTE_TDP_AD_DISABLED_MASK;
        else if (kvm_vcpu_ad_need_write_protect(vcpu))
-               spte |= SPTE_AD_WRPROT_ONLY_MASK;
+               spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
+
+       /*
+        * Bits 62:52 of PAE SPTEs are reserved.  WARN if said bits are set
+        * if PAE paging may be employed (shadow paging or any 32-bit KVM).
+        */
+       WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) &&
+                    (spte & SPTE_TDP_AD_MASK));
 
        /*
         * For the EPT case, shadow_present_mask is 0 if hardware
@@ -124,7 +139,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                        kvm_is_mmio_pfn(pfn));
 
        if (host_writable)
-               spte |= SPTE_HOST_WRITEABLE;
+               spte |= shadow_host_writable_mask;
        else
                pte_access &= ~ACC_WRITE_MASK;
 
@@ -134,7 +149,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
        spte |= (u64)pfn << PAGE_SHIFT;
 
        if (pte_access & ACC_WRITE_MASK) {
-               spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
+               spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
 
                /*
                 * Optimization: for pte sync, if spte was writable the hash
@@ -150,7 +165,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                                 __func__, gfn);
                        ret |= SET_SPTE_WRITE_PROTECTED_PT;
                        pte_access &= ~ACC_WRITE_MASK;
-                       spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+                       spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
                }
        }
 
@@ -161,19 +176,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                spte = mark_spte_for_access_track(spte);
 
 out:
+       WARN_ON(is_mmio_spte(spte));
        *new_spte = spte;
        return ret;
 }
 
 u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
 {
-       u64 spte;
+       u64 spte = SPTE_MMU_PRESENT_MASK;
 
-       spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
-              shadow_user_mask | shadow_x_mask | shadow_me_mask;
+       spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+               shadow_user_mask | shadow_x_mask | shadow_me_mask;
 
        if (ad_disabled)
-               spte |= SPTE_AD_DISABLED_MASK;
+               spte |= SPTE_TDP_AD_DISABLED_MASK;
        else
                spte |= shadow_accessed_mask;
 
@@ -188,7 +204,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
        new_spte |= (u64)new_pfn << PAGE_SHIFT;
 
        new_spte &= ~PT_WRITABLE_MASK;
-       new_spte &= ~SPTE_HOST_WRITEABLE;
+       new_spte &= ~shadow_host_writable_mask;
 
        new_spte = mark_spte_for_access_track(new_spte);
 
@@ -242,53 +258,68 @@ u64 mark_spte_for_access_track(u64 spte)
        return spte;
 }
 
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
 {
        BUG_ON((u64)(unsigned)access_mask != access_mask);
-       WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN));
        WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
-       shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
+
+       if (!enable_mmio_caching)
+               mmio_value = 0;
+
+       /*
+        * Disable MMIO caching if the MMIO value collides with the bits that
+        * are used to hold the relocated GFN when the L1TF mitigation is
+        * enabled.  This should never fire as there is no known hardware that
+        * can trigger this condition, e.g. SME/SEV CPUs that require a custom
+        * MMIO value are not susceptible to L1TF.
+        */
+       if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask <<
+                                 SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)))
+               mmio_value = 0;
+
+       /*
+        * The masked MMIO value must obviously match itself and a removed SPTE
+        * must not get a false positive.  Removed SPTEs and MMIO SPTEs should
+        * never collide as MMIO must set some RWX bits, and removed SPTEs must
+        * not set any RWX bits.
+        */
+       if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
+           WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
+               mmio_value = 0;
+
+       shadow_mmio_value = mmio_value;
+       shadow_mmio_mask  = mmio_mask;
        shadow_mmio_access_mask = access_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
-/*
- * Sets the shadow PTE masks used by the MMU.
- *
- * Assumptions:
- *  - Setting either @accessed_mask or @dirty_mask requires setting both
- *  - At least one of @accessed_mask or @acc_track_mask must be set
- */
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-               u64 acc_track_mask, u64 me_mask)
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
 {
-       BUG_ON(!dirty_mask != !accessed_mask);
-       BUG_ON(!accessed_mask && !acc_track_mask);
-       BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
-
-       shadow_user_mask = user_mask;
-       shadow_accessed_mask = accessed_mask;
-       shadow_dirty_mask = dirty_mask;
-       shadow_nx_mask = nx_mask;
-       shadow_x_mask = x_mask;
-       shadow_present_mask = p_mask;
-       shadow_acc_track_mask = acc_track_mask;
-       shadow_me_mask = me_mask;
+       shadow_user_mask        = VMX_EPT_READABLE_MASK;
+       shadow_accessed_mask    = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
+       shadow_dirty_mask       = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
+       shadow_nx_mask          = 0ull;
+       shadow_x_mask           = VMX_EPT_EXECUTABLE_MASK;
+       shadow_present_mask     = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+       shadow_acc_track_mask   = VMX_EPT_RWX_MASK;
+       shadow_me_mask          = 0ull;
+
+       shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
+       shadow_mmu_writable_mask  = EPT_SPTE_MMU_WRITABLE;
+
+       /*
+        * EPT Misconfigurations are generated if the value of bits 2:0
+        * of an EPT paging-structure entry is 110b (write/execute).
+        */
+       kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
+                                  VMX_EPT_RWX_MASK, 0);
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
 
 void kvm_mmu_reset_all_pte_masks(void)
 {
        u8 low_phys_bits;
-
-       shadow_user_mask = 0;
-       shadow_accessed_mask = 0;
-       shadow_dirty_mask = 0;
-       shadow_nx_mask = 0;
-       shadow_x_mask = 0;
-       shadow_present_mask = 0;
-       shadow_acc_track_mask = 0;
+       u64 mask;
 
        shadow_phys_bits = kvm_get_shadow_phys_bits();
 
@@ -315,4 +346,30 @@ void kvm_mmu_reset_all_pte_masks(void)
 
        shadow_nonpresent_or_rsvd_lower_gfn_mask =
                GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
+
+       shadow_user_mask        = PT_USER_MASK;
+       shadow_accessed_mask    = PT_ACCESSED_MASK;
+       shadow_dirty_mask       = PT_DIRTY_MASK;
+       shadow_nx_mask          = PT64_NX_MASK;
+       shadow_x_mask           = 0;
+       shadow_present_mask     = PT_PRESENT_MASK;
+       shadow_acc_track_mask   = 0;
+       shadow_me_mask          = sme_me_mask;
+
+       shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE;
+       shadow_mmu_writable_mask  = DEFAULT_SPTE_MMU_WRITEABLE;
+
+       /*
+        * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+        * PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT
+        * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+        * 52-bit physical addresses then there are no reserved PA bits in the
+        * PTEs and so the reserved PA approach must be disabled.
+        */
+       if (shadow_phys_bits < 52)
+               mask = BIT_ULL(51) | PT_PRESENT_MASK;
+       else
+               mask = 0;
+
+       kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
 }
index 6de3950..bca0ba1 100644 (file)
@@ -5,18 +5,33 @@
 
 #include "mmu_internal.h"
 
-#define PT_FIRST_AVAIL_BITS_SHIFT 10
-#define PT64_SECOND_AVAIL_BITS_SHIFT 54
+/*
+ * A MMU present SPTE is backed by actual memory and may or may not be present
+ * in hardware.  E.g. MMIO SPTEs are not considered present.  Use bit 11, as it
+ * is ignored by all flavors of SPTEs and checking a low bit often generates
+ * better code than for a high bit, e.g. 56+.  MMU present checks are pervasive
+ * enough that the improved code generation is noticeable in KVM's footprint.
+ */
+#define SPTE_MMU_PRESENT_MASK          BIT_ULL(11)
 
 /*
- * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
- * Access Tracking SPTEs.
+ * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
+ * be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
+ * PML, is enabled).  Use bits 52 and 53 to hold the type of A/D tracking that
+ * is must be employed for a given TDP SPTE.
+ *
+ * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE
+ * paging, including NPT PAE.  This scheme works because legacy shadow paging
+ * is guaranteed to have A/D bits and write-protection is forced only for
+ * TDP with CPU dirty logging (PML).  If NPT ever gains PML-like support, it
+ * must be restricted to 64-bit KVM.
  */
-#define SPTE_SPECIAL_MASK (3ULL << 52)
-#define SPTE_AD_ENABLED_MASK (0ULL << 52)
-#define SPTE_AD_DISABLED_MASK (1ULL << 52)
-#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
-#define SPTE_MMIO_MASK (3ULL << 52)
+#define SPTE_TDP_AD_SHIFT              52
+#define SPTE_TDP_AD_MASK               (3ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_ENABLED_MASK       (0ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_DISABLED_MASK      (1ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_WRPROT_ONLY_MASK   (2ULL << SPTE_TDP_AD_SHIFT)
+static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
 
 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
 #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
        (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
+/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
+#define DEFAULT_SPTE_HOST_WRITEABLE    BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITEABLE     BIT_ULL(10)
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page.  This mask obviously
+ * must not overlap the A/D type mask.
+ */
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
+                                         PT64_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
+#define SHADOW_ACC_TRACK_SAVED_MASK    (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
+                                        SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/*
+ * Low ignored bits are at a premium for EPT, use high ignored bits, taking care
+ * to not overlap the A/D type mask or the saved access bits of access-tracked
+ * SPTEs when A/D bits are disabled.
+ */
+#define EPT_SPTE_HOST_WRITABLE         BIT_ULL(57)
+#define EPT_SPTE_MMU_WRITABLE          BIT_ULL(58)
 
-#define SPTE_HOST_WRITEABLE    (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-#define SPTE_MMU_WRITEABLE     (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/* Defined only to keep the above static asserts readable. */
+#undef SHADOW_ACC_TRACK_SAVED_MASK
 
 /*
- * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of
+ * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
  * the memslots generation and is derived as follows:
  *
- * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
- * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62
+ * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
+ * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
  *
  * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
  * the MMIO generation number, as doing so would require stealing a bit from
  */
 
 #define MMIO_SPTE_GEN_LOW_START                3
-#define MMIO_SPTE_GEN_LOW_END          11
+#define MMIO_SPTE_GEN_LOW_END          10
 
-#define MMIO_SPTE_GEN_HIGH_START       PT64_SECOND_AVAIL_BITS_SHIFT
+#define MMIO_SPTE_GEN_HIGH_START       52
 #define MMIO_SPTE_GEN_HIGH_END         62
 
 #define MMIO_SPTE_GEN_LOW_MASK         GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
                                                    MMIO_SPTE_GEN_LOW_START)
 #define MMIO_SPTE_GEN_HIGH_MASK                GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
                                                    MMIO_SPTE_GEN_HIGH_START)
+static_assert(!(SPTE_MMU_PRESENT_MASK &
+               (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
 
 #define MMIO_SPTE_GEN_LOW_BITS         (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
 #define MMIO_SPTE_GEN_HIGH_BITS                (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
 
 /* remember to adjust the comment above as well if you change these */
-static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9);
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
 
 #define MMIO_SPTE_GEN_LOW_SHIFT                (MMIO_SPTE_GEN_LOW_START - 0)
 #define MMIO_SPTE_GEN_HIGH_SHIFT       (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
 
 #define MMIO_SPTE_GEN_MASK             GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
 
+extern u64 __read_mostly shadow_host_writable_mask;
+extern u64 __read_mostly shadow_mmu_writable_mask;
 extern u64 __read_mostly shadow_nx_mask;
 extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 extern u64 __read_mostly shadow_user_mask;
 extern u64 __read_mostly shadow_accessed_mask;
 extern u64 __read_mostly shadow_dirty_mask;
 extern u64 __read_mostly shadow_mmio_value;
+extern u64 __read_mostly shadow_mmio_mask;
 extern u64 __read_mostly shadow_mmio_access_mask;
 extern u64 __read_mostly shadow_present_mask;
 extern u64 __read_mostly shadow_me_mask;
 
 /*
- * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
+ * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK;
  * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
  * pages.
  */
@@ -120,29 +170,22 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
  */
 #define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
 
-/*
- * The mask/shift to use for saving the original R/X bits when marking the PTE
- * as not-present for access tracking purposes. We do not save the W bit as the
- * PTEs being access tracked also need to be dirty tracked, so the W bit will be
- * restored only when a write is attempted to the page.
- */
-#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
-                                         PT64_EPT_EXECUTABLE_MASK)
-#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
-
 /*
  * If a thread running without exclusive control of the MMU lock must perform a
  * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
  * non-present intermediate value. Other threads which encounter this value
  * should not modify the SPTE.
  *
- * This constant works because it is considered non-present on both AMD and
- * Intel CPUs and does not create a L1TF vulnerability because the pfn section
- * is zeroed out.
+ * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
+ * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
+ * vulnerability.  Use only low bits to avoid 64-bit immediates.
  *
  * Only used by the TDP MMU.
  */
-#define REMOVED_SPTE (1ull << 59)
+#define REMOVED_SPTE   0x5a0ULL
+
+/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
+static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
 
 static inline bool is_removed_spte(u64 spte)
 {
@@ -167,7 +210,13 @@ extern u8 __read_mostly shadow_phys_bits;
 
 static inline bool is_mmio_spte(u64 spte)
 {
-       return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
+       return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+              likely(shadow_mmio_value);
+}
+
+static inline bool is_shadow_present_pte(u64 pte)
+{
+       return !!(pte & SPTE_MMU_PRESENT_MASK);
 }
 
 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
@@ -177,25 +226,30 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
 
 static inline bool spte_ad_enabled(u64 spte)
 {
-       MMU_WARN_ON(is_mmio_spte(spte));
-       return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
+       return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK;
 }
 
 static inline bool spte_ad_need_write_protect(u64 spte)
 {
-       MMU_WARN_ON(is_mmio_spte(spte));
-       return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
+       /*
+        * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0',
+        * and non-TDP SPTEs will never set these bits.  Optimize for 64-bit
+        * TDP and do the A/D type check unconditionally.
+        */
+       return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK;
 }
 
 static inline u64 spte_shadow_accessed_mask(u64 spte)
 {
-       MMU_WARN_ON(is_mmio_spte(spte));
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
        return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
 }
 
 static inline u64 spte_shadow_dirty_mask(u64 spte)
 {
-       MMU_WARN_ON(is_mmio_spte(spte));
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
        return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
 }
 
@@ -204,11 +258,6 @@ static inline bool is_access_track_spte(u64 spte)
        return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
 }
 
-static inline bool is_shadow_present_pte(u64 pte)
-{
-       return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte);
-}
-
 static inline bool is_large_pte(u64 pte)
 {
        return pte & PT_PAGE_SIZE_MASK;
@@ -246,8 +295,8 @@ static inline bool is_dirty_spte(u64 spte)
 
 static inline bool spte_can_locklessly_be_made_writable(u64 spte)
 {
-       return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
-               (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
+       return (spte & shadow_host_writable_mask) &&
+              (spte & shadow_mmu_writable_mask);
 }
 
 static inline u64 get_mmio_spte_generation(u64 spte)
index 34207b8..88f69a6 100644 (file)
@@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
        INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
 }
 
+static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+                                                            bool shared)
+{
+       if (shared)
+               lockdep_assert_held_read(&kvm->mmu_lock);
+       else
+               lockdep_assert_held_write(&kvm->mmu_lock);
+}
+
 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 {
        if (!kvm->arch.tdp_mmu_enabled)
@@ -41,32 +50,85 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
        rcu_barrier();
 }
 
-static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+                         gfn_t start, gfn_t end, bool can_yield, bool flush,
+                         bool shared);
+
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 {
-       if (kvm_mmu_put_root(kvm, root))
-               kvm_tdp_mmu_free_root(kvm, root);
+       free_page((unsigned long)sp->spt);
+       kmem_cache_free(mmu_page_header_cache, sp);
 }
 
-static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
-                                          struct kvm_mmu_page *root)
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
 {
-       lockdep_assert_held_write(&kvm->mmu_lock);
+       struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+                                              rcu_head);
 
-       if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
-               return false;
+       tdp_mmu_free_sp(sp);
+}
 
-       kvm_mmu_get_root(kvm, root);
-       return true;
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+                         bool shared)
+{
+       gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+
+       kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 
+       if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
+               return;
+
+       WARN_ON(!root->tdp_mmu_page);
+
+       spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+       list_del_rcu(&root->link);
+       spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+
+       zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
+
+       call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
 }
 
-static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
-                                                    struct kvm_mmu_page *root)
+/*
+ * Finds the next valid root after root (or the first valid root if root
+ * is NULL), takes a reference on it, and returns that next root. If root
+ * is not NULL, this thread should have already taken a reference on it, and
+ * that reference will be dropped. If no valid root is found, this
+ * function will return NULL.
+ */
+static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
+                                             struct kvm_mmu_page *prev_root,
+                                             bool shared)
 {
        struct kvm_mmu_page *next_root;
 
-       next_root = list_next_entry(root, link);
-       tdp_mmu_put_root(kvm, root);
+       rcu_read_lock();
+
+       if (prev_root)
+               next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                 &prev_root->link,
+                                                 typeof(*prev_root), link);
+       else
+               next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                  typeof(*next_root), link);
+
+       while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
+               next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                               &next_root->link, typeof(*next_root), link);
+
+       rcu_read_unlock();
+
+       if (prev_root)
+               kvm_tdp_mmu_put_root(kvm, prev_root, shared);
+
        return next_root;
 }
 
@@ -75,35 +137,24 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  * This makes it safe to release the MMU lock and yield within the loop, but
  * if exiting the loop early, the caller must drop the reference to the most
  * recent root. (Unless keeping a live reference is desirable.)
+ *
+ * If shared is set, this function is operating under the MMU lock in read
+ * mode. In the unlikely event that this thread must free a root, the lock
+ * will be temporarily dropped and reacquired in write mode.
  */
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root)                          \
-       for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,        \
-                                     typeof(*_root), link);            \
-            tdp_mmu_next_root_valid(_kvm, _root);                      \
-            _root = tdp_mmu_next_root(_kvm, _root))
-
-#define for_each_tdp_mmu_root(_kvm, _root)                             \
-       list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
-
-static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                         gfn_t start, gfn_t end, bool can_yield, bool flush);
-
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
-{
-       gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
-
-       lockdep_assert_held_write(&kvm->mmu_lock);
-
-       WARN_ON(root->root_count);
-       WARN_ON(!root->tdp_mmu_page);
-
-       list_del(&root->link);
-
-       zap_gfn_range(kvm, root, 0, max_gfn, false, false);
-
-       free_page((unsigned long)root->spt);
-       kmem_cache_free(mmu_page_header_cache, root);
-}
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
+       for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);            \
+            _root;                                                     \
+            _root = tdp_mmu_next_root(_kvm, _root, _shared))           \
+               if (kvm_mmu_page_as_id(_root) != _as_id) {              \
+               } else
+
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id)                             \
+       list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,         \
+                               lockdep_is_held_type(&kvm->mmu_lock, 0) ||      \
+                               lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
+               if (kvm_mmu_page_as_id(_root) != _as_id) {              \
+               } else
 
 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
                                                   int level)
@@ -137,81 +188,46 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
        return sp;
 }
 
-static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 {
        union kvm_mmu_page_role role;
        struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_page *root;
 
-       role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
+       lockdep_assert_held_write(&kvm->mmu_lock);
 
-       write_lock(&kvm->mmu_lock);
+       role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 
        /* Check for an existing root before allocating a new one. */
-       for_each_tdp_mmu_root(kvm, root) {
-               if (root->role.word == role.word) {
-                       kvm_mmu_get_root(kvm, root);
-                       write_unlock(&kvm->mmu_lock);
-                       return root;
-               }
+       for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
+               if (root->role.word == role.word &&
+                   kvm_tdp_mmu_get_root(kvm, root))
+                       goto out;
        }
 
        root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
-       root->root_count = 1;
-
-       list_add(&root->link, &kvm->arch.tdp_mmu_roots);
-
-       write_unlock(&kvm->mmu_lock);
-
-       return root;
-}
-
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
-{
-       struct kvm_mmu_page *root;
+       refcount_set(&root->tdp_mmu_root_count, 1);
 
-       root = get_tdp_mmu_vcpu_root(vcpu);
-       if (!root)
-               return INVALID_PAGE;
+       spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+       list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
+       spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 
+out:
        return __pa(root->spt);
 }
 
-static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
-{
-       free_page((unsigned long)sp->spt);
-       kmem_cache_free(mmu_page_header_cache, sp);
-}
-
-/*
- * This is called through call_rcu in order to free TDP page table memory
- * safely with respect to other kernel threads that may be operating on
- * the memory.
- * By only accessing TDP MMU page table memory in an RCU read critical
- * section, and freeing it after a grace period, lockless access to that
- * memory won't use it after it is freed.
- */
-static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
-{
-       struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
-                                              rcu_head);
-
-       tdp_mmu_free_sp(sp);
-}
-
 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
                                u64 old_spte, u64 new_spte, int level,
                                bool shared);
 
 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 {
-       bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
-
        if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
                return;
 
        if (is_accessed_spte(old_spte) &&
-           (!is_accessed_spte(new_spte) || pfn_changed))
+           (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
+            spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
                kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 }
 
@@ -455,7 +471,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 
 
        if (was_leaf && is_dirty_spte(old_spte) &&
-           (!is_dirty_spte(new_spte) || pfn_changed))
+           (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
                kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 
        /*
@@ -479,8 +495,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 }
 
 /*
- * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
- * associated bookkeeping
+ * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
+ * and handle the associated bookkeeping, but do not mark the page dirty
+ * in KVM's dirty bitmaps.
  *
  * @kvm: kvm instance
  * @iter: a tdp_iter instance currently on the SPTE that should be set
@@ -488,9 +505,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
  * Returns: true if the SPTE was set, false if it was not. If false is returned,
  *         this function will have no side-effects.
  */
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
-                                          struct tdp_iter *iter,
-                                          u64 new_spte)
+static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
+                                                       struct tdp_iter *iter,
+                                                       u64 new_spte)
 {
        lockdep_assert_held_read(&kvm->mmu_lock);
 
@@ -498,19 +515,32 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
         * Do not change removed SPTEs. Only the thread that froze the SPTE
         * may modify it.
         */
-       if (iter->old_spte == REMOVED_SPTE)
+       if (is_removed_spte(iter->old_spte))
                return false;
 
        if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
                      new_spte) != iter->old_spte)
                return false;
 
-       handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
-                           new_spte, iter->level, true);
+       __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+                             new_spte, iter->level, true);
+       handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
 
        return true;
 }
 
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+                                          struct tdp_iter *iter,
+                                          u64 new_spte)
+{
+       if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
+               return false;
+
+       handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
+                                     iter->old_spte, new_spte, iter->level);
+       return true;
+}
+
 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
                                           struct tdp_iter *iter)
 {
@@ -569,7 +599,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
         * should be used. If operating under the MMU lock in write mode, the
         * use of the removed SPTE should not be necessary.
         */
-       WARN_ON(iter->old_spte == REMOVED_SPTE);
+       WARN_ON(is_removed_spte(iter->old_spte));
 
        WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 
@@ -634,7 +664,8 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
  * Return false if a yield was not needed.
  */
 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
-                                            struct tdp_iter *iter, bool flush)
+                                            struct tdp_iter *iter, bool flush,
+                                            bool shared)
 {
        /* Ensure forward progress has been made before yielding. */
        if (iter->next_last_level_gfn == iter->yielded_gfn)
@@ -646,7 +677,11 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
                if (flush)
                        kvm_flush_remote_tlbs(kvm);
 
-               cond_resched_rwlock_write(&kvm->mmu_lock);
+               if (shared)
+                       cond_resched_rwlock_read(&kvm->mmu_lock);
+               else
+                       cond_resched_rwlock_write(&kvm->mmu_lock);
+
                rcu_read_lock();
 
                WARN_ON(iter->gfn > iter->next_last_level_gfn);
@@ -664,24 +699,32 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
  * non-root pages mapping GFNs strictly within that range. Returns true if
  * SPTEs have been cleared and a TLB flush is needed before releasing the
  * MMU lock.
+ *
  * If can_yield is true, will release the MMU lock and reschedule if the
  * scheduler needs the CPU or there is contention on the MMU lock. If this
  * function cannot yield, it will not release the MMU lock or reschedule and
  * the caller must ensure it does not supply too large a GFN range, or the
- * operation can cause a soft lockup.  Note, in some use cases a flush may be
- * required by prior actions.  Ensure the pending flush is performed prior to
- * yielding.
+ * operation can cause a soft lockup.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU lock in write mode.
  */
 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                         gfn_t start, gfn_t end, bool can_yield, bool flush)
+                         gfn_t start, gfn_t end, bool can_yield, bool flush,
+                         bool shared)
 {
        struct tdp_iter iter;
 
+       kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
        rcu_read_lock();
 
        tdp_root_for_each_pte(iter, root, start, end) {
+retry:
                if (can_yield &&
-                   tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
+                   tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
                        flush = false;
                        continue;
                }
@@ -699,8 +742,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
                    !is_last_spte(iter.old_spte, iter.level))
                        continue;
 
-               tdp_mmu_set_spte(kvm, &iter, 0);
-               flush = true;
+               if (!shared) {
+                       tdp_mmu_set_spte(kvm, &iter, 0);
+                       flush = true;
+               } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+                       /*
+                        * The iter must explicitly re-read the SPTE because
+                        * the atomic cmpxchg failed.
+                        */
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+                       goto retry;
+               }
        }
 
        rcu_read_unlock();
@@ -712,15 +764,21 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  * non-root pages mapping GFNs strictly within that range. Returns true if
  * SPTEs have been cleared and a TLB flush is needed before releasing the
  * MMU lock.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU in write mode.
  */
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
-                                bool can_yield)
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+                                gfn_t end, bool can_yield, bool flush,
+                                bool shared)
 {
        struct kvm_mmu_page *root;
-       bool flush = false;
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root)
-               flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
+       for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
+               flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
+                                     shared);
 
        return flush;
 }
@@ -728,13 +786,115 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 {
        gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
-       bool flush;
+       bool flush = false;
+       int i;
+
+       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+               flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
+                                                 flush, false);
+
+       if (flush)
+               kvm_flush_remote_tlbs(kvm);
+}
+
+static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
+                                                 struct kvm_mmu_page *prev_root)
+{
+       struct kvm_mmu_page *next_root;
+
+       if (prev_root)
+               next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                 &prev_root->link,
+                                                 typeof(*prev_root), link);
+       else
+               next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                  typeof(*next_root), link);
+
+       while (next_root && !(next_root->role.invalid &&
+                             refcount_read(&next_root->tdp_mmu_root_count)))
+               next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                 &next_root->link,
+                                                 typeof(*next_root), link);
+
+       return next_root;
+}
+
+/*
+ * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
+ * invalidated root, they will not be freed until this function drops the
+ * reference. Before dropping that reference, tear down the paging
+ * structure so that whichever thread does drop the last reference
+ * only has to do a trivial amount of work. Since the roots are invalid,
+ * no new SPTEs should be created under them.
+ */
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+{
+       gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+       struct kvm_mmu_page *next_root;
+       struct kvm_mmu_page *root;
+       bool flush = false;
+
+       lockdep_assert_held_read(&kvm->mmu_lock);
+
+       rcu_read_lock();
+
+       root = next_invalidated_root(kvm, NULL);
+
+       while (root) {
+               next_root = next_invalidated_root(kvm, root);
+
+               rcu_read_unlock();
+
+               flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
+                                     true);
+
+               /*
+                * Put the reference acquired in
+                * kvm_tdp_mmu_invalidate_roots
+                */
+               kvm_tdp_mmu_put_root(kvm, root, true);
+
+               root = next_root;
+
+               rcu_read_lock();
+       }
+
+       rcu_read_unlock();
 
-       flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
        if (flush)
                kvm_flush_remote_tlbs(kvm);
 }
 
+/*
+ * Mark each TDP MMU root as invalid so that other threads
+ * will drop their references and allow the root count to
+ * go to 0.
+ *
+ * Also take a reference on all roots so that this thread
+ * can do the bulk of the work required to free the roots
+ * once they are invalidated. Without this reference, a
+ * vCPU thread might drop the last reference to a root and
+ * get stuck with tearing down the entire paging structure.
+ *
+ * Roots which have a zero refcount should be skipped as
+ * they're already being torn down.
+ * Already invalid roots should be referenced again so that
+ * they aren't freed before kvm_tdp_mmu_zap_all_fast is
+ * done with them.
+ *
+ * This has essentially the same effect for the TDP MMU
+ * as updating mmu_valid_gen does for the shadow MMU.
+ */
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+{
+       struct kvm_mmu_page *root;
+
+       lockdep_assert_held_write(&kvm->mmu_lock);
+       list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
+               if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
+                       root->role.invalid = true;
+}
+
 /*
  * Installs a last-level SPTE to handle a TDP page fault.
  * (NPT/EPT violation/misconfiguration)
@@ -777,12 +937,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
                trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
                                     new_spte);
                ret = RET_PF_EMULATE;
-       } else
+       } else {
                trace_kvm_mmu_set_spte(iter->level, iter->gfn,
                                       rcu_dereference(iter->sptep));
+       }
 
-       trace_kvm_mmu_set_spte(iter->level, iter->gfn,
-                              rcu_dereference(iter->sptep));
        if (!prefault)
                vcpu->stat.pf_fixed++;
 
@@ -882,199 +1041,139 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
        return ret;
 }
 
-static __always_inline int
-kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
-                            unsigned long start,
-                            unsigned long end,
-                            unsigned long data,
-                            int (*handler)(struct kvm *kvm,
-                                           struct kvm_memory_slot *slot,
-                                           struct kvm_mmu_page *root,
-                                           gfn_t start,
-                                           gfn_t end,
-                                           unsigned long data))
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+                                bool flush)
 {
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
        struct kvm_mmu_page *root;
-       int ret = 0;
-       int as_id;
-
-       for_each_tdp_mmu_root_yield_safe(kvm, root) {
-               as_id = kvm_mmu_page_as_id(root);
-               slots = __kvm_memslots(kvm, as_id);
-               kvm_for_each_memslot(memslot, slots) {
-                       unsigned long hva_start, hva_end;
-                       gfn_t gfn_start, gfn_end;
-
-                       hva_start = max(start, memslot->userspace_addr);
-                       hva_end = min(end, memslot->userspace_addr +
-                                     (memslot->npages << PAGE_SHIFT));
-                       if (hva_start >= hva_end)
-                               continue;
-                       /*
-                        * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                        * {gfn_start, gfn_start+1, ..., gfn_end-1}.
-                        */
-                       gfn_start = hva_to_gfn_memslot(hva_start, memslot);
-                       gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
-                       ret |= handler(kvm, memslot, root, gfn_start,
-                                      gfn_end, data);
-               }
-       }
+       for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
+               flush |= zap_gfn_range(kvm, root, range->start, range->end,
+                                      range->may_block, flush, false);
 
-       return ret;
+       return flush;
 }
 
-static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
-                                    struct kvm_memory_slot *slot,
-                                    struct kvm_mmu_page *root, gfn_t start,
-                                    gfn_t end, unsigned long unused)
-{
-       return zap_gfn_range(kvm, root, start, end, false, false);
-}
+typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
+                             struct kvm_gfn_range *range);
 
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
-                             unsigned long end)
+static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
+                                                  struct kvm_gfn_range *range,
+                                                  tdp_handler_t handler)
 {
-       return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
-                                           zap_gfn_range_hva_wrapper);
+       struct kvm_mmu_page *root;
+       struct tdp_iter iter;
+       bool ret = false;
+
+       rcu_read_lock();
+
+       /*
+        * Don't support rescheduling, none of the MMU notifiers that funnel
+        * into this helper allow blocking; it'd be dead, wasteful code.
+        */
+       for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
+               tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
+                       ret |= handler(kvm, &iter, range);
+       }
+
+       rcu_read_unlock();
+
+       return ret;
 }
 
 /*
  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
  * if any of the GFNs in the range have been accessed.
  */
-static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
-                        struct kvm_mmu_page *root, gfn_t start, gfn_t end,
-                        unsigned long unused)
+static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
+                         struct kvm_gfn_range *range)
 {
-       struct tdp_iter iter;
-       int young = 0;
        u64 new_spte = 0;
 
-       rcu_read_lock();
+       /* If we have a non-accessed entry we don't need to change the pte. */
+       if (!is_accessed_spte(iter->old_spte))
+               return false;
 
-       tdp_root_for_each_leaf_pte(iter, root, start, end) {
+       new_spte = iter->old_spte;
+
+       if (spte_ad_enabled(new_spte)) {
+               new_spte &= ~shadow_accessed_mask;
+       } else {
                /*
-                * If we have a non-accessed entry we don't need to change the
-                * pte.
+                * Capture the dirty status of the page, so that it doesn't get
+                * lost when the SPTE is marked for access tracking.
                 */
-               if (!is_accessed_spte(iter.old_spte))
-                       continue;
-
-               new_spte = iter.old_spte;
-
-               if (spte_ad_enabled(new_spte)) {
-                       clear_bit((ffs(shadow_accessed_mask) - 1),
-                                 (unsigned long *)&new_spte);
-               } else {
-                       /*
-                        * Capture the dirty status of the page, so that it doesn't get
-                        * lost when the SPTE is marked for access tracking.
-                        */
-                       if (is_writable_pte(new_spte))
-                               kvm_set_pfn_dirty(spte_to_pfn(new_spte));
+               if (is_writable_pte(new_spte))
+                       kvm_set_pfn_dirty(spte_to_pfn(new_spte));
 
-                       new_spte = mark_spte_for_access_track(new_spte);
-               }
-               new_spte &= ~shadow_dirty_mask;
-
-               tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
-               young = 1;
-
-               trace_kvm_age_page(iter.gfn, iter.level, slot, young);
+               new_spte = mark_spte_for_access_track(new_spte);
        }
 
-       rcu_read_unlock();
+       tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
 
-       return young;
+       return true;
 }
 
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
-                             unsigned long end)
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
-                                           age_gfn_range);
+       return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
 }
 
-static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
-                       struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
-                       unsigned long unused2)
+static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
+                        struct kvm_gfn_range *range)
 {
-       struct tdp_iter iter;
-
-       tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
-               if (is_accessed_spte(iter.old_spte))
-                       return 1;
-
-       return 0;
+       return is_accessed_spte(iter->old_spte);
 }
 
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
-                                           test_age_gfn);
+       return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
 }
 
-/*
- * Handle the changed_pte MMU notifier for the TDP MMU.
- * data is a pointer to the new pte_t mapping the HVA specified by the MMU
- * notifier.
- * Returns non-zero if a flush is needed before releasing the MMU lock.
- */
-static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
-                       struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
-                       unsigned long data)
+static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
+                        struct kvm_gfn_range *range)
 {
-       struct tdp_iter iter;
-       pte_t *ptep = (pte_t *)data;
-       kvm_pfn_t new_pfn;
        u64 new_spte;
-       int need_flush = 0;
-
-       rcu_read_lock();
 
-       WARN_ON(pte_huge(*ptep));
+       /* Huge pages aren't expected to be modified without first being zapped. */
+       WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
 
-       new_pfn = pte_pfn(*ptep);
-
-       tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
-               if (iter.level != PG_LEVEL_4K)
-                       continue;
-
-               if (!is_shadow_present_pte(iter.old_spte))
-                       break;
-
-               tdp_mmu_set_spte(kvm, &iter, 0);
-
-               kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
+       if (iter->level != PG_LEVEL_4K ||
+           !is_shadow_present_pte(iter->old_spte))
+               return false;
 
-               if (!pte_write(*ptep)) {
-                       new_spte = kvm_mmu_changed_pte_notifier_make_spte(
-                                       iter.old_spte, new_pfn);
+       /*
+        * Note, when changing a read-only SPTE, it's not strictly necessary to
+        * zero the SPTE before setting the new PFN, but doing so preserves the
+        * invariant that the PFN of a present * leaf SPTE can never change.
+        * See __handle_changed_spte().
+        */
+       tdp_mmu_set_spte(kvm, iter, 0);
 
-                       tdp_mmu_set_spte(kvm, &iter, new_spte);
-               }
+       if (!pte_write(range->pte)) {
+               new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
+                                                                 pte_pfn(range->pte));
 
-               need_flush = 1;
+               tdp_mmu_set_spte(kvm, iter, new_spte);
        }
 
-       if (need_flush)
-               kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
-
-       rcu_read_unlock();
-
-       return 0;
+       return true;
 }
 
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
-                            pte_t *host_ptep)
+/*
+ * Handle the changed_pte MMU notifier for the TDP MMU.
+ * data is a pointer to the new pte_t mapping the HVA specified by the MMU
+ * notifier.
+ * Returns non-zero if a flush is needed before releasing the MMU lock.
+ */
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
-                                           (unsigned long)host_ptep,
-                                           set_tdp_spte);
+       bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
+
+       /* FIXME: return 'flush' instead of flushing here. */
+       if (flush)
+               kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
+
+       return false;
 }
 
 /*
@@ -1095,7 +1194,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 
        for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
                                   min_level, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+retry:
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
                        continue;
 
                if (!is_shadow_present_pte(iter.old_spte) ||
@@ -1105,7 +1205,15 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 
                new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
 
-               tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+               if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
+                                                         new_spte)) {
+                       /*
+                        * The iter must explicitly re-read the SPTE because
+                        * the atomic cmpxchg failed.
+                        */
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+                       goto retry;
+               }
                spte_set = true;
        }
 
@@ -1122,17 +1230,13 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
                             int min_level)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
        bool spte_set = false;
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
+       lockdep_assert_held_read(&kvm->mmu_lock);
 
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
                spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
                             slot->base_gfn + slot->npages, min_level);
-       }
 
        return spte_set;
 }
@@ -1154,7 +1258,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
        rcu_read_lock();
 
        tdp_root_for_each_leaf_pte(iter, root, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+retry:
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
                        continue;
 
                if (spte_ad_need_write_protect(iter.old_spte)) {
@@ -1169,7 +1274,15 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
                                continue;
                }
 
-               tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+               if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
+                                                         new_spte)) {
+                       /*
+                        * The iter must explicitly re-read the SPTE because
+                        * the atomic cmpxchg failed.
+                        */
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+                       goto retry;
+               }
                spte_set = true;
        }
 
@@ -1187,17 +1300,13 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
        bool spte_set = false;
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
+       lockdep_assert_held_read(&kvm->mmu_lock);
 
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
                spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
                                slot->base_gfn + slot->npages);
-       }
 
        return spte_set;
 }
@@ -1259,37 +1368,32 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
                                       bool wrprot)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
 
        lockdep_assert_held_write(&kvm->mmu_lock);
-       for_each_tdp_mmu_root(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
-
+       for_each_tdp_mmu_root(kvm, root, slot->as_id)
                clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
-       }
 }
 
 /*
  * Clear leaf entries which could be replaced by large mappings, for
  * GFNs within the slot.
  */
-static void zap_collapsible_spte_range(struct kvm *kvm,
+static bool zap_collapsible_spte_range(struct kvm *kvm,
                                       struct kvm_mmu_page *root,
-                                      struct kvm_memory_slot *slot)
+                                      const struct kvm_memory_slot *slot,
+                                      bool flush)
 {
        gfn_t start = slot->base_gfn;
        gfn_t end = start + slot->npages;
        struct tdp_iter iter;
        kvm_pfn_t pfn;
-       bool spte_set = false;
 
        rcu_read_lock();
 
        tdp_root_for_each_pte(iter, root, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
-                       spte_set = false;
+retry:
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
+                       flush = false;
                        continue;
                }
 
@@ -1303,38 +1407,43 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
                                                            pfn, PG_LEVEL_NUM))
                        continue;
 
-               tdp_mmu_set_spte(kvm, &iter, 0);
-
-               spte_set = true;
+               if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+                       /*
+                        * The iter must explicitly re-read the SPTE because
+                        * the atomic cmpxchg failed.
+                        */
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+                       goto retry;
+               }
+               flush = true;
        }
 
        rcu_read_unlock();
-       if (spte_set)
-               kvm_flush_remote_tlbs(kvm);
+
+       return flush;
 }
 
 /*
  * Clear non-leaf entries (and free associated page tables) which could
  * be replaced by large mappings, for GFNs within the slot.
  */
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-                                      struct kvm_memory_slot *slot)
+bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+                                      const struct kvm_memory_slot *slot,
+                                      bool flush)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
+       lockdep_assert_held_read(&kvm->mmu_lock);
 
-               zap_collapsible_spte_range(kvm, root, slot);
-       }
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+               flush = zap_collapsible_spte_range(kvm, root, slot, flush);
+
+       return flush;
 }
 
 /*
  * Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
  * Returns true if an SPTE was set and a TLB flush is needed.
  */
 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
@@ -1351,7 +1460,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
                        break;
 
                new_spte = iter.old_spte &
-                       ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+                       ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
 
                tdp_mmu_set_spte(kvm, &iter, new_spte);
                spte_set = true;
@@ -1364,24 +1473,19 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
 
 /*
  * Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
  * Returns true if an SPTE was set and a TLB flush is needed.
  */
 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
                                   struct kvm_memory_slot *slot, gfn_t gfn)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
        bool spte_set = false;
 
        lockdep_assert_held_write(&kvm->mmu_lock);
-       for_each_tdp_mmu_root(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
-
+       for_each_tdp_mmu_root(kvm, root, slot->as_id)
                spte_set |= write_protect_gfn(kvm, root, gfn);
-       }
+
        return spte_set;
 }
 
index 31096ec..5fdf630 100644 (file)
@@ -6,14 +6,28 @@
 #include <linux/kvm_host.h>
 
 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
 
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
-                                bool can_yield);
-static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start,
-                                            gfn_t end)
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
+                                                    struct kvm_mmu_page *root)
 {
-       return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true);
+       if (root->role.invalid)
+               return false;
+
+       return refcount_inc_not_zero(&root->tdp_mmu_root_count);
+}
+
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+                         bool shared);
+
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+                                gfn_t end, bool can_yield, bool flush,
+                                bool shared);
+static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
+                                            gfn_t start, gfn_t end, bool flush,
+                                            bool shared)
+{
+       return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush,
+                                          shared);
 }
 static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
@@ -29,23 +43,23 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
         * of the shadow page's gfn range and stop iterating before yielding.
         */
        lockdep_assert_held_write(&kvm->mmu_lock);
-       return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false);
+       return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
+                                          sp->gfn, end, false, false, false);
 }
+
 void kvm_tdp_mmu_zap_all(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
 
 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                    int map_writable, int max_level, kvm_pfn_t pfn,
                    bool prefault);
 
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
-                             unsigned long end);
-
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
-                             unsigned long end);
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva);
-
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
-                            pte_t *host_ptep);
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+                                bool flush);
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 
 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
                             int min_level);
@@ -55,8 +69,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
                                       struct kvm_memory_slot *slot,
                                       gfn_t gfn, unsigned long mask,
                                       bool wrprot);
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-                                      struct kvm_memory_slot *slot);
+bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+                                      const struct kvm_memory_slot *slot,
+                                      bool flush);
 
 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
                                   struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
new file mode 100644 (file)
index 0000000..a19d473
--- /dev/null
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_REVERSE_CPUID_H
+#define ARCH_X86_KVM_REVERSE_CPUID_H
+
+#include <uapi/asm/kvm.h>
+#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
+
+/*
+ * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
+ * be directly used by KVM.  Note, these word values conflict with the kernel's
+ * "bug" caps, but KVM doesn't use those.
+ */
+enum kvm_only_cpuid_leafs {
+       CPUID_12_EAX     = NCAPINTS,
+       NR_KVM_CPU_CAPS,
+
+       NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
+};
+
+#define KVM_X86_FEATURE(w, f)          ((w)*32 + (f))
+
+/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
+#define KVM_X86_FEATURE_SGX1           KVM_X86_FEATURE(CPUID_12_EAX, 0)
+#define KVM_X86_FEATURE_SGX2           KVM_X86_FEATURE(CPUID_12_EAX, 1)
+
+struct cpuid_reg {
+       u32 function;
+       u32 index;
+       int reg;
+};
+
+static const struct cpuid_reg reverse_cpuid[] = {
+       [CPUID_1_EDX]         = {         1, 0, CPUID_EDX},
+       [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX},
+       [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},
+       [CPUID_1_ECX]         = {         1, 0, CPUID_ECX},
+       [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX},
+       [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
+       [CPUID_7_0_EBX]       = {         7, 0, CPUID_EBX},
+       [CPUID_D_1_EAX]       = {       0xd, 1, CPUID_EAX},
+       [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
+       [CPUID_6_EAX]         = {         6, 0, CPUID_EAX},
+       [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
+       [CPUID_7_ECX]         = {         7, 0, CPUID_ECX},
+       [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
+       [CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
+       [CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
+       [CPUID_12_EAX]        = {0x00000012, 0, CPUID_EAX},
+       [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
+};
+
+/*
+ * Reverse CPUID and its derivatives can only be used for hardware-defined
+ * feature words, i.e. words whose bits directly correspond to a CPUID leaf.
+ * Retrieving a feature bit or masking guest CPUID from a Linux-defined word
+ * is nonsensical as the bit number/mask is an arbitrary software-defined value
+ * and can't be used by KVM to query/control guest capabilities.  And obviously
+ * the leaf being queried must have an entry in the lookup table.
+ */
+static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
+{
+       BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
+       BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
+       BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
+       BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
+       BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
+       BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
+}
+
+/*
+ * Translate feature bits that are scattered in the kernel's cpufeatures word
+ * into KVM feature words that align with hardware's definitions.
+ */
+static __always_inline u32 __feature_translate(int x86_feature)
+{
+       if (x86_feature == X86_FEATURE_SGX1)
+               return KVM_X86_FEATURE_SGX1;
+       else if (x86_feature == X86_FEATURE_SGX2)
+               return KVM_X86_FEATURE_SGX2;
+
+       return x86_feature;
+}
+
+static __always_inline u32 __feature_leaf(int x86_feature)
+{
+       return __feature_translate(x86_feature) / 32;
+}
+
+/*
+ * Retrieve the bit mask from an X86_FEATURE_* definition.  Features contain
+ * the hardware defined bit number (stored in bits 4:0) and a software defined
+ * "word" (stored in bits 31:5).  The word is used to index into arrays of
+ * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has().
+ */
+static __always_inline u32 __feature_bit(int x86_feature)
+{
+       x86_feature = __feature_translate(x86_feature);
+
+       reverse_cpuid_check(x86_feature / 32);
+       return 1 << (x86_feature & 31);
+}
+
+#define feature_bit(name)  __feature_bit(X86_FEATURE_##name)
+
+static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
+{
+       unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+       reverse_cpuid_check(x86_leaf);
+       return reverse_cpuid[x86_leaf];
+}
+
+static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+                                                 u32 reg)
+{
+       switch (reg) {
+       case CPUID_EAX:
+               return &entry->eax;
+       case CPUID_EBX:
+               return &entry->ebx;
+       case CPUID_ECX:
+               return &entry->ecx;
+       case CPUID_EDX:
+               return &entry->edx;
+       default:
+               BUILD_BUG();
+               return NULL;
+       }
+}
+
+static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+                                               unsigned int x86_feature)
+{
+       const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+
+       return __cpuid_entry_get_reg(entry, cpuid.reg);
+}
+
+static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry,
+                                          unsigned int x86_feature)
+{
+       u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+       return *reg & __feature_bit(x86_feature);
+}
+
+static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry,
+                                           unsigned int x86_feature)
+{
+       return cpuid_entry_get(entry, x86_feature);
+}
+
+static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry,
+                                             unsigned int x86_feature)
+{
+       u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+       *reg &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry,
+                                           unsigned int x86_feature)
+{
+       u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+       *reg |= __feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
+                                              unsigned int x86_feature,
+                                              bool set)
+{
+       u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+       /*
+        * Open coded instead of using cpuid_entry_{clear,set}() to coerce the
+        * compiler into using CMOV instead of Jcc when possible.
+        */
+       if (set)
+               *reg |= __feature_bit(x86_feature);
+       else
+               *reg &= ~__feature_bit(x86_feature);
+}
+
+#endif /* ARCH_X86_KVM_REVERSE_CPUID_H */
index 3e55674..712b4e0 100644 (file)
@@ -270,7 +270,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
        if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
                return -EINVAL;
 
-       if (!svm->vcpu.arch.apic->regs)
+       if (!vcpu->arch.apic->regs)
                return -EINVAL;
 
        if (kvm_apicv_activated(vcpu->kvm)) {
@@ -281,7 +281,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
                        return ret;
        }
 
-       svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+       svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
 
        /* Setting AVIC backing page address in the phy APIC ID table */
        entry = avic_get_physical_id_entry(vcpu, id);
@@ -315,15 +315,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
        }
 }
 
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
        u32 icrl = svm->vmcb->control.exit_info_1;
        u32 id = svm->vmcb->control.exit_info_2 >> 32;
        u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
-       struct kvm_lapic *apic = svm->vcpu.arch.apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
 
-       trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+       trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
 
        switch (id) {
        case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
@@ -347,11 +348,11 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
                 * set the appropriate IRR bits on the valid target
                 * vcpus. So, we just need to kick the appropriate vcpu.
                 */
-               avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh);
+               avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh);
                break;
        case AVIC_IPI_FAILURE_INVALID_TARGET:
                WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
-                         index, svm->vcpu.vcpu_id, icrh, icrl);
+                         index, vcpu->vcpu_id, icrh, icrl);
                break;
        case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
                WARN_ONCE(1, "Invalid backing page\n");
@@ -539,8 +540,9 @@ static bool is_avic_unaccelerated_access_trap(u32 offset)
        return ret;
 }
 
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int ret = 0;
        u32 offset = svm->vmcb->control.exit_info_1 &
                     AVIC_UNACCEL_ACCESS_OFFSET_MASK;
@@ -550,7 +552,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
                     AVIC_UNACCEL_ACCESS_WRITE_MASK;
        bool trap = is_avic_unaccelerated_access_trap(offset);
 
-       trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+       trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
                                            trap, write, vector);
        if (trap) {
                /* Handling Trap */
@@ -558,7 +560,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
                ret = avic_unaccel_trap_write(svm);
        } else {
                /* Handling Fault */
-               ret = kvm_emulate_instruction(&svm->vcpu, 0);
+               ret = kvm_emulate_instruction(vcpu, 0);
        }
 
        return ret;
@@ -572,7 +574,7 @@ int avic_init_vcpu(struct vcpu_svm *svm)
        if (!avic || !irqchip_in_kernel(vcpu->kvm))
                return 0;
 
-       ret = avic_init_backing_page(&svm->vcpu);
+       ret = avic_init_backing_page(vcpu);
        if (ret)
                return ret;
 
index fb204ea..540d43b 100644 (file)
@@ -29,6 +29,8 @@
 #include "lapic.h"
 #include "svm.h"
 
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
                                       struct x86_exception *fault)
 {
@@ -92,12 +94,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *hsave = svm->nested.hsave;
 
        WARN_ON(mmu_is_nested(vcpu));
 
        vcpu->arch.mmu = &vcpu->arch.guest_mmu;
-       kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
+       kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+                               svm->vmcb01.ptr->save.efer,
                                svm->nested.ctl.nested_cr3);
        vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
        vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
@@ -123,7 +125,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
                return;
 
        c = &svm->vmcb->control;
-       h = &svm->nested.hsave->control;
+       h = &svm->vmcb01.ptr->control;
        g = &svm->nested.ctl;
 
        for (i = 0; i < MAX_INTERCEPT; i++)
@@ -213,44 +215,64 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
        return true;
 }
 
-static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+/*
+ * Bits 11:0 of bitmap address are ignored by hardware
+ */
+static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
+       u64 addr = PAGE_ALIGN(pa);
 
-       if (WARN_ON(!is_guest_mode(vcpu)))
-               return true;
-
-       if (!nested_svm_vmrun_msrpm(svm)) {
-               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               vcpu->run->internal.suberror =
-                       KVM_INTERNAL_ERROR_EMULATION;
-               vcpu->run->internal.ndata = 0;
-               return false;
-       }
-
-       return true;
+       return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
+           kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
 }
 
-static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+                                      struct vmcb_control_area *control)
 {
-       if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
+       if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN)))
                return false;
 
-       if (control->asid == 0)
+       if (CC(control->asid == 0))
                return false;
 
-       if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
-           !npt_enabled)
+       if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
+               return false;
+
+       if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
+                                          MSRPM_SIZE)))
+               return false;
+       if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
+                                          IOPM_SIZE)))
                return false;
 
        return true;
 }
 
-static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu,
+                                     struct vmcb_save_area *save)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
-       bool vmcb12_lma;
+       /*
+        * These checks are also performed by KVM_SET_SREGS,
+        * except that EFER.LMA is not checked by SVM against
+        * CR0.PG && EFER.LME.
+        */
+       if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
+               if (CC(!(save->cr4 & X86_CR4_PAE)) ||
+                   CC(!(save->cr0 & X86_CR0_PE)) ||
+                   CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3)))
+                       return false;
+       }
+
+       if (CC(!kvm_is_valid_cr4(vcpu, save->cr4)))
+               return false;
+
+       return true;
+}
 
+/* Common checks that apply to both L1 and L2 state.  */
+static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu,
+                                   struct vmcb_save_area *save)
+{
        /*
         * FIXME: these should be done after copying the fields,
         * to avoid TOC/TOU races.  For these save area checks
@@ -258,31 +280,27 @@ static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
         * kvm_set_cr4 handle failure; EFER_SVME is an exception
         * so it is force-set later in nested_prepare_vmcb_save.
         */
-       if ((vmcb12->save.efer & EFER_SVME) == 0)
+       if (CC(!(save->efer & EFER_SVME)))
                return false;
 
-       if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
+       if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+           CC(save->cr0 & ~0xffffffffULL))
                return false;
 
-       if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
+       if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
                return false;
 
-       vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
+       if (!nested_vmcb_check_cr3_cr4(vcpu, save))
+               return false;
 
-       if (vmcb12_lma) {
-               if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
-                   !(vmcb12->save.cr0 & X86_CR0_PE) ||
-                   kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3))
-                       return false;
-       }
-       if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
+       if (CC(!kvm_valid_efer(vcpu, save->efer)))
                return false;
 
        return true;
 }
 
-static void load_nested_vmcb_control(struct vcpu_svm *svm,
-                                    struct vmcb_control_area *control)
+static void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
+                                           struct vmcb_control_area *control)
 {
        copy_vmcb_control_area(&svm->nested.ctl, control);
 
@@ -294,9 +312,9 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm,
 
 /*
  * Synchronize fields that are written by the processor, so that
- * they can be copied back into the nested_vmcb.
+ * they can be copied back into the vmcb12.
  */
-void sync_nested_vmcb_control(struct vcpu_svm *svm)
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
 {
        u32 mask;
        svm->nested.ctl.event_inj      = svm->vmcb->control.event_inj;
@@ -324,8 +342,8 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
  * Transfer any event that L0 or L1 wanted to inject into L2 to
  * EXIT_INT_INFO.
  */
-static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
-                                          struct vmcb *vmcb12)
+static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
+                                               struct vmcb *vmcb12)
 {
        struct kvm_vcpu *vcpu = &svm->vcpu;
        u32 exit_int_info = 0;
@@ -369,12 +387,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
 static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
                               bool nested_npt)
 {
-       if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
+       if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
                return -EINVAL;
 
        if (!nested_npt && is_pae_paging(vcpu) &&
            (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
-               if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+               if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)))
                        return -EINVAL;
        }
 
@@ -393,15 +411,42 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
        return 0;
 }
 
-static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
 {
+       if (!svm->nested.vmcb02.ptr)
+               return;
+
+       /* FIXME: merge g_pat from vmcb01 and vmcb12.  */
+       svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
+}
+
+static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+{
+       bool new_vmcb12 = false;
+
+       nested_vmcb02_compute_g_pat(svm);
+
        /* Load the nested guest state */
-       svm->vmcb->save.es = vmcb12->save.es;
-       svm->vmcb->save.cs = vmcb12->save.cs;
-       svm->vmcb->save.ss = vmcb12->save.ss;
-       svm->vmcb->save.ds = vmcb12->save.ds;
-       svm->vmcb->save.gdtr = vmcb12->save.gdtr;
-       svm->vmcb->save.idtr = vmcb12->save.idtr;
+       if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
+               new_vmcb12 = true;
+               svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+       }
+
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
+               svm->vmcb->save.es = vmcb12->save.es;
+               svm->vmcb->save.cs = vmcb12->save.cs;
+               svm->vmcb->save.ss = vmcb12->save.ss;
+               svm->vmcb->save.ds = vmcb12->save.ds;
+               svm->vmcb->save.cpl = vmcb12->save.cpl;
+               vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+       }
+
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
+               svm->vmcb->save.gdtr = vmcb12->save.gdtr;
+               svm->vmcb->save.idtr = vmcb12->save.idtr;
+               vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+       }
+
        kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
 
        /*
@@ -413,7 +458,9 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
 
        svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
        svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
-       svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
+       svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
        kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
        kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
        kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
@@ -422,15 +469,41 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
        svm->vmcb->save.rax = vmcb12->save.rax;
        svm->vmcb->save.rsp = vmcb12->save.rsp;
        svm->vmcb->save.rip = vmcb12->save.rip;
-       svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
-       svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
-       svm->vmcb->save.cpl = vmcb12->save.cpl;
+
+       /* These bits will be set properly on the first execution when new_vmc12 is true */
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+               svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
+               svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
+               vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+       }
 }
 
-static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 {
        const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
 
+       /*
+        * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
+        * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+        */
+
+       /*
+        * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
+        * avic_physical_id.
+        */
+       WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK);
+
+       /* Copied from vmcb01.  msrpm_base can be overwritten later.  */
+       svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
+       svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa;
+       svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa;
+
+       /* Done at vmrun: asid.  */
+
+       /* Also overwritten later if necessary.  */
+       svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+       /* nested_cr3.  */
        if (nested_npt_enabled(svm))
                nested_svm_init_mmu_context(&svm->vcpu);
 
@@ -439,7 +512,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
 
        svm->vmcb->control.int_ctl             =
                (svm->nested.ctl.int_ctl & ~mask) |
-               (svm->nested.hsave->control.int_ctl & mask);
+               (svm->vmcb01.ptr->control.int_ctl & mask);
 
        svm->vmcb->control.virt_ext            = svm->nested.ctl.virt_ext;
        svm->vmcb->control.int_vector          = svm->nested.ctl.int_vector;
@@ -454,17 +527,28 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
        enter_guest_mode(&svm->vcpu);
 
        /*
-        * Merge guest and host intercepts - must be called  with vcpu in
-        * guest-mode to take affect here
+        * Merge guest and host intercepts - must be called with vcpu in
+        * guest-mode to take effect.
         */
        recalc_intercepts(svm);
+}
 
-       vmcb_mark_all_dirty(svm->vmcb);
+static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+       /*
+        * Some VMCB state is shared between L1 and L2 and thus has to be
+        * moved at the time of nested vmrun and vmexit.
+        *
+        * VMLOAD/VMSAVE state would also belong in this category, but KVM
+        * always performs VMLOAD and VMSAVE from the VMCB01.
+        */
+       to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
 }
 
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
                         struct vmcb *vmcb12)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int ret;
 
        trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
@@ -482,8 +566,14 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
 
 
        svm->nested.vmcb12_gpa = vmcb12_gpa;
-       nested_prepare_vmcb_control(svm);
-       nested_prepare_vmcb_save(svm, vmcb12);
+
+       WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
+
+       nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
+
+       svm_switch_vmcb(svm, &svm->nested.vmcb02);
+       nested_vmcb02_prepare_control(svm);
+       nested_vmcb02_prepare_save(svm, vmcb12);
 
        ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
                                  nested_npt_enabled(svm));
@@ -491,47 +581,48 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
                return ret;
 
        if (!npt_enabled)
-               svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
+               vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
 
        svm_set_gif(svm, true);
 
        return 0;
 }
 
-int nested_svm_vmrun(struct vcpu_svm *svm)
+int nested_svm_vmrun(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int ret;
        struct vmcb *vmcb12;
-       struct vmcb *hsave = svm->nested.hsave;
-       struct vmcb *vmcb = svm->vmcb;
        struct kvm_host_map map;
        u64 vmcb12_gpa;
 
-       if (is_smm(&svm->vcpu)) {
-               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       ++vcpu->stat.nested_run;
+
+       if (is_smm(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }
 
        vmcb12_gpa = svm->vmcb->save.rax;
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
+       ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
        if (ret == -EINVAL) {
-               kvm_inject_gp(&svm->vcpu, 0);
+               kvm_inject_gp(vcpu, 0);
                return 1;
        } else if (ret) {
-               return kvm_skip_emulated_instruction(&svm->vcpu);
+               return kvm_skip_emulated_instruction(vcpu);
        }
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+       ret = kvm_skip_emulated_instruction(vcpu);
 
        vmcb12 = map.hva;
 
        if (WARN_ON_ONCE(!svm->nested.initialized))
                return -EINVAL;
 
-       load_nested_vmcb_control(svm, &vmcb12->control);
+       nested_load_control_from_vmcb12(svm, &vmcb12->control);
 
-       if (!nested_vmcb_check_save(svm, vmcb12) ||
-           !nested_vmcb_check_controls(&svm->nested.ctl)) {
+       if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) ||
+           !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) {
                vmcb12->control.exit_code    = SVM_EXIT_ERR;
                vmcb12->control.exit_code_hi = 0;
                vmcb12->control.exit_info_1  = 0;
@@ -541,36 +632,25 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
 
 
        /* Clear internal status */
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
 
        /*
-        * Save the old vmcb, so we don't need to pick what we save, but can
-        * restore everything when a VMEXIT occurs
+        * Since vmcb01 is not in use, we can use it to store some of the L1
+        * state.
         */
-       hsave->save.es     = vmcb->save.es;
-       hsave->save.cs     = vmcb->save.cs;
-       hsave->save.ss     = vmcb->save.ss;
-       hsave->save.ds     = vmcb->save.ds;
-       hsave->save.gdtr   = vmcb->save.gdtr;
-       hsave->save.idtr   = vmcb->save.idtr;
-       hsave->save.efer   = svm->vcpu.arch.efer;
-       hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
-       hsave->save.cr4    = svm->vcpu.arch.cr4;
-       hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
-       hsave->save.rip    = kvm_rip_read(&svm->vcpu);
-       hsave->save.rsp    = vmcb->save.rsp;
-       hsave->save.rax    = vmcb->save.rax;
-       if (npt_enabled)
-               hsave->save.cr3    = vmcb->save.cr3;
-       else
-               hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
-
-       copy_vmcb_control_area(&hsave->control, &vmcb->control);
+       svm->vmcb01.ptr->save.efer   = vcpu->arch.efer;
+       svm->vmcb01.ptr->save.cr0    = kvm_read_cr0(vcpu);
+       svm->vmcb01.ptr->save.cr4    = vcpu->arch.cr4;
+       svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu);
+       svm->vmcb01.ptr->save.rip    = kvm_rip_read(vcpu);
+
+       if (!npt_enabled)
+               svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu);
 
        svm->nested.nested_run_pending = 1;
 
-       if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
+       if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12))
                goto out_exit_err;
 
        if (nested_svm_vmrun_msrpm(svm))
@@ -587,7 +667,7 @@ out_exit_err:
        nested_svm_vmexit(svm);
 
 out:
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       kvm_vcpu_unmap(vcpu, &map, true);
 
        return ret;
 }
@@ -610,27 +690,30 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
 
 int nested_svm_vmexit(struct vcpu_svm *svm)
 {
-       int rc;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
        struct vmcb *vmcb12;
-       struct vmcb *hsave = svm->nested.hsave;
        struct vmcb *vmcb = svm->vmcb;
        struct kvm_host_map map;
+       int rc;
 
-       rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
+       /* Triple faults in L2 should never escape. */
+       WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
+
+       rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
        if (rc) {
                if (rc == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
+                       kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
        vmcb12 = map.hva;
 
        /* Exit Guest-Mode */
-       leave_guest_mode(&svm->vcpu);
+       leave_guest_mode(vcpu);
        svm->nested.vmcb12_gpa = 0;
        WARN_ON_ONCE(svm->nested.nested_run_pending);
 
-       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 
        /* in case we halted in L2 */
        svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -644,14 +727,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        vmcb12->save.gdtr   = vmcb->save.gdtr;
        vmcb12->save.idtr   = vmcb->save.idtr;
        vmcb12->save.efer   = svm->vcpu.arch.efer;
-       vmcb12->save.cr0    = kvm_read_cr0(&svm->vcpu);
-       vmcb12->save.cr3    = kvm_read_cr3(&svm->vcpu);
+       vmcb12->save.cr0    = kvm_read_cr0(vcpu);
+       vmcb12->save.cr3    = kvm_read_cr3(vcpu);
        vmcb12->save.cr2    = vmcb->save.cr2;
        vmcb12->save.cr4    = svm->vcpu.arch.cr4;
-       vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
-       vmcb12->save.rip    = kvm_rip_read(&svm->vcpu);
-       vmcb12->save.rsp    = kvm_rsp_read(&svm->vcpu);
-       vmcb12->save.rax    = kvm_rax_read(&svm->vcpu);
+       vmcb12->save.rflags = kvm_get_rflags(vcpu);
+       vmcb12->save.rip    = kvm_rip_read(vcpu);
+       vmcb12->save.rsp    = kvm_rsp_read(vcpu);
+       vmcb12->save.rax    = kvm_rax_read(vcpu);
        vmcb12->save.dr7    = vmcb->save.dr7;
        vmcb12->save.dr6    = svm->vcpu.arch.dr6;
        vmcb12->save.cpl    = vmcb->save.cpl;
@@ -663,7 +746,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        vmcb12->control.exit_info_2       = vmcb->control.exit_info_2;
 
        if (vmcb12->control.exit_code != SVM_EXIT_ERR)
-               nested_vmcb_save_pending_event(svm, vmcb12);
+               nested_save_pending_event_to_vmcb12(svm, vmcb12);
 
        if (svm->nrips_enabled)
                vmcb12->control.next_rip  = vmcb->control.next_rip;
@@ -678,37 +761,39 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        vmcb12->control.pause_filter_thresh =
                svm->vmcb->control.pause_filter_thresh;
 
-       /* Restore the original control entries */
-       copy_vmcb_control_area(&vmcb->control, &hsave->control);
+       nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+
+       svm_switch_vmcb(svm, &svm->vmcb01);
+       WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN);
 
-       /* On vmexit the  GIF is set to false */
+       /*
+        * On vmexit the  GIF is set to false and
+        * no event can be injected in L1.
+        */
        svm_set_gif(svm, false);
+       svm->vmcb->control.exit_int_info = 0;
 
-       svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
-               svm->vcpu.arch.l1_tsc_offset;
+       svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
+       if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+               svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+               vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+       }
 
        svm->nested.ctl.nested_cr3 = 0;
 
-       /* Restore selected save entries */
-       svm->vmcb->save.es = hsave->save.es;
-       svm->vmcb->save.cs = hsave->save.cs;
-       svm->vmcb->save.ss = hsave->save.ss;
-       svm->vmcb->save.ds = hsave->save.ds;
-       svm->vmcb->save.gdtr = hsave->save.gdtr;
-       svm->vmcb->save.idtr = hsave->save.idtr;
-       kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
-       kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED);
-       svm_set_efer(&svm->vcpu, hsave->save.efer);
-       svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
-       svm_set_cr4(&svm->vcpu, hsave->save.cr4);
-       kvm_rax_write(&svm->vcpu, hsave->save.rax);
-       kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
-       kvm_rip_write(&svm->vcpu, hsave->save.rip);
-       svm->vmcb->save.dr7 = DR7_FIXED_1;
-       svm->vmcb->save.cpl = 0;
-       svm->vmcb->control.exit_int_info = 0;
+       /*
+        * Restore processor state that had been saved in vmcb01
+        */
+       kvm_set_rflags(vcpu, svm->vmcb->save.rflags);
+       svm_set_efer(vcpu, svm->vmcb->save.efer);
+       svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
+       svm_set_cr4(vcpu, svm->vmcb->save.cr4);
+       kvm_rax_write(vcpu, svm->vmcb->save.rax);
+       kvm_rsp_write(vcpu, svm->vmcb->save.rsp);
+       kvm_rip_write(vcpu, svm->vmcb->save.rip);
 
-       vmcb_mark_all_dirty(svm->vmcb);
+       svm->vcpu.arch.dr7 = DR7_FIXED_1;
+       kvm_update_dr7(&svm->vcpu);
 
        trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
                                       vmcb12->control.exit_info_1,
@@ -717,50 +802,62 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
                                       vmcb12->control.exit_int_info_err,
                                       KVM_ISA_SVM);
 
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       kvm_vcpu_unmap(vcpu, &map, true);
 
-       nested_svm_uninit_mmu_context(&svm->vcpu);
+       nested_svm_uninit_mmu_context(vcpu);
 
-       rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
+       rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false);
        if (rc)
                return 1;
 
-       if (npt_enabled)
-               svm->vmcb->save.cr3 = hsave->save.cr3;
-
        /*
         * Drop what we picked up for L2 via svm_complete_interrupts() so it
         * doesn't end up in L1.
         */
        svm->vcpu.arch.nmi_injected = false;
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
+
+       /*
+        * If we are here following the completion of a VMRUN that
+        * is being single-stepped, queue the pending #DB intercept
+        * right now so that it an be accounted for before we execute
+        * L1's next instruction.
+        */
+       if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF))
+               kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
 
        return 0;
 }
 
+static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
+{
+       nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
+}
+
 int svm_allocate_nested(struct vcpu_svm *svm)
 {
-       struct page *hsave_page;
+       struct page *vmcb02_page;
 
        if (svm->nested.initialized)
                return 0;
 
-       hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-       if (!hsave_page)
+       vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+       if (!vmcb02_page)
                return -ENOMEM;
-       svm->nested.hsave = page_address(hsave_page);
+       svm->nested.vmcb02.ptr = page_address(vmcb02_page);
+       svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
 
        svm->nested.msrpm = svm_vcpu_alloc_msrpm();
        if (!svm->nested.msrpm)
-               goto err_free_hsave;
+               goto err_free_vmcb02;
        svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
 
        svm->nested.initialized = true;
        return 0;
 
-err_free_hsave:
-       __free_page(hsave_page);
+err_free_vmcb02:
+       __free_page(vmcb02_page);
        return -ENOMEM;
 }
 
@@ -772,8 +869,8 @@ void svm_free_nested(struct vcpu_svm *svm)
        svm_vcpu_free_msrpm(svm->nested.msrpm);
        svm->nested.msrpm = NULL;
 
-       __free_page(virt_to_page(svm->nested.hsave));
-       svm->nested.hsave = NULL;
+       __free_page(virt_to_page(svm->nested.vmcb02.ptr));
+       svm->nested.vmcb02.ptr = NULL;
 
        svm->nested.initialized = false;
 }
@@ -783,18 +880,19 @@ void svm_free_nested(struct vcpu_svm *svm)
  */
 void svm_leave_nested(struct vcpu_svm *svm)
 {
-       if (is_guest_mode(&svm->vcpu)) {
-               struct vmcb *hsave = svm->nested.hsave;
-               struct vmcb *vmcb = svm->vmcb;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
 
+       if (is_guest_mode(vcpu)) {
                svm->nested.nested_run_pending = 0;
-               leave_guest_mode(&svm->vcpu);
-               copy_vmcb_control_area(&vmcb->control, &hsave->control);
-               nested_svm_uninit_mmu_context(&svm->vcpu);
+               leave_guest_mode(vcpu);
+
+               svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+               nested_svm_uninit_mmu_context(vcpu);
                vmcb_mark_all_dirty(svm->vmcb);
        }
 
-       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 }
 
 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
@@ -903,16 +1001,15 @@ int nested_svm_exit_handled(struct vcpu_svm *svm)
        return vmexit;
 }
 
-int nested_svm_check_permissions(struct vcpu_svm *svm)
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
 {
-       if (!(svm->vcpu.arch.efer & EFER_SVME) ||
-           !is_paging(&svm->vcpu)) {
-               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }
 
-       if (svm->vmcb->save.cpl) {
-               kvm_inject_gp(&svm->vcpu, 0);
+       if (to_svm(vcpu)->vmcb->save.cpl) {
+               kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
@@ -960,50 +1057,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
        nested_svm_vmexit(svm);
 }
 
-static void nested_svm_smi(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code = SVM_EXIT_SMI;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-static void nested_svm_nmi(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code = SVM_EXIT_NMI;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-static void nested_svm_intr(struct vcpu_svm *svm)
-{
-       trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-
-       svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
 static inline bool nested_exit_on_init(struct vcpu_svm *svm)
 {
        return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
 }
 
-static void nested_svm_init(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code   = SVM_EXIT_INIT;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-
 static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -1017,12 +1075,18 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_init(svm))
                        return 0;
-               nested_svm_init(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
                return 0;
        }
 
        if (vcpu->arch.exception.pending) {
-               if (block_nested_events)
+               /*
+                * Only a pending nested run can block a pending exception.
+                * Otherwise an injected NMI/interrupt should either be
+                * lost or delivered to the nested hypervisor in the EXITINTINFO
+                * vmcb field, while delivering the pending exception.
+                */
+               if (svm->nested.nested_run_pending)
                         return -EBUSY;
                if (!nested_exit_on_exception(svm))
                        return 0;
@@ -1035,7 +1099,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_smi(svm))
                        return 0;
-               nested_svm_smi(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
                return 0;
        }
 
@@ -1044,7 +1108,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_nmi(svm))
                        return 0;
-               nested_svm_nmi(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
                return 0;
        }
 
@@ -1053,7 +1117,8 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_intr(svm))
                        return 0;
-               nested_svm_intr(svm);
+               trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
                return 0;
        }
 
@@ -1072,8 +1137,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
        case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
                u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
 
-               if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
-                               excp_bits)
+               if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
+                   excp_bits)
                        return NESTED_EXIT_HOST;
                else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
                         svm->vcpu.arch.apf.host_apf_flags)
@@ -1137,10 +1202,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
        if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
                         sizeof(user_vmcb->control)))
                return -EFAULT;
-       if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
+       if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
                         sizeof(user_vmcb->save)))
                return -EFAULT;
-
 out:
        return kvm_state.size;
 }
@@ -1150,7 +1214,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                                struct kvm_nested_state *kvm_state)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *hsave = svm->nested.hsave;
        struct vmcb __user *user_vmcb = (struct vmcb __user *)
                &user_kvm_nested_state->data.svm[0];
        struct vmcb_control_area *ctl;
@@ -1195,8 +1258,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                return -EINVAL;
 
        ret  = -ENOMEM;
-       ctl  = kzalloc(sizeof(*ctl),  GFP_KERNEL);
-       save = kzalloc(sizeof(*save), GFP_KERNEL);
+       ctl  = kzalloc(sizeof(*ctl),  GFP_KERNEL_ACCOUNT);
+       save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
        if (!ctl || !save)
                goto out_free;
 
@@ -1207,12 +1270,12 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                goto out_free;
 
        ret = -EINVAL;
-       if (!nested_vmcb_check_controls(ctl))
+       if (!nested_vmcb_check_controls(vcpu, ctl))
                goto out_free;
 
        /*
         * Processor state contains L2 state.  Check that it is
-        * valid for guest mode (see nested_vmcb_checks).
+        * valid for guest mode (see nested_vmcb_check_save).
         */
        cr0 = kvm_read_cr0(vcpu);
         if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
@@ -1221,29 +1284,48 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
        /*
         * Validate host state saved from before VMRUN (see
         * nested_svm_check_permissions).
-        * TODO: validate reserved bits for all saved state.
         */
-       if (!(save->cr0 & X86_CR0_PG))
-               goto out_free;
-       if (!(save->efer & EFER_SVME))
+       if (!(save->cr0 & X86_CR0_PG) ||
+           !(save->cr0 & X86_CR0_PE) ||
+           (save->rflags & X86_EFLAGS_VM) ||
+           !nested_vmcb_valid_sregs(vcpu, save))
                goto out_free;
 
        /*
-        * All checks done, we can enter guest mode.  L1 control fields
-        * come from the nested save state.  Guest state is already
-        * in the registers, the save area of the nested state instead
-        * contains saved L1 state.
+        * All checks done, we can enter guest mode. Userspace provides
+        * vmcb12.control, which will be combined with L1 and stored into
+        * vmcb02, and the L1 save state which we store in vmcb01.
+        * L2 registers if needed are moved from the current VMCB to VMCB02.
         */
 
        svm->nested.nested_run_pending =
                !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
 
-       copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
-       hsave->save = *save;
-
        svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
-       load_nested_vmcb_control(svm, ctl);
-       nested_prepare_vmcb_control(svm);
+       if (svm->current_vmcb == &svm->vmcb01)
+               svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+       svm->vmcb01.ptr->save.es = save->es;
+       svm->vmcb01.ptr->save.cs = save->cs;
+       svm->vmcb01.ptr->save.ss = save->ss;
+       svm->vmcb01.ptr->save.ds = save->ds;
+       svm->vmcb01.ptr->save.gdtr = save->gdtr;
+       svm->vmcb01.ptr->save.idtr = save->idtr;
+       svm->vmcb01.ptr->save.rflags = save->rflags | X86_EFLAGS_FIXED;
+       svm->vmcb01.ptr->save.efer = save->efer;
+       svm->vmcb01.ptr->save.cr0 = save->cr0;
+       svm->vmcb01.ptr->save.cr3 = save->cr3;
+       svm->vmcb01.ptr->save.cr4 = save->cr4;
+       svm->vmcb01.ptr->save.rax = save->rax;
+       svm->vmcb01.ptr->save.rsp = save->rsp;
+       svm->vmcb01.ptr->save.rip = save->rip;
+       svm->vmcb01.ptr->save.cpl = 0;
+
+       nested_load_control_from_vmcb12(svm, ctl);
+
+       svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+       nested_vmcb02_prepare_control(svm);
 
        kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
        ret = 0;
@@ -1254,8 +1336,31 @@ out_free:
        return ret;
 }
 
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (WARN_ON(!is_guest_mode(vcpu)))
+               return true;
+
+       if (nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+                               nested_npt_enabled(svm)))
+               return false;
+
+       if (!nested_svm_vmrun_msrpm(svm)) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror =
+                       KVM_INTERNAL_ERROR_EMULATION;
+               vcpu->run->internal.ndata = 0;
+               return false;
+       }
+
+       return true;
+}
+
 struct kvm_x86_nested_ops svm_nested_ops = {
        .check_events = svm_check_nested_events,
+       .triple_fault = nested_svm_triple_fault,
        .get_nested_state_pages = svm_get_nested_state_pages,
        .get_state = svm_get_nested_state,
        .set_state = svm_set_nested_state,
index 415a49b..1356ee0 100644 (file)
 #define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
 #endif
 
+#ifdef CONFIG_KVM_AMD_SEV
+/* enable/disable SEV support */
+static bool sev_enabled = true;
+module_param_named(sev, sev_enabled, bool, 0444);
+
+/* enable/disable SEV-ES support */
+static bool sev_es_enabled = true;
+module_param_named(sev_es, sev_es_enabled, bool, 0444);
+#else
+#define sev_enabled false
+#define sev_es_enabled false
+#endif /* CONFIG_KVM_AMD_SEV */
+
 static u8 sev_enc_bit;
-static int sev_flush_asids(void);
 static DECLARE_RWSEM(sev_deactivate_lock);
 static DEFINE_MUTEX(sev_bitmap_lock);
 unsigned int max_sev_asid;
 static unsigned int min_sev_asid;
+static unsigned long sev_me_mask;
 static unsigned long *sev_asid_bitmap;
 static unsigned long *sev_reclaim_asid_bitmap;
 
@@ -61,9 +74,15 @@ struct enc_region {
        unsigned long size;
 };
 
-static int sev_flush_asids(void)
+/* Called with the sev_bitmap_lock held, or on shutdown  */
+static int sev_flush_asids(int min_asid, int max_asid)
 {
-       int ret, error = 0;
+       int ret, pos, error = 0;
+
+       /* Check if there are any ASIDs to reclaim before performing a flush */
+       pos = find_next_bit(sev_reclaim_asid_bitmap, max_asid, min_asid);
+       if (pos >= max_asid)
+               return -EBUSY;
 
        /*
         * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
@@ -82,17 +101,15 @@ static int sev_flush_asids(void)
        return ret;
 }
 
+static inline bool is_mirroring_enc_context(struct kvm *kvm)
+{
+       return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+}
+
 /* Must be called with the sev_bitmap_lock held */
 static bool __sev_recycle_asids(int min_asid, int max_asid)
 {
-       int pos;
-
-       /* Check if there are any ASIDs to reclaim before performing a flush */
-       pos = find_next_bit(sev_reclaim_asid_bitmap, max_sev_asid, min_asid);
-       if (pos >= max_asid)
-               return false;
-
-       if (sev_flush_asids())
+       if (sev_flush_asids(min_asid, max_asid))
                return false;
 
        /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
@@ -184,49 +201,41 @@ static void sev_asid_free(struct kvm_sev_info *sev)
 
 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
 {
-       struct sev_data_decommission *decommission;
-       struct sev_data_deactivate *data;
+       struct sev_data_decommission decommission;
+       struct sev_data_deactivate deactivate;
 
        if (!handle)
                return;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return;
-
-       /* deactivate handle */
-       data->handle = handle;
+       deactivate.handle = handle;
 
        /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
        down_read(&sev_deactivate_lock);
-       sev_guest_deactivate(data, NULL);
+       sev_guest_deactivate(&deactivate, NULL);
        up_read(&sev_deactivate_lock);
 
-       kfree(data);
-
-       decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
-       if (!decommission)
-               return;
-
        /* decommission handle */
-       decommission->handle = handle;
-       sev_guest_decommission(decommission, NULL);
-
-       kfree(decommission);
+       decommission.handle = handle;
+       sev_guest_decommission(&decommission, NULL);
 }
 
 static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       bool es_active = argp->id == KVM_SEV_ES_INIT;
        int asid, ret;
 
+       if (kvm->created_vcpus)
+               return -EINVAL;
+
        ret = -EBUSY;
        if (unlikely(sev->active))
                return ret;
 
+       sev->es_active = es_active;
        asid = sev_asid_new(sev);
        if (asid < 0)
-               return ret;
+               goto e_no_asid;
        sev->asid = asid;
 
        ret = sev_platform_init(&argp->error);
@@ -234,6 +243,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
                goto e_free;
 
        sev->active = true;
+       sev->asid = asid;
        INIT_LIST_HEAD(&sev->regions_list);
 
        return 0;
@@ -241,34 +251,21 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 e_free:
        sev_asid_free(sev);
        sev->asid = 0;
+e_no_asid:
+       sev->es_active = false;
        return ret;
 }
 
-static int sev_es_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
-       if (!sev_es)
-               return -ENOTTY;
-
-       to_kvm_svm(kvm)->sev_info.es_active = true;
-
-       return sev_guest_init(kvm, argp);
-}
-
 static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
 {
-       struct sev_data_activate *data;
+       struct sev_data_activate activate;
        int asid = sev_get_asid(kvm);
        int ret;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
-
        /* activate ASID on the given handle */
-       data->handle = handle;
-       data->asid   = asid;
-       ret = sev_guest_activate(data, error);
-       kfree(data);
+       activate.handle = handle;
+       activate.asid   = asid;
+       ret = sev_guest_activate(&activate, error);
 
        return ret;
 }
@@ -298,7 +295,7 @@ static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
 static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_start *start;
+       struct sev_data_launch_start start;
        struct kvm_sev_launch_start params;
        void *dh_blob, *session_blob;
        int *error = &argp->error;
@@ -310,20 +307,16 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
        if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
                return -EFAULT;
 
-       start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
-       if (!start)
-               return -ENOMEM;
+       memset(&start, 0, sizeof(start));
 
        dh_blob = NULL;
        if (params.dh_uaddr) {
                dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
-               if (IS_ERR(dh_blob)) {
-                       ret = PTR_ERR(dh_blob);
-                       goto e_free;
-               }
+               if (IS_ERR(dh_blob))
+                       return PTR_ERR(dh_blob);
 
-               start->dh_cert_address = __sme_set(__pa(dh_blob));
-               start->dh_cert_len = params.dh_len;
+               start.dh_cert_address = __sme_set(__pa(dh_blob));
+               start.dh_cert_len = params.dh_len;
        }
 
        session_blob = NULL;
@@ -334,40 +327,38 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
                        goto e_free_dh;
                }
 
-               start->session_address = __sme_set(__pa(session_blob));
-               start->session_len = params.session_len;
+               start.session_address = __sme_set(__pa(session_blob));
+               start.session_len = params.session_len;
        }
 
-       start->handle = params.handle;
-       start->policy = params.policy;
+       start.handle = params.handle;
+       start.policy = params.policy;
 
        /* create memory encryption context */
-       ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+       ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
        if (ret)
                goto e_free_session;
 
        /* Bind ASID to this guest */
-       ret = sev_bind_asid(kvm, start->handle, error);
+       ret = sev_bind_asid(kvm, start.handle, error);
        if (ret)
                goto e_free_session;
 
        /* return handle to userspace */
-       params.handle = start->handle;
+       params.handle = start.handle;
        if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
-               sev_unbind_asid(kvm, start->handle);
+               sev_unbind_asid(kvm, start.handle);
                ret = -EFAULT;
                goto e_free_session;
        }
 
-       sev->handle = start->handle;
+       sev->handle = start.handle;
        sev->fd = argp->sev_fd;
 
 e_free_session:
        kfree(session_blob);
 e_free_dh:
        kfree(dh_blob);
-e_free:
-       kfree(start);
        return ret;
 }
 
@@ -486,7 +477,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
        unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
        struct kvm_sev_launch_update_data params;
-       struct sev_data_launch_update_data *data;
+       struct sev_data_launch_update_data data;
        struct page **inpages;
        int ret;
 
@@ -496,20 +487,14 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
        if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
-
        vaddr = params.uaddr;
        size = params.len;
        vaddr_end = vaddr + size;
 
        /* Lock the user memory. */
        inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
-       if (IS_ERR(inpages)) {
-               ret = PTR_ERR(inpages);
-               goto e_free;
-       }
+       if (IS_ERR(inpages))
+               return PTR_ERR(inpages);
 
        /*
         * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
@@ -517,6 +502,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
         */
        sev_clflush_pages(inpages, npages);
 
+       data.reserved = 0;
+       data.handle = sev->handle;
+
        for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
                int offset, len;
 
@@ -531,10 +519,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
                len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
 
-               data->handle = sev->handle;
-               data->len = len;
-               data->address = __sme_page_pa(inpages[i]) + offset;
-               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
+               data.len = len;
+               data.address = __sme_page_pa(inpages[i]) + offset;
+               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
                if (ret)
                        goto e_unpin;
 
@@ -550,8 +537,6 @@ e_unpin:
        }
        /* unlock the user pages */
        sev_unpin_memory(kvm, inpages, npages);
-e_free:
-       kfree(data);
        return ret;
 }
 
@@ -603,23 +588,22 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
 static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_update_vmsa *vmsa;
+       struct sev_data_launch_update_vmsa vmsa;
+       struct kvm_vcpu *vcpu;
        int i, ret;
 
        if (!sev_es_guest(kvm))
                return -ENOTTY;
 
-       vmsa = kzalloc(sizeof(*vmsa), GFP_KERNEL);
-       if (!vmsa)
-               return -ENOMEM;
+       vmsa.reserved = 0;
 
-       for (i = 0; i < kvm->created_vcpus; i++) {
-               struct vcpu_svm *svm = to_svm(kvm->vcpus[i]);
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               struct vcpu_svm *svm = to_svm(vcpu);
 
                /* Perform some pre-encryption checks against the VMSA */
                ret = sev_es_sync_vmsa(svm);
                if (ret)
-                       goto e_free;
+                       return ret;
 
                /*
                 * The LAUNCH_UPDATE_VMSA command will perform in-place
@@ -629,27 +613,25 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
                 */
                clflush_cache_range(svm->vmsa, PAGE_SIZE);
 
-               vmsa->handle = sev->handle;
-               vmsa->address = __sme_pa(svm->vmsa);
-               vmsa->len = PAGE_SIZE;
-               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, vmsa,
+               vmsa.handle = sev->handle;
+               vmsa.address = __sme_pa(svm->vmsa);
+               vmsa.len = PAGE_SIZE;
+               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa,
                                    &argp->error);
                if (ret)
-                       goto e_free;
+                       return ret;
 
                svm->vcpu.arch.guest_state_protected = true;
        }
 
-e_free:
-       kfree(vmsa);
-       return ret;
+       return 0;
 }
 
 static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        void __user *measure = (void __user *)(uintptr_t)argp->data;
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_measure *data;
+       struct sev_data_launch_measure data;
        struct kvm_sev_launch_measure params;
        void __user *p = NULL;
        void *blob = NULL;
@@ -661,9 +643,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
        if (copy_from_user(&params, measure, sizeof(params)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
        /* User wants to query the blob length */
        if (!params.len)
@@ -671,23 +651,20 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
        p = (void __user *)(uintptr_t)params.uaddr;
        if (p) {
-               if (params.len > SEV_FW_BLOB_MAX_SIZE) {
-                       ret = -EINVAL;
-                       goto e_free;
-               }
+               if (params.len > SEV_FW_BLOB_MAX_SIZE)
+                       return -EINVAL;
 
-               ret = -ENOMEM;
-               blob = kmalloc(params.len, GFP_KERNEL);
+               blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
                if (!blob)
-                       goto e_free;
+                       return -ENOMEM;
 
-               data->address = __psp_pa(blob);
-               data->len = params.len;
+               data.address = __psp_pa(blob);
+               data.len = params.len;
        }
 
 cmd:
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
 
        /*
         * If we query the session length, FW responded with expected data.
@@ -704,63 +681,50 @@ cmd:
        }
 
 done:
-       params.len = data->len;
+       params.len = data.len;
        if (copy_to_user(measure, &params, sizeof(params)))
                ret = -EFAULT;
 e_free_blob:
        kfree(blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
 static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_finish *data;
-       int ret;
+       struct sev_data_launch_finish data;
 
        if (!sev_guest(kvm))
                return -ENOTTY;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
-
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
-
-       kfree(data);
-       return ret;
+       data.handle = sev->handle;
+       return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
 }
 
 static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
        struct kvm_sev_guest_status params;
-       struct sev_data_guest_status *data;
+       struct sev_data_guest_status data;
        int ret;
 
        if (!sev_guest(kvm))
                return -ENOTTY;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
        if (ret)
-               goto e_free;
+               return ret;
 
-       params.policy = data->policy;
-       params.state = data->state;
-       params.handle = data->handle;
+       params.policy = data.policy;
+       params.state = data.state;
+       params.handle = data.handle;
 
        if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
                ret = -EFAULT;
-e_free:
-       kfree(data);
+
        return ret;
 }
 
@@ -769,23 +733,17 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
                               int *error, bool enc)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_dbg *data;
-       int ret;
-
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       struct sev_data_dbg data;
 
-       data->handle = sev->handle;
-       data->dst_addr = dst;
-       data->src_addr = src;
-       data->len = size;
+       data.reserved = 0;
+       data.handle = sev->handle;
+       data.dst_addr = dst;
+       data.src_addr = src;
+       data.len = size;
 
-       ret = sev_issue_cmd(kvm,
-                           enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
-                           data, error);
-       kfree(data);
-       return ret;
+       return sev_issue_cmd(kvm,
+                            enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+                            &data, error);
 }
 
 static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
@@ -1005,7 +963,7 @@ err:
 static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_secret *data;
+       struct sev_data_launch_secret data;
        struct kvm_sev_launch_secret params;
        struct page **pages;
        void *blob, *hdr;
@@ -1037,41 +995,36 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
                goto e_unpin_memory;
        }
 
-       ret = -ENOMEM;
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               goto e_unpin_memory;
+       memset(&data, 0, sizeof(data));
 
        offset = params.guest_uaddr & (PAGE_SIZE - 1);
-       data->guest_address = __sme_page_pa(pages[0]) + offset;
-       data->guest_len = params.guest_len;
+       data.guest_address = __sme_page_pa(pages[0]) + offset;
+       data.guest_len = params.guest_len;
 
        blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
        if (IS_ERR(blob)) {
                ret = PTR_ERR(blob);
-               goto e_free;
+               goto e_unpin_memory;
        }
 
-       data->trans_address = __psp_pa(blob);
-       data->trans_len = params.trans_len;
+       data.trans_address = __psp_pa(blob);
+       data.trans_len = params.trans_len;
 
        hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
        if (IS_ERR(hdr)) {
                ret = PTR_ERR(hdr);
                goto e_free_blob;
        }
-       data->hdr_address = __psp_pa(hdr);
-       data->hdr_len = params.hdr_len;
+       data.hdr_address = __psp_pa(hdr);
+       data.hdr_len = params.hdr_len;
 
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
 
        kfree(hdr);
 
 e_free_blob:
        kfree(blob);
-e_free:
-       kfree(data);
 e_unpin_memory:
        /* content of memory is updated, mark pages dirty */
        for (i = 0; i < n; i++) {
@@ -1086,7 +1039,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        void __user *report = (void __user *)(uintptr_t)argp->data;
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_attestation_report *data;
+       struct sev_data_attestation_report data;
        struct kvm_sev_attestation_report params;
        void __user *p;
        void *blob = NULL;
@@ -1098,9 +1051,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
        if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
        /* User wants to query the blob length */
        if (!params.len)
@@ -1108,23 +1059,20 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
        p = (void __user *)(uintptr_t)params.uaddr;
        if (p) {
-               if (params.len > SEV_FW_BLOB_MAX_SIZE) {
-                       ret = -EINVAL;
-                       goto e_free;
-               }
+               if (params.len > SEV_FW_BLOB_MAX_SIZE)
+                       return -EINVAL;
 
-               ret = -ENOMEM;
-               blob = kmalloc(params.len, GFP_KERNEL);
+               blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
                if (!blob)
-                       goto e_free;
+                       return -ENOMEM;
 
-               data->address = __psp_pa(blob);
-               data->len = params.len;
-               memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce));
+               data.address = __psp_pa(blob);
+               data.len = params.len;
+               memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
        }
 cmd:
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error);
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
        /*
         * If we query the session length, FW responded with expected data.
         */
@@ -1140,22 +1088,417 @@ cmd:
        }
 
 done:
-       params.len = data->len;
+       params.len = data.len;
        if (copy_to_user(report, &params, sizeof(params)))
                ret = -EFAULT;
 e_free_blob:
        kfree(blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
+/* Userspace wants to query session length. */
+static int
+__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
+                                     struct kvm_sev_send_start *params)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_start data;
+       int ret;
+
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+       if (ret < 0)
+               return ret;
+
+       params->session_len = data.session_len;
+       if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+                               sizeof(struct kvm_sev_send_start)))
+               ret = -EFAULT;
+
+       return ret;
+}
+
+static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_start data;
+       struct kvm_sev_send_start params;
+       void *amd_certs, *session_data;
+       void *pdh_cert, *plat_certs;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+                               sizeof(struct kvm_sev_send_start)))
+               return -EFAULT;
+
+       /* if session_len is zero, userspace wants to query the session length */
+       if (!params.session_len)
+               return __sev_send_start_query_session_length(kvm, argp,
+                               &params);
+
+       /* some sanity checks */
+       if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
+           !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
+               return -EINVAL;
+
+       /* allocate the memory to hold the session data blob */
+       session_data = kmalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+       if (!session_data)
+               return -ENOMEM;
+
+       /* copy the certificate blobs from userspace */
+       pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
+                               params.pdh_cert_len);
+       if (IS_ERR(pdh_cert)) {
+               ret = PTR_ERR(pdh_cert);
+               goto e_free_session;
+       }
+
+       plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
+                               params.plat_certs_len);
+       if (IS_ERR(plat_certs)) {
+               ret = PTR_ERR(plat_certs);
+               goto e_free_pdh;
+       }
+
+       amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
+                               params.amd_certs_len);
+       if (IS_ERR(amd_certs)) {
+               ret = PTR_ERR(amd_certs);
+               goto e_free_plat_cert;
+       }
+
+       /* populate the FW SEND_START field with system physical address */
+       memset(&data, 0, sizeof(data));
+       data.pdh_cert_address = __psp_pa(pdh_cert);
+       data.pdh_cert_len = params.pdh_cert_len;
+       data.plat_certs_address = __psp_pa(plat_certs);
+       data.plat_certs_len = params.plat_certs_len;
+       data.amd_certs_address = __psp_pa(amd_certs);
+       data.amd_certs_len = params.amd_certs_len;
+       data.session_address = __psp_pa(session_data);
+       data.session_len = params.session_len;
+       data.handle = sev->handle;
+
+       ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+       if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+                       session_data, params.session_len)) {
+               ret = -EFAULT;
+               goto e_free_amd_cert;
+       }
+
+       params.policy = data.policy;
+       params.session_len = data.session_len;
+       if (copy_to_user((void __user *)(uintptr_t)argp->data, &params,
+                               sizeof(struct kvm_sev_send_start)))
+               ret = -EFAULT;
+
+e_free_amd_cert:
+       kfree(amd_certs);
+e_free_plat_cert:
+       kfree(plat_certs);
+e_free_pdh:
+       kfree(pdh_cert);
+e_free_session:
+       kfree(session_data);
+       return ret;
+}
+
+/* Userspace wants to query either header or trans length. */
+static int
+__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
+                                    struct kvm_sev_send_update_data *params)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_update_data data;
+       int ret;
+
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+       if (ret < 0)
+               return ret;
+
+       params->hdr_len = data.hdr_len;
+       params->trans_len = data.trans_len;
+
+       if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+                        sizeof(struct kvm_sev_send_update_data)))
+               ret = -EFAULT;
+
+       return ret;
+}
+
+static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_update_data data;
+       struct kvm_sev_send_update_data params;
+       void *hdr, *trans_data;
+       struct page **guest_page;
+       unsigned long n;
+       int ret, offset;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+                       sizeof(struct kvm_sev_send_update_data)))
+               return -EFAULT;
+
+       /* userspace wants to query either header or trans length */
+       if (!params.trans_len || !params.hdr_len)
+               return __sev_send_update_data_query_lengths(kvm, argp, &params);
+
+       if (!params.trans_uaddr || !params.guest_uaddr ||
+           !params.guest_len || !params.hdr_uaddr)
+               return -EINVAL;
+
+       /* Check if we are crossing the page boundary */
+       offset = params.guest_uaddr & (PAGE_SIZE - 1);
+       if ((params.guest_len + offset > PAGE_SIZE))
+               return -EINVAL;
+
+       /* Pin guest memory */
+       guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+                                   PAGE_SIZE, &n, 0);
+       if (!guest_page)
+               return -EFAULT;
+
+       /* allocate memory for header and transport buffer */
+       ret = -ENOMEM;
+       hdr = kmalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+       if (!hdr)
+               goto e_unpin;
+
+       trans_data = kmalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+       if (!trans_data)
+               goto e_free_hdr;
+
+       memset(&data, 0, sizeof(data));
+       data.hdr_address = __psp_pa(hdr);
+       data.hdr_len = params.hdr_len;
+       data.trans_address = __psp_pa(trans_data);
+       data.trans_len = params.trans_len;
+
+       /* The SEND_UPDATE_DATA command requires C-bit to be always set. */
+       data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+       data.guest_address |= sev_me_mask;
+       data.guest_len = params.guest_len;
+       data.handle = sev->handle;
+
+       ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+       if (ret)
+               goto e_free_trans_data;
+
+       /* copy transport buffer to user space */
+       if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr,
+                        trans_data, params.trans_len)) {
+               ret = -EFAULT;
+               goto e_free_trans_data;
+       }
+
+       /* Copy packet header to userspace. */
+       ret = copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+                               params.hdr_len);
+
+e_free_trans_data:
+       kfree(trans_data);
+e_free_hdr:
+       kfree(hdr);
+e_unpin:
+       sev_unpin_memory(kvm, guest_page, n);
+
+       return ret;
+}
+
+static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_finish data;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       data.handle = sev->handle;
+       return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
+}
+
+static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_cancel data;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       data.handle = sev->handle;
+       return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
+}
+
+static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_receive_start start;
+       struct kvm_sev_receive_start params;
+       int *error = &argp->error;
+       void *session_data;
+       void *pdh_data;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       /* Get parameter from the userspace */
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+                       sizeof(struct kvm_sev_receive_start)))
+               return -EFAULT;
+
+       /* some sanity checks */
+       if (!params.pdh_uaddr || !params.pdh_len ||
+           !params.session_uaddr || !params.session_len)
+               return -EINVAL;
+
+       pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
+       if (IS_ERR(pdh_data))
+               return PTR_ERR(pdh_data);
+
+       session_data = psp_copy_user_blob(params.session_uaddr,
+                       params.session_len);
+       if (IS_ERR(session_data)) {
+               ret = PTR_ERR(session_data);
+               goto e_free_pdh;
+       }
+
+       memset(&start, 0, sizeof(start));
+       start.handle = params.handle;
+       start.policy = params.policy;
+       start.pdh_cert_address = __psp_pa(pdh_data);
+       start.pdh_cert_len = params.pdh_len;
+       start.session_address = __psp_pa(session_data);
+       start.session_len = params.session_len;
+
+       /* create memory encryption context */
+       ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
+                               error);
+       if (ret)
+               goto e_free_session;
+
+       /* Bind ASID to this guest */
+       ret = sev_bind_asid(kvm, start.handle, error);
+       if (ret)
+               goto e_free_session;
+
+       params.handle = start.handle;
+       if (copy_to_user((void __user *)(uintptr_t)argp->data,
+                        &params, sizeof(struct kvm_sev_receive_start))) {
+               ret = -EFAULT;
+               sev_unbind_asid(kvm, start.handle);
+               goto e_free_session;
+       }
+
+       sev->handle = start.handle;
+       sev->fd = argp->sev_fd;
+
+e_free_session:
+       kfree(session_data);
+e_free_pdh:
+       kfree(pdh_data);
+
+       return ret;
+}
+
+static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct kvm_sev_receive_update_data params;
+       struct sev_data_receive_update_data data;
+       void *hdr = NULL, *trans = NULL;
+       struct page **guest_page;
+       unsigned long n;
+       int ret, offset;
+
+       if (!sev_guest(kvm))
+               return -EINVAL;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+                       sizeof(struct kvm_sev_receive_update_data)))
+               return -EFAULT;
+
+       if (!params.hdr_uaddr || !params.hdr_len ||
+           !params.guest_uaddr || !params.guest_len ||
+           !params.trans_uaddr || !params.trans_len)
+               return -EINVAL;
+
+       /* Check if we are crossing the page boundary */
+       offset = params.guest_uaddr & (PAGE_SIZE - 1);
+       if ((params.guest_len + offset > PAGE_SIZE))
+               return -EINVAL;
+
+       hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+       if (IS_ERR(hdr))
+               return PTR_ERR(hdr);
+
+       trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto e_free_hdr;
+       }
+
+       memset(&data, 0, sizeof(data));
+       data.hdr_address = __psp_pa(hdr);
+       data.hdr_len = params.hdr_len;
+       data.trans_address = __psp_pa(trans);
+       data.trans_len = params.trans_len;
+
+       /* Pin guest memory */
+       ret = -EFAULT;
+       guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+                                   PAGE_SIZE, &n, 0);
+       if (!guest_page)
+               goto e_free_trans;
+
+       /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
+       data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+       data.guest_address |= sev_me_mask;
+       data.guest_len = params.guest_len;
+       data.handle = sev->handle;
+
+       ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
+                               &argp->error);
+
+       sev_unpin_memory(kvm, guest_page, n);
+
+e_free_trans:
+       kfree(trans);
+e_free_hdr:
+       kfree(hdr);
+
+       return ret;
+}
+
+static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_receive_finish data;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       data.handle = sev->handle;
+       return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
+}
+
 int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
        struct kvm_sev_cmd sev_cmd;
        int r;
 
-       if (!svm_sev_enabled() || !sev)
+       if (!sev_enabled)
                return -ENOTTY;
 
        if (!argp)
@@ -1166,13 +1509,22 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 
        mutex_lock(&kvm->lock);
 
+       /* enc_context_owner handles all memory enc operations */
+       if (is_mirroring_enc_context(kvm)) {
+               r = -EINVAL;
+               goto out;
+       }
+
        switch (sev_cmd.id) {
+       case KVM_SEV_ES_INIT:
+               if (!sev_es_enabled) {
+                       r = -ENOTTY;
+                       goto out;
+               }
+               fallthrough;
        case KVM_SEV_INIT:
                r = sev_guest_init(kvm, &sev_cmd);
                break;
-       case KVM_SEV_ES_INIT:
-               r = sev_es_guest_init(kvm, &sev_cmd);
-               break;
        case KVM_SEV_LAUNCH_START:
                r = sev_launch_start(kvm, &sev_cmd);
                break;
@@ -1203,6 +1555,27 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
        case KVM_SEV_GET_ATTESTATION_REPORT:
                r = sev_get_attestation_report(kvm, &sev_cmd);
                break;
+       case KVM_SEV_SEND_START:
+               r = sev_send_start(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_SEND_UPDATE_DATA:
+               r = sev_send_update_data(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_SEND_FINISH:
+               r = sev_send_finish(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_SEND_CANCEL:
+               r = sev_send_cancel(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_RECEIVE_START:
+               r = sev_receive_start(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_RECEIVE_UPDATE_DATA:
+               r = sev_receive_update_data(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_RECEIVE_FINISH:
+               r = sev_receive_finish(kvm, &sev_cmd);
+               break;
        default:
                r = -EINVAL;
                goto out;
@@ -1226,6 +1599,10 @@ int svm_register_enc_region(struct kvm *kvm,
        if (!sev_guest(kvm))
                return -ENOTTY;
 
+       /* If kvm is mirroring encryption context it isn't responsible for it */
+       if (is_mirroring_enc_context(kvm))
+               return -EINVAL;
+
        if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
                return -EINVAL;
 
@@ -1292,6 +1669,10 @@ int svm_unregister_enc_region(struct kvm *kvm,
        struct enc_region *region;
        int ret;
 
+       /* If kvm is mirroring encryption context it isn't responsible for it */
+       if (is_mirroring_enc_context(kvm))
+               return -EINVAL;
+
        mutex_lock(&kvm->lock);
 
        if (!sev_guest(kvm)) {
@@ -1322,6 +1703,71 @@ failed:
        return ret;
 }
 
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
+{
+       struct file *source_kvm_file;
+       struct kvm *source_kvm;
+       struct kvm_sev_info *mirror_sev;
+       unsigned int asid;
+       int ret;
+
+       source_kvm_file = fget(source_fd);
+       if (!file_is_kvm(source_kvm_file)) {
+               ret = -EBADF;
+               goto e_source_put;
+       }
+
+       source_kvm = source_kvm_file->private_data;
+       mutex_lock(&source_kvm->lock);
+
+       if (!sev_guest(source_kvm)) {
+               ret = -EINVAL;
+               goto e_source_unlock;
+       }
+
+       /* Mirrors of mirrors should work, but let's not get silly */
+       if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) {
+               ret = -EINVAL;
+               goto e_source_unlock;
+       }
+
+       asid = to_kvm_svm(source_kvm)->sev_info.asid;
+
+       /*
+        * The mirror kvm holds an enc_context_owner ref so its asid can't
+        * disappear until we're done with it
+        */
+       kvm_get_kvm(source_kvm);
+
+       fput(source_kvm_file);
+       mutex_unlock(&source_kvm->lock);
+       mutex_lock(&kvm->lock);
+
+       if (sev_guest(kvm)) {
+               ret = -EINVAL;
+               goto e_mirror_unlock;
+       }
+
+       /* Set enc_context_owner and copy its encryption context over */
+       mirror_sev = &to_kvm_svm(kvm)->sev_info;
+       mirror_sev->enc_context_owner = source_kvm;
+       mirror_sev->asid = asid;
+       mirror_sev->active = true;
+
+       mutex_unlock(&kvm->lock);
+       return 0;
+
+e_mirror_unlock:
+       mutex_unlock(&kvm->lock);
+       kvm_put_kvm(source_kvm);
+       return ret;
+e_source_unlock:
+       mutex_unlock(&source_kvm->lock);
+e_source_put:
+       fput(source_kvm_file);
+       return ret;
+}
+
 void sev_vm_destroy(struct kvm *kvm)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -1331,6 +1777,12 @@ void sev_vm_destroy(struct kvm *kvm)
        if (!sev_guest(kvm))
                return;
 
+       /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+       if (is_mirroring_enc_context(kvm)) {
+               kvm_put_kvm(sev->enc_context_owner);
+               return;
+       }
+
        mutex_lock(&kvm->lock);
 
        /*
@@ -1358,12 +1810,24 @@ void sev_vm_destroy(struct kvm *kvm)
        sev_asid_free(sev);
 }
 
+void __init sev_set_cpu_caps(void)
+{
+       if (!sev_enabled)
+               kvm_cpu_cap_clear(X86_FEATURE_SEV);
+       if (!sev_es_enabled)
+               kvm_cpu_cap_clear(X86_FEATURE_SEV_ES);
+}
+
 void __init sev_hardware_setup(void)
 {
+#ifdef CONFIG_KVM_AMD_SEV
        unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
        bool sev_es_supported = false;
        bool sev_supported = false;
 
+       if (!sev_enabled || !npt_enabled)
+               goto out;
+
        /* Does the CPU support SEV? */
        if (!boot_cpu_has(X86_FEATURE_SEV))
                goto out;
@@ -1376,12 +1840,12 @@ void __init sev_hardware_setup(void)
 
        /* Maximum number of encrypted guests supported simultaneously */
        max_sev_asid = ecx;
-
-       if (!svm_sev_enabled())
+       if (!max_sev_asid)
                goto out;
 
        /* Minimum ASID value that should be used for SEV guest */
        min_sev_asid = edx;
+       sev_me_mask = 1UL << (ebx & 0x3f);
 
        /* Initialize SEV ASID bitmaps */
        sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
@@ -1389,8 +1853,11 @@ void __init sev_hardware_setup(void)
                goto out;
 
        sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
-       if (!sev_reclaim_asid_bitmap)
+       if (!sev_reclaim_asid_bitmap) {
+               bitmap_free(sev_asid_bitmap);
+               sev_asid_bitmap = NULL;
                goto out;
+       }
 
        sev_asid_count = max_sev_asid - min_sev_asid + 1;
        if (misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count))
@@ -1400,7 +1867,7 @@ void __init sev_hardware_setup(void)
        sev_supported = true;
 
        /* SEV-ES support requested? */
-       if (!sev_es)
+       if (!sev_es_enabled)
                goto out;
 
        /* Does the CPU support SEV-ES? */
@@ -1419,21 +1886,36 @@ void __init sev_hardware_setup(void)
        sev_es_supported = true;
 
 out:
-       sev = sev_supported;
-       sev_es = sev_es_supported;
+       sev_enabled = sev_supported;
+       sev_es_enabled = sev_es_supported;
+#endif
 }
 
 void sev_hardware_teardown(void)
 {
-       if (!svm_sev_enabled())
+       if (!sev_enabled)
                return;
 
+       /* No need to take sev_bitmap_lock, all VMs have been destroyed. */
+       sev_flush_asids(0, max_sev_asid);
+
        bitmap_free(sev_asid_bitmap);
        bitmap_free(sev_reclaim_asid_bitmap);
+
        misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
        misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
+}
 
-       sev_flush_asids();
+int sev_cpu_init(struct svm_cpu_data *sd)
+{
+       if (!sev_enabled)
+               return 0;
+
+       sd->sev_vmcbs = kcalloc(max_sev_asid + 1, sizeof(void *), GFP_KERNEL);
+       if (!sd->sev_vmcbs)
+               return -ENOMEM;
+
+       return 0;
 }
 
 /*
@@ -1825,7 +2307,7 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
                               len, GHCB_SCRATCH_AREA_LIMIT);
                        return false;
                }
-               scratch_va = kzalloc(len, GFP_KERNEL);
+               scratch_va = kzalloc(len, GFP_KERNEL_ACCOUNT);
                if (!scratch_va)
                        return false;
 
@@ -1899,7 +2381,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
                vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
                vcpu->arch.regs[VCPU_REGS_RCX] = 0;
 
-               ret = svm_invoke_exit_handler(svm, SVM_EXIT_CPUID);
+               ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
                if (!ret) {
                        ret = -EINVAL;
                        break;
@@ -1949,8 +2431,9 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
        return ret;
 }
 
-int sev_handle_vmgexit(struct vcpu_svm *svm)
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;
        u64 ghcb_gpa, exit_code;
        struct ghcb *ghcb;
@@ -1962,13 +2445,13 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                return sev_handle_vmgexit_msr_protocol(svm);
 
        if (!ghcb_gpa) {
-               vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB gpa is not set\n");
+               vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
                return -EINVAL;
        }
 
-       if (kvm_vcpu_map(&svm->vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
+       if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
                /* Unable to map GHCB from guest */
-               vcpu_unimpl(&svm->vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
+               vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
                            ghcb_gpa);
                return -EINVAL;
        }
@@ -1976,7 +2459,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
        svm->ghcb = svm->ghcb_map.hva;
        ghcb = svm->ghcb_map.hva;
 
-       trace_kvm_vmgexit_enter(svm->vcpu.vcpu_id, ghcb);
+       trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb);
 
        exit_code = ghcb_get_sw_exit_code(ghcb);
 
@@ -1994,7 +2477,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
                        break;
 
-               ret = kvm_sev_es_mmio_read(&svm->vcpu,
+               ret = kvm_sev_es_mmio_read(vcpu,
                                           control->exit_info_1,
                                           control->exit_info_2,
                                           svm->ghcb_sa);
@@ -2003,19 +2486,19 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                if (!setup_vmgexit_scratch(svm, false, control->exit_info_2))
                        break;
 
-               ret = kvm_sev_es_mmio_write(&svm->vcpu,
+               ret = kvm_sev_es_mmio_write(vcpu,
                                            control->exit_info_1,
                                            control->exit_info_2,
                                            svm->ghcb_sa);
                break;
        case SVM_VMGEXIT_NMI_COMPLETE:
-               ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET);
+               ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
                break;
        case SVM_VMGEXIT_AP_HLT_LOOP:
-               ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+               ret = kvm_emulate_ap_reset_hold(vcpu);
                break;
        case SVM_VMGEXIT_AP_JUMP_TABLE: {
-               struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+               struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
 
                switch (control->exit_info_1) {
                case 0:
@@ -2040,12 +2523,12 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                break;
        }
        case SVM_VMGEXIT_UNSUPPORTED_EVENT:
-               vcpu_unimpl(&svm->vcpu,
+               vcpu_unimpl(vcpu,
                            "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
                            control->exit_info_1, control->exit_info_2);
                break;
        default:
-               ret = svm_invoke_exit_handler(svm, exit_code);
+               ret = svm_invoke_exit_handler(vcpu, exit_code);
        }
 
        return ret;
@@ -2154,5 +2637,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
         * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
         * non-zero value.
         */
+       if (!svm->ghcb)
+               return;
+
        ghcb_set_sw_exit_info_2(svm->ghcb, 1);
 }
index 6dad892..9790c73 100644 (file)
@@ -56,9 +56,6 @@ static const struct x86_cpu_id svm_cpu_id[] = {
 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
 #endif
 
-#define IOPM_ALLOC_ORDER 2
-#define MSRPM_ALLOC_ORDER 1
-
 #define SEG_TYPE_LDT 2
 #define SEG_TYPE_BUSY_TSS16 3
 
@@ -95,6 +92,8 @@ static const struct svm_direct_access_msrs {
 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
        { .index = MSR_STAR,                            .always = true  },
        { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
+       { .index = MSR_IA32_SYSENTER_EIP,               .always = false },
+       { .index = MSR_IA32_SYSENTER_ESP,               .always = false },
 #ifdef CONFIG_X86_64
        { .index = MSR_GS_BASE,                         .always = true  },
        { .index = MSR_FS_BASE,                         .always = true  },
@@ -186,14 +185,6 @@ module_param(vls, int, 0444);
 static int vgif = true;
 module_param(vgif, int, 0444);
 
-/* enable/disable SEV support */
-int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
-module_param(sev, int, 0444);
-
-/* enable/disable SEV-ES support */
-int sev_es = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
-module_param(sev_es, int, 0444);
-
 bool __read_mostly dump_invalid_vmcb;
 module_param(dump_invalid_vmcb, bool, 0644);
 
@@ -214,6 +205,15 @@ struct kvm_ldttss_desc {
 
 DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
 
+/*
+ * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
+ * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
+ *
+ * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
+ * defer the restoration of TSC_AUX until the CPU returns to userspace.
+ */
+#define TSC_AUX_URET_SLOT      0
+
 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 
 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
@@ -279,7 +279,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
                         * In this case we will return to the nested guest
                         * as soon as we leave SMM.
                         */
-                       if (!is_smm(&svm->vcpu))
+                       if (!is_smm(vcpu))
                                svm_free_nested(svm);
 
                } else {
@@ -363,10 +363,10 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
        bool has_error_code = vcpu->arch.exception.has_error_code;
        u32 error_code = vcpu->arch.exception.error_code;
 
-       kvm_deliver_exception_payload(&svm->vcpu);
+       kvm_deliver_exception_payload(vcpu);
 
        if (nr == BP_VECTOR && !nrips) {
-               unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+               unsigned long rip, old_rip = kvm_rip_read(vcpu);
 
                /*
                 * For guest debugging where we have to reinject #BP if some
@@ -375,8 +375,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
                 * raises a fault that is not intercepted. Still better than
                 * failing in all cases.
                 */
-               (void)skip_emulated_instruction(&svm->vcpu);
-               rip = kvm_rip_read(&svm->vcpu);
+               (void)skip_emulated_instruction(vcpu);
+               rip = kvm_rip_read(vcpu);
                svm->int3_rip = rip + svm->vmcb->save.cs.base;
                svm->int3_injected = rip - old_rip;
        }
@@ -553,23 +553,21 @@ static void svm_cpu_uninit(int cpu)
 static int svm_cpu_init(int cpu)
 {
        struct svm_cpu_data *sd;
+       int ret = -ENOMEM;
 
        sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
        if (!sd)
-               return -ENOMEM;
+               return ret;
        sd->cpu = cpu;
        sd->save_area = alloc_page(GFP_KERNEL);
        if (!sd->save_area)
                goto free_cpu_data;
+
        clear_page(page_address(sd->save_area));
 
-       if (svm_sev_enabled()) {
-               sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
-                                             sizeof(void *),
-                                             GFP_KERNEL);
-               if (!sd->sev_vmcbs)
-                       goto free_save_area;
-       }
+       ret = sev_cpu_init(sd);
+       if (ret)
+               goto free_save_area;
 
        per_cpu(svm_data, cpu) = sd;
 
@@ -579,7 +577,7 @@ free_save_area:
        __free_page(sd->save_area);
 free_cpu_data:
        kfree(sd);
-       return -ENOMEM;
+       return ret;
 
 }
 
@@ -681,14 +679,15 @@ void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
 
 u32 *svm_vcpu_alloc_msrpm(void)
 {
-       struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
+       unsigned int order = get_order(MSRPM_SIZE);
+       struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
        u32 *msrpm;
 
        if (!pages)
                return NULL;
 
        msrpm = page_address(pages);
-       memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+       memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
 
        return msrpm;
 }
@@ -707,7 +706,7 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
 
 void svm_vcpu_free_msrpm(u32 *msrpm)
 {
-       __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
+       __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
 }
 
 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
@@ -881,20 +880,20 @@ static __init void svm_adjust_mmio_mask(void)
         */
        mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
 
-       kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK);
+       kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
 }
 
 static void svm_hardware_teardown(void)
 {
        int cpu;
 
-       if (svm_sev_enabled())
-               sev_hardware_teardown();
+       sev_hardware_teardown();
 
        for_each_possible_cpu(cpu)
                svm_cpu_uninit(cpu);
 
-       __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
+       __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
+       get_order(IOPM_SIZE));
        iopm_base = 0;
 }
 
@@ -922,6 +921,9 @@ static __init void svm_set_cpu_caps(void)
        if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
            boot_cpu_has(X86_FEATURE_AMD_SSBD))
                kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+       /* CPUID 0x8000001F (SME/SEV features) */
+       sev_set_cpu_caps();
 }
 
 static __init int svm_hardware_setup(void)
@@ -930,14 +932,15 @@ static __init int svm_hardware_setup(void)
        struct page *iopm_pages;
        void *iopm_va;
        int r;
+       unsigned int order = get_order(IOPM_SIZE);
 
-       iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
+       iopm_pages = alloc_pages(GFP_KERNEL, order);
 
        if (!iopm_pages)
                return -ENOMEM;
 
        iopm_va = page_address(iopm_pages);
-       memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
+       memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
        iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 
        init_msrpm_offsets();
@@ -956,6 +959,9 @@ static __init int svm_hardware_setup(void)
                kvm_tsc_scaling_ratio_frac_bits = 32;
        }
 
+       if (boot_cpu_has(X86_FEATURE_RDTSCP))
+               kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX);
+
        /* Check for pause filtering support */
        if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
                pause_filter_count = 0;
@@ -969,21 +975,6 @@ static __init int svm_hardware_setup(void)
                kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
        }
 
-       if (IS_ENABLED(CONFIG_KVM_AMD_SEV) && sev) {
-               sev_hardware_setup();
-       } else {
-               sev = false;
-               sev_es = false;
-       }
-
-       svm_adjust_mmio_mask();
-
-       for_each_possible_cpu(cpu) {
-               r = svm_cpu_init(cpu);
-               if (r)
-                       goto err;
-       }
-
        /*
         * KVM's MMU doesn't support using 2-level paging for itself, and thus
         * NPT isn't supported if the host is using 2-level paging since host
@@ -998,6 +989,17 @@ static __init int svm_hardware_setup(void)
        kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
        pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
 
+       /* Note, SEV setup consumes npt_enabled. */
+       sev_hardware_setup();
+
+       svm_adjust_mmio_mask();
+
+       for_each_possible_cpu(cpu) {
+               r = svm_cpu_init(cpu);
+               if (r)
+                       goto err;
+       }
+
        if (nrips) {
                if (!boot_cpu_has(X86_FEATURE_NRIPS))
                        nrips = false;
@@ -1084,8 +1086,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
        if (is_guest_mode(vcpu)) {
                /* Write L1's TSC offset.  */
                g_tsc_offset = svm->vmcb->control.tsc_offset -
-                              svm->nested.hsave->control.tsc_offset;
-               svm->nested.hsave->control.tsc_offset = offset;
+                              svm->vmcb01.ptr->control.tsc_offset;
+               svm->vmcb01.ptr->control.tsc_offset = offset;
        }
 
        trace_kvm_write_tsc_offset(vcpu->vcpu_id,
@@ -1113,12 +1115,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm)
        }
 }
 
-static void init_vmcb(struct vcpu_svm *svm)
+static void init_vmcb(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;
        struct vmcb_save_area *save = &svm->vmcb->save;
 
-       svm->vcpu.arch.hflags = 0;
+       vcpu->arch.hflags = 0;
 
        svm_set_intercept(svm, INTERCEPT_CR0_READ);
        svm_set_intercept(svm, INTERCEPT_CR3_READ);
@@ -1126,7 +1129,7 @@ static void init_vmcb(struct vcpu_svm *svm)
        svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
        svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
        svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
-       if (!kvm_vcpu_apicv_active(&svm->vcpu))
+       if (!kvm_vcpu_apicv_active(vcpu))
                svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
 
        set_dr_intercepts(svm);
@@ -1170,12 +1173,12 @@ static void init_vmcb(struct vcpu_svm *svm)
        svm_set_intercept(svm, INTERCEPT_RDPRU);
        svm_set_intercept(svm, INTERCEPT_RSM);
 
-       if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
+       if (!kvm_mwait_in_guest(vcpu->kvm)) {
                svm_set_intercept(svm, INTERCEPT_MONITOR);
                svm_set_intercept(svm, INTERCEPT_MWAIT);
        }
 
-       if (!kvm_hlt_in_guest(svm->vcpu.kvm))
+       if (!kvm_hlt_in_guest(vcpu->kvm))
                svm_set_intercept(svm, INTERCEPT_HLT);
 
        control->iopm_base_pa = __sme_set(iopm_base);
@@ -1201,19 +1204,19 @@ static void init_vmcb(struct vcpu_svm *svm)
        init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
-       svm_set_cr4(&svm->vcpu, 0);
-       svm_set_efer(&svm->vcpu, 0);
+       svm_set_cr4(vcpu, 0);
+       svm_set_efer(vcpu, 0);
        save->dr6 = 0xffff0ff0;
-       kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED);
+       kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
        save->rip = 0x0000fff0;
-       svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
+       vcpu->arch.regs[VCPU_REGS_RIP] = save->rip;
 
        /*
         * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
         * It also updates the guest-visible cr0 value.
         */
-       svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
-       kvm_mmu_reset_context(&svm->vcpu);
+       svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+       kvm_mmu_reset_context(vcpu);
 
        save->cr4 = X86_CR4_PAE;
        /* rdx = ?? */
@@ -1225,17 +1228,18 @@ static void init_vmcb(struct vcpu_svm *svm)
                clr_exception_intercept(svm, PF_VECTOR);
                svm_clr_intercept(svm, INTERCEPT_CR3_READ);
                svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
-               save->g_pat = svm->vcpu.arch.pat;
+               save->g_pat = vcpu->arch.pat;
                save->cr3 = 0;
                save->cr4 = 0;
        }
-       svm->asid_generation = 0;
+       svm->current_vmcb->asid_generation = 0;
        svm->asid = 0;
 
        svm->nested.vmcb12_gpa = 0;
-       svm->vcpu.arch.hflags = 0;
+       svm->nested.last_vmcb12_gpa = 0;
+       vcpu->arch.hflags = 0;
 
-       if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
+       if (!kvm_pause_in_guest(vcpu->kvm)) {
                control->pause_filter_count = pause_filter_count;
                if (pause_filter_thresh)
                        control->pause_filter_thresh = pause_filter_thresh;
@@ -1246,18 +1250,15 @@ static void init_vmcb(struct vcpu_svm *svm)
 
        svm_check_invpcid(svm);
 
-       if (kvm_vcpu_apicv_active(&svm->vcpu))
-               avic_init_vmcb(svm);
-
        /*
-        * If hardware supports Virtual VMLOAD VMSAVE then enable it
-        * in VMCB and clear intercepts to avoid #VMEXIT.
+        * If the host supports V_SPEC_CTRL then disable the interception
+        * of MSR_IA32_SPEC_CTRL.
         */
-       if (vls) {
-               svm_clr_intercept(svm, INTERCEPT_VMLOAD);
-               svm_clr_intercept(svm, INTERCEPT_VMSAVE);
-               svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
-       }
+       if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+
+       if (kvm_vcpu_apicv_active(vcpu))
+               avic_init_vmcb(svm);
 
        if (vgif) {
                svm_clr_intercept(svm, INTERCEPT_STGI);
@@ -1265,11 +1266,11 @@ static void init_vmcb(struct vcpu_svm *svm)
                svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
        }
 
-       if (sev_guest(svm->vcpu.kvm)) {
+       if (sev_guest(vcpu->kvm)) {
                svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
                clr_exception_intercept(svm, UD_VECTOR);
 
-               if (sev_es_guest(svm->vcpu.kvm)) {
+               if (sev_es_guest(vcpu->kvm)) {
                        /* Perform SEV-ES specific VMCB updates */
                        sev_es_init_vmcb(svm);
                }
@@ -1291,12 +1292,12 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        svm->virt_spec_ctrl = 0;
 
        if (!init_event) {
-               svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
-                                          MSR_IA32_APICBASE_ENABLE;
-               if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
-                       svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+               vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+                                      MSR_IA32_APICBASE_ENABLE;
+               if (kvm_vcpu_is_reset_bsp(vcpu))
+                       vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
        }
-       init_vmcb(svm);
+       init_vmcb(vcpu);
 
        kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
        kvm_rdx_write(vcpu, eax);
@@ -1305,10 +1306,16 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
                avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
 }
 
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
+{
+       svm->current_vmcb = target_vmcb;
+       svm->vmcb = target_vmcb->ptr;
+}
+
 static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm;
-       struct page *vmcb_page;
+       struct page *vmcb01_page;
        struct page *vmsa_page = NULL;
        int err;
 
@@ -1316,11 +1323,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
        svm = to_svm(vcpu);
 
        err = -ENOMEM;
-       vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-       if (!vmcb_page)
+       vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+       if (!vmcb01_page)
                goto out;
 
-       if (sev_es_guest(svm->vcpu.kvm)) {
+       if (sev_es_guest(vcpu->kvm)) {
                /*
                 * SEV-ES guests require a separate VMSA page used to contain
                 * the encrypted register state of the guest.
@@ -1356,20 +1363,21 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 
        svm_vcpu_init_msrpm(vcpu, svm->msrpm);
 
-       svm->vmcb = page_address(vmcb_page);
-       svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
+       svm->vmcb01.ptr = page_address(vmcb01_page);
+       svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
 
        if (vmsa_page)
                svm->vmsa = page_address(vmsa_page);
 
-       svm->asid_generation = 0;
        svm->guest_state_loaded = false;
-       init_vmcb(svm);
+
+       svm_switch_vmcb(svm, &svm->vmcb01);
+       init_vmcb(vcpu);
 
        svm_init_osvw(vcpu);
        vcpu->arch.microcode_version = 0x01000065;
 
-       if (sev_es_guest(svm->vcpu.kvm))
+       if (sev_es_guest(vcpu->kvm))
                /* Perform SEV-ES specific VMCB creation updates */
                sev_es_create_vcpu(svm);
 
@@ -1379,7 +1387,7 @@ error_free_vmsa_page:
        if (vmsa_page)
                __free_page(vmsa_page);
 error_free_vmcb_page:
-       __free_page(vmcb_page);
+       __free_page(vmcb01_page);
 out:
        return err;
 }
@@ -1407,32 +1415,23 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 
        sev_free_vcpu(vcpu);
 
-       __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
-       __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
+       __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
+       __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
 }
 
 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
-       unsigned int i;
 
        if (svm->guest_state_loaded)
                return;
 
-       /*
-        * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
-        * area (non-sev-es). Save ones that aren't so we can restore them
-        * individually later.
-        */
-       for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
-               rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
        /*
         * Save additional host state that will be restored on VMEXIT (sev-es)
         * or subsequent vmload of host save area.
         */
-       if (sev_es_guest(svm->vcpu.kvm)) {
+       if (sev_es_guest(vcpu->kvm)) {
                sev_es_prepare_guest_switch(svm, vcpu->cpu);
        } else {
                vmsave(__sme_page_pa(sd->save_area));
@@ -1446,29 +1445,15 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
                }
        }
 
-       /* This assumes that the kernel never uses MSR_TSC_AUX */
        if (static_cpu_has(X86_FEATURE_RDTSCP))
-               wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+               kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull);
 
        svm->guest_state_loaded = true;
 }
 
 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
-       unsigned int i;
-
-       if (!svm->guest_state_loaded)
-               return;
-
-       /*
-        * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
-        * area (non-sev-es). Restore the ones that weren't.
-        */
-       for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
-               wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
-       svm->guest_state_loaded = false;
+       to_svm(vcpu)->guest_state_loaded = false;
 }
 
 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1476,11 +1461,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
 
-       if (unlikely(cpu != vcpu->cpu)) {
-               svm->asid_generation = 0;
-               vmcb_mark_all_dirty(svm->vmcb);
-       }
-
        if (sd->current_vmcb != svm->vmcb) {
                sd->current_vmcb = svm->vmcb;
                indirect_branch_prediction_barrier();
@@ -1564,7 +1544,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
        /* Drop int_ctl fields related to VINTR injection.  */
        svm->vmcb->control.int_ctl &= mask;
        if (is_guest_mode(&svm->vcpu)) {
-               svm->nested.hsave->control.int_ctl &= mask;
+               svm->vmcb01.ptr->control.int_ctl &= mask;
 
                WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
                        (svm->nested.ctl.int_ctl & V_TPR_MASK));
@@ -1577,16 +1557,17 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
 {
        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+       struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
 
        switch (seg) {
        case VCPU_SREG_CS: return &save->cs;
        case VCPU_SREG_DS: return &save->ds;
        case VCPU_SREG_ES: return &save->es;
-       case VCPU_SREG_FS: return &save->fs;
-       case VCPU_SREG_GS: return &save->gs;
+       case VCPU_SREG_FS: return &save01->fs;
+       case VCPU_SREG_GS: return &save01->gs;
        case VCPU_SREG_SS: return &save->ss;
-       case VCPU_SREG_TR: return &save->tr;
-       case VCPU_SREG_LDTR: return &save->ldtr;
+       case VCPU_SREG_TR: return &save01->tr;
+       case VCPU_SREG_LDTR: return &save01->ldtr;
        }
        BUG();
        return NULL;
@@ -1709,37 +1690,10 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
        vmcb_mark_dirty(svm->vmcb, VMCB_DT);
 }
 
-static void update_cr0_intercept(struct vcpu_svm *svm)
-{
-       ulong gcr0;
-       u64 *hcr0;
-
-       /*
-        * SEV-ES guests must always keep the CR intercepts cleared. CR
-        * tracking is done using the CR write traps.
-        */
-       if (sev_es_guest(svm->vcpu.kvm))
-               return;
-
-       gcr0 = svm->vcpu.arch.cr0;
-       hcr0 = &svm->vmcb->save.cr0;
-       *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
-               | (gcr0 & SVM_CR0_SELECTIVE_MASK);
-
-       vmcb_mark_dirty(svm->vmcb, VMCB_CR);
-
-       if (gcr0 == *hcr0) {
-               svm_clr_intercept(svm, INTERCEPT_CR0_READ);
-               svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
-       } else {
-               svm_set_intercept(svm, INTERCEPT_CR0_READ);
-               svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
-       }
-}
-
 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+       u64 hcr0 = cr0;
 
 #ifdef CONFIG_X86_64
        if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
@@ -1757,7 +1711,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        vcpu->arch.cr0 = cr0;
 
        if (!npt_enabled)
-               cr0 |= X86_CR0_PG | X86_CR0_WP;
+               hcr0 |= X86_CR0_PG | X86_CR0_WP;
 
        /*
         * re-enable caching here because the QEMU bios
@@ -1765,10 +1719,26 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         * reboot
         */
        if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
-               cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
-       svm->vmcb->save.cr0 = cr0;
+               hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+
+       svm->vmcb->save.cr0 = hcr0;
        vmcb_mark_dirty(svm->vmcb, VMCB_CR);
-       update_cr0_intercept(svm);
+
+       /*
+        * SEV-ES guests must always keep the CR intercepts cleared. CR
+        * tracking is done using the CR write traps.
+        */
+       if (sev_es_guest(vcpu->kvm))
+               return;
+
+       if (hcr0 == cr0) {
+               /* Selective CR0 write remains on.  */
+               svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+               svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+       } else {
+               svm_set_intercept(svm, INTERCEPT_CR0_READ);
+               svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+       }
 }
 
 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1847,7 +1817,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
                vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
        }
 
-       svm->asid_generation = sd->asid_generation;
+       svm->current_vmcb->asid_generation = sd->asid_generation;
        svm->asid = sd->next_asid++;
 }
 
@@ -1896,39 +1866,43 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
        vmcb_mark_dirty(svm->vmcb, VMCB_DR);
 }
 
-static int pf_interception(struct vcpu_svm *svm)
+static int pf_interception(struct kvm_vcpu *vcpu)
 {
-       u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       u64 fault_address = svm->vmcb->control.exit_info_2;
        u64 error_code = svm->vmcb->control.exit_info_1;
 
-       return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
+       return kvm_handle_page_fault(vcpu, error_code, fault_address,
                        static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
                        svm->vmcb->control.insn_bytes : NULL,
                        svm->vmcb->control.insn_len);
 }
 
-static int npf_interception(struct vcpu_svm *svm)
+static int npf_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
+
        u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
        u64 error_code = svm->vmcb->control.exit_info_1;
 
        trace_kvm_page_fault(fault_address, error_code);
-       return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+       return kvm_mmu_page_fault(vcpu, fault_address, error_code,
                        static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
                        svm->vmcb->control.insn_bytes : NULL,
                        svm->vmcb->control.insn_len);
 }
 
-static int db_interception(struct vcpu_svm *svm)
+static int db_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_run *kvm_run = svm->vcpu.run;
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct kvm_run *kvm_run = vcpu->run;
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       if (!(svm->vcpu.guest_debug &
+       if (!(vcpu->guest_debug &
              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
                !svm->nmi_singlestep) {
                u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
-               kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
+               kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
                return 1;
        }
 
@@ -1938,7 +1912,7 @@ static int db_interception(struct vcpu_svm *svm)
                kvm_make_request(KVM_REQ_EVENT, vcpu);
        }
 
-       if (svm->vcpu.guest_debug &
+       if (vcpu->guest_debug &
            (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
                kvm_run->exit_reason = KVM_EXIT_DEBUG;
                kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
@@ -1952,9 +1926,10 @@ static int db_interception(struct vcpu_svm *svm)
        return 1;
 }
 
-static int bp_interception(struct vcpu_svm *svm)
+static int bp_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_run *kvm_run = svm->vcpu.run;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_run *kvm_run = vcpu->run;
 
        kvm_run->exit_reason = KVM_EXIT_DEBUG;
        kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1962,14 +1937,14 @@ static int bp_interception(struct vcpu_svm *svm)
        return 0;
 }
 
-static int ud_interception(struct vcpu_svm *svm)
+static int ud_interception(struct kvm_vcpu *vcpu)
 {
-       return handle_ud(&svm->vcpu);
+       return handle_ud(vcpu);
 }
 
-static int ac_interception(struct vcpu_svm *svm)
+static int ac_interception(struct kvm_vcpu *vcpu)
 {
-       kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+       kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
        return 1;
 }
 
@@ -2012,7 +1987,7 @@ static bool is_erratum_383(void)
        return true;
 }
 
-static void svm_handle_mce(struct vcpu_svm *svm)
+static void svm_handle_mce(struct kvm_vcpu *vcpu)
 {
        if (is_erratum_383()) {
                /*
@@ -2021,7 +1996,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
                 */
                pr_err("KVM: Guest triggered AMD Erratum 383\n");
 
-               kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
+               kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 
                return;
        }
@@ -2033,20 +2008,21 @@ static void svm_handle_mce(struct vcpu_svm *svm)
        kvm_machine_check();
 }
 
-static int mc_interception(struct vcpu_svm *svm)
+static int mc_interception(struct kvm_vcpu *vcpu)
 {
        return 1;
 }
 
-static int shutdown_interception(struct vcpu_svm *svm)
+static int shutdown_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_run *kvm_run = svm->vcpu.run;
+       struct kvm_run *kvm_run = vcpu->run;
+       struct vcpu_svm *svm = to_svm(vcpu);
 
        /*
         * The VM save area has already been encrypted so it
         * cannot be reinitialized - just terminate.
         */
-       if (sev_es_guest(svm->vcpu.kvm))
+       if (sev_es_guest(vcpu->kvm))
                return -EINVAL;
 
        /*
@@ -2054,20 +2030,20 @@ static int shutdown_interception(struct vcpu_svm *svm)
         * so reinitialize it.
         */
        clear_page(svm->vmcb);
-       init_vmcb(svm);
+       init_vmcb(vcpu);
 
        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
        return 0;
 }
 
-static int io_interception(struct vcpu_svm *svm)
+static int io_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
        u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
        int size, in, string;
        unsigned port;
 
-       ++svm->vcpu.stat.io_exits;
+       ++vcpu->stat.io_exits;
        string = (io_info & SVM_IOIO_STR_MASK) != 0;
        in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
        port = io_info >> 16;
@@ -2082,93 +2058,69 @@ static int io_interception(struct vcpu_svm *svm)
 
        svm->next_rip = svm->vmcb->control.exit_info_2;
 
-       return kvm_fast_pio(&svm->vcpu, size, port, in);
-}
-
-static int nmi_interception(struct vcpu_svm *svm)
-{
-       return 1;
+       return kvm_fast_pio(vcpu, size, port, in);
 }
 
-static int intr_interception(struct vcpu_svm *svm)
+static int nmi_interception(struct kvm_vcpu *vcpu)
 {
-       ++svm->vcpu.stat.irq_exits;
        return 1;
 }
 
-static int nop_on_interception(struct vcpu_svm *svm)
+static int intr_interception(struct kvm_vcpu *vcpu)
 {
+       ++vcpu->stat.irq_exits;
        return 1;
 }
 
-static int halt_interception(struct vcpu_svm *svm)
+static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
 {
-       return kvm_emulate_halt(&svm->vcpu);
-}
-
-static int vmmcall_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_hypercall(&svm->vcpu);
-}
-
-static int vmload_interception(struct vcpu_svm *svm)
-{
-       struct vmcb *nested_vmcb;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb *vmcb12;
        struct kvm_host_map map;
        int ret;
 
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                return 1;
 
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+       ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
        if (ret) {
                if (ret == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
+                       kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
-       nested_vmcb = map.hva;
+       vmcb12 = map.hva;
+
+       ret = kvm_skip_emulated_instruction(vcpu);
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+       if (vmload) {
+               nested_svm_vmloadsave(vmcb12, svm->vmcb);
+               svm->sysenter_eip_hi = 0;
+               svm->sysenter_esp_hi = 0;
+       } else
+               nested_svm_vmloadsave(svm->vmcb, vmcb12);
 
-       nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       kvm_vcpu_unmap(vcpu, &map, true);
 
        return ret;
 }
 
-static int vmsave_interception(struct vcpu_svm *svm)
+static int vmload_interception(struct kvm_vcpu *vcpu)
 {
-       struct vmcb *nested_vmcb;
-       struct kvm_host_map map;
-       int ret;
-
-       if (nested_svm_check_permissions(svm))
-               return 1;
-
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
-       if (ret) {
-               if (ret == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
-               return 1;
-       }
-
-       nested_vmcb = map.hva;
-
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
-       nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       return vmload_vmsave_interception(vcpu, true);
+}
 
-       return ret;
+static int vmsave_interception(struct kvm_vcpu *vcpu)
+{
+       return vmload_vmsave_interception(vcpu, false);
 }
 
-static int vmrun_interception(struct vcpu_svm *svm)
+static int vmrun_interception(struct kvm_vcpu *vcpu)
 {
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                return 1;
 
-       return nested_svm_vmrun(svm);
+       return nested_svm_vmrun(vcpu);
 }
 
 enum {
@@ -2207,7 +2159,7 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
                [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
                [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
        };
-       int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = {
+       int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
                [SVM_INSTR_VMRUN] = vmrun_interception,
                [SVM_INSTR_VMLOAD] = vmload_interception,
                [SVM_INSTR_VMSAVE] = vmsave_interception,
@@ -2216,17 +2168,13 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
        int ret;
 
        if (is_guest_mode(vcpu)) {
-               svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
-               svm->vmcb->control.exit_info_1 = 0;
-               svm->vmcb->control.exit_info_2 = 0;
-
                /* Returns '1' or -errno on failure, '0' on success. */
-               ret = nested_svm_vmexit(svm);
+               ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
                if (ret)
                        return ret;
                return 1;
        }
-       return svm_instr_handlers[opcode](svm);
+       return svm_instr_handlers[opcode](vcpu);
 }
 
 /*
@@ -2237,9 +2185,9 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
  *      regions (e.g. SMM memory on host).
  *   2) VMware backdoor
  */
-static int gp_interception(struct vcpu_svm *svm)
+static int gp_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
        u32 error_code = svm->vmcb->control.exit_info_1;
        int opcode;
 
@@ -2304,73 +2252,58 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
        }
 }
 
-static int stgi_interception(struct vcpu_svm *svm)
+static int stgi_interception(struct kvm_vcpu *vcpu)
 {
        int ret;
 
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                return 1;
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
-       svm_set_gif(svm, true);
+       ret = kvm_skip_emulated_instruction(vcpu);
+       svm_set_gif(to_svm(vcpu), true);
        return ret;
 }
 
-static int clgi_interception(struct vcpu_svm *svm)
+static int clgi_interception(struct kvm_vcpu *vcpu)
 {
        int ret;
 
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                return 1;
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
-       svm_set_gif(svm, false);
+       ret = kvm_skip_emulated_instruction(vcpu);
+       svm_set_gif(to_svm(vcpu), false);
        return ret;
 }
 
-static int invlpga_interception(struct vcpu_svm *svm)
+static int invlpga_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
-
-       trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
-                         kvm_rax_read(&svm->vcpu));
-
-       /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
-       kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
+       gva_t gva = kvm_rax_read(vcpu);
+       u32 asid = kvm_rcx_read(vcpu);
 
-       return kvm_skip_emulated_instruction(&svm->vcpu);
-}
+       /* FIXME: Handle an address size prefix. */
+       if (!is_long_mode(vcpu))
+               gva = (u32)gva;
 
-static int skinit_interception(struct vcpu_svm *svm)
-{
-       trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
+       trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
 
-       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
-       return 1;
-}
+       /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
+       kvm_mmu_invlpg(vcpu, gva);
 
-static int wbinvd_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_wbinvd(&svm->vcpu);
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int xsetbv_interception(struct vcpu_svm *svm)
+static int skinit_interception(struct kvm_vcpu *vcpu)
 {
-       u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
-       u32 index = kvm_rcx_read(&svm->vcpu);
+       trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
 
-       int err = kvm_set_xcr(&svm->vcpu, index, new_bv);
-       return kvm_complete_insn_gp(&svm->vcpu, err);
-}
-
-static int rdpru_interception(struct vcpu_svm *svm)
-{
-       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       kvm_queue_exception(vcpu, UD_VECTOR);
        return 1;
 }
 
-static int task_switch_interception(struct vcpu_svm *svm)
+static int task_switch_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        u16 tss_selector;
        int reason;
        int int_type = svm->vmcb->control.exit_int_info &
@@ -2399,7 +2332,7 @@ static int task_switch_interception(struct vcpu_svm *svm)
        if (reason == TASK_SWITCH_GATE) {
                switch (type) {
                case SVM_EXITINTINFO_TYPE_NMI:
-                       svm->vcpu.arch.nmi_injected = false;
+                       vcpu->arch.nmi_injected = false;
                        break;
                case SVM_EXITINTINFO_TYPE_EXEPT:
                        if (svm->vmcb->control.exit_info_2 &
@@ -2408,10 +2341,10 @@ static int task_switch_interception(struct vcpu_svm *svm)
                                error_code =
                                        (u32)svm->vmcb->control.exit_info_2;
                        }
-                       kvm_clear_exception_queue(&svm->vcpu);
+                       kvm_clear_exception_queue(vcpu);
                        break;
                case SVM_EXITINTINFO_TYPE_INTR:
-                       kvm_clear_interrupt_queue(&svm->vcpu);
+                       kvm_clear_interrupt_queue(vcpu);
                        break;
                default:
                        break;
@@ -2422,77 +2355,58 @@ static int task_switch_interception(struct vcpu_svm *svm)
            int_type == SVM_EXITINTINFO_TYPE_SOFT ||
            (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
             (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
-               if (!skip_emulated_instruction(&svm->vcpu))
+               if (!skip_emulated_instruction(vcpu))
                        return 0;
        }
 
        if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
                int_vec = -1;
 
-       return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
+       return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
                               has_error_code, error_code);
 }
 
-static int cpuid_interception(struct vcpu_svm *svm)
+static int iret_interception(struct kvm_vcpu *vcpu)
 {
-       return kvm_emulate_cpuid(&svm->vcpu);
-}
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-static int iret_interception(struct vcpu_svm *svm)
-{
-       ++svm->vcpu.stat.nmi_window_exits;
-       svm->vcpu.arch.hflags |= HF_IRET_MASK;
-       if (!sev_es_guest(svm->vcpu.kvm)) {
+       ++vcpu->stat.nmi_window_exits;
+       vcpu->arch.hflags |= HF_IRET_MASK;
+       if (!sev_es_guest(vcpu->kvm)) {
                svm_clr_intercept(svm, INTERCEPT_IRET);
-               svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+               svm->nmi_iret_rip = kvm_rip_read(vcpu);
        }
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
        return 1;
 }
 
-static int invd_interception(struct vcpu_svm *svm)
-{
-       /* Treat an INVD instruction as a NOP and just skip it. */
-       return kvm_skip_emulated_instruction(&svm->vcpu);
-}
-
-static int invlpg_interception(struct vcpu_svm *svm)
+static int invlpg_interception(struct kvm_vcpu *vcpu)
 {
        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return kvm_emulate_instruction(&svm->vcpu, 0);
+               return kvm_emulate_instruction(vcpu, 0);
 
-       kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
-       return kvm_skip_emulated_instruction(&svm->vcpu);
+       kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int emulate_on_interception(struct vcpu_svm *svm)
+static int emulate_on_interception(struct kvm_vcpu *vcpu)
 {
-       return kvm_emulate_instruction(&svm->vcpu, 0);
+       return kvm_emulate_instruction(vcpu, 0);
 }
 
-static int rsm_interception(struct vcpu_svm *svm)
+static int rsm_interception(struct kvm_vcpu *vcpu)
 {
-       return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
+       return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
 }
 
-static int rdpmc_interception(struct vcpu_svm *svm)
-{
-       int err;
-
-       if (!nrips)
-               return emulate_on_interception(svm);
-
-       err = kvm_rdpmc(&svm->vcpu);
-       return kvm_complete_insn_gp(&svm->vcpu, err);
-}
-
-static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
+static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
                                            unsigned long val)
 {
-       unsigned long cr0 = svm->vcpu.arch.cr0;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       unsigned long cr0 = vcpu->arch.cr0;
        bool ret = false;
 
-       if (!is_guest_mode(&svm->vcpu) ||
+       if (!is_guest_mode(vcpu) ||
            (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
                return false;
 
@@ -2509,17 +2423,18 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
 
 #define CR_VALID (1ULL << 63)
 
-static int cr_interception(struct vcpu_svm *svm)
+static int cr_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int reg, cr;
        unsigned long val;
        int err;
 
        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return emulate_on_interception(svm);
+               return emulate_on_interception(vcpu);
 
        if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
-               return emulate_on_interception(svm);
+               return emulate_on_interception(vcpu);
 
        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
        if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
@@ -2530,61 +2445,61 @@ static int cr_interception(struct vcpu_svm *svm)
        err = 0;
        if (cr >= 16) { /* mov to cr */
                cr -= 16;
-               val = kvm_register_read(&svm->vcpu, reg);
+               val = kvm_register_read(vcpu, reg);
                trace_kvm_cr_write(cr, val);
                switch (cr) {
                case 0:
-                       if (!check_selective_cr0_intercepted(svm, val))
-                               err = kvm_set_cr0(&svm->vcpu, val);
+                       if (!check_selective_cr0_intercepted(vcpu, val))
+                               err = kvm_set_cr0(vcpu, val);
                        else
                                return 1;
 
                        break;
                case 3:
-                       err = kvm_set_cr3(&svm->vcpu, val);
+                       err = kvm_set_cr3(vcpu, val);
                        break;
                case 4:
-                       err = kvm_set_cr4(&svm->vcpu, val);
+                       err = kvm_set_cr4(vcpu, val);
                        break;
                case 8:
-                       err = kvm_set_cr8(&svm->vcpu, val);
+                       err = kvm_set_cr8(vcpu, val);
                        break;
                default:
                        WARN(1, "unhandled write to CR%d", cr);
-                       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+                       kvm_queue_exception(vcpu, UD_VECTOR);
                        return 1;
                }
        } else { /* mov from cr */
                switch (cr) {
                case 0:
-                       val = kvm_read_cr0(&svm->vcpu);
+                       val = kvm_read_cr0(vcpu);
                        break;
                case 2:
-                       val = svm->vcpu.arch.cr2;
+                       val = vcpu->arch.cr2;
                        break;
                case 3:
-                       val = kvm_read_cr3(&svm->vcpu);
+                       val = kvm_read_cr3(vcpu);
                        break;
                case 4:
-                       val = kvm_read_cr4(&svm->vcpu);
+                       val = kvm_read_cr4(vcpu);
                        break;
                case 8:
-                       val = kvm_get_cr8(&svm->vcpu);
+                       val = kvm_get_cr8(vcpu);
                        break;
                default:
                        WARN(1, "unhandled read from CR%d", cr);
-                       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+                       kvm_queue_exception(vcpu, UD_VECTOR);
                        return 1;
                }
-               kvm_register_write(&svm->vcpu, reg, val);
+               kvm_register_write(vcpu, reg, val);
                trace_kvm_cr_read(cr, val);
        }
-       return kvm_complete_insn_gp(&svm->vcpu, err);
+       return kvm_complete_insn_gp(vcpu, err);
 }
 
-static int cr_trap(struct vcpu_svm *svm)
+static int cr_trap(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long old_value, new_value;
        unsigned int cr;
        int ret = 0;
@@ -2606,7 +2521,7 @@ static int cr_trap(struct vcpu_svm *svm)
                kvm_post_set_cr4(vcpu, old_value, new_value);
                break;
        case 8:
-               ret = kvm_set_cr8(&svm->vcpu, new_value);
+               ret = kvm_set_cr8(vcpu, new_value);
                break;
        default:
                WARN(1, "unhandled CR%d write trap", cr);
@@ -2617,57 +2532,57 @@ static int cr_trap(struct vcpu_svm *svm)
        return kvm_complete_insn_gp(vcpu, ret);
 }
 
-static int dr_interception(struct vcpu_svm *svm)
+static int dr_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int reg, dr;
        unsigned long val;
        int err = 0;
 
-       if (svm->vcpu.guest_debug == 0) {
+       if (vcpu->guest_debug == 0) {
                /*
                 * No more DR vmexits; force a reload of the debug registers
                 * and reenter on this instruction.  The next vmexit will
                 * retrieve the full state of the debug registers.
                 */
                clr_dr_intercepts(svm);
-               svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
                return 1;
        }
 
        if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return emulate_on_interception(svm);
+               return emulate_on_interception(vcpu);
 
        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
        dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
        if (dr >= 16) { /* mov to DRn  */
                dr -= 16;
-               val = kvm_register_read(&svm->vcpu, reg);
-               err = kvm_set_dr(&svm->vcpu, dr, val);
+               val = kvm_register_read(vcpu, reg);
+               err = kvm_set_dr(vcpu, dr, val);
        } else {
-               kvm_get_dr(&svm->vcpu, dr, &val);
-               kvm_register_write(&svm->vcpu, reg, val);
+               kvm_get_dr(vcpu, dr, &val);
+               kvm_register_write(vcpu, reg, val);
        }
 
-       return kvm_complete_insn_gp(&svm->vcpu, err);
+       return kvm_complete_insn_gp(vcpu, err);
 }
 
-static int cr8_write_interception(struct vcpu_svm *svm)
+static int cr8_write_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_run *kvm_run = svm->vcpu.run;
        int r;
 
-       u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+       u8 cr8_prev = kvm_get_cr8(vcpu);
        /* instruction emulation calls kvm_set_cr8() */
-       r = cr_interception(svm);
-       if (lapic_in_kernel(&svm->vcpu))
+       r = cr_interception(vcpu);
+       if (lapic_in_kernel(vcpu))
                return r;
-       if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
+       if (cr8_prev <= kvm_get_cr8(vcpu))
                return r;
-       kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+       vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
        return 0;
 }
 
-static int efer_trap(struct vcpu_svm *svm)
+static int efer_trap(struct kvm_vcpu *vcpu)
 {
        struct msr_data msr_info;
        int ret;
@@ -2680,10 +2595,10 @@ static int efer_trap(struct vcpu_svm *svm)
         */
        msr_info.host_initiated = false;
        msr_info.index = MSR_EFER;
-       msr_info.data = svm->vmcb->control.exit_info_1 & ~EFER_SVME;
-       ret = kvm_set_msr_common(&svm->vcpu, &msr_info);
+       msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
+       ret = kvm_set_msr_common(vcpu, &msr_info);
 
-       return kvm_complete_insn_gp(&svm->vcpu, ret);
+       return kvm_complete_insn_gp(vcpu, ret);
 }
 
 static int svm_get_msr_feature(struct kvm_msr_entry *msr)
@@ -2710,34 +2625,41 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
        switch (msr_info->index) {
        case MSR_STAR:
-               msr_info->data = svm->vmcb->save.star;
+               msr_info->data = svm->vmcb01.ptr->save.star;
                break;
 #ifdef CONFIG_X86_64
        case MSR_LSTAR:
-               msr_info->data = svm->vmcb->save.lstar;
+               msr_info->data = svm->vmcb01.ptr->save.lstar;
                break;
        case MSR_CSTAR:
-               msr_info->data = svm->vmcb->save.cstar;
+               msr_info->data = svm->vmcb01.ptr->save.cstar;
                break;
        case MSR_KERNEL_GS_BASE:
-               msr_info->data = svm->vmcb->save.kernel_gs_base;
+               msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
                break;
        case MSR_SYSCALL_MASK:
-               msr_info->data = svm->vmcb->save.sfmask;
+               msr_info->data = svm->vmcb01.ptr->save.sfmask;
                break;
 #endif
        case MSR_IA32_SYSENTER_CS:
-               msr_info->data = svm->vmcb->save.sysenter_cs;
+               msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
                break;
        case MSR_IA32_SYSENTER_EIP:
-               msr_info->data = svm->sysenter_eip;
+               msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
+               if (guest_cpuid_is_intel(vcpu))
+                       msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
                break;
        case MSR_IA32_SYSENTER_ESP:
-               msr_info->data = svm->sysenter_esp;
+               msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
+               if (guest_cpuid_is_intel(vcpu))
+                       msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
                break;
        case MSR_TSC_AUX:
                if (!boot_cpu_has(X86_FEATURE_RDTSCP))
                        return 1;
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+                       return 1;
                msr_info->data = svm->tsc_aux;
                break;
        /*
@@ -2771,7 +2693,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                    !guest_has_spec_ctrl_msr(vcpu))
                        return 1;
 
-               msr_info->data = svm->spec_ctrl;
+               if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+                       msr_info->data = svm->vmcb->save.spec_ctrl;
+               else
+                       msr_info->data = svm->spec_ctrl;
                break;
        case MSR_AMD64_VIRT_SPEC_CTRL:
                if (!msr_info->host_initiated &&
@@ -2809,8 +2734,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       if (!sev_es_guest(svm->vcpu.kvm) || !err)
-               return kvm_complete_insn_gp(&svm->vcpu, err);
+       if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->ghcb))
+               return kvm_complete_insn_gp(vcpu, err);
 
        ghcb_set_sw_exit_info_1(svm->ghcb, 1);
        ghcb_set_sw_exit_info_2(svm->ghcb,
@@ -2820,11 +2745,6 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
        return 1;
 }
 
-static int rdmsr_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_rdmsr(&svm->vcpu);
-}
-
 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -2853,6 +2773,7 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+       int r;
 
        u32 ecx = msr->index;
        u64 data = msr->data;
@@ -2861,7 +2782,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
                        return 1;
                vcpu->arch.pat = data;
-               svm->vmcb->save.g_pat = data;
+               svm->vmcb01.ptr->save.g_pat = data;
+               if (is_guest_mode(vcpu))
+                       nested_vmcb02_compute_g_pat(svm);
                vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
                break;
        case MSR_IA32_SPEC_CTRL:
@@ -2872,7 +2795,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                if (kvm_spec_ctrl_test_value(data))
                        return 1;
 
-               svm->spec_ctrl = data;
+               if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+                       svm->vmcb->save.spec_ctrl = data;
+               else
+                       svm->spec_ctrl = data;
                if (!data)
                        break;
 
@@ -2915,44 +2841,70 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                svm->virt_spec_ctrl = data;
                break;
        case MSR_STAR:
-               svm->vmcb->save.star = data;
+               svm->vmcb01.ptr->save.star = data;
                break;
 #ifdef CONFIG_X86_64
        case MSR_LSTAR:
-               svm->vmcb->save.lstar = data;
+               svm->vmcb01.ptr->save.lstar = data;
                break;
        case MSR_CSTAR:
-               svm->vmcb->save.cstar = data;
+               svm->vmcb01.ptr->save.cstar = data;
                break;
        case MSR_KERNEL_GS_BASE:
-               svm->vmcb->save.kernel_gs_base = data;
+               svm->vmcb01.ptr->save.kernel_gs_base = data;
                break;
        case MSR_SYSCALL_MASK:
-               svm->vmcb->save.sfmask = data;
+               svm->vmcb01.ptr->save.sfmask = data;
                break;
 #endif
        case MSR_IA32_SYSENTER_CS:
-               svm->vmcb->save.sysenter_cs = data;
+               svm->vmcb01.ptr->save.sysenter_cs = data;
                break;
        case MSR_IA32_SYSENTER_EIP:
-               svm->sysenter_eip = data;
-               svm->vmcb->save.sysenter_eip = data;
+               svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
+               /*
+                * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
+                * when we spoof an Intel vendor ID (for cross vendor migration).
+                * In this case we use this intercept to track the high
+                * 32 bit part of these msrs to support Intel's
+                * implementation of SYSENTER/SYSEXIT.
+                */
+               svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
                break;
        case MSR_IA32_SYSENTER_ESP:
-               svm->sysenter_esp = data;
-               svm->vmcb->save.sysenter_esp = data;
+               svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
+               svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
                break;
        case MSR_TSC_AUX:
                if (!boot_cpu_has(X86_FEATURE_RDTSCP))
                        return 1;
 
+               if (!msr->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+                       return 1;
+
+               /*
+                * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
+                * incomplete and conflicting architectural behavior.  Current
+                * AMD CPUs completely ignore bits 63:32, i.e. they aren't
+                * reserved and always read as zeros.  Emulate AMD CPU behavior
+                * to avoid explosions if the vCPU is migrated from an AMD host
+                * to an Intel host.
+                */
+               data = (u32)data;
+
                /*
-                * This is rare, so we update the MSR here instead of using
-                * direct_access_msrs.  Doing that would require a rdmsr in
-                * svm_vcpu_put.
+                * TSC_AUX is usually changed only during boot and never read
+                * directly.  Intercept TSC_AUX instead of exposing it to the
+                * guest via direct_access_msrs, and switch it via user return.
                 */
+               preempt_disable();
+               r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
+               preempt_enable();
+               if (r)
+                       return 1;
+
                svm->tsc_aux = data;
-               wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
                break;
        case MSR_IA32_DEBUGCTLMSR:
                if (!boot_cpu_has(X86_FEATURE_LBRV)) {
@@ -3006,38 +2958,32 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
        return 0;
 }
 
-static int wrmsr_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_wrmsr(&svm->vcpu);
-}
-
-static int msr_interception(struct vcpu_svm *svm)
+static int msr_interception(struct kvm_vcpu *vcpu)
 {
-       if (svm->vmcb->control.exit_info_1)
-               return wrmsr_interception(svm);
+       if (to_svm(vcpu)->vmcb->control.exit_info_1)
+               return kvm_emulate_wrmsr(vcpu);
        else
-               return rdmsr_interception(svm);
+               return kvm_emulate_rdmsr(vcpu);
 }
 
-static int interrupt_window_interception(struct vcpu_svm *svm)
+static int interrupt_window_interception(struct kvm_vcpu *vcpu)
 {
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-       svm_clear_vintr(svm);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
+       svm_clear_vintr(to_svm(vcpu));
 
        /*
         * For AVIC, the only reason to end up here is ExtINTs.
         * In this case AVIC was temporarily disabled for
         * requesting the IRQ window and we have to re-enable it.
         */
-       svm_toggle_avic_for_irq_window(&svm->vcpu, true);
+       svm_toggle_avic_for_irq_window(vcpu, true);
 
-       ++svm->vcpu.stat.irq_window_exits;
+       ++vcpu->stat.irq_window_exits;
        return 1;
 }
 
-static int pause_interception(struct vcpu_svm *svm)
+static int pause_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
        bool in_kernel;
 
        /*
@@ -3045,35 +2991,18 @@ static int pause_interception(struct vcpu_svm *svm)
         * vcpu->arch.preempted_in_kernel can never be true.  Just
         * set in_kernel to false as well.
         */
-       in_kernel = !sev_es_guest(svm->vcpu.kvm) && svm_get_cpl(vcpu) == 0;
+       in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
 
        if (!kvm_pause_in_guest(vcpu->kvm))
                grow_ple_window(vcpu);
 
        kvm_vcpu_on_spin(vcpu, in_kernel);
-       return 1;
-}
-
-static int nop_interception(struct vcpu_svm *svm)
-{
-       return kvm_skip_emulated_instruction(&(svm->vcpu));
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int monitor_interception(struct vcpu_svm *svm)
+static int invpcid_interception(struct kvm_vcpu *vcpu)
 {
-       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
-       return nop_interception(svm);
-}
-
-static int mwait_interception(struct vcpu_svm *svm)
-{
-       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
-       return nop_interception(svm);
-}
-
-static int invpcid_interception(struct vcpu_svm *svm)
-{
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long type;
        gva_t gva;
 
@@ -3098,7 +3027,7 @@ static int invpcid_interception(struct vcpu_svm *svm)
        return kvm_handle_invpcid(vcpu, type, gva);
 }
 
-static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
+static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [SVM_EXIT_READ_CR0]                     = cr_interception,
        [SVM_EXIT_READ_CR3]                     = cr_interception,
        [SVM_EXIT_READ_CR4]                     = cr_interception,
@@ -3133,15 +3062,15 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
        [SVM_EXIT_INTR]                         = intr_interception,
        [SVM_EXIT_NMI]                          = nmi_interception,
-       [SVM_EXIT_SMI]                          = nop_on_interception,
-       [SVM_EXIT_INIT]                         = nop_on_interception,
+       [SVM_EXIT_SMI]                          = kvm_emulate_as_nop,
+       [SVM_EXIT_INIT]                         = kvm_emulate_as_nop,
        [SVM_EXIT_VINTR]                        = interrupt_window_interception,
-       [SVM_EXIT_RDPMC]                        = rdpmc_interception,
-       [SVM_EXIT_CPUID]                        = cpuid_interception,
+       [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
+       [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
        [SVM_EXIT_IRET]                         = iret_interception,
-       [SVM_EXIT_INVD]                         = invd_interception,
+       [SVM_EXIT_INVD]                         = kvm_emulate_invd,
        [SVM_EXIT_PAUSE]                        = pause_interception,
-       [SVM_EXIT_HLT]                          = halt_interception,
+       [SVM_EXIT_HLT]                          = kvm_emulate_halt,
        [SVM_EXIT_INVLPG]                       = invlpg_interception,
        [SVM_EXIT_INVLPGA]                      = invlpga_interception,
        [SVM_EXIT_IOIO]                         = io_interception,
@@ -3149,17 +3078,17 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
        [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
        [SVM_EXIT_VMRUN]                        = vmrun_interception,
-       [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
+       [SVM_EXIT_VMMCALL]                      = kvm_emulate_hypercall,
        [SVM_EXIT_VMLOAD]                       = vmload_interception,
        [SVM_EXIT_VMSAVE]                       = vmsave_interception,
        [SVM_EXIT_STGI]                         = stgi_interception,
        [SVM_EXIT_CLGI]                         = clgi_interception,
        [SVM_EXIT_SKINIT]                       = skinit_interception,
-       [SVM_EXIT_WBINVD]                       = wbinvd_interception,
-       [SVM_EXIT_MONITOR]                      = monitor_interception,
-       [SVM_EXIT_MWAIT]                        = mwait_interception,
-       [SVM_EXIT_XSETBV]                       = xsetbv_interception,
-       [SVM_EXIT_RDPRU]                        = rdpru_interception,
+       [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
+       [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
+       [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
+       [SVM_EXIT_XSETBV]                       = kvm_emulate_xsetbv,
+       [SVM_EXIT_RDPRU]                        = kvm_handle_invalid_op,
        [SVM_EXIT_EFER_WRITE_TRAP]              = efer_trap,
        [SVM_EXIT_CR0_WRITE_TRAP]               = cr_trap,
        [SVM_EXIT_CR4_WRITE_TRAP]               = cr_trap,
@@ -3177,6 +3106,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;
        struct vmcb_save_area *save = &svm->vmcb->save;
+       struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
 
        if (!dump_invalid_vmcb) {
                pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
@@ -3239,28 +3169,28 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
               save->ds.limit, save->ds.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "fs:",
-              save->fs.selector, save->fs.attrib,
-              save->fs.limit, save->fs.base);
+              save01->fs.selector, save01->fs.attrib,
+              save01->fs.limit, save01->fs.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "gs:",
-              save->gs.selector, save->gs.attrib,
-              save->gs.limit, save->gs.base);
+              save01->gs.selector, save01->gs.attrib,
+              save01->gs.limit, save01->gs.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "gdtr:",
               save->gdtr.selector, save->gdtr.attrib,
               save->gdtr.limit, save->gdtr.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "ldtr:",
-              save->ldtr.selector, save->ldtr.attrib,
-              save->ldtr.limit, save->ldtr.base);
+              save01->ldtr.selector, save01->ldtr.attrib,
+              save01->ldtr.limit, save01->ldtr.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "idtr:",
               save->idtr.selector, save->idtr.attrib,
               save->idtr.limit, save->idtr.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "tr:",
-              save->tr.selector, save->tr.attrib,
-              save->tr.limit, save->tr.base);
+              save01->tr.selector, save01->tr.attrib,
+              save01->tr.limit, save01->tr.base);
        pr_err("cpl:            %d                efer:         %016llx\n",
                save->cpl, save->efer);
        pr_err("%-15s %016llx %-13s %016llx\n",
@@ -3274,15 +3204,15 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
        pr_err("%-15s %016llx %-13s %016llx\n",
               "rsp:", save->rsp, "rax:", save->rax);
        pr_err("%-15s %016llx %-13s %016llx\n",
-              "star:", save->star, "lstar:", save->lstar);
+              "star:", save01->star, "lstar:", save01->lstar);
        pr_err("%-15s %016llx %-13s %016llx\n",
-              "cstar:", save->cstar, "sfmask:", save->sfmask);
+              "cstar:", save01->cstar, "sfmask:", save01->sfmask);
        pr_err("%-15s %016llx %-13s %016llx\n",
-              "kernel_gs_base:", save->kernel_gs_base,
-              "sysenter_cs:", save->sysenter_cs);
+              "kernel_gs_base:", save01->kernel_gs_base,
+              "sysenter_cs:", save01->sysenter_cs);
        pr_err("%-15s %016llx %-13s %016llx\n",
-              "sysenter_esp:", save->sysenter_esp,
-              "sysenter_eip:", save->sysenter_eip);
+              "sysenter_esp:", save01->sysenter_esp,
+              "sysenter_eip:", save01->sysenter_eip);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
        pr_err("%-15s %016llx %-13s %016llx\n",
@@ -3309,24 +3239,24 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
        return -EINVAL;
 }
 
-int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code)
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
 {
-       if (svm_handle_invalid_exit(&svm->vcpu, exit_code))
+       if (svm_handle_invalid_exit(vcpu, exit_code))
                return 0;
 
 #ifdef CONFIG_RETPOLINE
        if (exit_code == SVM_EXIT_MSR)
-               return msr_interception(svm);
+               return msr_interception(vcpu);
        else if (exit_code == SVM_EXIT_VINTR)
-               return interrupt_window_interception(svm);
+               return interrupt_window_interception(vcpu);
        else if (exit_code == SVM_EXIT_INTR)
-               return intr_interception(svm);
+               return intr_interception(vcpu);
        else if (exit_code == SVM_EXIT_HLT)
-               return halt_interception(svm);
+               return kvm_emulate_halt(vcpu);
        else if (exit_code == SVM_EXIT_NPF)
-               return npf_interception(svm);
+               return npf_interception(vcpu);
 #endif
-       return svm_exit_handlers[exit_code](svm);
+       return svm_exit_handlers[exit_code](vcpu);
 }
 
 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
@@ -3395,7 +3325,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
        if (exit_fastpath != EXIT_FASTPATH_NONE)
                return 1;
 
-       return svm_invoke_exit_handler(svm, exit_code);
+       return svm_invoke_exit_handler(vcpu, exit_code);
 }
 
 static void reload_tss(struct kvm_vcpu *vcpu)
@@ -3406,15 +3336,27 @@ static void reload_tss(struct kvm_vcpu *vcpu)
        load_TR_desc();
 }
 
-static void pre_svm_run(struct vcpu_svm *svm)
+static void pre_svm_run(struct kvm_vcpu *vcpu)
 {
-       struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu);
+       struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       if (sev_guest(svm->vcpu.kvm))
-               return pre_sev_run(svm, svm->vcpu.cpu);
+       /*
+        * If the previous vmrun of the vmcb occurred on a different physical
+        * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
+        * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
+        */
+       if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
+               svm->current_vmcb->asid_generation = 0;
+               vmcb_mark_all_dirty(svm->vmcb);
+               svm->current_vmcb->cpu = vcpu->cpu;
+        }
+
+       if (sev_guest(vcpu->kvm))
+               return pre_sev_run(svm, vcpu->cpu);
 
        /* FIXME: handle wraparound of asid_generation */
-       if (svm->asid_generation != sd->asid_generation)
+       if (svm->current_vmcb->asid_generation != sd->asid_generation)
                new_asid(svm, sd);
 }
 
@@ -3424,7 +3366,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 
        svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
        vcpu->arch.hflags |= HF_NMI_MASK;
-       if (!sev_es_guest(svm->vcpu.kvm))
+       if (!sev_es_guest(vcpu->kvm))
                svm_set_intercept(svm, INTERCEPT_IRET);
        ++vcpu->stat.nmi_injections;
 }
@@ -3478,7 +3420,7 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
                return false;
 
        ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
-             (svm->vcpu.arch.hflags & HF_NMI_MASK);
+             (vcpu->arch.hflags & HF_NMI_MASK);
 
        return ret;
 }
@@ -3498,9 +3440,7 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 
 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+       return !!(vcpu->arch.hflags & HF_NMI_MASK);
 }
 
 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3508,12 +3448,12 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
        struct vcpu_svm *svm = to_svm(vcpu);
 
        if (masked) {
-               svm->vcpu.arch.hflags |= HF_NMI_MASK;
-               if (!sev_es_guest(svm->vcpu.kvm))
+               vcpu->arch.hflags |= HF_NMI_MASK;
+               if (!sev_es_guest(vcpu->kvm))
                        svm_set_intercept(svm, INTERCEPT_IRET);
        } else {
-               svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
-               if (!sev_es_guest(svm->vcpu.kvm))
+               vcpu->arch.hflags &= ~HF_NMI_MASK;
+               if (!sev_es_guest(vcpu->kvm))
                        svm_clr_intercept(svm, INTERCEPT_IRET);
        }
 }
@@ -3526,7 +3466,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
        if (!gif_set(svm))
                return true;
 
-       if (sev_es_guest(svm->vcpu.kvm)) {
+       if (sev_es_guest(vcpu->kvm)) {
                /*
                 * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
                 * bit to determine the state of the IF flag.
@@ -3536,7 +3476,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
        } else if (is_guest_mode(vcpu)) {
                /* As long as interrupts are being delivered...  */
                if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
-                   ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
+                   ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
                    : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
                        return true;
 
@@ -3595,8 +3535,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
-           == HF_NMI_MASK)
+       if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
                return; /* IRET will cause a vm exit */
 
        if (!gif_set(svm)) {
@@ -3638,7 +3577,7 @@ void svm_flush_tlb(struct kvm_vcpu *vcpu)
        if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
        else
-               svm->asid_generation--;
+               svm->current_vmcb->asid_generation--;
 }
 
 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
@@ -3675,8 +3614,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
        svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
 }
 
-static void svm_complete_interrupts(struct vcpu_svm *svm)
+static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        u8 vector;
        int type;
        u32 exitintinfo = svm->vmcb->control.exit_int_info;
@@ -3688,28 +3628,28 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
         * If we've made progress since setting HF_IRET_MASK, we've
         * executed an IRET and can allow NMI injection.
         */
-       if ((svm->vcpu.arch.hflags & HF_IRET_MASK) &&
-           (sev_es_guest(svm->vcpu.kvm) ||
-            kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip)) {
-               svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
-               kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       if ((vcpu->arch.hflags & HF_IRET_MASK) &&
+           (sev_es_guest(vcpu->kvm) ||
+            kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
+               vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
        }
 
-       svm->vcpu.arch.nmi_injected = false;
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       vcpu->arch.nmi_injected = false;
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
 
        if (!(exitintinfo & SVM_EXITINTINFO_VALID))
                return;
 
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
 
        vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
        type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
 
        switch (type) {
        case SVM_EXITINTINFO_TYPE_NMI:
-               svm->vcpu.arch.nmi_injected = true;
+               vcpu->arch.nmi_injected = true;
                break;
        case SVM_EXITINTINFO_TYPE_EXEPT:
                /*
@@ -3725,21 +3665,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
                 */
                if (kvm_exception_is_soft(vector)) {
                        if (vector == BP_VECTOR && int3_injected &&
-                           kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
-                               kvm_rip_write(&svm->vcpu,
-                                             kvm_rip_read(&svm->vcpu) -
-                                             int3_injected);
+                           kvm_is_linear_rip(vcpu, svm->int3_rip))
+                               kvm_rip_write(vcpu,
+                                             kvm_rip_read(vcpu) - int3_injected);
                        break;
                }
                if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
                        u32 err = svm->vmcb->control.exit_int_info_err;
-                       kvm_requeue_exception_e(&svm->vcpu, vector, err);
+                       kvm_requeue_exception_e(vcpu, vector, err);
 
                } else
-                       kvm_requeue_exception(&svm->vcpu, vector);
+                       kvm_requeue_exception(vcpu, vector);
                break;
        case SVM_EXITINTINFO_TYPE_INTR:
-               kvm_queue_interrupt(&svm->vcpu, vector, false);
+               kvm_queue_interrupt(vcpu, vector, false);
                break;
        default:
                break;
@@ -3754,7 +3693,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
        control->exit_int_info = control->event_inj;
        control->exit_int_info_err = control->event_inj_err;
        control->event_inj = 0;
-       svm_complete_interrupts(svm);
+       svm_complete_interrupts(vcpu);
 }
 
 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
@@ -3766,9 +3705,11 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
        return EXIT_FASTPATH_NONE;
 }
 
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
-                                       struct vcpu_svm *svm)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       unsigned long vmcb_pa = svm->current_vmcb->pa;
+
        /*
         * VMENTER enables interrupts (host state), but the kernel state is
         * interrupts disabled when this is invoked. Also tell RCU about
@@ -3789,12 +3730,20 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
        guest_enter_irqoff();
        lockdep_hardirqs_on(CALLER_ADDR0);
 
-       if (sev_es_guest(svm->vcpu.kvm)) {
-               __svm_sev_es_vcpu_run(svm->vmcb_pa);
+       if (sev_es_guest(vcpu->kvm)) {
+               __svm_sev_es_vcpu_run(vmcb_pa);
        } else {
                struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
 
-               __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
+               /*
+                * Use a single vmcb (vmcb01 because it's always valid) for
+                * context switching guest state via VMLOAD/VMSAVE, that way
+                * the state doesn't need to be copied between vmcb01 and
+                * vmcb02 when switching vmcbs for nested virtualization.
+                */
+               vmload(svm->vmcb01.pa);
+               __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
+               vmsave(svm->vmcb01.pa);
 
                vmload(__sme_page_pa(sd->save_area));
        }
@@ -3845,7 +3794,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
                smp_send_reschedule(vcpu->cpu);
        }
 
-       pre_svm_run(svm);
+       pre_svm_run(vcpu);
 
        sync_lapic_to_cr8(vcpu);
 
@@ -3859,7 +3808,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         * Run with all-zero DR6 unless needed, so that we can get the exact cause
         * of a #DB.
         */
-       if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+       if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
                svm_set_dr6(svm, vcpu->arch.dr6);
        else
                svm_set_dr6(svm, DR6_ACTIVE_LOW);
@@ -3875,9 +3824,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         * is no need to worry about the conditional branch over the wrmsr
         * being speculatively taken.
         */
-       x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
+       if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+               x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
 
-       svm_vcpu_enter_exit(vcpu, svm);
+       svm_vcpu_enter_exit(vcpu);
 
        /*
         * We do not use IBRS in the kernel. If this vCPU has used the
@@ -3894,15 +3844,17 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         * If the L02 MSR bitmap does not intercept the MSR, then we need to
         * save it.
         */
-       if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+       if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
+           unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
                svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-       if (!sev_es_guest(svm->vcpu.kvm))
+       if (!sev_es_guest(vcpu->kvm))
                reload_tss(vcpu);
 
-       x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
+       if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+               x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
 
-       if (!sev_es_guest(svm->vcpu.kvm)) {
+       if (!sev_es_guest(vcpu->kvm)) {
                vcpu->arch.cr2 = svm->vmcb->save.cr2;
                vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
                vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
@@ -3910,7 +3862,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
        }
 
        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
-               kvm_before_interrupt(&svm->vcpu);
+               kvm_before_interrupt(vcpu);
 
        kvm_load_host_xsave_state(vcpu);
        stgi();
@@ -3918,13 +3870,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
        /* Any pending NMI will happen here */
 
        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
-               kvm_after_interrupt(&svm->vcpu);
+               kvm_after_interrupt(vcpu);
 
        sync_cr8_to_lapic(vcpu);
 
        svm->next_rip = 0;
-       if (is_guest_mode(&svm->vcpu)) {
-               sync_nested_vmcb_control(svm);
+       if (is_guest_mode(vcpu)) {
+               nested_sync_control_from_vmcb02(svm);
                svm->nested.nested_run_pending = 0;
        }
 
@@ -3933,7 +3885,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
 
        /* if exit due to PF check for async PF */
        if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
-               svm->vcpu.arch.apf.host_apf_flags =
+               vcpu->arch.apf.host_apf_flags =
                        kvm_read_and_reset_apf_flags();
 
        if (npt_enabled) {
@@ -3947,9 +3899,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         */
        if (unlikely(svm->vmcb->control.exit_code ==
                     SVM_EXIT_EXCP_BASE + MC_VECTOR))
-               svm_handle_mce(svm);
+               svm_handle_mce(vcpu);
 
-       svm_complete_interrupts(svm);
+       svm_complete_interrupts(vcpu);
 
        if (is_guest_mode(vcpu))
                return EXIT_FASTPATH_NONE;
@@ -3957,21 +3909,26 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
        return svm_exit_handlers_fastpath(vcpu);
 }
 
-static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
+static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
                             int root_level)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long cr3;
 
-       cr3 = __sme_set(root);
        if (npt_enabled) {
-               svm->vmcb->control.nested_cr3 = cr3;
+               svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
                vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
 
                /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
                if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
                        return;
                cr3 = vcpu->arch.cr3;
+       } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
+               cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
+       } else {
+               /* PCID in the guest should be impossible with a 32-bit MMU. */
+               WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
+               cr3 = root_hpa;
        }
 
        svm->vmcb->save.cr3 = cr3;
@@ -4048,7 +4005,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 
        /* Update nrips enabled cache */
        svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
-                            guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
+                            guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
 
        /* Check again if INVPCID interception if required */
        svm_check_invpcid(svm);
@@ -4060,24 +4017,50 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                        vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
        }
 
-       if (!kvm_vcpu_apicv_active(vcpu))
-               return;
+       if (kvm_vcpu_apicv_active(vcpu)) {
+               /*
+                * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
+                * is exposed to the guest, disable AVIC.
+                */
+               if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
+                       kvm_request_apicv_update(vcpu->kvm, false,
+                                                APICV_INHIBIT_REASON_X2APIC);
 
-       /*
-        * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
-        * is exposed to the guest, disable AVIC.
-        */
-       if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
-               kvm_request_apicv_update(vcpu->kvm, false,
-                                        APICV_INHIBIT_REASON_X2APIC);
+               /*
+                * Currently, AVIC does not work with nested virtualization.
+                * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
+                */
+               if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+                       kvm_request_apicv_update(vcpu->kvm, false,
+                                                APICV_INHIBIT_REASON_NESTED);
+       }
 
-       /*
-        * Currently, AVIC does not work with nested virtualization.
-        * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
-        */
-       if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
-               kvm_request_apicv_update(vcpu->kvm, false,
-                                        APICV_INHIBIT_REASON_NESTED);
+       if (guest_cpuid_is_intel(vcpu)) {
+               /*
+                * We must intercept SYSENTER_EIP and SYSENTER_ESP
+                * accesses because the processor only stores 32 bits.
+                * For the same reason we cannot use virtual VMLOAD/VMSAVE.
+                */
+               svm_set_intercept(svm, INTERCEPT_VMLOAD);
+               svm_set_intercept(svm, INTERCEPT_VMSAVE);
+               svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
+       } else {
+               /*
+                * If hardware supports Virtual VMLOAD VMSAVE then enable it
+                * in VMCB and clear intercepts to avoid #VMEXIT.
+                */
+               if (vls) {
+                       svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+                       svm_clr_intercept(svm, INTERCEPT_VMSAVE);
+                       svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+               }
+               /* No need to intercept these MSRs */
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
+       }
 }
 
 static bool svm_has_wbinvd_exit(void)
@@ -4349,15 +4332,15 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
                        if (!(saved_efer & EFER_SVME))
                                return 1;
 
-                       if (kvm_vcpu_map(&svm->vcpu,
+                       if (kvm_vcpu_map(vcpu,
                                         gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
                                return 1;
 
                        if (svm_allocate_nested(svm))
                                return 1;
 
-                       ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
-                       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+                       ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva);
+                       kvm_vcpu_unmap(vcpu, &map, true);
                }
        }
 
@@ -4612,6 +4595,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
        .mem_enc_reg_region = svm_register_enc_region,
        .mem_enc_unreg_region = svm_unregister_enc_region,
 
+       .vm_copy_enc_context_from = svm_vm_copy_asid_from,
+
        .can_emulate_instruction = svm_can_emulate_instruction,
 
        .apic_init_signal_blocked = svm_apic_init_signal_blocked,
index 9806aae..84b3133 100644 (file)
 
 #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
 
-static const u32 host_save_user_msrs[] = {
-       MSR_TSC_AUX,
-};
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+#define        IOPM_SIZE PAGE_SIZE * 3
+#define        MSRPM_SIZE PAGE_SIZE * 2
 
-#define MAX_DIRECT_ACCESS_MSRS 18
+#define MAX_DIRECT_ACCESS_MSRS 20
 #define MSRPM_OFFSETS  16
 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 extern bool npt_enabled;
@@ -65,6 +63,7 @@ struct kvm_sev_info {
        unsigned long pages_locked; /* Number of pages locked */
        struct list_head regions_list;  /* List of registered regions */
        u64 ap_jump_table;      /* SEV-ES AP Jump Table address */
+       struct kvm *enc_context_owner; /* Owner of copied encryption context */
        struct misc_cg *misc_cg; /* For misc cgroup accounting */
 };
 
@@ -82,11 +81,19 @@ struct kvm_svm {
 
 struct kvm_vcpu;
 
+struct kvm_vmcb_info {
+       struct vmcb *ptr;
+       unsigned long pa;
+       int cpu;
+       uint64_t asid_generation;
+};
+
 struct svm_nested_state {
-       struct vmcb *hsave;
+       struct kvm_vmcb_info vmcb02;
        u64 hsave_msr;
        u64 vm_cr_msr;
        u64 vmcb12_gpa;
+       u64 last_vmcb12_gpa;
 
        /* These are the merged vectors */
        u32 *msrpm;
@@ -103,21 +110,20 @@ struct svm_nested_state {
 
 struct vcpu_svm {
        struct kvm_vcpu vcpu;
+       /* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */
        struct vmcb *vmcb;
-       unsigned long vmcb_pa;
+       struct kvm_vmcb_info vmcb01;
+       struct kvm_vmcb_info *current_vmcb;
        struct svm_cpu_data *svm_data;
        u32 asid;
-       uint64_t asid_generation;
-       uint64_t sysenter_esp;
-       uint64_t sysenter_eip;
+       u32 sysenter_esp_hi;
+       u32 sysenter_eip_hi;
        uint64_t tsc_aux;
 
        u64 msr_decfg;
 
        u64 next_rip;
 
-       u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
-
        u64 spec_ctrl;
        /*
         * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
@@ -240,17 +246,14 @@ static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
        vmcb->control.clean &= ~(1 << bit);
 }
 
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)
 {
-       return container_of(vcpu, struct vcpu_svm, vcpu);
+        return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
 }
 
-static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 {
-       if (is_guest_mode(&svm->vcpu))
-               return svm->nested.hsave;
-       else
-               return svm->vmcb;
+       return container_of(vcpu, struct vcpu_svm, vcpu);
 }
 
 static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
@@ -273,7 +276,7 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
 
 static inline void set_dr_intercepts(struct vcpu_svm *svm)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        if (!sev_es_guest(svm->vcpu.kvm)) {
                vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
@@ -300,7 +303,7 @@ static inline void set_dr_intercepts(struct vcpu_svm *svm)
 
 static inline void clr_dr_intercepts(struct vcpu_svm *svm)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        vmcb->control.intercepts[INTERCEPT_DR] = 0;
 
@@ -315,7 +318,7 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm)
 
 static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        WARN_ON_ONCE(bit >= 32);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
@@ -325,7 +328,7 @@ static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
 
 static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        WARN_ON_ONCE(bit >= 32);
        vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
@@ -335,7 +338,7 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
 
 static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        vmcb_set_intercept(&vmcb->control, bit);
 
@@ -344,7 +347,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
 
 static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        vmcb_clr_intercept(&vmcb->control, bit);
 
@@ -388,8 +391,6 @@ static inline bool gif_set(struct vcpu_svm *svm)
 /* svm.c */
 #define MSR_INVALID                            0xffffffffU
 
-extern int sev;
-extern int sev_es;
 extern bool dump_invalid_vmcb;
 
 u32 svm_msrpm_offset(u32 msr);
@@ -406,7 +407,7 @@ bool svm_smi_blocked(struct kvm_vcpu *vcpu);
 bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
 void svm_set_gif(struct vcpu_svm *svm, bool value);
-int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code);
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
                          int read, int write);
 
@@ -438,20 +439,30 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
        return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
 }
 
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-                        struct vmcb *nested_vmcb);
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12);
 void svm_leave_nested(struct vcpu_svm *svm);
 void svm_free_nested(struct vcpu_svm *svm);
 int svm_allocate_nested(struct vcpu_svm *svm);
-int nested_svm_vmrun(struct vcpu_svm *svm);
+int nested_svm_vmrun(struct kvm_vcpu *vcpu);
 void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
 int nested_svm_vmexit(struct vcpu_svm *svm);
+
+static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
+{
+       svm->vmcb->control.exit_code   = exit_code;
+       svm->vmcb->control.exit_info_1 = 0;
+       svm->vmcb->control.exit_info_2 = 0;
+       return nested_svm_vmexit(svm);
+}
+
 int nested_svm_exit_handled(struct vcpu_svm *svm);
-int nested_svm_check_permissions(struct vcpu_svm *svm);
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu);
 int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
                               bool has_error_code, u32 error_code);
 int nested_svm_exit_special(struct vcpu_svm *svm);
-void sync_nested_vmcb_control(struct vcpu_svm *svm);
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
 
 extern struct kvm_x86_nested_ops svm_nested_ops;
 
@@ -492,8 +503,8 @@ void avic_vm_destroy(struct kvm *kvm);
 int avic_vm_init(struct kvm *kvm);
 void avic_init_vmcb(struct vcpu_svm *svm);
 void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm);
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm);
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
 int avic_init_vcpu(struct vcpu_svm *svm);
 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void avic_vcpu_put(struct kvm_vcpu *vcpu);
@@ -551,22 +562,20 @@ void svm_vcpu_unblocking(struct kvm_vcpu *vcpu);
 
 extern unsigned int max_sev_asid;
 
-static inline bool svm_sev_enabled(void)
-{
-       return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0;
-}
-
 void sev_vm_destroy(struct kvm *kvm);
 int svm_mem_enc_op(struct kvm *kvm, void __user *argp);
 int svm_register_enc_region(struct kvm *kvm,
                            struct kvm_enc_region *range);
 int svm_unregister_enc_region(struct kvm *kvm,
                              struct kvm_enc_region *range);
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd);
 void pre_sev_run(struct vcpu_svm *svm, int cpu);
+void __init sev_set_cpu_caps(void);
 void __init sev_hardware_setup(void);
 void sev_hardware_teardown(void);
+int sev_cpu_init(struct svm_cpu_data *sd);
 void sev_free_vcpu(struct kvm_vcpu *vcpu);
-int sev_handle_vmgexit(struct vcpu_svm *svm);
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
 int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
 void sev_es_init_vmcb(struct vcpu_svm *svm);
 void sev_es_create_vcpu(struct vcpu_svm *svm);
index 6feb8c0..4fa17df 100644 (file)
@@ -79,28 +79,10 @@ SYM_FUNC_START(__svm_vcpu_run)
 
        /* Enter guest mode */
        sti
-1:     vmload %_ASM_AX
-       jmp 3f
-2:     cmpb $0, kvm_rebooting
-       jne 3f
-       ud2
-       _ASM_EXTABLE(1b, 2b)
 
-3:     vmrun %_ASM_AX
-       jmp 5f
-4:     cmpb $0, kvm_rebooting
-       jne 5f
-       ud2
-       _ASM_EXTABLE(3b, 4b)
+1:     vmrun %_ASM_AX
 
-5:     vmsave %_ASM_AX
-       jmp 7f
-6:     cmpb $0, kvm_rebooting
-       jne 7f
-       ud2
-       _ASM_EXTABLE(5b, 6b)
-7:
-       cli
+2:     cli
 
 #ifdef CONFIG_RETPOLINE
        /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
@@ -167,6 +149,13 @@ SYM_FUNC_START(__svm_vcpu_run)
 #endif
        pop %_ASM_BP
        ret
+
+3:     cmpb $0, kvm_rebooting
+       jne 2b
+       ud2
+
+       _ASM_EXTABLE(1b, 3b)
+
 SYM_FUNC_END(__svm_vcpu_run)
 
 /**
@@ -186,18 +175,15 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
 #endif
        push %_ASM_BX
 
-       /* Enter guest mode */
+       /* Move @vmcb to RAX. */
        mov %_ASM_ARG1, %_ASM_AX
+
+       /* Enter guest mode */
        sti
 
 1:     vmrun %_ASM_AX
-       jmp 3f
-2:     cmpb $0, kvm_rebooting
-       jne 3f
-       ud2
-       _ASM_EXTABLE(1b, 2b)
 
-3:     cli
+2:     cli
 
 #ifdef CONFIG_RETPOLINE
        /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
@@ -217,4 +203,11 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
 #endif
        pop %_ASM_BP
        ret
+
+3:     cmpb $0, kvm_rebooting
+       jne 2b
+       ud2
+
+       _ASM_EXTABLE(1b, 3b)
+
 SYM_FUNC_END(__svm_sev_es_vcpu_run)
index 1e069aa..bced766 100644 (file)
@@ -11,6 +11,7 @@
 #include "mmu.h"
 #include "nested.h"
 #include "pmu.h"
+#include "sgx.h"
 #include "trace.h"
 #include "vmx.h"
 #include "x86.h"
@@ -21,13 +22,7 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
 static bool __read_mostly nested_early_check = 0;
 module_param(nested_early_check, bool, S_IRUGO);
 
-#define CC(consistency_check)                                          \
-({                                                                     \
-       bool failed = (consistency_check);                              \
-       if (failed)                                                     \
-               trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
-       failed;                                                         \
-})
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
 
 /*
  * Hyper-V requires all of these, so mark them as supported even though
@@ -619,6 +614,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
        }
 
        /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
+#ifdef CONFIG_X86_64
        nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
                                             MSR_FS_BASE, MSR_TYPE_RW);
 
@@ -627,6 +623,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 
        nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
                                             MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
 
        /*
         * Checking the L0->L1 bitmap is trying to verify two things:
@@ -2306,6 +2303,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
                if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
                    exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
 
+               if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
+                       vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
+
                secondary_exec_controls_set(vmx, exec_control);
        }
 
@@ -3453,6 +3453,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
        enum nested_evmptrld_status evmptrld_status;
 
+       ++vcpu->stat.nested_run;
+
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
@@ -3810,9 +3812,15 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 
        /*
         * Process any exceptions that are not debug traps before MTF.
+        *
+        * Note that only a pending nested run can block a pending exception.
+        * Otherwise an injected NMI/interrupt should either be
+        * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
+        * while delivering the pending exception.
         */
+
        if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
-               if (block_nested_events)
+               if (vmx->nested.nested_run_pending)
                        return -EBUSY;
                if (!nested_vmx_check_exception(vcpu, &exit_qual))
                        goto no_vmexit;
@@ -3829,7 +3837,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
        }
 
        if (vcpu->arch.exception.pending) {
-               if (block_nested_events)
+               if (vmx->nested.nested_run_pending)
                        return -EBUSY;
                if (!nested_vmx_check_exception(vcpu, &exit_qual))
                        goto no_vmexit;
@@ -4105,6 +4113,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 {
        /* update exit information fields: */
        vmcs12->vm_exit_reason = vm_exit_reason;
+       if (to_vmx(vcpu)->exit_reason.enclave_mode)
+               vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
        vmcs12->exit_qualification = exit_qualification;
        vmcs12->vm_exit_intr_info = exit_intr_info;
 
@@ -4422,6 +4432,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
        /* trying to cancel vmlaunch/vmresume is a bug */
        WARN_ON_ONCE(vmx->nested.nested_run_pending);
 
+       /* Similarly, triple faults in L2 should never escape. */
+       WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
+
        kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 
        /* Service the TLB flush request for L2 before switching to L1. */
@@ -4558,6 +4571,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
        vmx->fail = 0;
 }
 
+static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
+{
+       nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+}
+
 /*
  * Decode the memory-address operand of a vmx instruction, as recorded on an
  * exit caused by such an instruction (run by a guest hypervisor).
@@ -5005,7 +5023,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
                return nested_vmx_failInvalid(vcpu);
 
        /* Decode instruction info and find the field to read */
-       field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
+       field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
 
        offset = vmcs_field_to_offset(field);
        if (offset < 0)
@@ -5023,7 +5041,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
         * on the guest's mode (32 or 64 bit), not on the given field's length.
         */
        if (instr_info & BIT(10)) {
-               kvm_register_writel(vcpu, (((instr_info) >> 3) & 0xf), value);
+               kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
        } else {
                len = is_64_bit_mode(vcpu) ? 8 : 4;
                if (get_vmx_mem_address(vcpu, exit_qualification,
@@ -5097,7 +5115,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                return nested_vmx_failInvalid(vcpu);
 
        if (instr_info & BIT(10))
-               value = kvm_register_readl(vcpu, (((instr_info) >> 3) & 0xf));
+               value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
        else {
                len = is_64_bit_mode(vcpu) ? 8 : 4;
                if (get_vmx_mem_address(vcpu, exit_qualification,
@@ -5108,7 +5126,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                        return kvm_handle_memory_failure(vcpu, r, &e);
        }
 
-       field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
+       field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
 
        offset = vmcs_field_to_offset(field);
        if (offset < 0)
@@ -5305,7 +5323,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
                return 1;
 
        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
-       type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+       type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
 
        types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
 
@@ -5385,7 +5403,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                return 1;
 
        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
-       type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+       type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
 
        types = (vmx->nested.msrs.vpid_caps &
                        VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
@@ -5479,16 +5497,11 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
                if (!nested_vmx_check_eptp(vcpu, new_eptp))
                        return 1;
 
-               kvm_mmu_unload(vcpu);
                mmu->ept_ad = accessed_dirty;
                mmu->mmu_role.base.ad_disabled = !accessed_dirty;
                vmcs12->ept_pointer = new_eptp;
-               /*
-                * TODO: Check what's the correct approach in case
-                * mmu reload fails. Currently, we just let the next
-                * reload potentially fail
-                */
-               kvm_mmu_reload(vcpu);
+
+               kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
        }
 
        return 0;
@@ -5646,7 +5659,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
        switch ((exit_qualification >> 4) & 3) {
        case 0: /* mov to cr */
                reg = (exit_qualification >> 8) & 15;
-               val = kvm_register_readl(vcpu, reg);
+               val = kvm_register_read(vcpu, reg);
                switch (cr) {
                case 0:
                        if (vmcs12->cr0_guest_host_mask &
@@ -5705,6 +5718,21 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
        return false;
 }
 
+static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
+                                         struct vmcs12 *vmcs12)
+{
+       u32 encls_leaf;
+
+       if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
+           !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
+               return false;
+
+       encls_leaf = kvm_rax_read(vcpu);
+       if (encls_leaf > 62)
+               encls_leaf = 63;
+       return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
+}
+
 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
        struct vmcs12 *vmcs12, gpa_t bitmap)
 {
@@ -5801,9 +5829,6 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
        case EXIT_REASON_VMFUNC:
                /* VM functions are emulated through L2->L0 vmexits. */
                return true;
-       case EXIT_REASON_ENCLS:
-               /* SGX is never exposed to L1 */
-               return true;
        default:
                break;
        }
@@ -5927,6 +5952,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
        case EXIT_REASON_TPAUSE:
                return nested_cpu_has2(vmcs12,
                        SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
+       case EXIT_REASON_ENCLS:
+               return nested_vmx_exit_handled_encls(vcpu, vmcs12);
        default:
                return true;
        }
@@ -6502,6 +6529,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
                msrs->secondary_ctls_high |=
                        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 
+       if (enable_sgx)
+               msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
+
        /* miscellaneous data */
        rdmsr(MSR_IA32_VMX_MISC,
                msrs->misc_low,
@@ -6599,6 +6629,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
 struct kvm_x86_nested_ops vmx_nested_ops = {
        .check_events = vmx_check_nested_events,
        .hv_timer_pending = nested_vmx_preemption_timer_pending,
+       .triple_fault = nested_vmx_triple_fault,
        .get_state = vmx_get_nested_state,
        .set_state = vmx_set_nested_state,
        .get_nested_state_pages = vmx_get_nested_state_pages,
index 197148d..184418b 100644 (file)
@@ -244,6 +244,11 @@ static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
                PIN_BASED_EXT_INTR_MASK;
 }
 
+static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12)
+{
+       return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING);
+}
+
 /*
  * if fixed0[i] == 1: val[i] must be 1
  * if fixed1[i] == 0: val[i] must be 0
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
new file mode 100644 (file)
index 0000000..6693ebd
--- /dev/null
@@ -0,0 +1,502 @@
+// SPDX-License-Identifier: GPL-2.0
+/*  Copyright(c) 2021 Intel Corporation. */
+
+#include <asm/sgx.h>
+
+#include "cpuid.h"
+#include "kvm_cache_regs.h"
+#include "nested.h"
+#include "sgx.h"
+#include "vmx.h"
+#include "x86.h"
+
+bool __read_mostly enable_sgx = 1;
+module_param_named(sgx, enable_sgx, bool, 0444);
+
+/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */
+static u64 sgx_pubkey_hash[4] __ro_after_init;
+
+/*
+ * ENCLS's memory operands use a fixed segment (DS) and a fixed
+ * address size based on the mode.  Related prefixes are ignored.
+ */
+static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
+                            int size, int alignment, gva_t *gva)
+{
+       struct kvm_segment s;
+       bool fault;
+
+       /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */
+       *gva = offset;
+       if (!is_long_mode(vcpu)) {
+               vmx_get_segment(vcpu, &s, VCPU_SREG_DS);
+               *gva += s.base;
+       }
+
+       if (!IS_ALIGNED(*gva, alignment)) {
+               fault = true;
+       } else if (likely(is_long_mode(vcpu))) {
+               fault = is_noncanonical_address(*gva, vcpu);
+       } else {
+               *gva &= 0xffffffff;
+               fault = (s.unusable) ||
+                       (s.type != 2 && s.type != 3) ||
+                       (*gva > s.limit) ||
+                       ((s.base != 0 || s.limit != 0xffffffff) &&
+                       (((u64)*gva + size - 1) > s.limit + 1));
+       }
+       if (fault)
+               kvm_inject_gp(vcpu, 0);
+       return fault ? -EINVAL : 0;
+}
+
+static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr,
+                                        unsigned int size)
+{
+       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+       vcpu->run->internal.ndata = 2;
+       vcpu->run->internal.data[0] = addr;
+       vcpu->run->internal.data[1] = size;
+}
+
+static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data,
+                       unsigned int size)
+{
+       if (__copy_from_user(data, (void __user *)hva, size)) {
+               sgx_handle_emulation_failure(vcpu, hva, size);
+               return -EFAULT;
+       }
+
+       return 0;
+}
+
+static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write,
+                         gpa_t *gpa)
+{
+       struct x86_exception ex;
+
+       if (write)
+               *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex);
+       else
+               *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex);
+
+       if (*gpa == UNMAPPED_GVA) {
+               kvm_inject_emulated_page_fault(vcpu, &ex);
+               return -EFAULT;
+       }
+
+       return 0;
+}
+
+static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva)
+{
+       *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa));
+       if (kvm_is_error_hva(*hva)) {
+               sgx_handle_emulation_failure(vcpu, gpa, 1);
+               return -EFAULT;
+       }
+
+       *hva |= gpa & ~PAGE_MASK;
+
+       return 0;
+}
+
+static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
+{
+       struct x86_exception ex;
+
+       /*
+        * A non-EPCM #PF indicates a bad userspace HVA.  This *should* check
+        * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC,
+        * but the error code isn't (yet) plumbed through the ENCLS helpers.
+        */
+       if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+               vcpu->run->internal.ndata = 0;
+               return 0;
+       }
+
+       /*
+        * If the guest thinks it's running on SGX2 hardware, inject an SGX
+        * #PF if the fault matches an EPCM fault signature (#GP on SGX1,
+        * #PF on SGX2).  The assumption is that EPCM faults are much more
+        * likely than a bad userspace address.
+        */
+       if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) &&
+           guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) {
+               memset(&ex, 0, sizeof(ex));
+               ex.vector = PF_VECTOR;
+               ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK |
+                               PFERR_SGX_MASK;
+               ex.address = gva;
+               ex.error_code_valid = true;
+               ex.nested_page_fault = false;
+               kvm_inject_page_fault(vcpu, &ex);
+       } else {
+               kvm_inject_gp(vcpu, 0);
+       }
+       return 1;
+}
+
+static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
+                                 struct sgx_pageinfo *pageinfo,
+                                 unsigned long secs_hva,
+                                 gva_t secs_gva)
+{
+       struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents;
+       struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1;
+       u64 attributes, xfrm, size;
+       u32 miscselect;
+       u8 max_size_log2;
+       int trapnr, ret;
+
+       sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+       sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+       if (!sgx_12_0 || !sgx_12_1) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+               vcpu->run->internal.ndata = 0;
+               return 0;
+       }
+
+       miscselect = contents->miscselect;
+       attributes = contents->attributes;
+       xfrm = contents->xfrm;
+       size = contents->size;
+
+       /* Enforce restriction of access to the PROVISIONKEY. */
+       if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
+           (attributes & SGX_ATTR_PROVISIONKEY)) {
+               if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
+                       pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n");
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
+       /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */
+       if ((u32)miscselect & ~sgx_12_0->ebx ||
+           (u32)attributes & ~sgx_12_1->eax ||
+           (u32)(attributes >> 32) & ~sgx_12_1->ebx ||
+           (u32)xfrm & ~sgx_12_1->ecx ||
+           (u32)(xfrm >> 32) & ~sgx_12_1->edx) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
+       /* Enforce CPUID restriction on max enclave size. */
+       max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 :
+                                                           sgx_12_0->edx;
+       if (size >= BIT_ULL(max_size_log2))
+               kvm_inject_gp(vcpu, 0);
+
+       /*
+        * sgx_virt_ecreate() returns:
+        *  1) 0:       ECREATE was successful
+        *  2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the
+        *              exception number.
+        *  3) -EINVAL: access_ok() on @secs_hva failed. This should never
+        *              happen as KVM checks host addresses at memslot creation.
+        *              sgx_virt_ecreate() has already warned in this case.
+        */
+       ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr);
+       if (!ret)
+               return kvm_skip_emulated_instruction(vcpu);
+       if (ret == -EFAULT)
+               return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+       return ret;
+}
+
+static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+       gva_t pageinfo_gva, secs_gva;
+       gva_t metadata_gva, contents_gva;
+       gpa_t metadata_gpa, contents_gpa, secs_gpa;
+       unsigned long metadata_hva, contents_hva, secs_hva;
+       struct sgx_pageinfo pageinfo;
+       struct sgx_secs *contents;
+       struct x86_exception ex;
+       int r;
+
+       if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) ||
+           sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva))
+               return 1;
+
+       /*
+        * Copy the PAGEINFO to local memory, its pointers need to be
+        * translated, i.e. we need to do a deep copy/translate.
+        */
+       r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo,
+                               sizeof(pageinfo), &ex);
+       if (r == X86EMUL_PROPAGATE_FAULT) {
+               kvm_inject_emulated_page_fault(vcpu, &ex);
+               return 1;
+       } else if (r != X86EMUL_CONTINUE) {
+               sgx_handle_emulation_failure(vcpu, pageinfo_gva,
+                                            sizeof(pageinfo));
+               return 0;
+       }
+
+       if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) ||
+           sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096,
+                             &contents_gva))
+               return 1;
+
+       /*
+        * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA.
+        * Resume the guest on failure to inject a #PF.
+        */
+       if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) ||
+           sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) ||
+           sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa))
+               return 1;
+
+       /*
+        * ...and then to HVA.  The order of accesses isn't architectural, i.e.
+        * KVM doesn't have to fully process one address at a time.  Exit to
+        * userspace if a GPA is invalid.
+        */
+       if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) ||
+           sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) ||
+           sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva))
+               return 0;
+
+       /*
+        * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the
+        * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and
+        * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
+        * enforce restriction of access to the PROVISIONKEY.
+        */
+       contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
+       if (!contents)
+               return -ENOMEM;
+
+       /* Exit to userspace if copying from a host userspace address fails. */
+       if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) {
+               free_page((unsigned long)contents);
+               return 0;
+       }
+
+       pageinfo.metadata = metadata_hva;
+       pageinfo.contents = (u64)contents;
+
+       r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva);
+
+       free_page((unsigned long)contents);
+
+       return r;
+}
+
+static int handle_encls_einit(struct kvm_vcpu *vcpu)
+{
+       unsigned long sig_hva, secs_hva, token_hva, rflags;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       gva_t sig_gva, secs_gva, token_gva;
+       gpa_t sig_gpa, secs_gpa, token_gpa;
+       int ret, trapnr;
+
+       if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) ||
+           sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) ||
+           sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva))
+               return 1;
+
+       /*
+        * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA.
+        * Resume the guest on failure to inject a #PF.
+        */
+       if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) ||
+           sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) ||
+           sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa))
+               return 1;
+
+       /*
+        * ...and then to HVA.  The order of accesses isn't architectural, i.e.
+        * KVM doesn't have to fully process one address at a time.  Exit to
+        * userspace if a GPA is invalid.  Note, all structures are aligned and
+        * cannot split pages.
+        */
+       if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) ||
+           sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) ||
+           sgx_gpa_to_hva(vcpu, token_gpa, &token_hva))
+               return 0;
+
+       ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva,
+                            (void __user *)secs_hva,
+                            vmx->msr_ia32_sgxlepubkeyhash, &trapnr);
+
+       if (ret == -EFAULT)
+               return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+       /*
+        * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva,
+        * @token_hva or @secs_hva. This should never happen as KVM checks host
+        * addresses at memslot creation. sgx_virt_einit() has already warned
+        * in this case, so just return.
+        */
+       if (ret < 0)
+               return ret;
+
+       rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF |
+                                         X86_EFLAGS_AF | X86_EFLAGS_SF |
+                                         X86_EFLAGS_OF);
+       if (ret)
+               rflags |= X86_EFLAGS_ZF;
+       else
+               rflags &= ~X86_EFLAGS_ZF;
+       vmx_set_rflags(vcpu, rflags);
+
+       kvm_rax_write(vcpu, ret);
+       return kvm_skip_emulated_instruction(vcpu);
+}
+
+static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
+{
+       if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+               return false;
+
+       if (leaf >= ECREATE && leaf <= ETRACK)
+               return guest_cpuid_has(vcpu, X86_FEATURE_SGX1);
+
+       if (leaf >= EAUG && leaf <= EMODT)
+               return guest_cpuid_has(vcpu, X86_FEATURE_SGX2);
+
+       return false;
+}
+
+static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu)
+{
+       const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED;
+
+       return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits;
+}
+
+int handle_encls(struct kvm_vcpu *vcpu)
+{
+       u32 leaf = (u32)kvm_rax_read(vcpu);
+
+       if (!encls_leaf_enabled_in_guest(vcpu, leaf)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+       } else if (!sgx_enabled_in_guest_bios(vcpu)) {
+               kvm_inject_gp(vcpu, 0);
+       } else {
+               if (leaf == ECREATE)
+                       return handle_encls_ecreate(vcpu);
+               if (leaf == EINIT)
+                       return handle_encls_einit(vcpu);
+               WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
+               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+               vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
+               return 0;
+       }
+       return 1;
+}
+
+void setup_default_sgx_lepubkeyhash(void)
+{
+       /*
+        * Use Intel's default value for Skylake hardware if Launch Control is
+        * not supported, i.e. Intel's hash is hardcoded into silicon, or if
+        * Launch Control is supported and enabled, i.e. mimic the reset value
+        * and let the guest write the MSRs at will.  If Launch Control is
+        * supported but disabled, then use the current MSR values as the hash
+        * MSRs exist but are read-only (locked and not writable).
+        */
+       if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) ||
+           rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) {
+               sgx_pubkey_hash[0] = 0xa6053e051270b7acULL;
+               sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL;
+               sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL;
+               sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL;
+       } else {
+               /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */
+               rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]);
+               rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]);
+               rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]);
+       }
+}
+
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash,
+              sizeof(sgx_pubkey_hash));
+}
+
+/*
+ * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM
+ * restrictions if the guest's allowed-1 settings diverge from hardware.
+ */
+static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *guest_cpuid;
+       u32 eax, ebx, ecx, edx;
+
+       if (!vcpu->kvm->arch.sgx_provisioning_allowed)
+               return true;
+
+       guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+       if (!guest_cpuid)
+               return true;
+
+       cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx);
+       if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx)
+               return true;
+
+       guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+       if (!guest_cpuid)
+               return true;
+
+       cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx);
+       if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx ||
+           guest_cpuid->ecx != ecx || guest_cpuid->edx != edx)
+               return true;
+
+       return false;
+}
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+       /*
+        * There is no software enable bit for SGX that is virtualized by
+        * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the
+        * guest (either by the host or by the guest's BIOS) but enabled in the
+        * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate
+        * the expected system behavior for ENCLS.
+        */
+       u64 bitmap = -1ull;
+
+       /* Nothing to do if hardware doesn't support SGX */
+       if (!cpu_has_vmx_encls_vmexit())
+               return;
+
+       if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) &&
+           sgx_enabled_in_guest_bios(vcpu)) {
+               if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+                       bitmap &= ~GENMASK_ULL(ETRACK, ECREATE);
+                       if (sgx_intercept_encls_ecreate(vcpu))
+                               bitmap |= (1 << ECREATE);
+               }
+
+               if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2))
+                       bitmap &= ~GENMASK_ULL(EMODT, EAUG);
+
+               /*
+                * Trap and execute EINIT if launch control is enabled in the
+                * host using the guest's values for launch control MSRs, even
+                * if the guest's values are fixed to hardware default values.
+                * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing
+                * the MSRs is extraordinarily expensive.
+                */
+               if (boot_cpu_has(X86_FEATURE_SGX_LC))
+                       bitmap |= (1 << EINIT);
+
+               if (!vmcs12 && is_guest_mode(vcpu))
+                       vmcs12 = get_vmcs12(vcpu);
+               if (vmcs12 && nested_cpu_has_encls_exit(vmcs12))
+                       bitmap |= vmcs12->encls_exiting_bitmap;
+       }
+       vmcs_write64(ENCLS_EXITING_BITMAP, bitmap);
+}
diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h
new file mode 100644 (file)
index 0000000..a400888
--- /dev/null
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SGX_H
+#define __KVM_X86_SGX_H
+
+#include <linux/kvm_host.h>
+
+#include "capabilities.h"
+#include "vmx_ops.h"
+
+#ifdef CONFIG_X86_SGX_KVM
+extern bool __read_mostly enable_sgx;
+
+int handle_encls(struct kvm_vcpu *vcpu);
+
+void setup_default_sgx_lepubkeyhash(void);
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu);
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12);
+#else
+#define enable_sgx 0
+
+static inline void setup_default_sgx_lepubkeyhash(void) { }
+static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { }
+
+static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu,
+                                         struct vmcs12 *vmcs12)
+{
+       /* Nothing to do if hardware doesn't support SGX */
+       if (cpu_has_vmx_encls_vmexit())
+               vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+}
+#endif
+
+#endif /* __KVM_X86_SGX_H */
index c8e51c0..034adb6 100644 (file)
@@ -50,6 +50,7 @@ const unsigned short vmcs_field_to_offset_table[] = {
        FIELD64(VMREAD_BITMAP, vmread_bitmap),
        FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
        FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
+       FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap),
        FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
        FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
        FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
index 80232da..1349495 100644 (file)
@@ -69,7 +69,8 @@ struct __packed vmcs12 {
        u64 vm_function_control;
        u64 eptp_list_address;
        u64 pml_address;
-       u64 padding64[3]; /* room for future expansion */
+       u64 encls_exiting_bitmap;
+       u64 padding64[2]; /* room for future expansion */
        /*
         * To allow migration of L1 (complete with its L2 guests) between
         * machines of different natural widths (32 or 64 bit), we cannot have
@@ -256,6 +257,7 @@ static inline void vmx_check_vmcs12_offsets(void)
        CHECK_OFFSET(vm_function_control, 296);
        CHECK_OFFSET(eptp_list_address, 304);
        CHECK_OFFSET(pml_address, 312);
+       CHECK_OFFSET(encls_exiting_bitmap, 320);
        CHECK_OFFSET(cr0_guest_host_mask, 344);
        CHECK_OFFSET(cr4_guest_host_mask, 352);
        CHECK_OFFSET(cr0_read_shadow, 360);
index bcbf0d2..cbe0cda 100644 (file)
@@ -57,6 +57,7 @@
 #include "mmu.h"
 #include "nested.h"
 #include "pmu.h"
+#include "sgx.h"
 #include "trace.h"
 #include "vmcs.h"
 #include "vmcs12.h"
@@ -156,9 +157,11 @@ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
        MSR_IA32_SPEC_CTRL,
        MSR_IA32_PRED_CMD,
        MSR_IA32_TSC,
+#ifdef CONFIG_X86_64
        MSR_FS_BASE,
        MSR_GS_BASE,
        MSR_KERNEL_GS_BASE,
+#endif
        MSR_IA32_SYSENTER_CS,
        MSR_IA32_SYSENTER_ESP,
        MSR_IA32_SYSENTER_EIP,
@@ -361,8 +364,6 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = {
 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 
 static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
-                                                         u32 msr, int type);
 
 void vmx_vmexit(void);
 
@@ -472,26 +473,6 @@ static const u32 vmx_uret_msrs_list[] = {
 static bool __read_mostly enlightened_vmcs = true;
 module_param(enlightened_vmcs, bool, 0444);
 
-/* check_ept_pointer() should be under protection of ept_pointer_lock. */
-static void check_ept_pointer_match(struct kvm *kvm)
-{
-       struct kvm_vcpu *vcpu;
-       u64 tmp_eptp = INVALID_PAGE;
-       int i;
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (!VALID_PAGE(tmp_eptp)) {
-                       tmp_eptp = to_vmx(vcpu)->ept_pointer;
-               } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
-                       to_kvm_vmx(kvm)->ept_pointers_match
-                               = EPT_POINTERS_MISMATCH;
-                       return;
-               }
-       }
-
-       to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
-}
-
 static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
                void *data)
 {
@@ -501,47 +482,70 @@ static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush
                        range->pages);
 }
 
-static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
-               struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
+static inline int hv_remote_flush_root_ept(hpa_t root_ept,
+                                          struct kvm_tlb_range *range)
 {
-       u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
-
-       /*
-        * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
-        * of the base of EPT PML4 table, strip off EPT configuration
-        * information.
-        */
        if (range)
-               return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
+               return hyperv_flush_guest_mapping_range(root_ept,
                                kvm_fill_hv_flush_list_func, (void *)range);
        else
-               return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
+               return hyperv_flush_guest_mapping(root_ept);
 }
 
 static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
                struct kvm_tlb_range *range)
 {
+       struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
        struct kvm_vcpu *vcpu;
-       int ret = 0, i;
+       int ret = 0, i, nr_unique_valid_roots;
+       hpa_t root;
 
-       spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+       spin_lock(&kvm_vmx->hv_root_ept_lock);
 
-       if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
-               check_ept_pointer_match(kvm);
+       if (!VALID_PAGE(kvm_vmx->hv_root_ept)) {
+               nr_unique_valid_roots = 0;
 
-       if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
+               /*
+                * Flush all valid roots, and see if all vCPUs have converged
+                * on a common root, in which case future flushes can skip the
+                * loop and flush the common root.
+                */
                kvm_for_each_vcpu(i, vcpu, kvm) {
-                       /* If ept_pointer is invalid pointer, bypass flush request. */
-                       if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
-                               ret |= __hv_remote_flush_tlb_with_range(
-                                       kvm, vcpu, range);
+                       root = to_vmx(vcpu)->hv_root_ept;
+                       if (!VALID_PAGE(root) || root == kvm_vmx->hv_root_ept)
+                               continue;
+
+                       /*
+                        * Set the tracked root to the first valid root.  Keep
+                        * this root for the entirety of the loop even if more
+                        * roots are encountered as a low effort optimization
+                        * to avoid flushing the same (first) root again.
+                        */
+                       if (++nr_unique_valid_roots == 1)
+                               kvm_vmx->hv_root_ept = root;
+
+                       if (!ret)
+                               ret = hv_remote_flush_root_ept(root, range);
+
+                       /*
+                        * Stop processing roots if a failure occurred and
+                        * multiple valid roots have already been detected.
+                        */
+                       if (ret && nr_unique_valid_roots > 1)
+                               break;
                }
+
+               /*
+                * The optimized flush of a single root can't be used if there
+                * are multiple valid roots (obviously).
+                */
+               if (nr_unique_valid_roots > 1)
+                       kvm_vmx->hv_root_ept = INVALID_PAGE;
        } else {
-               ret = __hv_remote_flush_tlb_with_range(kvm,
-                               kvm_get_vcpu(kvm, 0), range);
+               ret = hv_remote_flush_root_ept(kvm_vmx->hv_root_ept, range);
        }
 
-       spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+       spin_unlock(&kvm_vmx->hv_root_ept_lock);
        return ret;
 }
 static int hv_remote_flush_tlb(struct kvm *kvm)
@@ -559,7 +563,7 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
         * evmcs in singe VM shares same assist page.
         */
        if (!*p_hv_pa_pg)
-               *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+               *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
 
        if (!*p_hv_pa_pg)
                return -ENOMEM;
@@ -576,6 +580,21 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
 
 #endif /* IS_ENABLED(CONFIG_HYPERV) */
 
+static void hv_track_root_ept(struct kvm_vcpu *vcpu, hpa_t root_ept)
+{
+#if IS_ENABLED(CONFIG_HYPERV)
+       struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+
+       if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+               spin_lock(&kvm_vmx->hv_root_ept_lock);
+               to_vmx(vcpu)->hv_root_ept = root_ept;
+               if (root_ept != kvm_vmx->hv_root_ept)
+                       kvm_vmx->hv_root_ept = INVALID_PAGE;
+               spin_unlock(&kvm_vmx->hv_root_ept_lock);
+       }
+#endif
+}
+
 /*
  * Comment's format: document - errata name - stepping - processor name.
  * Refer from
@@ -1570,12 +1589,25 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
 
 static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
 {
+       /*
+        * Emulation of instructions in SGX enclaves is impossible as RIP does
+        * not point  tthe failing instruction, and even if it did, the code
+        * stream is inaccessible.  Inject #UD instead of exiting to userspace
+        * so that guest userspace can't DoS the guest simply by triggering
+        * emulation (enclaves are CPL3 only).
+        */
+       if (to_vmx(vcpu)->exit_reason.enclave_mode) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return false;
+       }
        return true;
 }
 
 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
+       union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
        unsigned long rip, orig_rip;
+       u32 instr_len;
 
        /*
         * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
@@ -1586,9 +1618,33 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
         * i.e. we end up advancing IP with some random value.
         */
        if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
-           to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+           exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+               instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+
+               /*
+                * Emulating an enclave's instructions isn't supported as KVM
+                * cannot access the enclave's memory or its true RIP, e.g. the
+                * vmcs.GUEST_RIP points at the exit point of the enclave, not
+                * the RIP that actually triggered the VM-Exit.  But, because
+                * most instructions that cause VM-Exit will #UD in an enclave,
+                * most instruction-based VM-Exits simply do not occur.
+                *
+                * There are a few exceptions, notably the debug instructions
+                * INT1ICEBRK and INT3, as they are allowed in debug enclaves
+                * and generate #DB/#BP as expected, which KVM might intercept.
+                * But again, the CPU does the dirty work and saves an instr
+                * length of zero so VMMs don't shoot themselves in the foot.
+                * WARN if KVM tries to skip a non-zero length instruction on
+                * a VM-Exit from an enclave.
+                */
+               if (!instr_len)
+                       goto rip_updated;
+
+               WARN(exit_reason.enclave_mode,
+                    "KVM: skipping instruction after SGX enclave VM-Exit");
+
                orig_rip = kvm_rip_read(vcpu);
-               rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+               rip = orig_rip + instr_len;
 #ifdef CONFIG_X86_64
                /*
                 * We need to mask out the high 32 bits of RIP if not in 64-bit
@@ -1604,6 +1660,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
                        return 0;
        }
 
+rip_updated:
        /* skipping an emulated instruction also counts */
        vmx_set_interrupt_shadow(vcpu, 0);
 
@@ -1865,6 +1922,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_FEAT_CTL:
                msr_info->data = vmx->msr_ia32_feature_control;
                break;
+       case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+                       return 1;
+               msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
+                       [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
+               break;
        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                if (!nested_vmx_allowed(vcpu))
                        return 1;
@@ -2158,6 +2222,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vmx->msr_ia32_feature_control = data;
                if (msr_info->host_initiated && data == 0)
                        vmx_leave_nested(vcpu);
+
+               /* SGX may be enabled/disabled by guest's firmware */
+               vmx_write_encls_bitmap(vcpu, NULL);
+               break;
+       case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+               /*
+                * On real hardware, the LE hash MSRs are writable before
+                * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
+                * at which point SGX related bits in IA32_FEATURE_CONTROL
+                * become writable.
+                *
+                * KVM does not emulate SGX activation for simplicity, so
+                * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
+                * is unlocked.  This is technically not architectural
+                * behavior, but it's close enough.
+                */
+               if (!msr_info->host_initiated &&
+                   (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+                   ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
+                   !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
+                       return 1;
+               vmx->msr_ia32_sgxlepubkeyhash
+                       [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
                break;
        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                if (!msr_info->host_initiated)
@@ -3088,8 +3175,7 @@ static int vmx_get_max_tdp_level(void)
        return 4;
 }
 
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
-                  int root_level)
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
 {
        u64 eptp = VMX_EPTP_MT_WB;
 
@@ -3098,13 +3184,13 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
        if (enable_ept_ad_bits &&
            (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
                eptp |= VMX_EPTP_AD_ENABLE_BIT;
-       eptp |= (root_hpa & PAGE_MASK);
+       eptp |= root_hpa;
 
        return eptp;
 }
 
-static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
-                            int pgd_level)
+static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+                            int root_level)
 {
        struct kvm *kvm = vcpu->kvm;
        bool update_guest_cr3 = true;
@@ -3112,16 +3198,10 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
        u64 eptp;
 
        if (enable_ept) {
-               eptp = construct_eptp(vcpu, pgd, pgd_level);
+               eptp = construct_eptp(vcpu, root_hpa, root_level);
                vmcs_write64(EPT_POINTER, eptp);
 
-               if (kvm_x86_ops.tlb_remote_flush) {
-                       spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-                       to_vmx(vcpu)->ept_pointer = eptp;
-                       to_kvm_vmx(kvm)->ept_pointers_match
-                               = EPT_POINTERS_CHECK;
-                       spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-               }
+               hv_track_root_ept(vcpu, root_hpa);
 
                if (!enable_unrestricted_guest && !is_paging(vcpu))
                        guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
@@ -3131,7 +3211,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
                        update_guest_cr3 = false;
                vmx_ept_load_pdptrs(vcpu);
        } else {
-               guest_cr3 = pgd;
+               guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
        }
 
        if (update_guest_cr3)
@@ -3738,8 +3818,7 @@ static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
                __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
 }
 
-static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
-                                                         u32 msr, int type)
+void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
@@ -3784,8 +3863,7 @@ static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
                vmx_clear_msr_bitmap_write(msr_bitmap, msr);
 }
 
-static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
-                                                        u32 msr, int type)
+void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
@@ -3818,15 +3896,6 @@ static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
                vmx_set_msr_bitmap_write(msr_bitmap, msr);
 }
 
-void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
-                                                     u32 msr, int type, bool value)
-{
-       if (value)
-               vmx_enable_intercept_for_msr(vcpu, msr, type);
-       else
-               vmx_disable_intercept_for_msr(vcpu, msr, type);
-}
-
 static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
 {
        u8 mode = 0;
@@ -4314,15 +4383,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
        vmx->secondary_exec_control = exec_control;
 }
 
-static void ept_set_mmio_spte_mask(void)
-{
-       /*
-        * EPT Misconfigurations can be generated if the value of bits 2:0
-        * of an EPT paging-structure entry is 110b (write/execute).
-        */
-       kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0);
-}
-
 #define VMX_XSS_EXIT_BITMAP 0
 
 /*
@@ -4410,8 +4470,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
                vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
        }
 
-       if (cpu_has_vmx_encls_vmexit())
-               vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+       vmx_write_encls_bitmap(&vmx->vcpu, NULL);
 
        if (vmx_pt_mode_is_host_guest()) {
                memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
@@ -5020,7 +5079,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
        reg = (exit_qualification >> 8) & 15;
        switch ((exit_qualification >> 4) & 3) {
        case 0: /* mov to cr */
-               val = kvm_register_readl(vcpu, reg);
+               val = kvm_register_read(vcpu, reg);
                trace_kvm_cr_write(cr, val);
                switch (cr) {
                case 0:
@@ -5143,7 +5202,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
                kvm_register_write(vcpu, reg, val);
                err = 0;
        } else {
-               err = kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg));
+               err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
        }
 
 out:
@@ -5184,17 +5243,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
        return 1;
 }
 
-static int handle_vmcall(struct kvm_vcpu *vcpu)
-{
-       return kvm_emulate_hypercall(vcpu);
-}
-
-static int handle_invd(struct kvm_vcpu *vcpu)
-{
-       /* Treat an INVD instruction as a NOP and just skip it. */
-       return kvm_skip_emulated_instruction(vcpu);
-}
-
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
@@ -5203,28 +5251,6 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
        return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int handle_rdpmc(struct kvm_vcpu *vcpu)
-{
-       int err;
-
-       err = kvm_rdpmc(vcpu);
-       return kvm_complete_insn_gp(vcpu, err);
-}
-
-static int handle_wbinvd(struct kvm_vcpu *vcpu)
-{
-       return kvm_emulate_wbinvd(vcpu);
-}
-
-static int handle_xsetbv(struct kvm_vcpu *vcpu)
-{
-       u64 new_bv = kvm_read_edx_eax(vcpu);
-       u32 index = kvm_rcx_read(vcpu);
-
-       int err = kvm_set_xcr(vcpu, index, new_bv);
-       return kvm_complete_insn_gp(vcpu, err);
-}
-
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
        if (likely(fasteoi)) {
@@ -5361,7 +5387,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
                        EPT_VIOLATION_EXECUTABLE))
                      ? PFERR_PRESENT_MASK : 0;
 
-       error_code |= (exit_qualification & 0x100) != 0 ?
+       error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
               PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
 
        vcpu->arch.exit_qualification = exit_qualification;
@@ -5384,6 +5410,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 {
        gpa_t gpa;
 
+       if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
+               return 1;
+
        /*
         * A nested guest cannot optimize MMIO vmexits, because we have an
         * nGPA here instead of the required GPA.
@@ -5485,18 +5514,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
        }
 }
 
-static void vmx_enable_tdp(void)
-{
-       kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
-               enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
-               enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
-               0ull, VMX_EPT_EXECUTABLE_MASK,
-               cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
-               VMX_EPT_RWX_MASK, 0ull);
-
-       ept_set_mmio_spte_mask();
-}
-
 /*
  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -5516,34 +5533,11 @@ static int handle_pause(struct kvm_vcpu *vcpu)
        return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int handle_nop(struct kvm_vcpu *vcpu)
-{
-       return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int handle_mwait(struct kvm_vcpu *vcpu)
-{
-       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
-       return handle_nop(vcpu);
-}
-
-static int handle_invalid_op(struct kvm_vcpu *vcpu)
-{
-       kvm_queue_exception(vcpu, UD_VECTOR);
-       return 1;
-}
-
 static int handle_monitor_trap(struct kvm_vcpu *vcpu)
 {
        return 1;
 }
 
-static int handle_monitor(struct kvm_vcpu *vcpu)
-{
-       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
-       return handle_nop(vcpu);
-}
-
 static int handle_invpcid(struct kvm_vcpu *vcpu)
 {
        u32 vmx_instruction_info;
@@ -5560,7 +5554,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
        }
 
        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
-       type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+       type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
 
        if (type > 3) {
                kvm_inject_gp(vcpu, 0);
@@ -5632,16 +5626,18 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+#ifndef CONFIG_X86_SGX_KVM
 static int handle_encls(struct kvm_vcpu *vcpu)
 {
        /*
-        * SGX virtualization is not yet supported.  There is no software
-        * enable bit for SGX, so we have to trap ENCLS and inject a #UD
-        * to prevent the guest from executing ENCLS.
+        * SGX virtualization is disabled.  There is no software enable bit for
+        * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
+        * the guest from executing ENCLS (when SGX is supported by hardware).
         */
        kvm_queue_exception(vcpu, UD_VECTOR);
        return 1;
 }
+#endif /* CONFIG_X86_SGX_KVM */
 
 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
 {
@@ -5668,10 +5664,10 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
        [EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
        [EXIT_REASON_HLT]                     = kvm_emulate_halt,
-       [EXIT_REASON_INVD]                    = handle_invd,
+       [EXIT_REASON_INVD]                    = kvm_emulate_invd,
        [EXIT_REASON_INVLPG]                  = handle_invlpg,
-       [EXIT_REASON_RDPMC]                   = handle_rdpmc,
-       [EXIT_REASON_VMCALL]                  = handle_vmcall,
+       [EXIT_REASON_RDPMC]                   = kvm_emulate_rdpmc,
+       [EXIT_REASON_VMCALL]                  = kvm_emulate_hypercall,
        [EXIT_REASON_VMCLEAR]                 = handle_vmx_instruction,
        [EXIT_REASON_VMLAUNCH]                = handle_vmx_instruction,
        [EXIT_REASON_VMPTRLD]                 = handle_vmx_instruction,
@@ -5685,8 +5681,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
        [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
        [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
-       [EXIT_REASON_WBINVD]                  = handle_wbinvd,
-       [EXIT_REASON_XSETBV]                  = handle_xsetbv,
+       [EXIT_REASON_WBINVD]                  = kvm_emulate_wbinvd,
+       [EXIT_REASON_XSETBV]                  = kvm_emulate_xsetbv,
        [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
        [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
        [EXIT_REASON_GDTR_IDTR]               = handle_desc,
@@ -5694,13 +5690,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
        [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
-       [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_mwait,
+       [EXIT_REASON_MWAIT_INSTRUCTION]       = kvm_emulate_mwait,
        [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
-       [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
+       [EXIT_REASON_MONITOR_INSTRUCTION]     = kvm_emulate_monitor,
        [EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
        [EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
-       [EXIT_REASON_RDRAND]                  = handle_invalid_op,
-       [EXIT_REASON_RDSEED]                  = handle_invalid_op,
+       [EXIT_REASON_RDRAND]                  = kvm_handle_invalid_op,
+       [EXIT_REASON_RDSEED]                  = kvm_handle_invalid_op,
        [EXIT_REASON_PML_FULL]                = handle_pml_full,
        [EXIT_REASON_INVPCID]                 = handle_invpcid,
        [EXIT_REASON_VMFUNC]                  = handle_vmx_instruction,
@@ -5787,12 +5783,23 @@ static void vmx_dump_dtsel(char *name, uint32_t limit)
               vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
 }
 
-void dump_vmcs(void)
+static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
+{
+       unsigned int i;
+       struct vmx_msr_entry *e;
+
+       pr_err("MSR %s:\n", name);
+       for (i = 0, e = m->val; i < m->nr; ++i, ++e)
+               pr_err("  %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
+}
+
+void dump_vmcs(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 vmentry_ctl, vmexit_ctl;
        u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
        unsigned long cr4;
-       u64 efer;
+       int efer_slot;
 
        if (!dump_invalid_vmcs) {
                pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
@@ -5804,7 +5811,6 @@ void dump_vmcs(void)
        cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
        cr4 = vmcs_readl(GUEST_CR4);
-       efer = vmcs_read64(GUEST_IA32_EFER);
        secondary_exec_control = 0;
        if (cpu_has_secondary_exec_ctrls())
                secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -5816,9 +5822,7 @@ void dump_vmcs(void)
        pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
               cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
        pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
-       if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
-           (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
-       {
+       if (cpu_has_vmx_ept()) {
                pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
                       vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
                pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
@@ -5841,10 +5845,20 @@ void dump_vmcs(void)
        vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
        vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
        vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
-       if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
-           (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
-               pr_err("EFER =     0x%016llx  PAT = 0x%016llx\n",
-                      efer, vmcs_read64(GUEST_IA32_PAT));
+       efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
+       if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
+               pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
+       else if (efer_slot >= 0)
+               pr_err("EFER= 0x%016llx (autoload)\n",
+                      vmx->msr_autoload.guest.val[efer_slot].value);
+       else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
+               pr_err("EFER= 0x%016llx (effective)\n",
+                      vcpu->arch.efer | (EFER_LMA | EFER_LME));
+       else
+               pr_err("EFER= 0x%016llx (effective)\n",
+                      vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
+       if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
+               pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
        pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
               vmcs_read64(GUEST_IA32_DEBUGCTL),
               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
@@ -5860,6 +5874,10 @@ void dump_vmcs(void)
        if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
                pr_err("InterruptStatus = %04x\n",
                       vmcs_read16(GUEST_INTR_STATUS));
+       if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
+               vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
+       if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
+               vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
 
        pr_err("*** Host State ***\n");
        pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
@@ -5881,14 +5899,16 @@ void dump_vmcs(void)
               vmcs_readl(HOST_IA32_SYSENTER_ESP),
               vmcs_read32(HOST_IA32_SYSENTER_CS),
               vmcs_readl(HOST_IA32_SYSENTER_EIP));
-       if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
-               pr_err("EFER = 0x%016llx  PAT = 0x%016llx\n",
-                      vmcs_read64(HOST_IA32_EFER),
-                      vmcs_read64(HOST_IA32_PAT));
+       if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
+               pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
+       if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
+               pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
        if (cpu_has_load_perf_global_ctrl() &&
            vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
                pr_err("PerfGlobCtl = 0x%016llx\n",
                       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+       if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
+               vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
 
        pr_err("*** Control State ***\n");
        pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
@@ -5997,7 +6017,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
        }
 
        if (exit_reason.failed_vmentry) {
-               dump_vmcs();
+               dump_vmcs(vcpu);
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason
                        = exit_reason.full;
@@ -6006,7 +6026,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
        }
 
        if (unlikely(vmx->fail)) {
-               dump_vmcs();
+               dump_vmcs(vcpu);
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason
                        = vmcs_read32(VM_INSTRUCTION_ERROR);
@@ -6092,7 +6112,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 unexpected_vmexit:
        vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
                    exit_reason.full);
-       dump_vmcs();
+       dump_vmcs(vcpu);
        vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
        vcpu->run->internal.suberror =
                        KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
@@ -6938,9 +6958,11 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
        bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
 
        vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+#ifdef CONFIG_X86_64
        vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
        vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
        vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
        vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
        vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
        vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
@@ -6976,6 +6998,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
        else
                memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
 
+       vcpu_setup_sgx_lepubkeyhash(vcpu);
+
        vmx->nested.posted_intr_nv = -1;
        vmx->nested.current_vmptr = -1ull;
 
@@ -6989,8 +7013,9 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
        vmx->pi_desc.nv = POSTED_INTR_VECTOR;
        vmx->pi_desc.sn = 1;
 
-       vmx->ept_pointer = INVALID_PAGE;
-
+#if IS_ENABLED(CONFIG_HYPERV)
+       vmx->hv_root_ept = INVALID_PAGE;
+#endif
        return 0;
 
 free_vmcs:
@@ -7007,7 +7032,9 @@ free_vpid:
 
 static int vmx_vm_init(struct kvm *kvm)
 {
-       spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+#if IS_ENABLED(CONFIG_HYPERV)
+       spin_lock_init(&to_kvm_vmx(kvm)->hv_root_ept_lock);
+#endif
 
        if (!ple_gap)
                kvm->arch.pause_in_guest = true;
@@ -7302,6 +7329,19 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 
        set_cr4_guest_host_mask(vmx);
 
+       vmx_write_encls_bitmap(vcpu, NULL);
+       if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+               vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
+       else
+               vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
+
+       if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+               vmx->msr_ia32_feature_control_valid_bits |=
+                       FEAT_CTL_SGX_LC_ENABLED;
+       else
+               vmx->msr_ia32_feature_control_valid_bits &=
+                       ~FEAT_CTL_SGX_LC_ENABLED;
+
        /* Refresh #PF interception to account for MAXPHYADDR changes. */
        vmx_update_exception_bitmap(vcpu);
 }
@@ -7322,6 +7362,13 @@ static __init void vmx_set_cpu_caps(void)
        if (vmx_pt_mode_is_host_guest())
                kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
 
+       if (!enable_sgx) {
+               kvm_cpu_cap_clear(X86_FEATURE_SGX);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX1);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+       }
+
        if (vmx_umip_emulated())
                kvm_cpu_cap_set(X86_FEATURE_UMIP);
 
@@ -7848,7 +7895,8 @@ static __init int hardware_setup(void)
        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
        if (enable_ept)
-               vmx_enable_tdp();
+               kvm_mmu_set_ept_masks(enable_ept_ad_bits,
+                                     cpu_has_vmx_ept_execute_only());
 
        if (!enable_ept)
                ept_lpage_level = 0;
@@ -7909,6 +7957,8 @@ static __init int hardware_setup(void)
        if (!enable_ept || !cpu_has_vmx_intel_pt())
                pt_mode = PT_MODE_SYSTEM;
 
+       setup_default_sgx_lepubkeyhash();
+
        if (nested) {
                nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
                                           vmx_capability.ept);
index 89da5e1..008cb87 100644 (file)
@@ -325,7 +325,12 @@ struct vcpu_vmx {
         */
        u64 msr_ia32_feature_control;
        u64 msr_ia32_feature_control_valid_bits;
-       u64 ept_pointer;
+       /* SGX Launch Control public key hash */
+       u64 msr_ia32_sgxlepubkeyhash[4];
+
+#if IS_ENABLED(CONFIG_HYPERV)
+       u64 hv_root_ept;
+#endif
 
        struct pt_desc pt_desc;
        struct lbr_desc lbr_desc;
@@ -338,12 +343,6 @@ struct vcpu_vmx {
        } shadow_msr_intercept;
 };
 
-enum ept_pointers_status {
-       EPT_POINTERS_CHECK = 0,
-       EPT_POINTERS_MATCH = 1,
-       EPT_POINTERS_MISMATCH = 2
-};
-
 struct kvm_vmx {
        struct kvm kvm;
 
@@ -351,8 +350,10 @@ struct kvm_vmx {
        bool ept_identity_pagetable_done;
        gpa_t ept_identity_map_addr;
 
-       enum ept_pointers_status ept_pointers_match;
-       spinlock_t ept_pointer_lock;
+#if IS_ENABLED(CONFIG_HYPERV)
+       hpa_t hv_root_ept;
+       spinlock_t hv_root_ept_lock;
+#endif
 };
 
 bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
@@ -376,8 +377,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
 void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
-                  int root_level);
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
 
 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
 void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
@@ -392,8 +392,19 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
-void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
-       u32 msr, int type, bool value);
+
+void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
+void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
+
+static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
+                                            int type, bool value)
+{
+       if (value)
+               vmx_enable_intercept_for_msr(vcpu, msr, type);
+       else
+               vmx_disable_intercept_for_msr(vcpu, msr, type);
+}
+
 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
 
 static inline u8 vmx_get_rvi(void)
@@ -543,6 +554,6 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu)
        return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu);
 }
 
-void dump_vmcs(void);
+void dump_vmcs(struct kvm_vcpu *vcpu);
 
 #endif /* __KVM_X86_VMX_H */
index 692b0c3..164b64f 100644 (file)
@@ -37,6 +37,10 @@ static __always_inline void vmcs_check32(unsigned long field)
 {
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
                         "32-bit accessor invalid for 16-bit field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+                        "32-bit accessor invalid for 64-bit field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+                        "32-bit accessor invalid for 64-bit high field");
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
                         "32-bit accessor invalid for natural width field");
 }
index efc7a82..cebdaa1 100644 (file)
@@ -75,6 +75,7 @@
 #include <asm/tlbflush.h>
 #include <asm/intel_pt.h>
 #include <asm/emulate_prefix.h>
+#include <asm/sgx.h>
 #include <clocksource/hyperv_timer.h>
 
 #define CREATE_TRACE_POINTS
@@ -245,6 +246,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        VCPU_STAT("l1d_flush", l1d_flush),
        VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
        VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+       VCPU_STAT("nested_run", nested_run),
+       VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
+       VCPU_STAT("directed_yield_successful", directed_yield_successful),
        VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
        VM_STAT("mmu_pte_write", mmu_pte_write),
        VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
@@ -543,8 +547,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 
        if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
        queue:
-               if (has_error && !is_protmode(vcpu))
-                       has_error = false;
                if (reinject) {
                        /*
                         * On vmentry, vcpu->arch.exception.pending is only
@@ -983,14 +985,17 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
        return 0;
 }
 
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
 {
-       if (static_call(kvm_x86_get_cpl)(vcpu) == 0)
-               return __kvm_set_xcr(vcpu, index, xcr);
+       if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
+           __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
 
-       return 1;
+       return kvm_skip_emulated_instruction(vcpu);
 }
-EXPORT_SYMBOL_GPL(kvm_set_xcr);
+EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
 
 bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
@@ -1072,10 +1077,15 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                return 0;
        }
 
-       if (is_long_mode(vcpu) && kvm_vcpu_is_illegal_gpa(vcpu, cr3))
+       /*
+        * Do not condition the GPA check on long mode, this helper is used to
+        * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
+        * the current vCPU mode is accurate.
+        */
+       if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
                return 1;
-       else if (is_pae_paging(vcpu) &&
-                !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+
+       if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
                return 1;
 
        kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
@@ -1191,20 +1201,21 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 }
 EXPORT_SYMBOL_GPL(kvm_get_dr);
 
-bool kvm_rdpmc(struct kvm_vcpu *vcpu)
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
 {
        u32 ecx = kvm_rcx_read(vcpu);
        u64 data;
-       int err;
 
-       err = kvm_pmu_rdpmc(vcpu, ecx, &data);
-       if (err)
-               return err;
+       if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
        kvm_rax_write(vcpu, (u32)data);
        kvm_rdx_write(vcpu, data >> 32);
-       return err;
+       return kvm_skip_emulated_instruction(vcpu);
 }
-EXPORT_SYMBOL_GPL(kvm_rdpmc);
+EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
 
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -1791,6 +1802,40 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
 
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
+{
+       return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_as_nop);
+
+int kvm_emulate_invd(struct kvm_vcpu *vcpu)
+{
+       /* Treat an INVD instruction as a NOP and just skip it. */
+       return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_invd);
+
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
+{
+       pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n");
+       return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
+
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+       kvm_queue_exception(vcpu, UD_VECTOR);
+       return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
+
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+{
+       pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n");
+       return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
+
 static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
 {
        xfer_to_guest_mode_prepare();
@@ -3382,6 +3427,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = 0;
                break;
        case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+                       return kvm_pmu_get_msr(vcpu, msr_info);
+               if (!msr_info->host_initiated)
+                       return 1;
+               msr_info->data = 0;
+               break;
        case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
        case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
        case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -3771,8 +3822,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_X86_USER_SPACE_MSR:
        case KVM_CAP_X86_MSR_FILTER:
        case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+#ifdef CONFIG_X86_SGX_KVM
+       case KVM_CAP_SGX_ATTRIBUTE:
+#endif
+       case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
                r = 1;
                break;
+       case KVM_CAP_SET_GUEST_DEBUG2:
+               return KVM_GUESTDBG_VALID_MASK;
 #ifdef CONFIG_KVM_XEN
        case KVM_CAP_XEN_HVM:
                r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
@@ -4673,7 +4730,6 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
                        kvm_update_pv_runtime(vcpu);
 
                return 0;
-
        default:
                return -EINVAL;
        }
@@ -5355,6 +5411,28 @@ split_irqchip_unlock:
                        kvm->arch.bus_lock_detection_enabled = true;
                r = 0;
                break;
+#ifdef CONFIG_X86_SGX_KVM
+       case KVM_CAP_SGX_ATTRIBUTE: {
+               unsigned long allowed_attributes = 0;
+
+               r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
+               if (r)
+                       break;
+
+               /* KVM only supports the PROVISIONKEY privileged attribute. */
+               if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
+                   !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
+                       kvm->arch.sgx_provisioning_allowed = true;
+               else
+                       r = -EINVAL;
+               break;
+       }
+#endif
+       case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+               r = -EINVAL;
+               if (kvm_x86_ops.vm_copy_enc_context_from)
+                       r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
+               return r;
        default:
                r = -EINVAL;
                break;
@@ -5999,6 +6077,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
        u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
 
  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
                                struct x86_exception *exception)
@@ -6015,6 +6094,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
        access |= PFERR_WRITE_MASK;
        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
 
 /* uses this to access any guest's mapped memory without checking CPL */
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
@@ -6934,12 +7014,12 @@ static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
 
 static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
 {
-       return kvm_register_read(emul_to_vcpu(ctxt), reg);
+       return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
 }
 
 static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
 {
-       kvm_register_write(emul_to_vcpu(ctxt), reg, val);
+       kvm_register_write_raw(emul_to_vcpu(ctxt), reg, val);
 }
 
 static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
@@ -8043,9 +8123,6 @@ int kvm_arch_init(void *opaque)
        if (r)
                goto out_free_percpu;
 
-       kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-                       PT_DIRTY_MASK, PT64_NX_MASK, 0,
-                       PT_PRESENT_MASK, 0, sme_me_mask);
        kvm_timer_init();
 
        perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -8205,21 +8282,35 @@ void kvm_apicv_init(struct kvm *kvm, bool enable)
 }
 EXPORT_SYMBOL_GPL(kvm_apicv_init);
 
-static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
+static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
 {
        struct kvm_vcpu *target = NULL;
        struct kvm_apic_map *map;
 
+       vcpu->stat.directed_yield_attempted++;
+
        rcu_read_lock();
-       map = rcu_dereference(kvm->arch.apic_map);
+       map = rcu_dereference(vcpu->kvm->arch.apic_map);
 
        if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
                target = map->phys_map[dest_id]->vcpu;
 
        rcu_read_unlock();
 
-       if (target && READ_ONCE(target->ready))
-               kvm_vcpu_yield_to(target);
+       if (!target || !READ_ONCE(target->ready))
+               goto no_yield;
+
+       /* Ignore requests to yield to self */
+       if (vcpu == target)
+               goto no_yield;
+
+       if (kvm_vcpu_yield_to(target) <= 0)
+               goto no_yield;
+
+       vcpu->stat.directed_yield_successful++;
+
+no_yield:
+       return;
 }
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
@@ -8266,7 +8357,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                        break;
 
                kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
-               kvm_sched_yield(vcpu->kvm, a1);
+               kvm_sched_yield(vcpu, a1);
                ret = 0;
                break;
 #ifdef CONFIG_X86_64
@@ -8284,7 +8375,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
                        break;
 
-               kvm_sched_yield(vcpu->kvm, a0);
+               kvm_sched_yield(vcpu, a0);
                ret = 0;
                break;
        default:
@@ -8367,6 +8458,27 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
        static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
 }
 
+
+int kvm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+       if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
+               return -EIO;
+
+       if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+               kvm_x86_ops.nested_ops->triple_fault(vcpu);
+               return 1;
+       }
+
+       return kvm_x86_ops.nested_ops->check_events(vcpu);
+}
+
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
+               vcpu->arch.exception.error_code = false;
+       static_call(kvm_x86_queue_exception)(vcpu);
+}
+
 static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
 {
        int r;
@@ -8375,7 +8487,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
        /* try to reinject previous events if any */
 
        if (vcpu->arch.exception.injected) {
-               static_call(kvm_x86_queue_exception)(vcpu);
+               kvm_inject_exception(vcpu);
                can_inject = false;
        }
        /*
@@ -8412,7 +8524,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
         * from L2 to L1.
         */
        if (is_guest_mode(vcpu)) {
-               r = kvm_x86_ops.nested_ops->check_events(vcpu);
+               r = kvm_check_nested_events(vcpu);
                if (r < 0)
                        goto busy;
        }
@@ -8438,7 +8550,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
                        }
                }
 
-               static_call(kvm_x86_queue_exception)(vcpu);
+               kvm_inject_exception(vcpu);
                can_inject = false;
        }
 
@@ -8587,7 +8699,7 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
        put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
 
        for (i = 0; i < 8; i++)
-               put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
+               put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i));
 
        kvm_get_dr(vcpu, 6, &val);
        put_smstate(u32, buf, 0x7fcc, (u32)val);
@@ -8633,7 +8745,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
        int i;
 
        for (i = 0; i < 16; i++)
-               put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
+               put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i));
 
        put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
        put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
@@ -8975,10 +9087,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        goto out;
                }
                if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
-                       vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
-                       vcpu->mmio_needed = 0;
-                       r = 0;
-                       goto out;
+                       if (is_guest_mode(vcpu)) {
+                               kvm_x86_ops.nested_ops->triple_fault(vcpu);
+                       } else {
+                               vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+                               vcpu->mmio_needed = 0;
+                               r = 0;
+                               goto out;
+                       }
                }
                if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
                        /* Page is swapped out. Do synthetic halt */
@@ -9276,7 +9392,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 {
        if (is_guest_mode(vcpu))
-               kvm_x86_ops.nested_ops->check_events(vcpu);
+               kvm_check_nested_events(vcpu);
 
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                !vcpu->arch.apf.halted);
@@ -11002,6 +11118,14 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
        return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
 }
 
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
+               return true;
+
+       return false;
+}
+
 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
 {
        if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
@@ -11012,14 +11136,14 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
                 kvm_test_request(KVM_REQ_EVENT, vcpu))
                return true;
 
-       if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
-               return true;
-
-       return false;
+       return kvm_arch_dy_has_pending_interrupt(vcpu);
 }
 
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
 {
+       if (vcpu->arch.guest_state_protected)
+               return true;
+
        return vcpu->arch.preempted_in_kernel;
 }
 
@@ -11290,7 +11414,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
        if (!kvm_pv_async_pf_enabled(vcpu))
                return true;
        else
-               return apf_pageready_slot_free(vcpu);
+               return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
 }
 
 void kvm_arch_start_assignment(struct kvm *kvm)
@@ -11539,7 +11663,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
 
                fallthrough;
        case INVPCID_TYPE_ALL_INCL_GLOBAL:
-               kvm_mmu_unload(vcpu);
+               kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
                return kvm_skip_emulated_instruction(vcpu);
 
        default:
index 9035e34..8ddd381 100644 (file)
@@ -8,6 +8,14 @@
 #include "kvm_cache_regs.h"
 #include "kvm_emulate.h"
 
+#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check)                \
+({                                                                     \
+       bool failed = (consistency_check);                              \
+       if (failed)                                                     \
+               trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
+       failed;                                                         \
+})
+
 #define KVM_DEFAULT_PLE_GAP            128
 #define KVM_VMX_DEFAULT_PLE_WINDOW     4096
 #define KVM_DEFAULT_PLE_WINDOW_GROW    2
@@ -48,6 +56,8 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
 
 #define MSR_IA32_CR_PAT_DEFAULT  0x0007040600070406ULL
 
+int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+
 static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 {
        vcpu->arch.exception.pending = false;
@@ -222,19 +232,19 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
        return false;
 }
 
-static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, int reg)
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
 {
-       unsigned long val = kvm_register_read(vcpu, reg);
+       unsigned long val = kvm_register_read_raw(vcpu, reg);
 
        return is_64_bit_mode(vcpu) ? val : (u32)val;
 }
 
-static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
+static inline void kvm_register_write(struct kvm_vcpu *vcpu,
                                       int reg, unsigned long val)
 {
        if (!is_64_bit_mode(vcpu))
                val = (u32)val;
-       return kvm_register_write(vcpu, reg, val);
+       return kvm_register_write_raw(vcpu, reg, val);
 }
 
 static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
index f633f9e..ff08dc4 100644 (file)
@@ -45,8 +45,6 @@ EXPORT_SYMBOL(sme_me_mask);
 DEFINE_STATIC_KEY_FALSE(sev_enable_key);
 EXPORT_SYMBOL_GPL(sev_enable_key);
 
-bool sev_enabled __section(".data");
-
 /* Buffer used for early in-place encryption by BSP, no locking needed */
 static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE);
 
@@ -374,14 +372,14 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
  * up under SME the trampoline area cannot be encrypted, whereas under SEV
  * the trampoline area must be encrypted.
  */
-bool sme_active(void)
+bool sev_active(void)
 {
-       return sme_me_mask && !sev_enabled;
+       return sev_status & MSR_AMD64_SEV_ENABLED;
 }
 
-bool sev_active(void)
+bool sme_active(void)
 {
-       return sev_status & MSR_AMD64_SEV_ENABLED;
+       return sme_me_mask && !sev_active();
 }
 EXPORT_SYMBOL_GPL(sev_active);
 
index a19374d..04aba7e 100644 (file)
@@ -548,7 +548,6 @@ void __init sme_enable(struct boot_params *bp)
        } else {
                /* SEV state cannot be controlled by a command line option */
                sme_me_mask = me_mask;
-               sev_enabled = true;
                physical_mask &= ~sme_me_mask;
                return;
        }
index 4279806..156cd23 100644 (file)
@@ -16,6 +16,8 @@
 #include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/libnvdimm.h>
+#include <linux/vmstat.h>
+#include <linux/kernel.h>
 
 #include <asm/e820/api.h>
 #include <asm/processor.h>
@@ -91,6 +93,12 @@ static void split_page_count(int level)
                return;
 
        direct_pages_count[level]--;
+       if (system_state == SYSTEM_RUNNING) {
+               if (level == PG_LEVEL_2M)
+                       count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
+               else if (level == PG_LEVEL_1G)
+                       count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
+       }
        direct_pages_count[level - 1] += PTRS_PER_PTE;
 }
 
index bfa50e6..ae744b6 100644 (file)
@@ -126,7 +126,7 @@ static int __init early_root_info_init(void)
                node = (reg >> 4) & 0x07;
                link = (reg >> 8) & 0x03;
 
-               info = alloc_pci_root_info(min_bus, max_bus, node, link);
+               alloc_pci_root_info(min_bus, max_bus, node, link);
        }
 
        /*
index 77f70b9..5ccb182 100644 (file)
@@ -21,6 +21,7 @@ obj-y += checksum_32.o syscalls_32.o
 obj-$(CONFIG_ELF_CORE) += elfcore.o
 
 subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
+subarch-y += ../lib/cmpxchg8b_emu.o ../lib/atomic64_386_32.o
 subarch-y += ../kernel/sys_ia32.o
 
 else
index c907b20..dcaf3b3 100644 (file)
@@ -212,6 +212,6 @@ extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
 extern long elf_aux_hwcap;
 #define ELF_HWCAP (elf_aux_hwcap)
 
-#define SET_PERSONALITY(ex) do ; while(0)
+#define SET_PERSONALITY(ex) do {} while(0)
 
 #endif
index c3891c1..b95db9d 100644 (file)
@@ -77,7 +77,7 @@ static inline void trap_myself(void)
        __asm("int3");
 }
 
-static void inline remap_stack_and_trap(void)
+static inline void remap_stack_and_trap(void)
 {
        __asm__ volatile (
                "movl %%esp,%%ebx ;"
index 19ae3e4..54f9aa7 100644 (file)
@@ -59,7 +59,7 @@ int __init pci_xen_swiotlb_detect(void)
 void __init pci_xen_swiotlb_init(void)
 {
        if (xen_swiotlb) {
-               xen_swiotlb_init(1, true /* early */);
+               xen_swiotlb_init_early();
                dma_ops = &xen_swiotlb_dma_ops;
 
 #ifdef CONFIG_PCI
@@ -76,7 +76,7 @@ int pci_xen_swiotlb_init_late(void)
        if (xen_swiotlb)
                return 0;
 
-       rc = xen_swiotlb_init(1, false /* late */);
+       rc = xen_swiotlb_init();
        if (rc)
                return rc;
 
index 4f1ff95..062148e 100644 (file)
@@ -72,7 +72,6 @@ CONFIG_MARVELL_PHY=y
 # CONFIG_INPUT_KEYBOARD is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
-CONFIG_DEVKMEM=y
 CONFIG_SERIAL_8250=y
 # CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
 CONFIG_SERIAL_8250_CONSOLE=y
index c71cc45..9d76d43 100644 (file)
 441    common  epoll_pwait2                    sys_epoll_pwait2
 442    common  mount_setattr                   sys_mount_setattr
 443    common  quotactl_path                   sys_quotactl_path
+444    common  landlock_create_ruleset         sys_landlock_create_ruleset
+445    common  landlock_add_rule               sys_landlock_add_rule
+446    common  landlock_restrict_self          sys_landlock_restrict_self
index 44205df..221dc56 100644 (file)
@@ -255,6 +255,13 @@ void bio_init(struct bio *bio, struct bio_vec *table,
 }
 EXPORT_SYMBOL(bio_init);
 
+unsigned int bio_max_size(struct bio *bio)
+{
+       struct block_device *bdev = bio->bi_bdev;
+
+       return bdev ? bdev->bd_disk->queue->limits.bio_max_bytes : UINT_MAX;
+}
+
 /**
  * bio_reset - reinitialize a bio
  * @bio:       bio to reset
@@ -866,7 +873,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
                struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
 
                if (page_is_mergeable(bv, page, len, off, same_page)) {
-                       if (bio->bi_iter.bi_size > UINT_MAX - len) {
+                       if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
                                *same_page = false;
                                return false;
                        }
@@ -995,6 +1002,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
        unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
        unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
+       unsigned int bytes_left = bio_max_size(bio) - bio->bi_iter.bi_size;
        struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
        struct page **pages = (struct page **)bv;
        bool same_page = false;
@@ -1010,7 +1018,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
        BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
        pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
-       size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+       size = iov_iter_get_pages(iter, pages, bytes_left, nr_pages,
+                                 &offset);
        if (unlikely(size <= 0))
                return size ? size : -EFAULT;
 
index 9c00909..c6f80e3 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/gcd.h>
 #include <linux/lcm.h>
 #include <linux/jiffies.h>
@@ -31,6 +32,7 @@ EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
  */
 void blk_set_default_limits(struct queue_limits *lim)
 {
+       lim->bio_max_bytes = UINT_MAX;
        lim->max_segments = BLK_MAX_SEGMENTS;
        lim->max_discard_segments = 1;
        lim->max_integrity_segments = 0;
@@ -139,6 +141,10 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
                                 limits->logical_block_size >> SECTOR_SHIFT);
        limits->max_sectors = max_sectors;
 
+       if (check_shl_overflow(max_sectors, SECTOR_SHIFT,
+                               &limits->bio_max_bytes))
+               limits->bio_max_bytes = UINT_MAX;
+
        q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9);
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
index ab88d2a..f4e6111 100644 (file)
@@ -4,7 +4,7 @@ menu "Certificates for signature checking"
 config MODULE_SIG_KEY
        string "File name or PKCS#11 URI of module signing key"
        default "certs/signing_key.pem"
-       depends on MODULE_SIG
+       depends on MODULE_SIG || (IMA_APPRAISE_MODSIG && MODULES)
        help
          Provide the file name of a private key/certificate in PEM format,
          or a PKCS#11 URI according to RFC7512. The file should contain, or
index b6db52e..359239a 100644 (file)
@@ -33,6 +33,16 @@ endif # CONFIG_SYSTEM_TRUSTED_KEYRING
 clean-files := x509_certificate_list .x509.list x509_revocation_list
 
 ifeq ($(CONFIG_MODULE_SIG),y)
+       SIGN_KEY = y
+endif
+
+ifeq ($(CONFIG_IMA_APPRAISE_MODSIG),y)
+ifeq ($(CONFIG_MODULES),y)
+       SIGN_KEY = y
+endif
+endif
+
+ifdef SIGN_KEY
 ###############################################################################
 #
 # If module signing is requested, say by allyesconfig, but a key has not been
index 8f29058..e1645e6 100644 (file)
@@ -8,9 +8,12 @@
        .globl system_certificate_list
 system_certificate_list:
 __cert_list_start:
-#ifdef CONFIG_MODULE_SIG
+__module_cert_start:
+#if defined(CONFIG_MODULE_SIG) || (defined(CONFIG_IMA_APPRAISE_MODSIG) \
+                              && defined(CONFIG_MODULES))
        .incbin "certs/signing_key.x509"
 #endif
+__module_cert_end:
        .incbin "certs/x509_certificate_list"
 __cert_list_end:
 
@@ -35,3 +38,12 @@ system_certificate_list_size:
 #else
        .long __cert_list_end - __cert_list_start
 #endif
+
+       .align 8
+       .globl module_cert_size
+module_cert_size:
+#ifdef CONFIG_64BIT
+       .quad __module_cert_end - __module_cert_start
+#else
+       .long __module_cert_end - __module_cert_start
+#endif
index 0c9a479..692365d 100644 (file)
@@ -28,6 +28,7 @@ static struct key *platform_trusted_keys;
 
 extern __initconst const u8 system_certificate_list[];
 extern __initconst const unsigned long system_certificate_list_size;
+extern __initconst const unsigned long module_cert_size;
 
 /**
  * restrict_link_to_builtin_trusted - Restrict keyring addition by built in CA
@@ -133,15 +134,35 @@ static __init int system_trusted_keyring_init(void)
  */
 device_initcall(system_trusted_keyring_init);
 
+__init int load_module_cert(struct key *keyring)
+{
+       if (!IS_ENABLED(CONFIG_IMA_APPRAISE_MODSIG))
+               return 0;
+
+       pr_notice("Loading compiled-in module X.509 certificates\n");
+
+       return load_certificate_list(system_certificate_list, module_cert_size, keyring);
+}
+
 /*
  * Load the compiled-in list of X.509 certificates.
  */
 static __init int load_system_certificate_list(void)
 {
+       const u8 *p;
+       unsigned long size;
+
        pr_notice("Loading compiled-in X.509 certificates\n");
 
-       return load_certificate_list(system_certificate_list, system_certificate_list_size,
-                                    builtin_trusted_keys);
+#ifdef CONFIG_MODULE_SIG
+       p = system_certificate_list;
+       size = system_certificate_list_size;
+#else
+       p = system_certificate_list + module_cert_size;
+       size = system_certificate_list_size - module_cert_size;
+#endif
+
+       return load_certificate_list(p, size, builtin_trusted_keys);
 }
 late_initcall(load_system_certificate_list);
 
index 8f3fee8..5a6d613 100644 (file)
@@ -42,6 +42,7 @@ obj-$(CONFIG_DMADEVICES)      += dma/
 obj-y                          += soc/
 
 obj-$(CONFIG_VIRTIO)           += virtio/
+obj-$(CONFIG_VIRTIO_PCI_LIB)   += virtio/
 obj-$(CONFIG_VDPA)             += vdpa/
 obj-$(CONFIG_XEN)              += xen/
 
index b02fd51..8cc195c 100644 (file)
@@ -171,6 +171,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
        acpi_handle handle = mem_device->device->handle;
        int result, num_enabled = 0;
        struct acpi_memory_info *info;
+       mhp_t mhp_flags = MHP_NONE;
        int node;
 
        node = acpi_get_node(handle);
@@ -194,8 +195,10 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
                if (node < 0)
                        node = memory_add_physaddr_to_nid(info->start_addr);
 
+               if (mhp_supports_memmap_on_memory(info->length))
+                       mhp_flags |= MHP_MEMMAP_ON_MEMORY;
                result = __add_memory(node, info->start_addr, info->length,
-                                     MHP_NONE);
+                                     mhp_flags);
 
                /*
                 * If the memory block has been used by the kernel, add_memory()
index f2d0e59..0a0a982 100644 (file)
@@ -329,7 +329,7 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd,
                                        int index)
 {
        struct platform_device *pdev;
-       int irq = map_gt_gsi(wd->timer_interrupt, wd->timer_flags);
+       int irq;
 
        /*
         * According to SBSA specification the size of refresh and control
@@ -338,7 +338,7 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd,
        struct resource res[] = {
                DEFINE_RES_MEM(wd->control_frame_address, SZ_4K),
                DEFINE_RES_MEM(wd->refresh_frame_address, SZ_4K),
-               DEFINE_RES_IRQ(irq),
+               {},
        };
        int nr_res = ARRAY_SIZE(res);
 
@@ -348,10 +348,11 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd,
 
        if (!(wd->refresh_frame_address && wd->control_frame_address)) {
                pr_err(FW_BUG "failed to get the Watchdog base address.\n");
-               acpi_unregister_gsi(wd->timer_interrupt);
                return -EINVAL;
        }
 
+       irq = map_gt_gsi(wd->timer_interrupt, wd->timer_flags);
+       res[2] = (struct resource)DEFINE_RES_IRQ(irq);
        if (irq <= 0) {
                pr_warn("failed to map the Watchdog interrupt.\n");
                nr_res--;
@@ -364,7 +365,8 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd,
         */
        pdev = platform_device_register_simple("sbsa-gwdt", index, res, nr_res);
        if (IS_ERR(pdev)) {
-               acpi_unregister_gsi(wd->timer_interrupt);
+               if (irq > 0)
+                       acpi_unregister_gsi(wd->timer_interrupt);
                return PTR_ERR(pdev);
        }
 
index 2494138..3912a1f 100644 (file)
@@ -968,15 +968,16 @@ static int iort_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data)
 static void iort_named_component_init(struct device *dev,
                                      struct acpi_iort_node *node)
 {
+       struct property_entry props[2] = {};
        struct acpi_iort_named_component *nc;
-       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-
-       if (!fwspec)
-               return;
 
        nc = (struct acpi_iort_named_component *)node->node_data;
-       fwspec->num_pasid_bits = FIELD_GET(ACPI_IORT_NC_PASID_BITS,
-                                          nc->node_flags);
+       props[0] = PROPERTY_ENTRY_U32("pasid-num-bits",
+                                     FIELD_GET(ACPI_IORT_NC_PASID_BITS,
+                                               nc->node_flags));
+
+       if (device_add_properties(dev, props))
+               dev_warn(dev, "Could not add device properties\n");
 }
 
 static int iort_nc_iommu_map(struct device *dev, struct acpi_iort_node *node)
index 443fdf6..d39a9b4 100644 (file)
@@ -42,6 +42,8 @@ static ssize_t cm_write(struct file *file, const char __user *user_buf,
                                   sizeof(struct acpi_table_header)))
                        return -EFAULT;
                uncopied_bytes = max_size = table.length;
+               /* make sure the buf is not allocated */
+               kfree(buf);
                buf = kzalloc(max_size, GFP_KERNEL);
                if (!buf)
                        return -ENOMEM;
@@ -55,6 +57,7 @@ static ssize_t cm_write(struct file *file, const char __user *user_buf,
            (*ppos + count < count) ||
            (count > uncopied_bytes)) {
                kfree(buf);
+               buf = NULL;
                return -EINVAL;
        }
 
@@ -76,7 +79,6 @@ static ssize_t cm_write(struct file *file, const char __user *user_buf,
                add_taint(TAINT_OVERRIDDEN_ACPI_TABLE, LOCKDEP_NOW_UNRELIABLE);
        }
 
-       kfree(buf);
        return count;
 }
 
index f973bbe..b852cff 100644 (file)
@@ -142,7 +142,6 @@ int acpi_device_sleep_wake(struct acpi_device *dev,
 int acpi_power_get_inferred_state(struct acpi_device *device, int *state);
 int acpi_power_on_resources(struct acpi_device *device, int state);
 int acpi_power_transition(struct acpi_device *device, int state);
-void acpi_turn_off_unused_power_resources(void);
 
 /* --------------------------------------------------------------------------
                               Device Power Management
index e209081..c68e694 100644 (file)
@@ -75,8 +75,12 @@ void acpi_unregister_gsi(u32 gsi)
 {
        struct irq_domain *d = irq_find_matching_fwnode(acpi_gsi_domain_id,
                                                        DOMAIN_BUS_ANY);
-       int irq = irq_find_mapping(d, gsi);
+       int irq;
 
+       if (WARN_ON(acpi_irq_model == ACPI_IRQ_MODEL_GIC && gsi < 16))
+               return;
+
+       irq = irq_find_mapping(d, gsi);
        irq_dispose_mapping(irq);
 }
 EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
index 95f23ac..53cab97 100644 (file)
@@ -116,6 +116,13 @@ static struct mcfg_fixup mcfg_quirks[] = {
        THUNDER_ECAM_QUIRK(2, 12),
        THUNDER_ECAM_QUIRK(2, 13),
 
+       { "NVIDIA", "TEGRA194", 1, 0, MCFG_BUS_ANY, &tegra194_pcie_ops},
+       { "NVIDIA", "TEGRA194", 1, 1, MCFG_BUS_ANY, &tegra194_pcie_ops},
+       { "NVIDIA", "TEGRA194", 1, 2, MCFG_BUS_ANY, &tegra194_pcie_ops},
+       { "NVIDIA", "TEGRA194", 1, 3, MCFG_BUS_ANY, &tegra194_pcie_ops},
+       { "NVIDIA", "TEGRA194", 1, 4, MCFG_BUS_ANY, &tegra194_pcie_ops},
+       { "NVIDIA", "TEGRA194", 1, 5, MCFG_BUS_ANY, &tegra194_pcie_ops},
+
 #define XGENE_V1_ECAM_MCFG(rev, seg) \
        {"APM   ", "XGENE   ", rev, seg, MCFG_BUS_ANY, \
                &xgene_v1_pcie_ecam_ops }
index 56102ea..32974b5 100644 (file)
@@ -995,7 +995,6 @@ void acpi_resume_power_resources(void)
 
        mutex_unlock(&power_resource_list_lock);
 }
-#endif
 
 void acpi_turn_off_unused_power_resources(void)
 {
@@ -1016,3 +1015,4 @@ void acpi_turn_off_unused_power_resources(void)
 
        mutex_unlock(&power_resource_list_lock);
 }
+#endif
index bc973fb..a22778e 100644 (file)
@@ -2359,8 +2359,6 @@ int __init acpi_scan_init(void)
                }
        }
 
-       acpi_turn_off_unused_power_resources();
-
        acpi_scan_initialized = true;
 
  out:
index 7fe41ee..1856f76 100644 (file)
@@ -8,6 +8,7 @@ extern struct list_head acpi_wakeup_device_list;
 extern struct mutex acpi_device_lock;
 
 extern void acpi_resume_power_resources(void);
+extern void acpi_turn_off_unused_power_resources(void);
 
 static inline acpi_status acpi_set_waking_vector(u32 wakeup_address)
 {
index 5b32df5..6e9c5ad 100644 (file)
@@ -86,7 +86,8 @@ struct brcm_ahci_priv {
        u32 port_mask;
        u32 quirks;
        enum brcm_ahci_version version;
-       struct reset_control *rcdev;
+       struct reset_control *rcdev_rescal;
+       struct reset_control *rcdev_ahci;
 };
 
 static inline u32 brcm_sata_readreg(void __iomem *addr)
@@ -352,8 +353,8 @@ static int brcm_ahci_suspend(struct device *dev)
        else
                ret = 0;
 
-       if (priv->version != BRCM_SATA_BCM7216)
-               reset_control_assert(priv->rcdev);
+       reset_control_assert(priv->rcdev_ahci);
+       reset_control_rearm(priv->rcdev_rescal);
 
        return ret;
 }
@@ -365,10 +366,10 @@ static int __maybe_unused brcm_ahci_resume(struct device *dev)
        struct brcm_ahci_priv *priv = hpriv->plat_data;
        int ret = 0;
 
-       if (priv->version == BRCM_SATA_BCM7216)
-               ret = reset_control_reset(priv->rcdev);
-       else
-               ret = reset_control_deassert(priv->rcdev);
+       ret = reset_control_deassert(priv->rcdev_ahci);
+       if (ret)
+               return ret;
+       ret = reset_control_reset(priv->rcdev_rescal);
        if (ret)
                return ret;
 
@@ -434,7 +435,6 @@ static int brcm_ahci_probe(struct platform_device *pdev)
 {
        const struct of_device_id *of_id;
        struct device *dev = &pdev->dev;
-       const char *reset_name = NULL;
        struct brcm_ahci_priv *priv;
        struct ahci_host_priv *hpriv;
        struct resource *res;
@@ -456,15 +456,15 @@ static int brcm_ahci_probe(struct platform_device *pdev)
        if (IS_ERR(priv->top_ctrl))
                return PTR_ERR(priv->top_ctrl);
 
-       /* Reset is optional depending on platform and named differently */
-       if (priv->version == BRCM_SATA_BCM7216)
-               reset_name = "rescal";
-       else
-               reset_name = "ahci";
-
-       priv->rcdev = devm_reset_control_get_optional(&pdev->dev, reset_name);
-       if (IS_ERR(priv->rcdev))
-               return PTR_ERR(priv->rcdev);
+       if (priv->version == BRCM_SATA_BCM7216) {
+               priv->rcdev_rescal = devm_reset_control_get_optional_shared(
+                       &pdev->dev, "rescal");
+               if (IS_ERR(priv->rcdev_rescal))
+                       return PTR_ERR(priv->rcdev_rescal);
+       }
+       priv->rcdev_ahci = devm_reset_control_get_optional(&pdev->dev, "ahci");
+       if (IS_ERR(priv->rcdev_ahci))
+               return PTR_ERR(priv->rcdev_ahci);
 
        hpriv = ahci_platform_get_resources(pdev, 0);
        if (IS_ERR(hpriv))
@@ -485,10 +485,10 @@ static int brcm_ahci_probe(struct platform_device *pdev)
                break;
        }
 
-       if (priv->version == BRCM_SATA_BCM7216)
-               ret = reset_control_reset(priv->rcdev);
-       else
-               ret = reset_control_deassert(priv->rcdev);
+       ret = reset_control_reset(priv->rcdev_rescal);
+       if (ret)
+               return ret;
+       ret = reset_control_deassert(priv->rcdev_ahci);
        if (ret)
                return ret;
 
@@ -539,8 +539,8 @@ out_disable_regulators:
 out_disable_clks:
        ahci_platform_disable_clks(hpriv);
 out_reset:
-       if (priv->version != BRCM_SATA_BCM7216)
-               reset_control_assert(priv->rcdev);
+       reset_control_assert(priv->rcdev_ahci);
+       reset_control_rearm(priv->rcdev_rescal);
        return ret;
 }
 
index 0ddd611..3bc3c31 100644 (file)
@@ -795,6 +795,7 @@ static void process_incoming (struct fs_dev *dev, struct queue *q)
                switch (STATUS_CODE (qe)) {
                case 0x1:
                        /* Fall through for streaming mode */
+                       fallthrough;
                case 0x2:/* Packet received OK.... */
                        if (atm_vcc) {
                                skb = pe->skb;
index ff5755e..eba04c0 100644 (file)
@@ -1737,10 +1737,3 @@ module_init(panel_init_module);
 module_exit(panel_cleanup_module);
 MODULE_AUTHOR("Willy Tarreau");
 MODULE_LICENSE("GPL");
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 8
- * End:
- */
index 7835509..4fdb821 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/kernel_read_file.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/initrd.h>
 #include <linux/timer.h>
 #include <linux/vmalloc.h>
 #include <linux/interrupt.h>
@@ -504,6 +505,7 @@ fw_get_filesystem_firmware(struct device *device, struct fw_priv *fw_priv,
        if (!path)
                return -ENOMEM;
 
+       wait_for_initramfs();
        for (i = 0; i < ARRAY_SIZE(fw_path); i++) {
                size_t file_size = 0;
                size_t *file_size_ptr = NULL;
index f352984..b31b3af 100644 (file)
@@ -169,30 +169,98 @@ int memory_notify(unsigned long val, void *v)
        return blocking_notifier_call_chain(&memory_chain, val, v);
 }
 
+static int memory_block_online(struct memory_block *mem)
+{
+       unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
+       unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+       unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+       struct zone *zone;
+       int ret;
+
+       zone = zone_for_pfn_range(mem->online_type, mem->nid, start_pfn, nr_pages);
+
+       /*
+        * Although vmemmap pages have a different lifecycle than the pages
+        * they describe (they remain until the memory is unplugged), doing
+        * their initialization and accounting at memory onlining/offlining
+        * stage helps to keep accounting easier to follow - e.g vmemmaps
+        * belong to the same zone as the memory they backed.
+        */
+       if (nr_vmemmap_pages) {
+               ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
+               if (ret)
+                       return ret;
+       }
+
+       ret = online_pages(start_pfn + nr_vmemmap_pages,
+                          nr_pages - nr_vmemmap_pages, zone);
+       if (ret) {
+               if (nr_vmemmap_pages)
+                       mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
+               return ret;
+       }
+
+       /*
+        * Account once onlining succeeded. If the zone was unpopulated, it is
+        * now already properly populated.
+        */
+       if (nr_vmemmap_pages)
+               adjust_present_page_count(zone, nr_vmemmap_pages);
+
+       return ret;
+}
+
+static int memory_block_offline(struct memory_block *mem)
+{
+       unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
+       unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+       unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+       struct zone *zone;
+       int ret;
+
+       zone = page_zone(pfn_to_page(start_pfn));
+
+       /*
+        * Unaccount before offlining, such that unpopulated zone and kthreads
+        * can properly be torn down in offline_pages().
+        */
+       if (nr_vmemmap_pages)
+               adjust_present_page_count(zone, -nr_vmemmap_pages);
+
+       ret = offline_pages(start_pfn + nr_vmemmap_pages,
+                           nr_pages - nr_vmemmap_pages);
+       if (ret) {
+               /* offline_pages() failed. Account back. */
+               if (nr_vmemmap_pages)
+                       adjust_present_page_count(zone, nr_vmemmap_pages);
+               return ret;
+       }
+
+       if (nr_vmemmap_pages)
+               mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
+
+       return ret;
+}
+
 /*
  * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
  * OK to have direct references to sparsemem variables in here.
  */
 static int
-memory_block_action(unsigned long start_section_nr, unsigned long action,
-                   int online_type, int nid)
+memory_block_action(struct memory_block *mem, unsigned long action)
 {
-       unsigned long start_pfn;
-       unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
        int ret;
 
-       start_pfn = section_nr_to_pfn(start_section_nr);
-
        switch (action) {
        case MEM_ONLINE:
-               ret = online_pages(start_pfn, nr_pages, online_type, nid);
+               ret = memory_block_online(mem);
                break;
        case MEM_OFFLINE:
-               ret = offline_pages(start_pfn, nr_pages);
+               ret = memory_block_offline(mem);
                break;
        default:
                WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
-                    "%ld\n", __func__, start_section_nr, action, action);
+                    "%ld\n", __func__, mem->start_section_nr, action, action);
                ret = -EINVAL;
        }
 
@@ -210,9 +278,7 @@ static int memory_block_change_state(struct memory_block *mem,
        if (to_state == MEM_OFFLINE)
                mem->state = MEM_GOING_OFFLINE;
 
-       ret = memory_block_action(mem->start_section_nr, to_state,
-                                 mem->online_type, mem->nid);
-
+       ret = memory_block_action(mem, to_state);
        mem->state = ret ? from_state_req : to_state;
 
        return ret;
@@ -567,7 +633,8 @@ int register_memory(struct memory_block *memory)
        return ret;
 }
 
-static int init_memory_block(unsigned long block_id, unsigned long state)
+static int init_memory_block(unsigned long block_id, unsigned long state,
+                            unsigned long nr_vmemmap_pages)
 {
        struct memory_block *mem;
        int ret = 0;
@@ -584,6 +651,7 @@ static int init_memory_block(unsigned long block_id, unsigned long state)
        mem->start_section_nr = block_id * sections_per_block;
        mem->state = state;
        mem->nid = NUMA_NO_NODE;
+       mem->nr_vmemmap_pages = nr_vmemmap_pages;
 
        ret = register_memory(mem);
 
@@ -603,7 +671,7 @@ static int add_memory_block(unsigned long base_section_nr)
        if (section_count == 0)
                return 0;
        return init_memory_block(memory_block_id(base_section_nr),
-                                MEM_ONLINE);
+                                MEM_ONLINE, 0);
 }
 
 static void unregister_memory(struct memory_block *memory)
@@ -625,7 +693,8 @@ static void unregister_memory(struct memory_block *memory)
  *
  * Called under device_hotplug_lock.
  */
-int create_memory_block_devices(unsigned long start, unsigned long size)
+int create_memory_block_devices(unsigned long start, unsigned long size,
+                               unsigned long vmemmap_pages)
 {
        const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
        unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
@@ -638,7 +707,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
                return -EINVAL;
 
        for (block_id = start_block_id; block_id != end_block_id; block_id++) {
-               ret = init_memory_block(block_id, MEM_OFFLINE);
+               ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages);
                if (ret)
                        break;
        }
index 6e622c1..7562cf3 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/bio.h>
 #include <linux/highmem.h>
 #include <linux/mutex.h>
+#include <linux/pagemap.h>
 #include <linux/radix-tree.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
index a370cde..d58d68f 100644 (file)
@@ -53,6 +53,7 @@
 #include <linux/moduleparam.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
+#include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
index c01786a..c604a40 100644 (file)
@@ -88,7 +88,7 @@ static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev,
        dev->discard_alignment      = le32_to_cpu(rsp->discard_alignment);
        dev->secure_discard         = le16_to_cpu(rsp->secure_discard);
        dev->rotational             = rsp->rotational;
-       dev->wc                     = !!(rsp->cache_policy & RNBD_WRITEBACK);
+       dev->wc                     = !!(rsp->cache_policy & RNBD_WRITEBACK);
        dev->fua                    = !!(rsp->cache_policy & RNBD_FUA);
 
        dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE;
@@ -241,7 +241,7 @@ static bool rnbd_rerun_if_needed(struct rnbd_clt_session *sess)
             cpu_q = rnbd_get_cpu_qlist(sess, nxt_cpu(cpu_q->cpu))) {
                if (!spin_trylock_irqsave(&cpu_q->requeue_lock, flags))
                        continue;
-               if (unlikely(!test_bit(cpu_q->cpu, sess->cpu_queues_bm)))
+               if (!test_bit(cpu_q->cpu, sess->cpu_queues_bm))
                        goto unlock;
                q = list_first_entry_or_null(&cpu_q->requeue_list,
                                             typeof(*q), requeue_list);
@@ -320,7 +320,7 @@ static struct rtrs_permit *rnbd_get_permit(struct rnbd_clt_session *sess,
        struct rtrs_permit *permit;
 
        permit = rtrs_clt_get_permit(sess->rtrs, con_type, wait);
-       if (likely(permit))
+       if (permit)
                /* We have a subtle rare case here, when all permits can be
                 * consumed before busy counter increased.  This is safe,
                 * because loser will get NULL as a permit, observe 0 busy
@@ -351,12 +351,11 @@ static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess,
        struct rtrs_permit *permit;
 
        iu = kzalloc(sizeof(*iu), GFP_KERNEL);
-       if (!iu) {
+       if (!iu)
                return NULL;
-       }
 
        permit = rnbd_get_permit(sess, con_type, wait);
-       if (unlikely(!permit)) {
+       if (!permit) {
                kfree(iu);
                return NULL;
        }
@@ -692,7 +691,11 @@ static void remap_devs(struct rnbd_clt_session *sess)
                return;
        }
 
-       rtrs_clt_query(sess->rtrs, &attrs);
+       err = rtrs_clt_query(sess->rtrs, &attrs);
+       if (err) {
+               pr_err("rtrs_clt_query(\"%s\"): %d\n", sess->sessname, err);
+               return;
+       }
        mutex_lock(&sess->lock);
        sess->max_io_size = attrs.max_io_size;
 
@@ -805,7 +808,7 @@ static struct rnbd_clt_session *alloc_sess(const char *sessname)
        mutex_init(&sess->lock);
        INIT_LIST_HEAD(&sess->devs_list);
        INIT_LIST_HEAD(&sess->list);
-       bitmap_zero(sess->cpu_queues_bm, NR_CPUS);
+       bitmap_zero(sess->cpu_queues_bm, num_possible_cpus());
        init_waitqueue_head(&sess->rtrs_waitq);
        refcount_set(&sess->refcount, 1);
 
@@ -1047,7 +1050,7 @@ static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev,
        };
        err = rtrs_clt_request(rq_data_dir(rq), &req_ops, rtrs, permit,
                               &vec, 1, size, iu->sgt.sgl, sg_cnt);
-       if (unlikely(err)) {
+       if (err) {
                rnbd_clt_err_rl(dev, "RTRS failed to transfer IO, err: %d\n",
                                 err);
                return err;
@@ -1078,7 +1081,7 @@ static bool rnbd_clt_dev_add_to_requeue(struct rnbd_clt_dev *dev,
        cpu_q = get_cpu_ptr(sess->cpu_queues);
        spin_lock_irqsave(&cpu_q->requeue_lock, flags);
 
-       if (likely(!test_and_set_bit_lock(0, &q->in_list))) {
+       if (!test_and_set_bit_lock(0, &q->in_list)) {
                if (WARN_ON(!list_empty(&q->requeue_list)))
                        goto unlock;
 
@@ -1090,7 +1093,7 @@ static bool rnbd_clt_dev_add_to_requeue(struct rnbd_clt_dev *dev,
                         */
                        smp_mb__before_atomic();
                }
-               if (likely(atomic_read(&sess->busy))) {
+               if (atomic_read(&sess->busy)) {
                        list_add_tail(&q->requeue_list, &cpu_q->requeue_list);
                } else {
                        /* Very unlikely, but possible: busy counter was
@@ -1118,7 +1121,7 @@ static void rnbd_clt_dev_kick_mq_queue(struct rnbd_clt_dev *dev,
 
        if (delay != RNBD_DELAY_IFBUSY)
                blk_mq_delay_run_hw_queue(hctx, delay);
-       else if (unlikely(!rnbd_clt_dev_add_to_requeue(dev, q)))
+       else if (!rnbd_clt_dev_add_to_requeue(dev, q))
                /*
                 * If session is not busy we have to restart
                 * the queue ourselves.
@@ -1135,12 +1138,12 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
        int err;
        blk_status_t ret = BLK_STS_IOERR;
 
-       if (unlikely(dev->dev_state != DEV_STATE_MAPPED))
+       if (dev->dev_state != DEV_STATE_MAPPED)
                return BLK_STS_IOERR;
 
        iu->permit = rnbd_get_permit(dev->sess, RTRS_IO_CON,
                                      RTRS_PERMIT_NOWAIT);
-       if (unlikely(!iu->permit)) {
+       if (!iu->permit) {
                rnbd_clt_dev_kick_mq_queue(dev, hctx, RNBD_DELAY_IFBUSY);
                return BLK_STS_RESOURCE;
        }
@@ -1148,7 +1151,8 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
        iu->sgt.sgl = iu->first_sgl;
        err = sg_alloc_table_chained(&iu->sgt,
                                     /* Even-if the request has no segment,
-                                     * sglist must have one entry at least */
+                                     * sglist must have one entry at least.
+                                     */
                                     blk_rq_nr_phys_segments(rq) ? : 1,
                                     iu->sgt.sgl,
                                     RNBD_INLINE_SG_CNT);
@@ -1161,9 +1165,9 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 
        blk_mq_start_request(rq);
        err = rnbd_client_xfer_request(dev, rq, iu);
-       if (likely(err == 0))
+       if (err == 0)
                return BLK_STS_OK;
-       if (unlikely(err == -EAGAIN || err == -ENOMEM)) {
+       if (err == -EAGAIN || err == -ENOMEM) {
                rnbd_clt_dev_kick_mq_queue(dev, hctx, 10/*ms*/);
                ret = BLK_STS_RESOURCE;
        }
@@ -1294,7 +1298,11 @@ find_and_get_or_create_sess(const char *sessname,
                err = PTR_ERR(sess->rtrs);
                goto wake_up_and_put;
        }
-       rtrs_clt_query(sess->rtrs, &attrs);
+
+       err = rtrs_clt_query(sess->rtrs, &attrs);
+       if (err)
+               goto close_rtrs;
+
        sess->max_io_size = attrs.max_io_size;
        sess->queue_depth = attrs.queue_depth;
        sess->nr_poll_queues = nr_poll_queues;
@@ -1576,7 +1584,7 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
        struct rnbd_clt_dev *dev;
        int ret;
 
-       if (unlikely(exists_devpath(pathname, sessname)))
+       if (exists_devpath(pathname, sessname))
                return ERR_PTR(-EEXIST);
 
        sess = find_and_get_or_create_sess(sessname, paths, path_cnt, port_nr, nr_poll_queues);
index 451e738..b5322c5 100644 (file)
@@ -87,7 +87,7 @@ struct rnbd_clt_session {
        DECLARE_BITMAP(cpu_queues_bm, NR_CPUS);
        int     __percpu        *cpu_rr; /* per-cpu var for CPU round-robin */
        atomic_t                busy;
-       int                     queue_depth;
+       size_t                  queue_depth;
        u32                     max_io_size;
        struct blk_mq_tag_set   tag_set;
        u32                     nr_poll_queues;
index 899dd9d..aafecfe 100644 (file)
@@ -104,7 +104,7 @@ rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess)
 
        rcu_read_lock();
        sess_dev = xa_load(&srv_sess->index_idr, dev_id);
-       if (likely(sess_dev))
+       if (sess_dev)
                ret = kref_get_unless_zero(&sess_dev->kref);
        rcu_read_unlock();
 
index d229a2d..b151e0f 100644 (file)
@@ -334,16 +334,6 @@ config DEVMEM
          memory.
          When in doubt, say "Y".
 
-config DEVKMEM
-       bool "/dev/kmem virtual device support"
-       # On arm64, VMALLOC_START < PAGE_OFFSET, which confuses kmem read/write
-       depends on !ARM64
-       help
-         Say Y here if you want to support the /dev/kmem device. The
-         /dev/kmem device is rarely used, but can be used for certain
-         kind of kernel debugging operations.
-         When in doubt, say "N".
-
 config NVRAM
        tristate "/dev/nvram support"
        depends on X86 || HAVE_ARCH_NVRAM_OPS
index 869b9f5..15dc54f 100644 (file)
@@ -403,221 +403,6 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
 
-static int mmap_kmem(struct file *file, struct vm_area_struct *vma)
-{
-       unsigned long pfn;
-
-       /* Turn a kernel-virtual address into a physical page frame */
-       pfn = __pa((u64)vma->vm_pgoff << PAGE_SHIFT) >> PAGE_SHIFT;
-
-       /*
-        * RED-PEN: on some architectures there is more mapped memory than
-        * available in mem_map which pfn_valid checks for. Perhaps should add a
-        * new macro here.
-        *
-        * RED-PEN: vmalloc is not supported right now.
-        */
-       if (!pfn_valid(pfn))
-               return -EIO;
-
-       vma->vm_pgoff = pfn;
-       return mmap_mem(file, vma);
-}
-
-/*
- * This function reads the *virtual* memory as seen by the kernel.
- */
-static ssize_t read_kmem(struct file *file, char __user *buf,
-                        size_t count, loff_t *ppos)
-{
-       unsigned long p = *ppos;
-       ssize_t low_count, read, sz;
-       char *kbuf; /* k-addr because vread() takes vmlist_lock rwlock */
-       int err = 0;
-
-       read = 0;
-       if (p < (unsigned long) high_memory) {
-               low_count = count;
-               if (count > (unsigned long)high_memory - p)
-                       low_count = (unsigned long)high_memory - p;
-
-#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED
-               /* we don't have page 0 mapped on sparc and m68k.. */
-               if (p < PAGE_SIZE && low_count > 0) {
-                       sz = size_inside_page(p, low_count);
-                       if (clear_user(buf, sz))
-                               return -EFAULT;
-                       buf += sz;
-                       p += sz;
-                       read += sz;
-                       low_count -= sz;
-                       count -= sz;
-               }
-#endif
-               while (low_count > 0) {
-                       sz = size_inside_page(p, low_count);
-
-                       /*
-                        * On ia64 if a page has been mapped somewhere as
-                        * uncached, then it must also be accessed uncached
-                        * by the kernel or data corruption may occur
-                        */
-                       kbuf = xlate_dev_kmem_ptr((void *)p);
-                       if (!virt_addr_valid(kbuf))
-                               return -ENXIO;
-
-                       if (copy_to_user(buf, kbuf, sz))
-                               return -EFAULT;
-                       buf += sz;
-                       p += sz;
-                       read += sz;
-                       low_count -= sz;
-                       count -= sz;
-                       if (should_stop_iteration()) {
-                               count = 0;
-                               break;
-                       }
-               }
-       }
-
-       if (count > 0) {
-               kbuf = (char *)__get_free_page(GFP_KERNEL);
-               if (!kbuf)
-                       return -ENOMEM;
-               while (count > 0) {
-                       sz = size_inside_page(p, count);
-                       if (!is_vmalloc_or_module_addr((void *)p)) {
-                               err = -ENXIO;
-                               break;
-                       }
-                       sz = vread(kbuf, (char *)p, sz);
-                       if (!sz)
-                               break;
-                       if (copy_to_user(buf, kbuf, sz)) {
-                               err = -EFAULT;
-                               break;
-                       }
-                       count -= sz;
-                       buf += sz;
-                       read += sz;
-                       p += sz;
-                       if (should_stop_iteration())
-                               break;
-               }
-               free_page((unsigned long)kbuf);
-       }
-       *ppos = p;
-       return read ? read : err;
-}
-
-
-static ssize_t do_write_kmem(unsigned long p, const char __user *buf,
-                               size_t count, loff_t *ppos)
-{
-       ssize_t written, sz;
-       unsigned long copied;
-
-       written = 0;
-#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED
-       /* we don't have page 0 mapped on sparc and m68k.. */
-       if (p < PAGE_SIZE) {
-               sz = size_inside_page(p, count);
-               /* Hmm. Do something? */
-               buf += sz;
-               p += sz;
-               count -= sz;
-               written += sz;
-       }
-#endif
-
-       while (count > 0) {
-               void *ptr;
-
-               sz = size_inside_page(p, count);
-
-               /*
-                * On ia64 if a page has been mapped somewhere as uncached, then
-                * it must also be accessed uncached by the kernel or data
-                * corruption may occur.
-                */
-               ptr = xlate_dev_kmem_ptr((void *)p);
-               if (!virt_addr_valid(ptr))
-                       return -ENXIO;
-
-               copied = copy_from_user(ptr, buf, sz);
-               if (copied) {
-                       written += sz - copied;
-                       if (written)
-                               break;
-                       return -EFAULT;
-               }
-               buf += sz;
-               p += sz;
-               count -= sz;
-               written += sz;
-               if (should_stop_iteration())
-                       break;
-       }
-
-       *ppos += written;
-       return written;
-}
-
-/*
- * This function writes to the *virtual* memory as seen by the kernel.
- */
-static ssize_t write_kmem(struct file *file, const char __user *buf,
-                         size_t count, loff_t *ppos)
-{
-       unsigned long p = *ppos;
-       ssize_t wrote = 0;
-       ssize_t virtr = 0;
-       char *kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */
-       int err = 0;
-
-       if (p < (unsigned long) high_memory) {
-               unsigned long to_write = min_t(unsigned long, count,
-                                              (unsigned long)high_memory - p);
-               wrote = do_write_kmem(p, buf, to_write, ppos);
-               if (wrote != to_write)
-                       return wrote;
-               p += wrote;
-               buf += wrote;
-               count -= wrote;
-       }
-
-       if (count > 0) {
-               kbuf = (char *)__get_free_page(GFP_KERNEL);
-               if (!kbuf)
-                       return wrote ? wrote : -ENOMEM;
-               while (count > 0) {
-                       unsigned long sz = size_inside_page(p, count);
-                       unsigned long n;
-
-                       if (!is_vmalloc_or_module_addr((void *)p)) {
-                               err = -ENXIO;
-                               break;
-                       }
-                       n = copy_from_user(kbuf, buf, sz);
-                       if (n) {
-                               err = -EFAULT;
-                               break;
-                       }
-                       vwrite(kbuf, (char *)p, sz);
-                       count -= sz;
-                       buf += sz;
-                       virtr += sz;
-                       p += sz;
-                       if (should_stop_iteration())
-                               break;
-               }
-               free_page((unsigned long)kbuf);
-       }
-
-       *ppos = p;
-       return virtr + wrote ? : err;
-}
-
 static ssize_t read_port(struct file *file, char __user *buf,
                         size_t count, loff_t *ppos)
 {
@@ -855,7 +640,6 @@ static int open_port(struct inode *inode, struct file *filp)
 #define write_zero     write_null
 #define write_iter_zero        write_iter_null
 #define open_mem       open_port
-#define open_kmem      open_mem
 
 static const struct file_operations __maybe_unused mem_fops = {
        .llseek         = memory_lseek,
@@ -869,18 +653,6 @@ static const struct file_operations __maybe_unused mem_fops = {
 #endif
 };
 
-static const struct file_operations __maybe_unused kmem_fops = {
-       .llseek         = memory_lseek,
-       .read           = read_kmem,
-       .write          = write_kmem,
-       .mmap           = mmap_kmem,
-       .open           = open_kmem,
-#ifndef CONFIG_MMU
-       .get_unmapped_area = get_unmapped_area_mem,
-       .mmap_capabilities = memory_mmap_capabilities,
-#endif
-};
-
 static const struct file_operations null_fops = {
        .llseek         = null_lseek,
        .read           = read_null,
@@ -924,9 +696,6 @@ static const struct memdev {
 } devlist[] = {
 #ifdef CONFIG_DEVMEM
         [DEVMEM_MINOR] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET },
-#endif
-#ifdef CONFIG_DEVKMEM
-        [2] = { "kmem", 0, &kmem_fops, FMODE_UNSIGNED_OFFSET },
 #endif
         [3] = { "null", 0666, &null_fops, 0 },
 #ifdef CONFIG_DEVPORT
index 1c14eb2..9132c3c 100644 (file)
@@ -10,6 +10,8 @@ if CLK_SIFIVE
 
 config CLK_SIFIVE_PRCI
        bool "PRCI driver for SiFive SoCs"
+       select RESET_CONTROLLER
+       select RESET_SIMPLE
        select CLK_ANALOGBITS_WRPLL_CLN28HPC
        help
          Supports the Power Reset Clock interface (PRCI) IP block found in
index 764d109..53f6e00 100644 (file)
@@ -72,6 +72,12 @@ static const struct clk_ops sifive_fu740_prci_hfpclkplldiv_clk_ops = {
        .recalc_rate = sifive_prci_hfpclkplldiv_recalc_rate,
 };
 
+static const struct clk_ops sifive_fu740_prci_pcie_aux_clk_ops = {
+       .enable = sifive_prci_pcie_aux_clock_enable,
+       .disable = sifive_prci_pcie_aux_clock_disable,
+       .is_enabled = sifive_prci_pcie_aux_clock_is_enabled,
+};
+
 /* List of clock controls provided by the PRCI */
 struct __prci_clock __prci_init_clocks_fu740[] = {
        [PRCI_CLK_COREPLL] = {
@@ -120,4 +126,9 @@ struct __prci_clock __prci_init_clocks_fu740[] = {
                .parent_name = "hfpclkpll",
                .ops = &sifive_fu740_prci_hfpclkplldiv_clk_ops,
        },
+       [PRCI_CLK_PCIE_AUX] = {
+               .name = "pcie_aux",
+               .parent_name = "hfclk",
+               .ops = &sifive_fu740_prci_pcie_aux_clk_ops,
+       },
 };
index 13ef971..511a0bf 100644 (file)
@@ -9,7 +9,7 @@
 
 #include "sifive-prci.h"
 
-#define NUM_CLOCK_FU740        8
+#define NUM_CLOCK_FU740        9
 
 extern struct __prci_clock __prci_init_clocks_fu740[NUM_CLOCK_FU740];
 
index 1490b01..0d79ba3 100644 (file)
@@ -453,6 +453,47 @@ void sifive_prci_hfpclkpllsel_use_hfpclkpll(struct __prci_data *pd)
        r = __prci_readl(pd, PRCI_HFPCLKPLLSEL_OFFSET); /* barrier */
 }
 
+/* PCIE AUX clock APIs for enable, disable. */
+int sifive_prci_pcie_aux_clock_is_enabled(struct clk_hw *hw)
+{
+       struct __prci_clock *pc = clk_hw_to_prci_clock(hw);
+       struct __prci_data *pd = pc->pd;
+       u32 r;
+
+       r = __prci_readl(pd, PRCI_PCIE_AUX_OFFSET);
+
+       if (r & PRCI_PCIE_AUX_EN_MASK)
+               return 1;
+       else
+               return 0;
+}
+
+int sifive_prci_pcie_aux_clock_enable(struct clk_hw *hw)
+{
+       struct __prci_clock *pc = clk_hw_to_prci_clock(hw);
+       struct __prci_data *pd = pc->pd;
+       u32 r __maybe_unused;
+
+       if (sifive_prci_pcie_aux_clock_is_enabled(hw))
+               return 0;
+
+       __prci_writel(1, PRCI_PCIE_AUX_OFFSET, pd);
+       r = __prci_readl(pd, PRCI_PCIE_AUX_OFFSET);     /* barrier */
+
+       return 0;
+}
+
+void sifive_prci_pcie_aux_clock_disable(struct clk_hw *hw)
+{
+       struct __prci_clock *pc = clk_hw_to_prci_clock(hw);
+       struct __prci_data *pd = pc->pd;
+       u32 r __maybe_unused;
+
+       __prci_writel(0, PRCI_PCIE_AUX_OFFSET, pd);
+       r = __prci_readl(pd, PRCI_PCIE_AUX_OFFSET);     /* barrier */
+
+}
+
 /**
  * __prci_register_clocks() - register clock controls in the PRCI
  * @dev: Linux struct device
@@ -547,6 +588,19 @@ static int sifive_prci_probe(struct platform_device *pdev)
        if (IS_ERR(pd->va))
                return PTR_ERR(pd->va);
 
+       pd->reset.rcdev.owner = THIS_MODULE;
+       pd->reset.rcdev.nr_resets = PRCI_RST_NR;
+       pd->reset.rcdev.ops = &reset_simple_ops;
+       pd->reset.rcdev.of_node = pdev->dev.of_node;
+       pd->reset.active_low = true;
+       pd->reset.membase = pd->va + PRCI_DEVICESRESETREG_OFFSET;
+       spin_lock_init(&pd->reset.lock);
+
+       r = devm_reset_controller_register(&pdev->dev, &pd->reset.rcdev);
+       if (r) {
+               dev_err(dev, "could not register reset controller: %d\n", r);
+               return r;
+       }
        r = __prci_register_clocks(dev, pd, desc);
        if (r) {
                dev_err(dev, "could not register clocks: %d\n", r);
index dbdbd17..91658a8 100644 (file)
@@ -11,6 +11,7 @@
 
 #include <linux/clk/analogbits-wrpll-cln28hpc.h>
 #include <linux/clk-provider.h>
+#include <linux/reset/reset-simple.h>
 #include <linux/platform_device.h>
 
 /*
 #define PRCI_DDRPLLCFG1_CKE_SHIFT      31
 #define PRCI_DDRPLLCFG1_CKE_MASK       (0x1 << PRCI_DDRPLLCFG1_CKE_SHIFT)
 
+/* PCIEAUX */
+#define PRCI_PCIE_AUX_OFFSET           0x14
+#define PRCI_PCIE_AUX_EN_SHIFT         0
+#define PRCI_PCIE_AUX_EN_MASK          (0x1 << PRCI_PCIE_AUX_EN_SHIFT)
+
 /* GEMGXLPLLCFG0 */
 #define PRCI_GEMGXLPLLCFG0_OFFSET      0x1c
 #define PRCI_GEMGXLPLLCFG0_DIVR_SHIFT  0
 #define PRCI_DEVICESRESETREG_CHIPLINK_RST_N_MASK                       \
                (0x1 << PRCI_DEVICESRESETREG_CHIPLINK_RST_N_SHIFT)
 
+#define PRCI_RST_NR                                            7
+
 /* CLKMUXSTATUSREG */
 #define PRCI_CLKMUXSTATUSREG_OFFSET                            0x2c
 #define PRCI_CLKMUXSTATUSREG_TLCLKSEL_STATUS_SHIFT             1
  */
 struct __prci_data {
        void __iomem *va;
+       struct reset_simple_data reset;
        struct clk_hw_onecell_data hw_clks;
 };
 
@@ -296,4 +305,8 @@ unsigned long sifive_prci_tlclksel_recalc_rate(struct clk_hw *hw,
 unsigned long sifive_prci_hfpclkplldiv_recalc_rate(struct clk_hw *hw,
                                                   unsigned long parent_rate);
 
+int sifive_prci_pcie_aux_clock_is_enabled(struct clk_hw *hw);
+int sifive_prci_pcie_aux_clock_enable(struct clk_hw *hw);
+void sifive_prci_pcie_aux_clock_disable(struct clk_hw *hw);
+
 #endif /* __SIFIVE_CLK_SIFIVE_PRCI_H */
index 4fb1f4d..fe1a826 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/cpu_pm.h>
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
+#include <linux/clocksource_ids.h>
 #include <linux/interrupt.h>
 #include <linux/of_irq.h>
 #include <linux/of_address.h>
@@ -24,6 +25,8 @@
 #include <linux/sched/clock.h>
 #include <linux/sched_clock.h>
 #include <linux/acpi.h>
+#include <linux/arm-smccc.h>
+#include <linux/ptp_kvm.h>
 
 #include <asm/arch_timer.h>
 #include <asm/virt.h>
@@ -200,6 +203,7 @@ static u64 arch_counter_read_cc(const struct cyclecounter *cc)
 
 static struct clocksource clocksource_counter = {
        .name   = "arch_sys_counter",
+       .id     = CSID_ARM_ARCH_COUNTER,
        .rating = 400,
        .read   = arch_counter_read,
        .mask   = CLOCKSOURCE_MASK(56),
@@ -1676,3 +1680,35 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table)
 }
 TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init);
 #endif
+
+int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *ts,
+                                struct clocksource **cs)
+{
+       struct arm_smccc_res hvc_res;
+       u32 ptp_counter;
+       ktime_t ktime;
+
+       if (!IS_ENABLED(CONFIG_HAVE_ARM_SMCCC_DISCOVERY))
+               return -EOPNOTSUPP;
+
+       if (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI)
+               ptp_counter = KVM_PTP_VIRT_COUNTER;
+       else
+               ptp_counter = KVM_PTP_PHYS_COUNTER;
+
+       arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID,
+                            ptp_counter, &hvc_res);
+
+       if ((int)(hvc_res.a0) < 0)
+               return -EOPNOTSUPP;
+
+       ktime = (u64)hvc_res.a0 << 32 | hvc_res.a1;
+       *ts = ktime_to_timespec64(ktime);
+       if (cycle)
+               *cycle = (u64)hvc_res.a2 << 32 | hvc_res.a3;
+       if (cs)
+               *cs = &clocksource_counter;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_arch_ptp_get_crosststamp);
index da3872c..3506b20 100644 (file)
@@ -130,6 +130,7 @@ static int sev_cmd_buffer_len(int cmd)
        case SEV_CMD_DOWNLOAD_FIRMWARE:         return sizeof(struct sev_data_download_firmware);
        case SEV_CMD_GET_ID:                    return sizeof(struct sev_data_get_id);
        case SEV_CMD_ATTESTATION_REPORT:        return sizeof(struct sev_data_attestation_report);
+       case SEV_CMD_SEND_CANCEL:                       return sizeof(struct sev_data_send_cancel);
        default:                                return 0;
        }
 
@@ -142,6 +143,7 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
        struct sev_device *sev;
        unsigned int phys_lsb, phys_msb;
        unsigned int reg, ret = 0;
+       int buf_len;
 
        if (!psp || !psp->sev_data)
                return -ENODEV;
@@ -151,15 +153,27 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
 
        sev = psp->sev_data;
 
+       buf_len = sev_cmd_buffer_len(cmd);
+       if (WARN_ON_ONCE(!data != !buf_len))
+               return -EINVAL;
+
+       /*
+        * Copy the incoming data to driver's scratch buffer as __pa() will not
+        * work for some memory, e.g. vmalloc'd addresses, and @data may not be
+        * physically contiguous.
+        */
+       if (data)
+               memcpy(sev->cmd_buf, data, buf_len);
+
        /* Get the physical address of the command buffer */
-       phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0;
-       phys_msb = data ? upper_32_bits(__psp_pa(data)) : 0;
+       phys_lsb = data ? lower_32_bits(__psp_pa(sev->cmd_buf)) : 0;
+       phys_msb = data ? upper_32_bits(__psp_pa(sev->cmd_buf)) : 0;
 
        dev_dbg(sev->dev, "sev command id %#x buffer 0x%08x%08x timeout %us\n",
                cmd, phys_msb, phys_lsb, psp_timeout);
 
        print_hex_dump_debug("(in):  ", DUMP_PREFIX_OFFSET, 16, 2, data,
-                            sev_cmd_buffer_len(cmd), false);
+                            buf_len, false);
 
        iowrite32(phys_lsb, sev->io_regs + sev->vdata->cmdbuff_addr_lo_reg);
        iowrite32(phys_msb, sev->io_regs + sev->vdata->cmdbuff_addr_hi_reg);
@@ -195,7 +209,14 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
        }
 
        print_hex_dump_debug("(out): ", DUMP_PREFIX_OFFSET, 16, 2, data,
-                            sev_cmd_buffer_len(cmd), false);
+                            buf_len, false);
+
+       /*
+        * Copy potential output from the PSP back to data.  Do this even on
+        * failure in case the caller wants to glean something from the error.
+        */
+       if (data)
+               memcpy(data, sev->cmd_buf, buf_len);
 
        return ret;
 }
@@ -214,6 +235,7 @@ static int sev_do_cmd(int cmd, void *data, int *psp_ret)
 static int __sev_platform_init_locked(int *error)
 {
        struct psp_device *psp = psp_master;
+       struct sev_data_init data;
        struct sev_device *sev;
        int rc = 0;
 
@@ -225,6 +247,7 @@ static int __sev_platform_init_locked(int *error)
        if (sev->state == SEV_STATE_INIT)
                return 0;
 
+       memset(&data, 0, sizeof(data));
        if (sev_es_tmr) {
                u64 tmr_pa;
 
@@ -234,12 +257,12 @@ static int __sev_platform_init_locked(int *error)
                 */
                tmr_pa = __pa(sev_es_tmr);
 
-               sev->init_cmd_buf.flags |= SEV_INIT_FLAGS_SEV_ES;
-               sev->init_cmd_buf.tmr_address = tmr_pa;
-               sev->init_cmd_buf.tmr_len = SEV_ES_TMR_SIZE;
+               data.flags |= SEV_INIT_FLAGS_SEV_ES;
+               data.tmr_address = tmr_pa;
+               data.tmr_len = SEV_ES_TMR_SIZE;
        }
 
-       rc = __sev_do_cmd_locked(SEV_CMD_INIT, &sev->init_cmd_buf, error);
+       rc = __sev_do_cmd_locked(SEV_CMD_INIT, &data, error);
        if (rc)
                return rc;
 
@@ -296,15 +319,14 @@ static int sev_platform_shutdown(int *error)
 
 static int sev_get_platform_state(int *state, int *error)
 {
-       struct sev_device *sev = psp_master->sev_data;
+       struct sev_user_data_status data;
        int rc;
 
-       rc = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS,
-                                &sev->status_cmd_buf, error);
+       rc = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, &data, error);
        if (rc)
                return rc;
 
-       *state = sev->status_cmd_buf.state;
+       *state = data.state;
        return rc;
 }
 
@@ -342,15 +364,14 @@ static int sev_ioctl_do_reset(struct sev_issue_cmd *argp, bool writable)
 
 static int sev_ioctl_do_platform_status(struct sev_issue_cmd *argp)
 {
-       struct sev_device *sev = psp_master->sev_data;
-       struct sev_user_data_status *data = &sev->status_cmd_buf;
+       struct sev_user_data_status data;
        int ret;
 
-       ret = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, &data, &argp->error);
        if (ret)
                return ret;
 
-       if (copy_to_user((void __user *)argp->data, data, sizeof(*data)))
+       if (copy_to_user((void __user *)argp->data, &data, sizeof(data)))
                ret = -EFAULT;
 
        return ret;
@@ -377,7 +398,7 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp, bool writable)
 {
        struct sev_device *sev = psp_master->sev_data;
        struct sev_user_data_pek_csr input;
-       struct sev_data_pek_csr *data;
+       struct sev_data_pek_csr data;
        void __user *input_address;
        void *blob = NULL;
        int ret;
@@ -388,9 +409,7 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp, bool writable)
        if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
        /* userspace wants to query CSR length */
        if (!input.address || !input.length)
@@ -398,19 +417,15 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp, bool writable)
 
        /* allocate a physically contiguous buffer to store the CSR blob */
        input_address = (void __user *)input.address;
-       if (input.length > SEV_FW_BLOB_MAX_SIZE) {
-               ret = -EFAULT;
-               goto e_free;
-       }
+       if (input.length > SEV_FW_BLOB_MAX_SIZE)
+               return -EFAULT;
 
        blob = kmalloc(input.length, GFP_KERNEL);
-       if (!blob) {
-               ret = -ENOMEM;
-               goto e_free;
-       }
+       if (!blob)
+               return -ENOMEM;
 
-       data->address = __psp_pa(blob);
-       data->len = input.length;
+       data.address = __psp_pa(blob);
+       data.len = input.length;
 
 cmd:
        if (sev->state == SEV_STATE_UNINIT) {
@@ -419,10 +434,10 @@ cmd:
                        goto e_free_blob;
        }
 
-       ret = __sev_do_cmd_locked(SEV_CMD_PEK_CSR, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_PEK_CSR, &data, &argp->error);
 
         /* If we query the CSR length, FW responded with expected data. */
-       input.length = data->len;
+       input.length = data.len;
 
        if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) {
                ret = -EFAULT;
@@ -436,8 +451,6 @@ cmd:
 
 e_free_blob:
        kfree(blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
@@ -457,21 +470,20 @@ EXPORT_SYMBOL_GPL(psp_copy_user_blob);
 static int sev_get_api_version(void)
 {
        struct sev_device *sev = psp_master->sev_data;
-       struct sev_user_data_status *status;
+       struct sev_user_data_status status;
        int error = 0, ret;
 
-       status = &sev->status_cmd_buf;
-       ret = sev_platform_status(status, &error);
+       ret = sev_platform_status(&status, &error);
        if (ret) {
                dev_err(sev->dev,
                        "SEV: failed to get status. Error: %#x\n", error);
                return 1;
        }
 
-       sev->api_major = status->api_major;
-       sev->api_minor = status->api_minor;
-       sev->build = status->build;
-       sev->state = status->state;
+       sev->api_major = status.api_major;
+       sev->api_minor = status.api_minor;
+       sev->build = status.build;
+       sev->state = status.state;
 
        return 0;
 }
@@ -569,7 +581,7 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp, bool writable)
 {
        struct sev_device *sev = psp_master->sev_data;
        struct sev_user_data_pek_cert_import input;
-       struct sev_data_pek_cert_import *data;
+       struct sev_data_pek_cert_import data;
        void *pek_blob, *oca_blob;
        int ret;
 
@@ -579,19 +591,14 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp, bool writable)
        if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return -ENOMEM;
-
        /* copy PEK certificate blobs from userspace */
        pek_blob = psp_copy_user_blob(input.pek_cert_address, input.pek_cert_len);
-       if (IS_ERR(pek_blob)) {
-               ret = PTR_ERR(pek_blob);
-               goto e_free;
-       }
+       if (IS_ERR(pek_blob))
+               return PTR_ERR(pek_blob);
 
-       data->pek_cert_address = __psp_pa(pek_blob);
-       data->pek_cert_len = input.pek_cert_len;
+       data.reserved = 0;
+       data.pek_cert_address = __psp_pa(pek_blob);
+       data.pek_cert_len = input.pek_cert_len;
 
        /* copy PEK certificate blobs from userspace */
        oca_blob = psp_copy_user_blob(input.oca_cert_address, input.oca_cert_len);
@@ -600,8 +607,8 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp, bool writable)
                goto e_free_pek;
        }
 
-       data->oca_cert_address = __psp_pa(oca_blob);
-       data->oca_cert_len = input.oca_cert_len;
+       data.oca_cert_address = __psp_pa(oca_blob);
+       data.oca_cert_len = input.oca_cert_len;
 
        /* If platform is not in INIT state then transition it to INIT */
        if (sev->state != SEV_STATE_INIT) {
@@ -610,21 +617,19 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp, bool writable)
                        goto e_free_oca;
        }
 
-       ret = __sev_do_cmd_locked(SEV_CMD_PEK_CERT_IMPORT, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_PEK_CERT_IMPORT, &data, &argp->error);
 
 e_free_oca:
        kfree(oca_blob);
 e_free_pek:
        kfree(pek_blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
 static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
 {
        struct sev_user_data_get_id2 input;
-       struct sev_data_get_id *data;
+       struct sev_data_get_id data;
        void __user *input_address;
        void *id_blob = NULL;
        int ret;
@@ -638,28 +643,25 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
 
        input_address = (void __user *)input.address;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return -ENOMEM;
-
        if (input.address && input.length) {
                id_blob = kmalloc(input.length, GFP_KERNEL);
-               if (!id_blob) {
-                       kfree(data);
+               if (!id_blob)
                        return -ENOMEM;
-               }
 
-               data->address = __psp_pa(id_blob);
-               data->len = input.length;
+               data.address = __psp_pa(id_blob);
+               data.len = input.length;
+       } else {
+               data.address = 0;
+               data.len = 0;
        }
 
-       ret = __sev_do_cmd_locked(SEV_CMD_GET_ID, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_GET_ID, &data, &argp->error);
 
        /*
         * Firmware will return the length of the ID value (either the minimum
         * required length or the actual length written), return it to the user.
         */
-       input.length = data->len;
+       input.length = data.len;
 
        if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) {
                ret = -EFAULT;
@@ -667,7 +669,7 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
        }
 
        if (id_blob) {
-               if (copy_to_user(input_address, id_blob, data->len)) {
+               if (copy_to_user(input_address, id_blob, data.len)) {
                        ret = -EFAULT;
                        goto e_free;
                }
@@ -675,7 +677,6 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
 
 e_free:
        kfree(id_blob);
-       kfree(data);
 
        return ret;
 }
@@ -725,7 +726,7 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp, bool writable)
        struct sev_device *sev = psp_master->sev_data;
        struct sev_user_data_pdh_cert_export input;
        void *pdh_blob = NULL, *cert_blob = NULL;
-       struct sev_data_pdh_cert_export *data;
+       struct sev_data_pdh_cert_export data;
        void __user *input_cert_chain_address;
        void __user *input_pdh_cert_address;
        int ret;
@@ -743,9 +744,7 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp, bool writable)
        if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
        /* Userspace wants to query the certificate length. */
        if (!input.pdh_cert_address ||
@@ -757,25 +756,19 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp, bool writable)
        input_cert_chain_address = (void __user *)input.cert_chain_address;
 
        /* Allocate a physically contiguous buffer to store the PDH blob. */
-       if (input.pdh_cert_len > SEV_FW_BLOB_MAX_SIZE) {
-               ret = -EFAULT;
-               goto e_free;
-       }
+       if (input.pdh_cert_len > SEV_FW_BLOB_MAX_SIZE)
+               return -EFAULT;
 
        /* Allocate a physically contiguous buffer to store the cert chain blob. */
-       if (input.cert_chain_len > SEV_FW_BLOB_MAX_SIZE) {
-               ret = -EFAULT;
-               goto e_free;
-       }
+       if (input.cert_chain_len > SEV_FW_BLOB_MAX_SIZE)
+               return -EFAULT;
 
        pdh_blob = kmalloc(input.pdh_cert_len, GFP_KERNEL);
-       if (!pdh_blob) {
-               ret = -ENOMEM;
-               goto e_free;
-       }
+       if (!pdh_blob)
+               return -ENOMEM;
 
-       data->pdh_cert_address = __psp_pa(pdh_blob);
-       data->pdh_cert_len = input.pdh_cert_len;
+       data.pdh_cert_address = __psp_pa(pdh_blob);
+       data.pdh_cert_len = input.pdh_cert_len;
 
        cert_blob = kmalloc(input.cert_chain_len, GFP_KERNEL);
        if (!cert_blob) {
@@ -783,15 +776,15 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp, bool writable)
                goto e_free_pdh;
        }
 
-       data->cert_chain_address = __psp_pa(cert_blob);
-       data->cert_chain_len = input.cert_chain_len;
+       data.cert_chain_address = __psp_pa(cert_blob);
+       data.cert_chain_len = input.cert_chain_len;
 
 cmd:
-       ret = __sev_do_cmd_locked(SEV_CMD_PDH_CERT_EXPORT, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_PDH_CERT_EXPORT, &data, &argp->error);
 
        /* If we query the length, FW responded with expected data. */
-       input.cert_chain_len = data->cert_chain_len;
-       input.pdh_cert_len = data->pdh_cert_len;
+       input.cert_chain_len = data.cert_chain_len;
+       input.pdh_cert_len = data.pdh_cert_len;
 
        if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) {
                ret = -EFAULT;
@@ -816,8 +809,6 @@ e_free_cert:
        kfree(cert_blob);
 e_free_pdh:
        kfree(pdh_blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
@@ -982,6 +973,10 @@ int sev_dev_init(struct psp_device *psp)
        if (!sev)
                goto e_err;
 
+       sev->cmd_buf = (void *)devm_get_free_pages(dev, GFP_KERNEL, 0);
+       if (!sev->cmd_buf)
+               goto e_sev;
+
        psp->sev_data = sev;
 
        sev->dev = dev;
@@ -993,7 +988,7 @@ int sev_dev_init(struct psp_device *psp)
        if (!sev->vdata) {
                ret = -ENODEV;
                dev_err(dev, "sev: missing driver data\n");
-               goto e_err;
+               goto e_buf;
        }
 
        psp_set_sev_irq_handler(psp, sev_irq_handler, sev);
@@ -1008,6 +1003,10 @@ int sev_dev_init(struct psp_device *psp)
 
 e_irq:
        psp_clear_sev_irq_handler(psp);
+e_buf:
+       devm_free_pages(dev, (unsigned long)sev->cmd_buf);
+e_sev:
+       devm_kfree(dev, sev);
 e_err:
        psp->sev_data = NULL;
 
index dd5c4fe..666c21e 100644 (file)
@@ -46,12 +46,12 @@ struct sev_device {
        unsigned int int_rcvd;
        wait_queue_head_t int_queue;
        struct sev_misc_dev *misc;
-       struct sev_user_data_status status_cmd_buf;
-       struct sev_data_init init_cmd_buf;
 
        u8 api_major;
        u8 api_minor;
        u8 build;
+
+       void *cmd_buf;
 };
 
 int sev_dev_init(struct psp_device *psp);
index a0836ff..6ab9d9a 100644 (file)
@@ -300,6 +300,18 @@ config INTEL_IDXD_SVM
        depends on PCI_PASID
        depends on PCI_IOV
 
+config INTEL_IDXD_PERFMON
+       bool "Intel Data Accelerators performance monitor support"
+       depends on INTEL_IDXD
+       help
+         Enable performance monitor (pmu) support for the Intel(R)
+         data accelerators present in Intel Xeon CPU.  With this
+         enabled, perf can be used to monitor the DSA (Intel Data
+         Streaming Accelerator) events described in the Intel DSA
+         spec.
+
+         If unsure, say N.
+
 config INTEL_IOATDMA
        tristate "Intel I/OAT DMA support"
        depends on PCI && X86_64
index fe45ad5..64a52bf 100644 (file)
@@ -344,17 +344,6 @@ static inline int at_xdmac_chan_is_paused(struct at_xdmac_chan *atchan)
        return test_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status);
 }
 
-static inline int at_xdmac_csize(u32 maxburst)
-{
-       int csize;
-
-       csize = ffs(maxburst) - 1;
-       if (csize > 4)
-               csize = -EINVAL;
-
-       return csize;
-};
-
 static inline bool at_xdmac_chan_is_peripheral_xfer(u32 cfg)
 {
        return cfg & AT_XDMAC_CC_TYPE_PER_TRAN;
index 08d71da..5328992 100644 (file)
@@ -81,8 +81,13 @@ static struct dw_edma_chunk *dw_edma_alloc_chunk(struct dw_edma_desc *desc)
         *  - Even chunks originate CB equal to 1
         */
        chunk->cb = !(desc->chunks_alloc % 2);
-       chunk->ll_region.paddr = dw->ll_region.paddr + chan->ll_off;
-       chunk->ll_region.vaddr = dw->ll_region.vaddr + chan->ll_off;
+       if (chan->dir == EDMA_DIR_WRITE) {
+               chunk->ll_region.paddr = dw->ll_region_wr[chan->id].paddr;
+               chunk->ll_region.vaddr = dw->ll_region_wr[chan->id].vaddr;
+       } else {
+               chunk->ll_region.paddr = dw->ll_region_rd[chan->id].paddr;
+               chunk->ll_region.vaddr = dw->ll_region_rd[chan->id].vaddr;
+       }
 
        if (desc->chunk) {
                /* Create and add new element into the linked list */
@@ -329,22 +334,22 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
        struct dw_edma_chunk *chunk;
        struct dw_edma_burst *burst;
        struct dw_edma_desc *desc;
-       u32 cnt;
+       u32 cnt = 0;
        int i;
 
        if (!chan->configured)
                return NULL;
 
        switch (chan->config.direction) {
-       case DMA_DEV_TO_MEM: /* local dma */
+       case DMA_DEV_TO_MEM: /* local DMA */
                if (dir == DMA_DEV_TO_MEM && chan->dir == EDMA_DIR_READ)
                        break;
                return NULL;
-       case DMA_MEM_TO_DEV: /* local dma */
+       case DMA_MEM_TO_DEV: /* local DMA */
                if (dir == DMA_MEM_TO_DEV && chan->dir == EDMA_DIR_WRITE)
                        break;
                return NULL;
-       default: /* remote dma */
+       default: /* remote DMA */
                if (dir == DMA_MEM_TO_DEV && chan->dir == EDMA_DIR_READ)
                        break;
                if (dir == DMA_DEV_TO_MEM && chan->dir == EDMA_DIR_WRITE)
@@ -352,12 +357,19 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
                return NULL;
        }
 
-       if (xfer->cyclic) {
+       if (xfer->type == EDMA_XFER_CYCLIC) {
                if (!xfer->xfer.cyclic.len || !xfer->xfer.cyclic.cnt)
                        return NULL;
-       } else {
+       } else if (xfer->type == EDMA_XFER_SCATTER_GATHER) {
                if (xfer->xfer.sg.len < 1)
                        return NULL;
+       } else if (xfer->type == EDMA_XFER_INTERLEAVED) {
+               if (!xfer->xfer.il->numf)
+                       return NULL;
+               if (xfer->xfer.il->numf > 0 && xfer->xfer.il->frame_size > 0)
+                       return NULL;
+       } else {
+               return NULL;
        }
 
        desc = dw_edma_alloc_desc(chan);
@@ -368,18 +380,28 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
        if (unlikely(!chunk))
                goto err_alloc;
 
-       src_addr = chan->config.src_addr;
-       dst_addr = chan->config.dst_addr;
+       if (xfer->type == EDMA_XFER_INTERLEAVED) {
+               src_addr = xfer->xfer.il->src_start;
+               dst_addr = xfer->xfer.il->dst_start;
+       } else {
+               src_addr = chan->config.src_addr;
+               dst_addr = chan->config.dst_addr;
+       }
 
-       if (xfer->cyclic) {
+       if (xfer->type == EDMA_XFER_CYCLIC) {
                cnt = xfer->xfer.cyclic.cnt;
-       } else {
+       } else if (xfer->type == EDMA_XFER_SCATTER_GATHER) {
                cnt = xfer->xfer.sg.len;
                sg = xfer->xfer.sg.sgl;
+       } else if (xfer->type == EDMA_XFER_INTERLEAVED) {
+               if (xfer->xfer.il->numf > 0)
+                       cnt = xfer->xfer.il->numf;
+               else
+                       cnt = xfer->xfer.il->frame_size;
        }
 
        for (i = 0; i < cnt; i++) {
-               if (!xfer->cyclic && !sg)
+               if (xfer->type == EDMA_XFER_SCATTER_GATHER && !sg)
                        break;
 
                if (chunk->bursts_alloc == chan->ll_max) {
@@ -392,20 +414,23 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
                if (unlikely(!burst))
                        goto err_alloc;
 
-               if (xfer->cyclic)
+               if (xfer->type == EDMA_XFER_CYCLIC)
                        burst->sz = xfer->xfer.cyclic.len;
-               else
+               else if (xfer->type == EDMA_XFER_SCATTER_GATHER)
                        burst->sz = sg_dma_len(sg);
+               else if (xfer->type == EDMA_XFER_INTERLEAVED)
+                       burst->sz = xfer->xfer.il->sgl[i].size;
 
                chunk->ll_region.sz += burst->sz;
                desc->alloc_sz += burst->sz;
 
                if (chan->dir == EDMA_DIR_WRITE) {
                        burst->sar = src_addr;
-                       if (xfer->cyclic) {
+                       if (xfer->type == EDMA_XFER_CYCLIC) {
                                burst->dar = xfer->xfer.cyclic.paddr;
-                       } else {
-                               burst->dar = dst_addr;
+                       } else if (xfer->type == EDMA_XFER_SCATTER_GATHER) {
+                               src_addr += sg_dma_len(sg);
+                               burst->dar = sg_dma_address(sg);
                                /* Unlike the typical assumption by other
                                 * drivers/IPs the peripheral memory isn't
                                 * a FIFO memory, in this case, it's a
@@ -416,10 +441,11 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
                        }
                } else {
                        burst->dar = dst_addr;
-                       if (xfer->cyclic) {
+                       if (xfer->type == EDMA_XFER_CYCLIC) {
                                burst->sar = xfer->xfer.cyclic.paddr;
-                       } else {
-                               burst->sar = src_addr;
+                       } else if (xfer->type == EDMA_XFER_SCATTER_GATHER) {
+                               dst_addr += sg_dma_len(sg);
+                               burst->sar = sg_dma_address(sg);
                                /* Unlike the typical assumption by other
                                 * drivers/IPs the peripheral memory isn't
                                 * a FIFO memory, in this case, it's a
@@ -430,10 +456,22 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
                        }
                }
 
-               if (!xfer->cyclic) {
-                       src_addr += sg_dma_len(sg);
-                       dst_addr += sg_dma_len(sg);
+               if (xfer->type == EDMA_XFER_SCATTER_GATHER) {
                        sg = sg_next(sg);
+               } else if (xfer->type == EDMA_XFER_INTERLEAVED &&
+                          xfer->xfer.il->frame_size > 0) {
+                       struct dma_interleaved_template *il = xfer->xfer.il;
+                       struct data_chunk *dc = &il->sgl[i];
+
+                       if (il->src_sgl) {
+                               src_addr += burst->sz;
+                               src_addr += dmaengine_get_src_icg(il, dc);
+                       }
+
+                       if (il->dst_sgl) {
+                               dst_addr += burst->sz;
+                               dst_addr += dmaengine_get_dst_icg(il, dc);
+                       }
                }
        }
 
@@ -459,7 +497,7 @@ dw_edma_device_prep_slave_sg(struct dma_chan *dchan, struct scatterlist *sgl,
        xfer.xfer.sg.sgl = sgl;
        xfer.xfer.sg.len = len;
        xfer.flags = flags;
-       xfer.cyclic = false;
+       xfer.type = EDMA_XFER_SCATTER_GATHER;
 
        return dw_edma_device_transfer(&xfer);
 }
@@ -478,7 +516,23 @@ dw_edma_device_prep_dma_cyclic(struct dma_chan *dchan, dma_addr_t paddr,
        xfer.xfer.cyclic.len = len;
        xfer.xfer.cyclic.cnt = count;
        xfer.flags = flags;
-       xfer.cyclic = true;
+       xfer.type = EDMA_XFER_CYCLIC;
+
+       return dw_edma_device_transfer(&xfer);
+}
+
+static struct dma_async_tx_descriptor *
+dw_edma_device_prep_interleaved_dma(struct dma_chan *dchan,
+                                   struct dma_interleaved_template *ilt,
+                                   unsigned long flags)
+{
+       struct dw_edma_transfer xfer;
+
+       xfer.dchan = dchan;
+       xfer.direction = ilt->dir;
+       xfer.xfer.il = ilt;
+       xfer.flags = flags;
+       xfer.type = EDMA_XFER_INTERLEAVED;
 
        return dw_edma_device_transfer(&xfer);
 }
@@ -642,24 +696,13 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
        struct device *dev = chip->dev;
        struct dw_edma *dw = chip->dw;
        struct dw_edma_chan *chan;
-       size_t ll_chunk, dt_chunk;
        struct dw_edma_irq *irq;
        struct dma_device *dma;
-       u32 i, j, cnt, ch_cnt;
        u32 alloc, off_alloc;
+       u32 i, j, cnt;
        int err = 0;
        u32 pos;
 
-       ch_cnt = dw->wr_ch_cnt + dw->rd_ch_cnt;
-       ll_chunk = dw->ll_region.sz;
-       dt_chunk = dw->dt_region.sz;
-
-       /* Calculate linked list chunk for each channel */
-       ll_chunk /= roundup_pow_of_two(ch_cnt);
-
-       /* Calculate linked list chunk for each channel */
-       dt_chunk /= roundup_pow_of_two(ch_cnt);
-
        if (write) {
                i = 0;
                cnt = dw->wr_ch_cnt;
@@ -691,14 +734,14 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
                chan->request = EDMA_REQ_NONE;
                chan->status = EDMA_ST_IDLE;
 
-               chan->ll_off = (ll_chunk * i);
-               chan->ll_max = (ll_chunk / EDMA_LL_SZ) - 1;
-
-               chan->dt_off = (dt_chunk * i);
+               if (write)
+                       chan->ll_max = (dw->ll_region_wr[j].sz / EDMA_LL_SZ);
+               else
+                       chan->ll_max = (dw->ll_region_rd[j].sz / EDMA_LL_SZ);
+               chan->ll_max -= 1;
 
-               dev_vdbg(dev, "L. List:\tChannel %s[%u] off=0x%.8lx, max_cnt=%u\n",
-                        write ? "write" : "read", j,
-                        chan->ll_off, chan->ll_max);
+               dev_vdbg(dev, "L. List:\tChannel %s[%u] max_cnt=%u\n",
+                        write ? "write" : "read", j, chan->ll_max);
 
                if (dw->nr_irqs == 1)
                        pos = 0;
@@ -723,12 +766,15 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
                chan->vc.desc_free = vchan_free_desc;
                vchan_init(&chan->vc, dma);
 
-               dt_region->paddr = dw->dt_region.paddr + chan->dt_off;
-               dt_region->vaddr = dw->dt_region.vaddr + chan->dt_off;
-               dt_region->sz = dt_chunk;
-
-               dev_vdbg(dev, "Data:\tChannel %s[%u] off=0x%.8lx\n",
-                        write ? "write" : "read", j, chan->dt_off);
+               if (write) {
+                       dt_region->paddr = dw->dt_region_wr[j].paddr;
+                       dt_region->vaddr = dw->dt_region_wr[j].vaddr;
+                       dt_region->sz = dw->dt_region_wr[j].sz;
+               } else {
+                       dt_region->paddr = dw->dt_region_rd[j].paddr;
+                       dt_region->vaddr = dw->dt_region_rd[j].vaddr;
+                       dt_region->sz = dw->dt_region_rd[j].sz;
+               }
 
                dw_edma_v0_core_device_config(chan);
        }
@@ -738,6 +784,7 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
        dma_cap_set(DMA_SLAVE, dma->cap_mask);
        dma_cap_set(DMA_CYCLIC, dma->cap_mask);
        dma_cap_set(DMA_PRIVATE, dma->cap_mask);
+       dma_cap_set(DMA_INTERLEAVE, dma->cap_mask);
        dma->directions = BIT(write ? DMA_DEV_TO_MEM : DMA_MEM_TO_DEV);
        dma->src_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES);
        dma->dst_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES);
@@ -756,6 +803,7 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
        dma->device_tx_status = dw_edma_device_tx_status;
        dma->device_prep_slave_sg = dw_edma_device_prep_slave_sg;
        dma->device_prep_dma_cyclic = dw_edma_device_prep_dma_cyclic;
+       dma->device_prep_interleaved_dma = dw_edma_device_prep_interleaved_dma;
 
        dma_set_max_seg_size(dma->dev, U32_MAX);
 
@@ -863,14 +911,15 @@ int dw_edma_probe(struct dw_edma_chip *chip)
 
        raw_spin_lock_init(&dw->lock);
 
-       /* Find out how many write channels are supported by hardware */
-       dw->wr_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_WRITE);
-       if (!dw->wr_ch_cnt)
-               return -EINVAL;
+       dw->wr_ch_cnt = min_t(u16, dw->wr_ch_cnt,
+                             dw_edma_v0_core_ch_count(dw, EDMA_DIR_WRITE));
+       dw->wr_ch_cnt = min_t(u16, dw->wr_ch_cnt, EDMA_MAX_WR_CH);
 
-       /* Find out how many read channels are supported by hardware */
-       dw->rd_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_READ);
-       if (!dw->rd_ch_cnt)
+       dw->rd_ch_cnt = min_t(u16, dw->rd_ch_cnt,
+                             dw_edma_v0_core_ch_count(dw, EDMA_DIR_READ));
+       dw->rd_ch_cnt = min_t(u16, dw->rd_ch_cnt, EDMA_MAX_RD_CH);
+
+       if (!dw->wr_ch_cnt && !dw->rd_ch_cnt)
                return -EINVAL;
 
        dev_vdbg(dev, "Channels:\twrite=%d, read=%d\n",
@@ -937,24 +986,23 @@ int dw_edma_remove(struct dw_edma_chip *chip)
        /* Power management */
        pm_runtime_disable(dev);
 
+       /* Deregister eDMA device */
+       dma_async_device_unregister(&dw->wr_edma);
        list_for_each_entry_safe(chan, _chan, &dw->wr_edma.channels,
                                 vc.chan.device_node) {
-               list_del(&chan->vc.chan.device_node);
                tasklet_kill(&chan->vc.task);
+               list_del(&chan->vc.chan.device_node);
        }
 
+       dma_async_device_unregister(&dw->rd_edma);
        list_for_each_entry_safe(chan, _chan, &dw->rd_edma.channels,
                                 vc.chan.device_node) {
-               list_del(&chan->vc.chan.device_node);
                tasklet_kill(&chan->vc.task);
+               list_del(&chan->vc.chan.device_node);
        }
 
-       /* Deregister eDMA device */
-       dma_async_device_unregister(&dw->wr_edma);
-       dma_async_device_unregister(&dw->rd_edma);
-
        /* Turn debugfs off */
-       dw_edma_v0_core_debugfs_off();
+       dw_edma_v0_core_debugfs_off(chip);
 
        return 0;
 }
index 31fc50d..60316d4 100644 (file)
 #include "../virt-dma.h"
 
 #define EDMA_LL_SZ                                     24
+#define EDMA_MAX_WR_CH                                 8
+#define EDMA_MAX_RD_CH                                 8
 
 enum dw_edma_dir {
        EDMA_DIR_WRITE = 0,
        EDMA_DIR_READ
 };
 
-enum dw_edma_mode {
-       EDMA_MODE_LEGACY = 0,
-       EDMA_MODE_UNROLL
+enum dw_edma_map_format {
+       EDMA_MF_EDMA_LEGACY = 0x0,
+       EDMA_MF_EDMA_UNROLL = 0x1,
+       EDMA_MF_HDMA_COMPAT = 0x5
 };
 
 enum dw_edma_request {
@@ -38,6 +41,12 @@ enum dw_edma_status {
        EDMA_ST_BUSY
 };
 
+enum dw_edma_xfer_type {
+       EDMA_XFER_SCATTER_GATHER = 0,
+       EDMA_XFER_CYCLIC,
+       EDMA_XFER_INTERLEAVED
+};
+
 struct dw_edma_chan;
 struct dw_edma_chunk;
 
@@ -82,11 +91,8 @@ struct dw_edma_chan {
        int                             id;
        enum dw_edma_dir                dir;
 
-       off_t                           ll_off;
        u32                             ll_max;
 
-       off_t                           dt_off;
-
        struct msi_msg                  msi;
 
        enum dw_edma_request            request;
@@ -117,19 +123,23 @@ struct dw_edma {
        u16                             rd_ch_cnt;
 
        struct dw_edma_region           rg_region;      /* Registers */
-       struct dw_edma_region           ll_region;      /* Linked list */
-       struct dw_edma_region           dt_region;      /* Data */
+       struct dw_edma_region           ll_region_wr[EDMA_MAX_WR_CH];
+       struct dw_edma_region           ll_region_rd[EDMA_MAX_RD_CH];
+       struct dw_edma_region           dt_region_wr[EDMA_MAX_WR_CH];
+       struct dw_edma_region           dt_region_rd[EDMA_MAX_RD_CH];
 
        struct dw_edma_irq              *irq;
        int                             nr_irqs;
 
-       u32                             version;
-       enum dw_edma_mode               mode;
+       enum dw_edma_map_format         mf;
 
        struct dw_edma_chan             *chan;
        const struct dw_edma_core_ops   *ops;
 
        raw_spinlock_t                  lock;           /* Only for legacy */
+#ifdef CONFIG_DEBUG_FS
+       struct dentry                   *debugfs;
+#endif /* CONFIG_DEBUG_FS */
 };
 
 struct dw_edma_sg {
@@ -146,12 +156,13 @@ struct dw_edma_cyclic {
 struct dw_edma_transfer {
        struct dma_chan                 *dchan;
        union dw_edma_xfer {
-               struct dw_edma_sg       sg;
-               struct dw_edma_cyclic   cyclic;
+               struct dw_edma_sg               sg;
+               struct dw_edma_cyclic           cyclic;
+               struct dma_interleaved_template *il;
        } xfer;
        enum dma_transfer_direction     direction;
        unsigned long                   flags;
-       bool                            cyclic;
+       enum dw_edma_xfer_type          type;
 };
 
 static inline
index 1eafc60..44f6e09 100644 (file)
 #include <linux/dma/edma.h>
 #include <linux/pci-epf.h>
 #include <linux/msi.h>
+#include <linux/bitfield.h>
 
 #include "dw-edma-core.h"
 
+#define DW_PCIE_VSEC_DMA_ID                    0x6
+#define DW_PCIE_VSEC_DMA_BAR                   GENMASK(10, 8)
+#define DW_PCIE_VSEC_DMA_MAP                   GENMASK(2, 0)
+#define DW_PCIE_VSEC_DMA_WR_CH                 GENMASK(9, 0)
+#define DW_PCIE_VSEC_DMA_RD_CH                 GENMASK(25, 16)
+
+#define DW_BLOCK(a, b, c) \
+       { \
+               .bar = a, \
+               .off = b, \
+               .sz = c, \
+       },
+
+struct dw_edma_block {
+       enum pci_barno                  bar;
+       off_t                           off;
+       size_t                          sz;
+};
+
 struct dw_edma_pcie_data {
        /* eDMA registers location */
-       enum pci_barno                  rg_bar;
-       off_t                           rg_off;
-       size_t                          rg_sz;
+       struct dw_edma_block            rg;
        /* eDMA memory linked list location */
-       enum pci_barno                  ll_bar;
-       off_t                           ll_off;
-       size_t                          ll_sz;
+       struct dw_edma_block            ll_wr[EDMA_MAX_WR_CH];
+       struct dw_edma_block            ll_rd[EDMA_MAX_RD_CH];
        /* eDMA memory data location */
-       enum pci_barno                  dt_bar;
-       off_t                           dt_off;
-       size_t                          dt_sz;
+       struct dw_edma_block            dt_wr[EDMA_MAX_WR_CH];
+       struct dw_edma_block            dt_rd[EDMA_MAX_RD_CH];
        /* Other */
-       u32                             version;
-       enum dw_edma_mode               mode;
+       enum dw_edma_map_format         mf;
        u8                              irqs;
+       u16                             wr_ch_cnt;
+       u16                             rd_ch_cnt;
 };
 
 static const struct dw_edma_pcie_data snps_edda_data = {
        /* eDMA registers location */
-       .rg_bar                         = BAR_0,
-       .rg_off                         = 0x00001000,   /*  4 Kbytes */
-       .rg_sz                          = 0x00002000,   /*  8 Kbytes */
+       .rg.bar                         = BAR_0,
+       .rg.off                         = 0x00001000,   /*  4 Kbytes */
+       .rg.sz                          = 0x00002000,   /*  8 Kbytes */
        /* eDMA memory linked list location */
-       .ll_bar                         = BAR_2,
-       .ll_off                         = 0x00000000,   /*  0 Kbytes */
-       .ll_sz                          = 0x00800000,   /*  8 Mbytes */
+       .ll_wr = {
+               /* Channel 0 - BAR 2, offset 0 Mbytes, size 2 Kbytes */
+               DW_BLOCK(BAR_2, 0x00000000, 0x00000800)
+               /* Channel 1 - BAR 2, offset 2 Mbytes, size 2 Kbytes */
+               DW_BLOCK(BAR_2, 0x00200000, 0x00000800)
+       },
+       .ll_rd = {
+               /* Channel 0 - BAR 2, offset 4 Mbytes, size 2 Kbytes */
+               DW_BLOCK(BAR_2, 0x00400000, 0x00000800)
+               /* Channel 1 - BAR 2, offset 6 Mbytes, size 2 Kbytes */
+               DW_BLOCK(BAR_2, 0x00600000, 0x00000800)
+       },
        /* eDMA memory data location */
-       .dt_bar                         = BAR_2,
-       .dt_off                         = 0x00800000,   /*  8 Mbytes */
-       .dt_sz                          = 0x03800000,   /* 56 Mbytes */
+       .dt_wr = {
+               /* Channel 0 - BAR 2, offset 8 Mbytes, size 2 Kbytes */
+               DW_BLOCK(BAR_2, 0x00800000, 0x00000800)
+               /* Channel 1 - BAR 2, offset 9 Mbytes, size 2 Kbytes */
+               DW_BLOCK(BAR_2, 0x00900000, 0x00000800)
+       },
+       .dt_rd = {
+               /* Channel 0 - BAR 2, offset 10 Mbytes, size 2 Kbytes */
+               DW_BLOCK(BAR_2, 0x00a00000, 0x00000800)
+               /* Channel 1 - BAR 2, offset 11 Mbytes, size 2 Kbytes */
+               DW_BLOCK(BAR_2, 0x00b00000, 0x00000800)
+       },
        /* Other */
-       .version                        = 0,
-       .mode                           = EDMA_MODE_UNROLL,
+       .mf                             = EDMA_MF_EDMA_UNROLL,
        .irqs                           = 1,
+       .wr_ch_cnt                      = 2,
+       .rd_ch_cnt                      = 2,
 };
 
 static int dw_edma_pcie_irq_vector(struct device *dev, unsigned int nr)
@@ -63,14 +99,58 @@ static const struct dw_edma_core_ops dw_edma_pcie_core_ops = {
        .irq_vector = dw_edma_pcie_irq_vector,
 };
 
+static void dw_edma_pcie_get_vsec_dma_data(struct pci_dev *pdev,
+                                          struct dw_edma_pcie_data *pdata)
+{
+       u32 val, map;
+       u16 vsec;
+       u64 off;
+
+       vsec = pci_find_vsec_capability(pdev, PCI_VENDOR_ID_SYNOPSYS,
+                                       DW_PCIE_VSEC_DMA_ID);
+       if (!vsec)
+               return;
+
+       pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &val);
+       if (PCI_VNDR_HEADER_REV(val) != 0x00 ||
+           PCI_VNDR_HEADER_LEN(val) != 0x18)
+               return;
+
+       pci_dbg(pdev, "Detected PCIe Vendor-Specific Extended Capability DMA\n");
+       pci_read_config_dword(pdev, vsec + 0x8, &val);
+       map = FIELD_GET(DW_PCIE_VSEC_DMA_MAP, val);
+       if (map != EDMA_MF_EDMA_LEGACY &&
+           map != EDMA_MF_EDMA_UNROLL &&
+           map != EDMA_MF_HDMA_COMPAT)
+               return;
+
+       pdata->mf = map;
+       pdata->rg.bar = FIELD_GET(DW_PCIE_VSEC_DMA_BAR, val);
+
+       pci_read_config_dword(pdev, vsec + 0xc, &val);
+       pdata->wr_ch_cnt = min_t(u16, pdata->wr_ch_cnt,
+                                FIELD_GET(DW_PCIE_VSEC_DMA_WR_CH, val));
+       pdata->rd_ch_cnt = min_t(u16, pdata->rd_ch_cnt,
+                                FIELD_GET(DW_PCIE_VSEC_DMA_RD_CH, val));
+
+       pci_read_config_dword(pdev, vsec + 0x14, &val);
+       off = val;
+       pci_read_config_dword(pdev, vsec + 0x10, &val);
+       off <<= 32;
+       off |= val;
+       pdata->rg.off = off;
+}
+
 static int dw_edma_pcie_probe(struct pci_dev *pdev,
                              const struct pci_device_id *pid)
 {
-       const struct dw_edma_pcie_data *pdata = (void *)pid->driver_data;
+       struct dw_edma_pcie_data *pdata = (void *)pid->driver_data;
+       struct dw_edma_pcie_data vsec_data;
        struct device *dev = &pdev->dev;
        struct dw_edma_chip *chip;
-       int err, nr_irqs;
        struct dw_edma *dw;
+       int err, nr_irqs;
+       int i, mask;
 
        /* Enable PCI device */
        err = pcim_enable_device(pdev);
@@ -79,11 +159,25 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
                return err;
        }
 
+       memcpy(&vsec_data, pdata, sizeof(struct dw_edma_pcie_data));
+
+       /*
+        * Tries to find if exists a PCIe Vendor-Specific Extended Capability
+        * for the DMA, if one exists, then reconfigures it.
+        */
+       dw_edma_pcie_get_vsec_dma_data(pdev, &vsec_data);
+
        /* Mapping PCI BAR regions */
-       err = pcim_iomap_regions(pdev, BIT(pdata->rg_bar) |
-                                      BIT(pdata->ll_bar) |
-                                      BIT(pdata->dt_bar),
-                                pci_name(pdev));
+       mask = BIT(vsec_data.rg.bar);
+       for (i = 0; i < vsec_data.wr_ch_cnt; i++) {
+               mask |= BIT(vsec_data.ll_wr[i].bar);
+               mask |= BIT(vsec_data.dt_wr[i].bar);
+       }
+       for (i = 0; i < vsec_data.rd_ch_cnt; i++) {
+               mask |= BIT(vsec_data.ll_rd[i].bar);
+               mask |= BIT(vsec_data.dt_rd[i].bar);
+       }
+       err = pcim_iomap_regions(pdev, mask, pci_name(pdev));
        if (err) {
                pci_err(pdev, "eDMA BAR I/O remapping failed\n");
                return err;
@@ -125,7 +219,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
                return -ENOMEM;
 
        /* IRQs allocation */
-       nr_irqs = pci_alloc_irq_vectors(pdev, 1, pdata->irqs,
+       nr_irqs = pci_alloc_irq_vectors(pdev, 1, vsec_data.irqs,
                                        PCI_IRQ_MSI | PCI_IRQ_MSIX);
        if (nr_irqs < 1) {
                pci_err(pdev, "fail to alloc IRQ vector (number of IRQs=%u)\n",
@@ -139,46 +233,109 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
        chip->id = pdev->devfn;
        chip->irq = pdev->irq;
 
-       dw->rg_region.vaddr = pcim_iomap_table(pdev)[pdata->rg_bar];
-       dw->rg_region.vaddr += pdata->rg_off;
-       dw->rg_region.paddr = pdev->resource[pdata->rg_bar].start;
-       dw->rg_region.paddr += pdata->rg_off;
-       dw->rg_region.sz = pdata->rg_sz;
-
-       dw->ll_region.vaddr = pcim_iomap_table(pdev)[pdata->ll_bar];
-       dw->ll_region.vaddr += pdata->ll_off;
-       dw->ll_region.paddr = pdev->resource[pdata->ll_bar].start;
-       dw->ll_region.paddr += pdata->ll_off;
-       dw->ll_region.sz = pdata->ll_sz;
-
-       dw->dt_region.vaddr = pcim_iomap_table(pdev)[pdata->dt_bar];
-       dw->dt_region.vaddr += pdata->dt_off;
-       dw->dt_region.paddr = pdev->resource[pdata->dt_bar].start;
-       dw->dt_region.paddr += pdata->dt_off;
-       dw->dt_region.sz = pdata->dt_sz;
-
-       dw->version = pdata->version;
-       dw->mode = pdata->mode;
+       dw->mf = vsec_data.mf;
        dw->nr_irqs = nr_irqs;
        dw->ops = &dw_edma_pcie_core_ops;
+       dw->wr_ch_cnt = vsec_data.wr_ch_cnt;
+       dw->rd_ch_cnt = vsec_data.rd_ch_cnt;
 
-       /* Debug info */
-       pci_dbg(pdev, "Version:\t%u\n", dw->version);
+       dw->rg_region.vaddr = pcim_iomap_table(pdev)[vsec_data.rg.bar];
+       if (!dw->rg_region.vaddr)
+               return -ENOMEM;
+
+       dw->rg_region.vaddr += vsec_data.rg.off;
+       dw->rg_region.paddr = pdev->resource[vsec_data.rg.bar].start;
+       dw->rg_region.paddr += vsec_data.rg.off;
+       dw->rg_region.sz = vsec_data.rg.sz;
+
+       for (i = 0; i < dw->wr_ch_cnt; i++) {
+               struct dw_edma_region *ll_region = &dw->ll_region_wr[i];
+               struct dw_edma_region *dt_region = &dw->dt_region_wr[i];
+               struct dw_edma_block *ll_block = &vsec_data.ll_wr[i];
+               struct dw_edma_block *dt_block = &vsec_data.dt_wr[i];
+
+               ll_region->vaddr = pcim_iomap_table(pdev)[ll_block->bar];
+               if (!ll_region->vaddr)
+                       return -ENOMEM;
+
+               ll_region->vaddr += ll_block->off;
+               ll_region->paddr = pdev->resource[ll_block->bar].start;
+               ll_region->paddr += ll_block->off;
+               ll_region->sz = ll_block->sz;
+
+               dt_region->vaddr = pcim_iomap_table(pdev)[dt_block->bar];
+               if (!dt_region->vaddr)
+                       return -ENOMEM;
+
+               dt_region->vaddr += dt_block->off;
+               dt_region->paddr = pdev->resource[dt_block->bar].start;
+               dt_region->paddr += dt_block->off;
+               dt_region->sz = dt_block->sz;
+       }
 
-       pci_dbg(pdev, "Mode:\t%s\n",
-               dw->mode == EDMA_MODE_LEGACY ? "Legacy" : "Unroll");
+       for (i = 0; i < dw->rd_ch_cnt; i++) {
+               struct dw_edma_region *ll_region = &dw->ll_region_rd[i];
+               struct dw_edma_region *dt_region = &dw->dt_region_rd[i];
+               struct dw_edma_block *ll_block = &vsec_data.ll_rd[i];
+               struct dw_edma_block *dt_block = &vsec_data.dt_rd[i];
+
+               ll_region->vaddr = pcim_iomap_table(pdev)[ll_block->bar];
+               if (!ll_region->vaddr)
+                       return -ENOMEM;
+
+               ll_region->vaddr += ll_block->off;
+               ll_region->paddr = pdev->resource[ll_block->bar].start;
+               ll_region->paddr += ll_block->off;
+               ll_region->sz = ll_block->sz;
+
+               dt_region->vaddr = pcim_iomap_table(pdev)[dt_block->bar];
+               if (!dt_region->vaddr)
+                       return -ENOMEM;
+
+               dt_region->vaddr += dt_block->off;
+               dt_region->paddr = pdev->resource[dt_block->bar].start;
+               dt_region->paddr += dt_block->off;
+               dt_region->sz = dt_block->sz;
+       }
+
+       /* Debug info */
+       if (dw->mf == EDMA_MF_EDMA_LEGACY)
+               pci_dbg(pdev, "Version:\teDMA Port Logic (0x%x)\n", dw->mf);
+       else if (dw->mf == EDMA_MF_EDMA_UNROLL)
+               pci_dbg(pdev, "Version:\teDMA Unroll (0x%x)\n", dw->mf);
+       else if (dw->mf == EDMA_MF_HDMA_COMPAT)
+               pci_dbg(pdev, "Version:\tHDMA Compatible (0x%x)\n", dw->mf);
+       else
+               pci_dbg(pdev, "Version:\tUnknown (0x%x)\n", dw->mf);
 
        pci_dbg(pdev, "Registers:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
-               pdata->rg_bar, pdata->rg_off, pdata->rg_sz,
+               vsec_data.rg.bar, vsec_data.rg.off, vsec_data.rg.sz,
                dw->rg_region.vaddr, &dw->rg_region.paddr);
 
-       pci_dbg(pdev, "L. List:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
-               pdata->ll_bar, pdata->ll_off, pdata->ll_sz,
-               dw->ll_region.vaddr, &dw->ll_region.paddr);
 
-       pci_dbg(pdev, "Data:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
-               pdata->dt_bar, pdata->dt_off, pdata->dt_sz,
-               dw->dt_region.vaddr, &dw->dt_region.paddr);
+       for (i = 0; i < dw->wr_ch_cnt; i++) {
+               pci_dbg(pdev, "L. List:\tWRITE CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
+                       i, vsec_data.ll_wr[i].bar,
+                       vsec_data.ll_wr[i].off, dw->ll_region_wr[i].sz,
+                       dw->ll_region_wr[i].vaddr, &dw->ll_region_wr[i].paddr);
+
+               pci_dbg(pdev, "Data:\tWRITE CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
+                       i, vsec_data.dt_wr[i].bar,
+                       vsec_data.dt_wr[i].off, dw->dt_region_wr[i].sz,
+                       dw->dt_region_wr[i].vaddr, &dw->dt_region_wr[i].paddr);
+       }
+
+       for (i = 0; i < dw->rd_ch_cnt; i++) {
+               pci_dbg(pdev, "L. List:\tREAD CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
+                       i, vsec_data.ll_rd[i].bar,
+                       vsec_data.ll_rd[i].off, dw->ll_region_rd[i].sz,
+                       dw->ll_region_rd[i].vaddr, &dw->ll_region_rd[i].paddr);
+
+               pci_dbg(pdev, "Data:\tREAD CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
+                       i, vsec_data.dt_rd[i].bar,
+                       vsec_data.dt_rd[i].off, dw->dt_region_rd[i].sz,
+                       dw->dt_region_rd[i].vaddr, &dw->dt_region_rd[i].paddr);
+       }
 
        pci_dbg(pdev, "Nr. IRQs:\t%u\n", dw->nr_irqs);
 
index 692de47..329fc2e 100644 (file)
@@ -28,35 +28,75 @@ static inline struct dw_edma_v0_regs __iomem *__dw_regs(struct dw_edma *dw)
        return dw->rg_region.vaddr;
 }
 
-#define SET(dw, name, value)                           \
+#define SET_32(dw, name, value)                                \
        writel(value, &(__dw_regs(dw)->name))
 
-#define GET(dw, name)                                  \
+#define GET_32(dw, name)                               \
        readl(&(__dw_regs(dw)->name))
 
-#define SET_RW(dw, dir, name, value)                   \
+#define SET_RW_32(dw, dir, name, value)                        \
        do {                                            \
                if ((dir) == EDMA_DIR_WRITE)            \
-                       SET(dw, wr_##name, value);      \
+                       SET_32(dw, wr_##name, value);   \
                else                                    \
-                       SET(dw, rd_##name, value);      \
+                       SET_32(dw, rd_##name, value);   \
        } while (0)
 
-#define GET_RW(dw, dir, name)                          \
+#define GET_RW_32(dw, dir, name)                       \
        ((dir) == EDMA_DIR_WRITE                        \
-         ? GET(dw, wr_##name)                          \
-         : GET(dw, rd_##name))
+         ? GET_32(dw, wr_##name)                       \
+         : GET_32(dw, rd_##name))
 
-#define SET_BOTH(dw, name, value)                      \
+#define SET_BOTH_32(dw, name, value)                   \
        do {                                            \
-               SET(dw, wr_##name, value);              \
-               SET(dw, rd_##name, value);              \
+               SET_32(dw, wr_##name, value);           \
+               SET_32(dw, rd_##name, value);           \
+       } while (0)
+
+#ifdef CONFIG_64BIT
+
+#define SET_64(dw, name, value)                                \
+       writeq(value, &(__dw_regs(dw)->name))
+
+#define GET_64(dw, name)                               \
+       readq(&(__dw_regs(dw)->name))
+
+#define SET_RW_64(dw, dir, name, value)                        \
+       do {                                            \
+               if ((dir) == EDMA_DIR_WRITE)            \
+                       SET_64(dw, wr_##name, value);   \
+               else                                    \
+                       SET_64(dw, rd_##name, value);   \
+       } while (0)
+
+#define GET_RW_64(dw, dir, name)                       \
+       ((dir) == EDMA_DIR_WRITE                        \
+         ? GET_64(dw, wr_##name)                       \
+         : GET_64(dw, rd_##name))
+
+#define SET_BOTH_64(dw, name, value)                   \
+       do {                                            \
+               SET_64(dw, wr_##name, value);           \
+               SET_64(dw, rd_##name, value);           \
+       } while (0)
+
+#endif /* CONFIG_64BIT */
+
+#define SET_COMPAT(dw, name, value)                    \
+       writel(value, &(__dw_regs(dw)->type.unroll.name))
+
+#define SET_RW_COMPAT(dw, dir, name, value)            \
+       do {                                            \
+               if ((dir) == EDMA_DIR_WRITE)            \
+                       SET_COMPAT(dw, wr_##name, value); \
+               else                                    \
+                       SET_COMPAT(dw, rd_##name, value); \
        } while (0)
 
 static inline struct dw_edma_v0_ch_regs __iomem *
 __dw_ch_regs(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch)
 {
-       if (dw->mode == EDMA_MODE_LEGACY)
+       if (dw->mf == EDMA_MF_EDMA_LEGACY)
                return &(__dw_regs(dw)->type.legacy.ch);
 
        if (dir == EDMA_DIR_WRITE)
@@ -68,7 +108,7 @@ __dw_ch_regs(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch)
 static inline void writel_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
                             u32 value, void __iomem *addr)
 {
-       if (dw->mode == EDMA_MODE_LEGACY) {
+       if (dw->mf == EDMA_MF_EDMA_LEGACY) {
                u32 viewport_sel;
                unsigned long flags;
 
@@ -93,7 +133,7 @@ static inline u32 readl_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
 {
        u32 value;
 
-       if (dw->mode == EDMA_MODE_LEGACY) {
+       if (dw->mf == EDMA_MF_EDMA_LEGACY) {
                u32 viewport_sel;
                unsigned long flags;
 
@@ -115,21 +155,86 @@ static inline u32 readl_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
        return value;
 }
 
-#define SET_CH(dw, dir, ch, name, value) \
+#define SET_CH_32(dw, dir, ch, name, value) \
        writel_ch(dw, dir, ch, value, &(__dw_ch_regs(dw, dir, ch)->name))
 
-#define GET_CH(dw, dir, ch, name) \
+#define GET_CH_32(dw, dir, ch, name) \
        readl_ch(dw, dir, ch, &(__dw_ch_regs(dw, dir, ch)->name))
 
-#define SET_LL(ll, value) \
+#define SET_LL_32(ll, value) \
        writel(value, ll)
 
+#ifdef CONFIG_64BIT
+
+static inline void writeq_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
+                            u64 value, void __iomem *addr)
+{
+       if (dw->mf == EDMA_MF_EDMA_LEGACY) {
+               u32 viewport_sel;
+               unsigned long flags;
+
+               raw_spin_lock_irqsave(&dw->lock, flags);
+
+               viewport_sel = FIELD_PREP(EDMA_V0_VIEWPORT_MASK, ch);
+               if (dir == EDMA_DIR_READ)
+                       viewport_sel |= BIT(31);
+
+               writel(viewport_sel,
+                      &(__dw_regs(dw)->type.legacy.viewport_sel));
+               writeq(value, addr);
+
+               raw_spin_unlock_irqrestore(&dw->lock, flags);
+       } else {
+               writeq(value, addr);
+       }
+}
+
+static inline u64 readq_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
+                          const void __iomem *addr)
+{
+       u32 value;
+
+       if (dw->mf == EDMA_MF_EDMA_LEGACY) {
+               u32 viewport_sel;
+               unsigned long flags;
+
+               raw_spin_lock_irqsave(&dw->lock, flags);
+
+               viewport_sel = FIELD_PREP(EDMA_V0_VIEWPORT_MASK, ch);
+               if (dir == EDMA_DIR_READ)
+                       viewport_sel |= BIT(31);
+
+               writel(viewport_sel,
+                      &(__dw_regs(dw)->type.legacy.viewport_sel));
+               value = readq(addr);
+
+               raw_spin_unlock_irqrestore(&dw->lock, flags);
+       } else {
+               value = readq(addr);
+       }
+
+       return value;
+}
+
+#define SET_CH_64(dw, dir, ch, name, value) \
+       writeq_ch(dw, dir, ch, value, &(__dw_ch_regs(dw, dir, ch)->name))
+
+#define GET_CH_64(dw, dir, ch, name) \
+       readq_ch(dw, dir, ch, &(__dw_ch_regs(dw, dir, ch)->name))
+
+#define SET_LL_64(ll, value) \
+       writeq(value, ll)
+
+#endif /* CONFIG_64BIT */
+
 /* eDMA management callbacks */
 void dw_edma_v0_core_off(struct dw_edma *dw)
 {
-       SET_BOTH(dw, int_mask, EDMA_V0_DONE_INT_MASK | EDMA_V0_ABORT_INT_MASK);
-       SET_BOTH(dw, int_clear, EDMA_V0_DONE_INT_MASK | EDMA_V0_ABORT_INT_MASK);
-       SET_BOTH(dw, engine_en, 0);
+       SET_BOTH_32(dw, int_mask,
+                   EDMA_V0_DONE_INT_MASK | EDMA_V0_ABORT_INT_MASK);
+       SET_BOTH_32(dw, int_clear,
+                   EDMA_V0_DONE_INT_MASK | EDMA_V0_ABORT_INT_MASK);
+       SET_BOTH_32(dw, engine_en, 0);
 }
 
 u16 dw_edma_v0_core_ch_count(struct dw_edma *dw, enum dw_edma_dir dir)
@@ -137,9 +242,11 @@ u16 dw_edma_v0_core_ch_count(struct dw_edma *dw, enum dw_edma_dir dir)
        u32 num_ch;
 
        if (dir == EDMA_DIR_WRITE)
-               num_ch = FIELD_GET(EDMA_V0_WRITE_CH_COUNT_MASK, GET(dw, ctrl));
+               num_ch = FIELD_GET(EDMA_V0_WRITE_CH_COUNT_MASK,
+                                  GET_32(dw, ctrl));
        else
-               num_ch = FIELD_GET(EDMA_V0_READ_CH_COUNT_MASK, GET(dw, ctrl));
+               num_ch = FIELD_GET(EDMA_V0_READ_CH_COUNT_MASK,
+                                  GET_32(dw, ctrl));
 
        if (num_ch > EDMA_V0_MAX_NR_CH)
                num_ch = EDMA_V0_MAX_NR_CH;
@@ -153,7 +260,7 @@ enum dma_status dw_edma_v0_core_ch_status(struct dw_edma_chan *chan)
        u32 tmp;
 
        tmp = FIELD_GET(EDMA_V0_CH_STATUS_MASK,
-                       GET_CH(dw, chan->dir, chan->id, ch_control1));
+                       GET_CH_32(dw, chan->dir, chan->id, ch_control1));
 
        if (tmp == 1)
                return DMA_IN_PROGRESS;
@@ -167,26 +274,28 @@ void dw_edma_v0_core_clear_done_int(struct dw_edma_chan *chan)
 {
        struct dw_edma *dw = chan->chip->dw;
 
-       SET_RW(dw, chan->dir, int_clear,
-              FIELD_PREP(EDMA_V0_DONE_INT_MASK, BIT(chan->id)));
+       SET_RW_32(dw, chan->dir, int_clear,
+                 FIELD_PREP(EDMA_V0_DONE_INT_MASK, BIT(chan->id)));
 }
 
 void dw_edma_v0_core_clear_abort_int(struct dw_edma_chan *chan)
 {
        struct dw_edma *dw = chan->chip->dw;
 
-       SET_RW(dw, chan->dir, int_clear,
-              FIELD_PREP(EDMA_V0_ABORT_INT_MASK, BIT(chan->id)));
+       SET_RW_32(dw, chan->dir, int_clear,
+                 FIELD_PREP(EDMA_V0_ABORT_INT_MASK, BIT(chan->id)));
 }
 
 u32 dw_edma_v0_core_status_done_int(struct dw_edma *dw, enum dw_edma_dir dir)
 {
-       return FIELD_GET(EDMA_V0_DONE_INT_MASK, GET_RW(dw, dir, int_status));
+       return FIELD_GET(EDMA_V0_DONE_INT_MASK,
+                        GET_RW_32(dw, dir, int_status));
 }
 
 u32 dw_edma_v0_core_status_abort_int(struct dw_edma *dw, enum dw_edma_dir dir)
 {
-       return FIELD_GET(EDMA_V0_ABORT_INT_MASK, GET_RW(dw, dir, int_status));
+       return FIELD_GET(EDMA_V0_ABORT_INT_MASK,
+                        GET_RW_32(dw, dir, int_status));
 }
 
 static void dw_edma_v0_core_write_chunk(struct dw_edma_chunk *chunk)
@@ -209,15 +318,23 @@ static void dw_edma_v0_core_write_chunk(struct dw_edma_chunk *chunk)
                        control |= (DW_EDMA_V0_LIE | DW_EDMA_V0_RIE);
 
                /* Channel control */
-               SET_LL(&lli[i].control, control);
+               SET_LL_32(&lli[i].control, control);
                /* Transfer size */
-               SET_LL(&lli[i].transfer_size, child->sz);
-               /* SAR - low, high */
-               SET_LL(&lli[i].sar_low, lower_32_bits(child->sar));
-               SET_LL(&lli[i].sar_high, upper_32_bits(child->sar));
-               /* DAR - low, high */
-               SET_LL(&lli[i].dar_low, lower_32_bits(child->dar));
-               SET_LL(&lli[i].dar_high, upper_32_bits(child->dar));
+               SET_LL_32(&lli[i].transfer_size, child->sz);
+               /* SAR */
+               #ifdef CONFIG_64BIT
+                       SET_LL_64(&lli[i].sar.reg, child->sar);
+               #else /* CONFIG_64BIT */
+                       SET_LL_32(&lli[i].sar.lsb, lower_32_bits(child->sar));
+                       SET_LL_32(&lli[i].sar.msb, upper_32_bits(child->sar));
+               #endif /* CONFIG_64BIT */
+               /* DAR */
+               #ifdef CONFIG_64BIT
+                       SET_LL_64(&lli[i].dar.reg, child->dar);
+               #else /* CONFIG_64BIT */
+                       SET_LL_32(&lli[i].dar.lsb, lower_32_bits(child->dar));
+                       SET_LL_32(&lli[i].dar.msb, upper_32_bits(child->dar));
+               #endif /* CONFIG_64BIT */
                i++;
        }
 
@@ -227,10 +344,14 @@ static void dw_edma_v0_core_write_chunk(struct dw_edma_chunk *chunk)
                control |= DW_EDMA_V0_CB;
 
        /* Channel control */
-       SET_LL(&llp->control, control);
-       /* Linked list  - low, high */
-       SET_LL(&llp->llp_low, lower_32_bits(chunk->ll_region.paddr));
-       SET_LL(&llp->llp_high, upper_32_bits(chunk->ll_region.paddr));
+       SET_LL_32(&llp->control, control);
+       /* Linked list */
+       #ifdef CONFIG_64BIT
+               SET_LL_64(&llp->llp.reg, chunk->ll_region.paddr);
+       #else /* CONFIG_64BIT */
+               SET_LL_32(&llp->llp.lsb, lower_32_bits(chunk->ll_region.paddr));
+               SET_LL_32(&llp->llp.msb, upper_32_bits(chunk->ll_region.paddr));
+       #endif /* CONFIG_64BIT */
 }
 
 void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first)
@@ -243,28 +364,69 @@ void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first)
 
        if (first) {
                /* Enable engine */
-               SET_RW(dw, chan->dir, engine_en, BIT(0));
+               SET_RW_32(dw, chan->dir, engine_en, BIT(0));
+               if (dw->mf == EDMA_MF_HDMA_COMPAT) {
+                       switch (chan->id) {
+                       case 0:
+                               SET_RW_COMPAT(dw, chan->dir, ch0_pwr_en,
+                                             BIT(0));
+                               break;
+                       case 1:
+                               SET_RW_COMPAT(dw, chan->dir, ch1_pwr_en,
+                                             BIT(0));
+                               break;
+                       case 2:
+                               SET_RW_COMPAT(dw, chan->dir, ch2_pwr_en,
+                                             BIT(0));
+                               break;
+                       case 3:
+                               SET_RW_COMPAT(dw, chan->dir, ch3_pwr_en,
+                                             BIT(0));
+                               break;
+                       case 4:
+                               SET_RW_COMPAT(dw, chan->dir, ch4_pwr_en,
+                                             BIT(0));
+                               break;
+                       case 5:
+                               SET_RW_COMPAT(dw, chan->dir, ch5_pwr_en,
+                                             BIT(0));
+                               break;
+                       case 6:
+                               SET_RW_COMPAT(dw, chan->dir, ch6_pwr_en,
+                                             BIT(0));
+                               break;
+                       case 7:
+                               SET_RW_COMPAT(dw, chan->dir, ch7_pwr_en,
+                                             BIT(0));
+                               break;
+                       }
+               }
                /* Interrupt unmask - done, abort */
-               tmp = GET_RW(dw, chan->dir, int_mask);
+               tmp = GET_RW_32(dw, chan->dir, int_mask);
                tmp &= ~FIELD_PREP(EDMA_V0_DONE_INT_MASK, BIT(chan->id));
                tmp &= ~FIELD_PREP(EDMA_V0_ABORT_INT_MASK, BIT(chan->id));
-               SET_RW(dw, chan->dir, int_mask, tmp);
+               SET_RW_32(dw, chan->dir, int_mask, tmp);
                /* Linked list error */
-               tmp = GET_RW(dw, chan->dir, linked_list_err_en);
+               tmp = GET_RW_32(dw, chan->dir, linked_list_err_en);
                tmp |= FIELD_PREP(EDMA_V0_LINKED_LIST_ERR_MASK, BIT(chan->id));
-               SET_RW(dw, chan->dir, linked_list_err_en, tmp);
+               SET_RW_32(dw, chan->dir, linked_list_err_en, tmp);
                /* Channel control */
-               SET_CH(dw, chan->dir, chan->id, ch_control1,
-                      (DW_EDMA_V0_CCS | DW_EDMA_V0_LLE));
-               /* Linked list - low, high */
-               SET_CH(dw, chan->dir, chan->id, llp_low,
-                      lower_32_bits(chunk->ll_region.paddr));
-               SET_CH(dw, chan->dir, chan->id, llp_high,
-                      upper_32_bits(chunk->ll_region.paddr));
+               SET_CH_32(dw, chan->dir, chan->id, ch_control1,
+                         (DW_EDMA_V0_CCS | DW_EDMA_V0_LLE));
+               /* Linked list */
+               #ifdef CONFIG_64BIT
+                       SET_CH_64(dw, chan->dir, chan->id, llp.reg,
+                                 chunk->ll_region.paddr);
+               #else /* CONFIG_64BIT */
+                       SET_CH_32(dw, chan->dir, chan->id, llp.lsb,
+                                 lower_32_bits(chunk->ll_region.paddr));
+                       SET_CH_32(dw, chan->dir, chan->id, llp.msb,
+                                 upper_32_bits(chunk->ll_region.paddr));
+               #endif /* CONFIG_64BIT */
        }
        /* Doorbell */
-       SET_RW(dw, chan->dir, doorbell,
-              FIELD_PREP(EDMA_V0_DOORBELL_CH_MASK, chan->id));
+       SET_RW_32(dw, chan->dir, doorbell,
+                 FIELD_PREP(EDMA_V0_DOORBELL_CH_MASK, chan->id));
 }
 
 int dw_edma_v0_core_device_config(struct dw_edma_chan *chan)
@@ -273,31 +435,31 @@ int dw_edma_v0_core_device_config(struct dw_edma_chan *chan)
        u32 tmp = 0;
 
        /* MSI done addr - low, high */
-       SET_RW(dw, chan->dir, done_imwr_low, chan->msi.address_lo);
-       SET_RW(dw, chan->dir, done_imwr_high, chan->msi.address_hi);
+       SET_RW_32(dw, chan->dir, done_imwr.lsb, chan->msi.address_lo);
+       SET_RW_32(dw, chan->dir, done_imwr.msb, chan->msi.address_hi);
        /* MSI abort addr - low, high */
-       SET_RW(dw, chan->dir, abort_imwr_low, chan->msi.address_lo);
-       SET_RW(dw, chan->dir, abort_imwr_high, chan->msi.address_hi);
+       SET_RW_32(dw, chan->dir, abort_imwr.lsb, chan->msi.address_lo);
+       SET_RW_32(dw, chan->dir, abort_imwr.msb, chan->msi.address_hi);
        /* MSI data - low, high */
        switch (chan->id) {
        case 0:
        case 1:
-               tmp = GET_RW(dw, chan->dir, ch01_imwr_data);
+               tmp = GET_RW_32(dw, chan->dir, ch01_imwr_data);
                break;
 
        case 2:
        case 3:
-               tmp = GET_RW(dw, chan->dir, ch23_imwr_data);
+               tmp = GET_RW_32(dw, chan->dir, ch23_imwr_data);
                break;
 
        case 4:
        case 5:
-               tmp = GET_RW(dw, chan->dir, ch45_imwr_data);
+               tmp = GET_RW_32(dw, chan->dir, ch45_imwr_data);
                break;
 
        case 6:
        case 7:
-               tmp = GET_RW(dw, chan->dir, ch67_imwr_data);
+               tmp = GET_RW_32(dw, chan->dir, ch67_imwr_data);
                break;
        }
 
@@ -316,22 +478,22 @@ int dw_edma_v0_core_device_config(struct dw_edma_chan *chan)
        switch (chan->id) {
        case 0:
        case 1:
-               SET_RW(dw, chan->dir, ch01_imwr_data, tmp);
+               SET_RW_32(dw, chan->dir, ch01_imwr_data, tmp);
                break;
 
        case 2:
        case 3:
-               SET_RW(dw, chan->dir, ch23_imwr_data, tmp);
+               SET_RW_32(dw, chan->dir, ch23_imwr_data, tmp);
                break;
 
        case 4:
        case 5:
-               SET_RW(dw, chan->dir, ch45_imwr_data, tmp);
+               SET_RW_32(dw, chan->dir, ch45_imwr_data, tmp);
                break;
 
        case 6:
        case 7:
-               SET_RW(dw, chan->dir, ch67_imwr_data, tmp);
+               SET_RW_32(dw, chan->dir, ch67_imwr_data, tmp);
                break;
        }
 
@@ -344,7 +506,7 @@ void dw_edma_v0_core_debugfs_on(struct dw_edma_chip *chip)
        dw_edma_v0_debugfs_on(chip);
 }
 
-void dw_edma_v0_core_debugfs_off(void)
+void dw_edma_v0_core_debugfs_off(struct dw_edma_chip *chip)
 {
-       dw_edma_v0_debugfs_off();
+       dw_edma_v0_debugfs_off(chip);
 }
index abae152..2afa626 100644 (file)
@@ -23,6 +23,6 @@ void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first);
 int dw_edma_v0_core_device_config(struct dw_edma_chan *chan);
 /* eDMA debug fs callbacks */
 void dw_edma_v0_core_debugfs_on(struct dw_edma_chip *chip);
-void dw_edma_v0_core_debugfs_off(void);
+void dw_edma_v0_core_debugfs_off(struct dw_edma_chip *chip);
 
 #endif /* _DW_EDMA_V0_CORE_H */
index 6f62711..4b3bcff 100644 (file)
@@ -38,7 +38,6 @@
 #define CHANNEL_STR                            "channel"
 #define REGISTERS_STR                          "registers"
 
-static struct dentry                           *base_dir;
 static struct dw_edma                          *dw;
 static struct dw_edma_v0_regs                  __iomem *regs;
 
@@ -55,7 +54,7 @@ struct debugfs_entries {
 static int dw_edma_debugfs_u32_get(void *data, u64 *val)
 {
        void __iomem *reg = (void __force __iomem *)data;
-       if (dw->mode == EDMA_MODE_LEGACY &&
+       if (dw->mf == EDMA_MF_EDMA_LEGACY &&
            reg >= (void __iomem *)&regs->type.legacy.ch) {
                void __iomem *ptr = &regs->type.legacy.ch;
                u32 viewport_sel = 0;
@@ -114,12 +113,12 @@ static void dw_edma_debugfs_regs_ch(struct dw_edma_v0_ch_regs __iomem *regs,
                REGISTER(ch_control1),
                REGISTER(ch_control2),
                REGISTER(transfer_size),
-               REGISTER(sar_low),
-               REGISTER(sar_high),
-               REGISTER(dar_low),
-               REGISTER(dar_high),
-               REGISTER(llp_low),
-               REGISTER(llp_high),
+               REGISTER(sar.lsb),
+               REGISTER(sar.msb),
+               REGISTER(dar.lsb),
+               REGISTER(dar.msb),
+               REGISTER(llp.lsb),
+               REGISTER(llp.msb),
        };
 
        nr_entries = ARRAY_SIZE(debugfs_regs);
@@ -132,17 +131,17 @@ static void dw_edma_debugfs_regs_wr(struct dentry *dir)
                /* eDMA global registers */
                WR_REGISTER(engine_en),
                WR_REGISTER(doorbell),
-               WR_REGISTER(ch_arb_weight_low),
-               WR_REGISTER(ch_arb_weight_high),
+               WR_REGISTER(ch_arb_weight.lsb),
+               WR_REGISTER(ch_arb_weight.msb),
                /* eDMA interrupts registers */
                WR_REGISTER(int_status),
                WR_REGISTER(int_mask),
                WR_REGISTER(int_clear),
                WR_REGISTER(err_status),
-               WR_REGISTER(done_imwr_low),
-               WR_REGISTER(done_imwr_high),
-               WR_REGISTER(abort_imwr_low),
-               WR_REGISTER(abort_imwr_high),
+               WR_REGISTER(done_imwr.lsb),
+               WR_REGISTER(done_imwr.msb),
+               WR_REGISTER(abort_imwr.lsb),
+               WR_REGISTER(abort_imwr.msb),
                WR_REGISTER(ch01_imwr_data),
                WR_REGISTER(ch23_imwr_data),
                WR_REGISTER(ch45_imwr_data),
@@ -152,8 +151,8 @@ static void dw_edma_debugfs_regs_wr(struct dentry *dir)
        const struct debugfs_entries debugfs_unroll_regs[] = {
                /* eDMA channel context grouping */
                WR_REGISTER_UNROLL(engine_chgroup),
-               WR_REGISTER_UNROLL(engine_hshake_cnt_low),
-               WR_REGISTER_UNROLL(engine_hshake_cnt_high),
+               WR_REGISTER_UNROLL(engine_hshake_cnt.lsb),
+               WR_REGISTER_UNROLL(engine_hshake_cnt.msb),
                WR_REGISTER_UNROLL(ch0_pwr_en),
                WR_REGISTER_UNROLL(ch1_pwr_en),
                WR_REGISTER_UNROLL(ch2_pwr_en),
@@ -174,7 +173,7 @@ static void dw_edma_debugfs_regs_wr(struct dentry *dir)
        nr_entries = ARRAY_SIZE(debugfs_regs);
        dw_edma_debugfs_create_x32(debugfs_regs, nr_entries, regs_dir);
 
-       if (dw->mode == EDMA_MODE_UNROLL) {
+       if (dw->mf == EDMA_MF_HDMA_COMPAT) {
                nr_entries = ARRAY_SIZE(debugfs_unroll_regs);
                dw_edma_debugfs_create_x32(debugfs_unroll_regs, nr_entries,
                                           regs_dir);
@@ -200,19 +199,19 @@ static void dw_edma_debugfs_regs_rd(struct dentry *dir)
                /* eDMA global registers */
                RD_REGISTER(engine_en),
                RD_REGISTER(doorbell),
-               RD_REGISTER(ch_arb_weight_low),
-               RD_REGISTER(ch_arb_weight_high),
+               RD_REGISTER(ch_arb_weight.lsb),
+               RD_REGISTER(ch_arb_weight.msb),
                /* eDMA interrupts registers */
                RD_REGISTER(int_status),
                RD_REGISTER(int_mask),
                RD_REGISTER(int_clear),
-               RD_REGISTER(err_status_low),
-               RD_REGISTER(err_status_high),
+               RD_REGISTER(err_status.lsb),
+               RD_REGISTER(err_status.msb),
                RD_REGISTER(linked_list_err_en),
-               RD_REGISTER(done_imwr_low),
-               RD_REGISTER(done_imwr_high),
-               RD_REGISTER(abort_imwr_low),
-               RD_REGISTER(abort_imwr_high),
+               RD_REGISTER(done_imwr.lsb),
+               RD_REGISTER(done_imwr.msb),
+               RD_REGISTER(abort_imwr.lsb),
+               RD_REGISTER(abort_imwr.msb),
                RD_REGISTER(ch01_imwr_data),
                RD_REGISTER(ch23_imwr_data),
                RD_REGISTER(ch45_imwr_data),
@@ -221,8 +220,8 @@ static void dw_edma_debugfs_regs_rd(struct dentry *dir)
        const struct debugfs_entries debugfs_unroll_regs[] = {
                /* eDMA channel context grouping */
                RD_REGISTER_UNROLL(engine_chgroup),
-               RD_REGISTER_UNROLL(engine_hshake_cnt_low),
-               RD_REGISTER_UNROLL(engine_hshake_cnt_high),
+               RD_REGISTER_UNROLL(engine_hshake_cnt.lsb),
+               RD_REGISTER_UNROLL(engine_hshake_cnt.msb),
                RD_REGISTER_UNROLL(ch0_pwr_en),
                RD_REGISTER_UNROLL(ch1_pwr_en),
                RD_REGISTER_UNROLL(ch2_pwr_en),
@@ -243,7 +242,7 @@ static void dw_edma_debugfs_regs_rd(struct dentry *dir)
        nr_entries = ARRAY_SIZE(debugfs_regs);
        dw_edma_debugfs_create_x32(debugfs_regs, nr_entries, regs_dir);
 
-       if (dw->mode == EDMA_MODE_UNROLL) {
+       if (dw->mf == EDMA_MF_HDMA_COMPAT) {
                nr_entries = ARRAY_SIZE(debugfs_unroll_regs);
                dw_edma_debugfs_create_x32(debugfs_unroll_regs, nr_entries,
                                           regs_dir);
@@ -272,7 +271,7 @@ static void dw_edma_debugfs_regs(void)
        struct dentry *regs_dir;
        int nr_entries;
 
-       regs_dir = debugfs_create_dir(REGISTERS_STR, base_dir);
+       regs_dir = debugfs_create_dir(REGISTERS_STR, dw->debugfs);
        if (!regs_dir)
                return;
 
@@ -293,19 +292,23 @@ void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip)
        if (!regs)
                return;
 
-       base_dir = debugfs_create_dir(dw->name, NULL);
-       if (!base_dir)
+       dw->debugfs = debugfs_create_dir(dw->name, NULL);
+       if (!dw->debugfs)
                return;
 
-       debugfs_create_u32("version", 0444, base_dir, &dw->version);
-       debugfs_create_u32("mode", 0444, base_dir, &dw->mode);
-       debugfs_create_u16("wr_ch_cnt", 0444, base_dir, &dw->wr_ch_cnt);
-       debugfs_create_u16("rd_ch_cnt", 0444, base_dir, &dw->rd_ch_cnt);
+       debugfs_create_u32("mf", 0444, dw->debugfs, &dw->mf);
+       debugfs_create_u16("wr_ch_cnt", 0444, dw->debugfs, &dw->wr_ch_cnt);
+       debugfs_create_u16("rd_ch_cnt", 0444, dw->debugfs, &dw->rd_ch_cnt);
 
        dw_edma_debugfs_regs();
 }
 
-void dw_edma_v0_debugfs_off(void)
+void dw_edma_v0_debugfs_off(struct dw_edma_chip *chip)
 {
-       debugfs_remove_recursive(base_dir);
+       dw = chip->dw;
+       if (!dw)
+               return;
+
+       debugfs_remove_recursive(dw->debugfs);
+       dw->debugfs = NULL;
 }
index 5450a0a..d0ff25a 100644 (file)
 
 #ifdef CONFIG_DEBUG_FS
 void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip);
-void dw_edma_v0_debugfs_off(void);
+void dw_edma_v0_debugfs_off(struct dw_edma_chip *chip);
 #else
 static inline void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip)
 {
 }
 
-static inline void dw_edma_v0_debugfs_off(void)
+static inline void dw_edma_v0_debugfs_off(struct dw_edma_chip *chip)
 {
 }
 #endif /* CONFIG_DEBUG_FS */
index dfd70e2..e175f7b 100644 (file)
 #define EDMA_V0_CH_EVEN_MSI_DATA_MASK                  GENMASK(15, 0)
 
 struct dw_edma_v0_ch_regs {
-       u32 ch_control1;                                /* 0x000 */
-       u32 ch_control2;                                /* 0x004 */
-       u32 transfer_size;                              /* 0x008 */
-       u32 sar_low;                                    /* 0x00c */
-       u32 sar_high;                                   /* 0x010 */
-       u32 dar_low;                                    /* 0x014 */
-       u32 dar_high;                                   /* 0x018 */
-       u32 llp_low;                                    /* 0x01c */
-       u32 llp_high;                                   /* 0x020 */
-};
+       u32 ch_control1;                                /* 0x0000 */
+       u32 ch_control2;                                /* 0x0004 */
+       u32 transfer_size;                              /* 0x0008 */
+       union {
+               u64 reg;                                /* 0x000c..0x0010 */
+               struct {
+                       u32 lsb;                        /* 0x000c */
+                       u32 msb;                        /* 0x0010 */
+               };
+       } sar;
+       union {
+               u64 reg;                                /* 0x0014..0x0018 */
+               struct {
+                       u32 lsb;                        /* 0x0014 */
+                       u32 msb;                        /* 0x0018 */
+               };
+       } dar;
+       union {
+               u64 reg;                                /* 0x001c..0x0020 */
+               struct {
+                       u32 lsb;                        /* 0x001c */
+                       u32 msb;                        /* 0x0020 */
+               };
+       } llp;
+} __packed;
 
 struct dw_edma_v0_ch {
-       struct dw_edma_v0_ch_regs wr;                   /* 0x200 */
-       u32 padding_1[55];                              /* [0x224..0x2fc] */
-       struct dw_edma_v0_ch_regs rd;                   /* 0x300 */
-       u32 padding_2[55];                              /* [0x324..0x3fc] */
-};
+       struct dw_edma_v0_ch_regs wr;                   /* 0x0200 */
+       u32 padding_1[55];                              /* 0x0224..0x02fc */
+       struct dw_edma_v0_ch_regs rd;                   /* 0x0300 */
+       u32 padding_2[55];                              /* 0x0324..0x03fc */
+} __packed;
 
 struct dw_edma_v0_unroll {
-       u32 padding_1;                                  /* 0x0f8 */
-       u32 wr_engine_chgroup;                          /* 0x100 */
-       u32 rd_engine_chgroup;                          /* 0x104 */
-       u32 wr_engine_hshake_cnt_low;                   /* 0x108 */
-       u32 wr_engine_hshake_cnt_high;                  /* 0x10c */
-       u32 padding_2[2];                               /* [0x110..0x114] */
-       u32 rd_engine_hshake_cnt_low;                   /* 0x118 */
-       u32 rd_engine_hshake_cnt_high;                  /* 0x11c */
-       u32 padding_3[2];                               /* [0x120..0x124] */
-       u32 wr_ch0_pwr_en;                              /* 0x128 */
-       u32 wr_ch1_pwr_en;                              /* 0x12c */
-       u32 wr_ch2_pwr_en;                              /* 0x130 */
-       u32 wr_ch3_pwr_en;                              /* 0x134 */
-       u32 wr_ch4_pwr_en;                              /* 0x138 */
-       u32 wr_ch5_pwr_en;                              /* 0x13c */
-       u32 wr_ch6_pwr_en;                              /* 0x140 */
-       u32 wr_ch7_pwr_en;                              /* 0x144 */
-       u32 padding_4[8];                               /* [0x148..0x164] */
-       u32 rd_ch0_pwr_en;                              /* 0x168 */
-       u32 rd_ch1_pwr_en;                              /* 0x16c */
-       u32 rd_ch2_pwr_en;                              /* 0x170 */
-       u32 rd_ch3_pwr_en;                              /* 0x174 */
-       u32 rd_ch4_pwr_en;                              /* 0x178 */
-       u32 rd_ch5_pwr_en;                              /* 0x18c */
-       u32 rd_ch6_pwr_en;                              /* 0x180 */
-       u32 rd_ch7_pwr_en;                              /* 0x184 */
-       u32 padding_5[30];                              /* [0x188..0x1fc] */
-       struct dw_edma_v0_ch ch[EDMA_V0_MAX_NR_CH];     /* [0x200..0x1120] */
-};
+       u32 padding_1;                                  /* 0x00f8 */
+       u32 wr_engine_chgroup;                          /* 0x0100 */
+       u32 rd_engine_chgroup;                          /* 0x0104 */
+       union {
+               u64 reg;                                /* 0x0108..0x010c */
+               struct {
+                       u32 lsb;                        /* 0x0108 */
+                       u32 msb;                        /* 0x010c */
+               };
+       } wr_engine_hshake_cnt;
+       u32 padding_2[2];                               /* 0x0110..0x0114 */
+       union {
+               u64 reg;                                /* 0x0120..0x0124 */
+               struct {
+                       u32 lsb;                        /* 0x0120 */
+                       u32 msb;                        /* 0x0124 */
+               };
+       } rd_engine_hshake_cnt;
+       u32 padding_3[2];                               /* 0x0120..0x0124 */
+       u32 wr_ch0_pwr_en;                              /* 0x0128 */
+       u32 wr_ch1_pwr_en;                              /* 0x012c */
+       u32 wr_ch2_pwr_en;                              /* 0x0130 */
+       u32 wr_ch3_pwr_en;                              /* 0x0134 */
+       u32 wr_ch4_pwr_en;                              /* 0x0138 */
+       u32 wr_ch5_pwr_en;                              /* 0x013c */
+       u32 wr_ch6_pwr_en;                              /* 0x0140 */
+       u32 wr_ch7_pwr_en;                              /* 0x0144 */
+       u32 padding_4[8];                               /* 0x0148..0x0164 */
+       u32 rd_ch0_pwr_en;                              /* 0x0168 */
+       u32 rd_ch1_pwr_en;                              /* 0x016c */
+       u32 rd_ch2_pwr_en;                              /* 0x0170 */
+       u32 rd_ch3_pwr_en;                              /* 0x0174 */
+       u32 rd_ch4_pwr_en;                              /* 0x0178 */
+       u32 rd_ch5_pwr_en;                              /* 0x018c */
+       u32 rd_ch6_pwr_en;                              /* 0x0180 */
+       u32 rd_ch7_pwr_en;                              /* 0x0184 */
+       u32 padding_5[30];                              /* 0x0188..0x01fc */
+       struct dw_edma_v0_ch ch[EDMA_V0_MAX_NR_CH];     /* 0x0200..0x1120 */
+} __packed;
 
 struct dw_edma_v0_legacy {
-       u32 viewport_sel;                               /* 0x0f8 */
-       struct dw_edma_v0_ch_regs ch;                   /* [0x100..0x120] */
-};
+       u32 viewport_sel;                               /* 0x00f8 */
+       struct dw_edma_v0_ch_regs ch;                   /* 0x0100..0x0120 */
+} __packed;
 
 struct dw_edma_v0_regs {
        /* eDMA global registers */
-       u32 ctrl_data_arb_prior;                        /* 0x000 */
-       u32 padding_1;                                  /* 0x004 */
-       u32 ctrl;                                       /* 0x008 */
-       u32 wr_engine_en;                               /* 0x00c */
-       u32 wr_doorbell;                                /* 0x010 */
-       u32 padding_2;                                  /* 0x014 */
-       u32 wr_ch_arb_weight_low;                       /* 0x018 */
-       u32 wr_ch_arb_weight_high;                      /* 0x01c */
-       u32 padding_3[3];                               /* [0x020..0x028] */
-       u32 rd_engine_en;                               /* 0x02c */
-       u32 rd_doorbell;                                /* 0x030 */
-       u32 padding_4;                                  /* 0x034 */
-       u32 rd_ch_arb_weight_low;                       /* 0x038 */
-       u32 rd_ch_arb_weight_high;                      /* 0x03c */
-       u32 padding_5[3];                               /* [0x040..0x048] */
+       u32 ctrl_data_arb_prior;                        /* 0x0000 */
+       u32 padding_1;                                  /* 0x0004 */
+       u32 ctrl;                                       /* 0x0008 */
+       u32 wr_engine_en;                               /* 0x000c */
+       u32 wr_doorbell;                                /* 0x0010 */
+       u32 padding_2;                                  /* 0x0014 */
+       union {
+               u64 reg;                                /* 0x0018..0x001c */
+               struct {
+                       u32 lsb;                        /* 0x0018 */
+                       u32 msb;                        /* 0x001c */
+               };
+       } wr_ch_arb_weight;
+       u32 padding_3[3];                               /* 0x0020..0x0028 */
+       u32 rd_engine_en;                               /* 0x002c */
+       u32 rd_doorbell;                                /* 0x0030 */
+       u32 padding_4;                                  /* 0x0034 */
+       union {
+               u64 reg;                                /* 0x0038..0x003c */
+               struct {
+                       u32 lsb;                        /* 0x0038 */
+                       u32 msb;                        /* 0x003c */
+               };
+       } rd_ch_arb_weight;
+       u32 padding_5[3];                               /* 0x0040..0x0048 */
        /* eDMA interrupts registers */
-       u32 wr_int_status;                              /* 0x04c */
-       u32 padding_6;                                  /* 0x050 */
-       u32 wr_int_mask;                                /* 0x054 */
-       u32 wr_int_clear;                               /* 0x058 */
-       u32 wr_err_status;                              /* 0x05c */
-       u32 wr_done_imwr_low;                           /* 0x060 */
-       u32 wr_done_imwr_high;                          /* 0x064 */
-       u32 wr_abort_imwr_low;                          /* 0x068 */
-       u32 wr_abort_imwr_high;                         /* 0x06c */
-       u32 wr_ch01_imwr_data;                          /* 0x070 */
-       u32 wr_ch23_imwr_data;                          /* 0x074 */
-       u32 wr_ch45_imwr_data;                          /* 0x078 */
-       u32 wr_ch67_imwr_data;                          /* 0x07c */
-       u32 padding_7[4];                               /* [0x080..0x08c] */
-       u32 wr_linked_list_err_en;                      /* 0x090 */
-       u32 padding_8[3];                               /* [0x094..0x09c] */
-       u32 rd_int_status;                              /* 0x0a0 */
-       u32 padding_9;                                  /* 0x0a4 */
-       u32 rd_int_mask;                                /* 0x0a8 */
-       u32 rd_int_clear;                               /* 0x0ac */
-       u32 padding_10;                                 /* 0x0b0 */
-       u32 rd_err_status_low;                          /* 0x0b4 */
-       u32 rd_err_status_high;                         /* 0x0b8 */
-       u32 padding_11[2];                              /* [0x0bc..0x0c0] */
-       u32 rd_linked_list_err_en;                      /* 0x0c4 */
-       u32 padding_12;                                 /* 0x0c8 */
-       u32 rd_done_imwr_low;                           /* 0x0cc */
-       u32 rd_done_imwr_high;                          /* 0x0d0 */
-       u32 rd_abort_imwr_low;                          /* 0x0d4 */
-       u32 rd_abort_imwr_high;                         /* 0x0d8 */
-       u32 rd_ch01_imwr_data;                          /* 0x0dc */
-       u32 rd_ch23_imwr_data;                          /* 0x0e0 */
-       u32 rd_ch45_imwr_data;                          /* 0x0e4 */
-       u32 rd_ch67_imwr_data;                          /* 0x0e8 */
-       u32 padding_13[4];                              /* [0x0ec..0x0f8] */
+       u32 wr_int_status;                              /* 0x004c */
+       u32 padding_6;                                  /* 0x0050 */
+       u32 wr_int_mask;                                /* 0x0054 */
+       u32 wr_int_clear;                               /* 0x0058 */
+       u32 wr_err_status;                              /* 0x005c */
+       union {
+               u64 reg;                                /* 0x0060..0x0064 */
+               struct {
+                       u32 lsb;                        /* 0x0060 */
+                       u32 msb;                        /* 0x0064 */
+               };
+       } wr_done_imwr;
+       union {
+               u64 reg;                                /* 0x0068..0x006c */
+               struct {
+                       u32 lsb;                        /* 0x0068 */
+                       u32 msb;                        /* 0x006c */
+               };
+       } wr_abort_imwr;
+       u32 wr_ch01_imwr_data;                          /* 0x0070 */
+       u32 wr_ch23_imwr_data;                          /* 0x0074 */
+       u32 wr_ch45_imwr_data;                          /* 0x0078 */
+       u32 wr_ch67_imwr_data;                          /* 0x007c */
+       u32 padding_7[4];                               /* 0x0080..0x008c */
+       u32 wr_linked_list_err_en;                      /* 0x0090 */
+       u32 padding_8[3];                               /* 0x0094..0x009c */
+       u32 rd_int_status;                              /* 0x00a0 */
+       u32 padding_9;                                  /* 0x00a4 */
+       u32 rd_int_mask;                                /* 0x00a8 */
+       u32 rd_int_clear;                               /* 0x00ac */
+       u32 padding_10;                                 /* 0x00b0 */
+       union {
+               u64 reg;                                /* 0x00b4..0x00b8 */
+               struct {
+                       u32 lsb;                        /* 0x00b4 */
+                       u32 msb;                        /* 0x00b8 */
+               };
+       } rd_err_status;
+       u32 padding_11[2];                              /* 0x00bc..0x00c0 */
+       u32 rd_linked_list_err_en;                      /* 0x00c4 */
+       u32 padding_12;                                 /* 0x00c8 */
+       union {
+               u64 reg;                                /* 0x00cc..0x00d0 */
+               struct {
+                       u32 lsb;                        /* 0x00cc */
+                       u32 msb;                        /* 0x00d0 */
+               };
+       } rd_done_imwr;
+       union {
+               u64 reg;                                /* 0x00d4..0x00d8 */
+               struct {
+                       u32 lsb;                        /* 0x00d4 */
+                       u32 msb;                        /* 0x00d8 */
+               };
+       } rd_abort_imwr;
+       u32 rd_ch01_imwr_data;                          /* 0x00dc */
+       u32 rd_ch23_imwr_data;                          /* 0x00e0 */
+       u32 rd_ch45_imwr_data;                          /* 0x00e4 */
+       u32 rd_ch67_imwr_data;                          /* 0x00e8 */
+       u32 padding_13[4];                              /* 0x00ec..0x00f8 */
        /* eDMA channel context grouping */
        union dw_edma_v0_type {
-               struct dw_edma_v0_legacy legacy;        /* [0x0f8..0x120] */
-               struct dw_edma_v0_unroll unroll;        /* [0x0f8..0x1120] */
+               struct dw_edma_v0_legacy legacy;        /* 0x00f8..0x0120 */
+               struct dw_edma_v0_unroll unroll;        /* 0x00f8..0x1120 */
        } type;
-};
+} __packed;
 
 struct dw_edma_v0_lli {
        u32 control;
        u32 transfer_size;
-       u32 sar_low;
-       u32 sar_high;
-       u32 dar_low;
-       u32 dar_high;
-};
+       union {
+               u64 reg;
+               struct {
+                       u32 lsb;
+                       u32 msb;
+               };
+       } sar;
+       union {
+               u64 reg;
+               struct {
+                       u32 lsb;
+                       u32 msb;
+               };
+       } dar;
+} __packed;
 
 struct dw_edma_v0_llp {
        u32 control;
        u32 reserved;
-       u32 llp_low;
-       u32 llp_high;
-};
+       union {
+               u64 reg;
+               struct {
+                       u32 lsb;
+                       u32 msb;
+               };
+       } llp;
+} __packed;
 
 #endif /* _DW_EDMA_V0_REGS_H */
index 8978b89..6d11558 100644 (file)
@@ -1,2 +1,4 @@
 obj-$(CONFIG_INTEL_IDXD) += idxd.o
 idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o
+
+idxd-$(CONFIG_INTEL_IDXD_PERFMON) += perfmon.o
index 0db9b82..302cba5 100644 (file)
@@ -39,15 +39,15 @@ struct idxd_user_context {
        struct iommu_sva *sva;
 };
 
-enum idxd_cdev_cleanup {
-       CDEV_NORMAL = 0,
-       CDEV_FAILED,
-};
-
 static void idxd_cdev_dev_release(struct device *dev)
 {
-       dev_dbg(dev, "releasing cdev device\n");
-       kfree(dev);
+       struct idxd_cdev *idxd_cdev = container_of(dev, struct idxd_cdev, dev);
+       struct idxd_cdev_context *cdev_ctx;
+       struct idxd_wq *wq = idxd_cdev->wq;
+
+       cdev_ctx = &ictx[wq->idxd->data->type];
+       ida_simple_remove(&cdev_ctx->minor_ida, idxd_cdev->minor);
+       kfree(idxd_cdev);
 }
 
 static struct device_type idxd_cdev_device_type = {
@@ -62,14 +62,11 @@ static inline struct idxd_cdev *inode_idxd_cdev(struct inode *inode)
        return container_of(cdev, struct idxd_cdev, cdev);
 }
 
-static inline struct idxd_wq *idxd_cdev_wq(struct idxd_cdev *idxd_cdev)
-{
-       return container_of(idxd_cdev, struct idxd_wq, idxd_cdev);
-}
-
 static inline struct idxd_wq *inode_wq(struct inode *inode)
 {
-       return idxd_cdev_wq(inode_idxd_cdev(inode));
+       struct idxd_cdev *idxd_cdev = inode_idxd_cdev(inode);
+
+       return idxd_cdev->wq;
 }
 
 static int idxd_cdev_open(struct inode *inode, struct file *filp)
@@ -220,11 +217,10 @@ static __poll_t idxd_cdev_poll(struct file *filp,
        struct idxd_user_context *ctx = filp->private_data;
        struct idxd_wq *wq = ctx->wq;
        struct idxd_device *idxd = wq->idxd;
-       struct idxd_cdev *idxd_cdev = &wq->idxd_cdev;
        unsigned long flags;
        __poll_t out = 0;
 
-       poll_wait(filp, &idxd_cdev->err_queue, wait);
+       poll_wait(filp, &wq->err_queue, wait);
        spin_lock_irqsave(&idxd->dev_lock, flags);
        if (idxd->sw_err.valid)
                out = EPOLLIN | EPOLLRDNORM;
@@ -243,101 +239,69 @@ static const struct file_operations idxd_cdev_fops = {
 
 int idxd_cdev_get_major(struct idxd_device *idxd)
 {
-       return MAJOR(ictx[idxd->type].devt);
+       return MAJOR(ictx[idxd->data->type].devt);
 }
 
-static int idxd_wq_cdev_dev_setup(struct idxd_wq *wq)
+int idxd_wq_add_cdev(struct idxd_wq *wq)
 {
        struct idxd_device *idxd = wq->idxd;
-       struct idxd_cdev *idxd_cdev = &wq->idxd_cdev;
-       struct idxd_cdev_context *cdev_ctx;
+       struct idxd_cdev *idxd_cdev;
+       struct cdev *cdev;
        struct device *dev;
-       int minor, rc;
+       struct idxd_cdev_context *cdev_ctx;
+       int rc, minor;
 
-       idxd_cdev->dev = kzalloc(sizeof(*idxd_cdev->dev), GFP_KERNEL);
-       if (!idxd_cdev->dev)
+       idxd_cdev = kzalloc(sizeof(*idxd_cdev), GFP_KERNEL);
+       if (!idxd_cdev)
                return -ENOMEM;
 
-       dev = idxd_cdev->dev;
-       dev->parent = &idxd->pdev->dev;
-       dev_set_name(dev, "%s/wq%u.%u", idxd_get_dev_name(idxd),
-                    idxd->id, wq->id);
-       dev->bus = idxd_get_bus_type(idxd);
-
-       cdev_ctx = &ictx[wq->idxd->type];
+       idxd_cdev->wq = wq;
+       cdev = &idxd_cdev->cdev;
+       dev = &idxd_cdev->dev;
+       cdev_ctx = &ictx[wq->idxd->data->type];
        minor = ida_simple_get(&cdev_ctx->minor_ida, 0, MINORMASK, GFP_KERNEL);
        if (minor < 0) {
-               rc = minor;
-               kfree(dev);
-               goto ida_err;
-       }
-
-       dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor);
-       dev->type = &idxd_cdev_device_type;
-       rc = device_register(dev);
-       if (rc < 0) {
-               dev_err(&idxd->pdev->dev, "device register failed\n");
-               goto dev_reg_err;
+               kfree(idxd_cdev);
+               return minor;
        }
        idxd_cdev->minor = minor;
 
-       return 0;
-
- dev_reg_err:
-       ida_simple_remove(&cdev_ctx->minor_ida, MINOR(dev->devt));
-       put_device(dev);
- ida_err:
-       idxd_cdev->dev = NULL;
-       return rc;
-}
-
-static void idxd_wq_cdev_cleanup(struct idxd_wq *wq,
-                                enum idxd_cdev_cleanup cdev_state)
-{
-       struct idxd_cdev *idxd_cdev = &wq->idxd_cdev;
-       struct idxd_cdev_context *cdev_ctx;
-
-       cdev_ctx = &ictx[wq->idxd->type];
-       if (cdev_state == CDEV_NORMAL)
-               cdev_del(&idxd_cdev->cdev);
-       device_unregister(idxd_cdev->dev);
-       /*
-        * The device_type->release() will be called on the device and free
-        * the allocated struct device. We can just forget it.
-        */
-       ida_simple_remove(&cdev_ctx->minor_ida, idxd_cdev->minor);
-       idxd_cdev->dev = NULL;
-       idxd_cdev->minor = -1;
-}
-
-int idxd_wq_add_cdev(struct idxd_wq *wq)
-{
-       struct idxd_cdev *idxd_cdev = &wq->idxd_cdev;
-       struct cdev *cdev = &idxd_cdev->cdev;
-       struct device *dev;
-       int rc;
+       device_initialize(dev);
+       dev->parent = &wq->conf_dev;
+       dev->bus = &dsa_bus_type;
+       dev->type = &idxd_cdev_device_type;
+       dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor);
 
-       rc = idxd_wq_cdev_dev_setup(wq);
+       rc = dev_set_name(dev, "%s/wq%u.%u", idxd->data->name_prefix, idxd->id, wq->id);
        if (rc < 0)
-               return rc;
+               goto err;
 
-       dev = idxd_cdev->dev;
+       wq->idxd_cdev = idxd_cdev;
        cdev_init(cdev, &idxd_cdev_fops);
-       cdev_set_parent(cdev, &dev->kobj);
-       rc = cdev_add(cdev, dev->devt, 1);
+       rc = cdev_device_add(cdev, dev);
        if (rc) {
                dev_dbg(&wq->idxd->pdev->dev, "cdev_add failed: %d\n", rc);
-               idxd_wq_cdev_cleanup(wq, CDEV_FAILED);
-               return rc;
+               goto err;
        }
 
-       init_waitqueue_head(&idxd_cdev->err_queue);
        return 0;
+
+ err:
+       put_device(dev);
+       wq->idxd_cdev = NULL;
+       return rc;
 }
 
 void idxd_wq_del_cdev(struct idxd_wq *wq)
 {
-       idxd_wq_cdev_cleanup(wq, CDEV_NORMAL);
+       struct idxd_cdev *idxd_cdev;
+       struct idxd_cdev_context *cdev_ctx;
+
+       cdev_ctx = &ictx[wq->idxd->data->type];
+       idxd_cdev = wq->idxd_cdev;
+       wq->idxd_cdev = NULL;
+       cdev_device_del(&idxd_cdev->cdev, &idxd_cdev->dev);
+       put_device(&idxd_cdev->dev);
 }
 
 int idxd_cdev_register(void)
index 31c8195..420b93f 100644 (file)
@@ -19,7 +19,7 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 /* Interrupt control bits */
 void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id)
 {
-       struct irq_data *data = irq_get_irq_data(idxd->msix_entries[vec_id].vector);
+       struct irq_data *data = irq_get_irq_data(idxd->irq_entries[vec_id].vector);
 
        pci_msi_mask_irq(data);
 }
@@ -36,7 +36,7 @@ void idxd_mask_msix_vectors(struct idxd_device *idxd)
 
 void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id)
 {
-       struct irq_data *data = irq_get_irq_data(idxd->msix_entries[vec_id].vector);
+       struct irq_data *data = irq_get_irq_data(idxd->irq_entries[vec_id].vector);
 
        pci_msi_unmask_irq(data);
 }
@@ -47,6 +47,7 @@ void idxd_unmask_error_interrupts(struct idxd_device *idxd)
 
        genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET);
        genctrl.softerr_int_en = 1;
+       genctrl.halt_int_en = 1;
        iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET);
 }
 
@@ -56,6 +57,7 @@ void idxd_mask_error_interrupts(struct idxd_device *idxd)
 
        genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET);
        genctrl.softerr_int_en = 0;
+       genctrl.halt_int_en = 0;
        iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET);
 }
 
@@ -144,14 +146,8 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
        if (rc < 0)
                return rc;
 
-       if (idxd->type == IDXD_TYPE_DSA)
-               align = 32;
-       else if (idxd->type == IDXD_TYPE_IAX)
-               align = 64;
-       else
-               return -ENODEV;
-
-       wq->compls_size = num_descs * idxd->compl_size + align;
+       align = idxd->data->align;
+       wq->compls_size = num_descs * idxd->data->compl_size + align;
        wq->compls_raw = dma_alloc_coherent(dev, wq->compls_size,
                                            &wq->compls_addr_raw, GFP_KERNEL);
        if (!wq->compls_raw) {
@@ -178,16 +174,14 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
                struct idxd_desc *desc = wq->descs[i];
 
                desc->hw = wq->hw_descs[i];
-               if (idxd->type == IDXD_TYPE_DSA)
+               if (idxd->data->type == IDXD_TYPE_DSA)
                        desc->completion = &wq->compls[i];
-               else if (idxd->type == IDXD_TYPE_IAX)
+               else if (idxd->data->type == IDXD_TYPE_IAX)
                        desc->iax_completion = &wq->iax_compls[i];
-               desc->compl_dma = wq->compls_addr + idxd->compl_size * i;
+               desc->compl_dma = wq->compls_addr + idxd->data->compl_size * i;
                desc->id = i;
                desc->wq = wq;
                desc->cpu = -1;
-               dma_async_tx_descriptor_init(&desc->txd, &wq->dma_chan);
-               desc->txd.tx_submit = idxd_dma_tx_submit;
        }
 
        return 0;
@@ -320,6 +314,19 @@ void idxd_wq_unmap_portal(struct idxd_wq *wq)
        struct device *dev = &wq->idxd->pdev->dev;
 
        devm_iounmap(dev, wq->portal);
+       wq->portal = NULL;
+}
+
+void idxd_wqs_unmap_portal(struct idxd_device *idxd)
+{
+       int i;
+
+       for (i = 0; i < idxd->max_wqs; i++) {
+               struct idxd_wq *wq = idxd->wqs[i];
+
+               if (wq->portal)
+                       idxd_wq_unmap_portal(wq);
+       }
 }
 
 int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid)
@@ -392,6 +399,32 @@ void idxd_wq_disable_cleanup(struct idxd_wq *wq)
        memset(wq->name, 0, WQ_NAME_SIZE);
 }
 
+static void idxd_wq_ref_release(struct percpu_ref *ref)
+{
+       struct idxd_wq *wq = container_of(ref, struct idxd_wq, wq_active);
+
+       complete(&wq->wq_dead);
+}
+
+int idxd_wq_init_percpu_ref(struct idxd_wq *wq)
+{
+       int rc;
+
+       memset(&wq->wq_active, 0, sizeof(wq->wq_active));
+       rc = percpu_ref_init(&wq->wq_active, idxd_wq_ref_release, 0, GFP_KERNEL);
+       if (rc < 0)
+               return rc;
+       reinit_completion(&wq->wq_dead);
+       return 0;
+}
+
+void idxd_wq_quiesce(struct idxd_wq *wq)
+{
+       percpu_ref_kill(&wq->wq_active);
+       wait_for_completion(&wq->wq_dead);
+       percpu_ref_exit(&wq->wq_active);
+}
+
 /* Device control bits */
 static inline bool idxd_is_enabled(struct idxd_device *idxd)
 {
@@ -432,13 +465,13 @@ int idxd_device_init_reset(struct idxd_device *idxd)
        memset(&cmd, 0, sizeof(cmd));
        cmd.cmd = IDXD_CMD_RESET_DEVICE;
        dev_dbg(dev, "%s: sending reset for init.\n", __func__);
-       spin_lock_irqsave(&idxd->dev_lock, flags);
+       spin_lock_irqsave(&idxd->cmd_lock, flags);
        iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET);
 
        while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) &
               IDXD_CMDSTS_ACTIVE)
                cpu_relax();
-       spin_unlock_irqrestore(&idxd->dev_lock, flags);
+       spin_unlock_irqrestore(&idxd->cmd_lock, flags);
        return 0;
 }
 
@@ -451,7 +484,8 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 
        if (idxd_device_is_halted(idxd)) {
                dev_warn(&idxd->pdev->dev, "Device is HALTED!\n");
-               *status = IDXD_CMDSTS_HW_ERR;
+               if (status)
+                       *status = IDXD_CMDSTS_HW_ERR;
                return;
        }
 
@@ -460,10 +494,10 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
        cmd.operand = operand;
        cmd.int_req = 1;
 
-       spin_lock_irqsave(&idxd->dev_lock, flags);
+       spin_lock_irqsave(&idxd->cmd_lock, flags);
        wait_event_lock_irq(idxd->cmd_waitq,
                            !test_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags),
-                           idxd->dev_lock);
+                           idxd->cmd_lock);
 
        dev_dbg(&idxd->pdev->dev, "%s: sending cmd: %#x op: %#x\n",
                __func__, cmd_code, operand);
@@ -477,9 +511,9 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
         * After command submitted, release lock and go to sleep until
         * the command completes via interrupt.
         */
-       spin_unlock_irqrestore(&idxd->dev_lock, flags);
+       spin_unlock_irqrestore(&idxd->cmd_lock, flags);
        wait_for_completion(&done);
-       spin_lock_irqsave(&idxd->dev_lock, flags);
+       spin_lock_irqsave(&idxd->cmd_lock, flags);
        if (status) {
                *status = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
                idxd->cmd_status = *status & GENMASK(7, 0);
@@ -488,7 +522,7 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
        __clear_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags);
        /* Wake up other pending commands */
        wake_up(&idxd->cmd_waitq);
-       spin_unlock_irqrestore(&idxd->dev_lock, flags);
+       spin_unlock_irqrestore(&idxd->cmd_lock, flags);
 }
 
 int idxd_device_enable(struct idxd_device *idxd)
@@ -521,7 +555,7 @@ void idxd_device_wqs_clear_state(struct idxd_device *idxd)
        lockdep_assert_held(&idxd->dev_lock);
 
        for (i = 0; i < idxd->max_wqs; i++) {
-               struct idxd_wq *wq = &idxd->wqs[i];
+               struct idxd_wq *wq = idxd->wqs[i];
 
                if (wq->state == IDXD_WQ_ENABLED) {
                        idxd_wq_disable_cleanup(wq);
@@ -579,6 +613,77 @@ void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid)
        dev_dbg(dev, "pasid %d drained\n", pasid);
 }
 
+int idxd_device_request_int_handle(struct idxd_device *idxd, int idx, int *handle,
+                                  enum idxd_interrupt_type irq_type)
+{
+       struct device *dev = &idxd->pdev->dev;
+       u32 operand, status;
+
+       if (!(idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE)))
+               return -EOPNOTSUPP;
+
+       dev_dbg(dev, "get int handle, idx %d\n", idx);
+
+       operand = idx & GENMASK(15, 0);
+       if (irq_type == IDXD_IRQ_IMS)
+               operand |= CMD_INT_HANDLE_IMS;
+
+       dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_REQUEST_INT_HANDLE, operand);
+
+       idxd_cmd_exec(idxd, IDXD_CMD_REQUEST_INT_HANDLE, operand, &status);
+
+       if ((status & IDXD_CMDSTS_ERR_MASK) != IDXD_CMDSTS_SUCCESS) {
+               dev_dbg(dev, "request int handle failed: %#x\n", status);
+               return -ENXIO;
+       }
+
+       *handle = (status >> IDXD_CMDSTS_RES_SHIFT) & GENMASK(15, 0);
+
+       dev_dbg(dev, "int handle acquired: %u\n", *handle);
+       return 0;
+}
+
+int idxd_device_release_int_handle(struct idxd_device *idxd, int handle,
+                                  enum idxd_interrupt_type irq_type)
+{
+       struct device *dev = &idxd->pdev->dev;
+       u32 operand, status;
+       union idxd_command_reg cmd;
+       unsigned long flags;
+
+       if (!(idxd->hw.cmd_cap & BIT(IDXD_CMD_RELEASE_INT_HANDLE)))
+               return -EOPNOTSUPP;
+
+       dev_dbg(dev, "release int handle, handle %d\n", handle);
+
+       memset(&cmd, 0, sizeof(cmd));
+       operand = handle & GENMASK(15, 0);
+
+       if (irq_type == IDXD_IRQ_IMS)
+               operand |= CMD_INT_HANDLE_IMS;
+
+       cmd.cmd = IDXD_CMD_RELEASE_INT_HANDLE;
+       cmd.operand = operand;
+
+       dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_RELEASE_INT_HANDLE, operand);
+
+       spin_lock_irqsave(&idxd->cmd_lock, flags);
+       iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET);
+
+       while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) & IDXD_CMDSTS_ACTIVE)
+               cpu_relax();
+       status = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET);
+       spin_unlock_irqrestore(&idxd->cmd_lock, flags);
+
+       if ((status & IDXD_CMDSTS_ERR_MASK) != IDXD_CMDSTS_SUCCESS) {
+               dev_dbg(dev, "release int handle failed: %#x\n", status);
+               return -ENXIO;
+       }
+
+       dev_dbg(dev, "int handle released.\n");
+       return 0;
+}
+
 /* Device configuration bits */
 void idxd_msix_perm_setup(struct idxd_device *idxd)
 {
@@ -660,7 +765,7 @@ static int idxd_groups_config_write(struct idxd_device *idxd)
                ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET));
 
        for (i = 0; i < idxd->max_groups; i++) {
-               struct idxd_group *group = &idxd->groups[i];
+               struct idxd_group *group = idxd->groups[i];
 
                idxd_group_config_write(group);
        }
@@ -739,7 +844,7 @@ static int idxd_wqs_config_write(struct idxd_device *idxd)
        int i, rc;
 
        for (i = 0; i < idxd->max_wqs; i++) {
-               struct idxd_wq *wq = &idxd->wqs[i];
+               struct idxd_wq *wq = idxd->wqs[i];
 
                rc = idxd_wq_config_write(wq);
                if (rc < 0)
@@ -755,7 +860,7 @@ static void idxd_group_flags_setup(struct idxd_device *idxd)
 
        /* TC-A 0 and TC-B 1 should be defaults */
        for (i = 0; i < idxd->max_groups; i++) {
-               struct idxd_group *group = &idxd->groups[i];
+               struct idxd_group *group = idxd->groups[i];
 
                if (group->tc_a == -1)
                        group->tc_a = group->grpcfg.flags.tc_a = 0;
@@ -782,12 +887,12 @@ static int idxd_engines_setup(struct idxd_device *idxd)
        struct idxd_group *group;
 
        for (i = 0; i < idxd->max_groups; i++) {
-               group = &idxd->groups[i];
+               group = idxd->groups[i];
                group->grpcfg.engines = 0;
        }
 
        for (i = 0; i < idxd->max_engines; i++) {
-               eng = &idxd->engines[i];
+               eng = idxd->engines[i];
                group = eng->group;
 
                if (!group)
@@ -811,13 +916,13 @@ static int idxd_wqs_setup(struct idxd_device *idxd)
        struct device *dev = &idxd->pdev->dev;
 
        for (i = 0; i < idxd->max_groups; i++) {
-               group = &idxd->groups[i];
+               group = idxd->groups[i];
                for (j = 0; j < 4; j++)
                        group->grpcfg.wqs[j] = 0;
        }
 
        for (i = 0; i < idxd->max_wqs; i++) {
-               wq = &idxd->wqs[i];
+               wq = idxd->wqs[i];
                group = wq->group;
 
                if (!wq->group)
@@ -865,3 +970,119 @@ int idxd_device_config(struct idxd_device *idxd)
 
        return 0;
 }
+
+static int idxd_wq_load_config(struct idxd_wq *wq)
+{
+       struct idxd_device *idxd = wq->idxd;
+       struct device *dev = &idxd->pdev->dev;
+       int wqcfg_offset;
+       int i;
+
+       wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, 0);
+       memcpy_fromio(wq->wqcfg, idxd->reg_base + wqcfg_offset, idxd->wqcfg_size);
+
+       wq->size = wq->wqcfg->wq_size;
+       wq->threshold = wq->wqcfg->wq_thresh;
+       if (wq->wqcfg->priv)
+               wq->type = IDXD_WQT_KERNEL;
+
+       /* The driver does not support shared WQ mode in read-only config yet */
+       if (wq->wqcfg->mode == 0 || wq->wqcfg->pasid_en)
+               return -EOPNOTSUPP;
+
+       set_bit(WQ_FLAG_DEDICATED, &wq->flags);
+
+       wq->priority = wq->wqcfg->priority;
+
+       for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
+               wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, i);
+               dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n", wq->id, i, wqcfg_offset, wq->wqcfg->bits[i]);
+       }
+
+       return 0;
+}
+
+static void idxd_group_load_config(struct idxd_group *group)
+{
+       struct idxd_device *idxd = group->idxd;
+       struct device *dev = &idxd->pdev->dev;
+       int i, j, grpcfg_offset;
+
+       /*
+        * Load WQS bit fields
+        * Iterate through all 256 bits 64 bits at a time
+        */
+       for (i = 0; i < GRPWQCFG_STRIDES; i++) {
+               struct idxd_wq *wq;
+
+               grpcfg_offset = GRPWQCFG_OFFSET(idxd, group->id, i);
+               group->grpcfg.wqs[i] = ioread64(idxd->reg_base + grpcfg_offset);
+               dev_dbg(dev, "GRPCFG wq[%d:%d: %#x]: %#llx\n",
+                       group->id, i, grpcfg_offset, group->grpcfg.wqs[i]);
+
+               if (i * 64 >= idxd->max_wqs)
+                       break;
+
+               /* Iterate through all 64 bits and check for wq set */
+               for (j = 0; j < 64; j++) {
+                       int id = i * 64 + j;
+
+                       /* No need to check beyond max wqs */
+                       if (id >= idxd->max_wqs)
+                               break;
+
+                       /* Set group assignment for wq if wq bit is set */
+                       if (group->grpcfg.wqs[i] & BIT(j)) {
+                               wq = idxd->wqs[id];
+                               wq->group = group;
+                       }
+               }
+       }
+
+       grpcfg_offset = GRPENGCFG_OFFSET(idxd, group->id);
+       group->grpcfg.engines = ioread64(idxd->reg_base + grpcfg_offset);
+       dev_dbg(dev, "GRPCFG engs[%d: %#x]: %#llx\n", group->id,
+               grpcfg_offset, group->grpcfg.engines);
+
+       /* Iterate through all 64 bits to check engines set */
+       for (i = 0; i < 64; i++) {
+               if (i >= idxd->max_engines)
+                       break;
+
+               if (group->grpcfg.engines & BIT(i)) {
+                       struct idxd_engine *engine = idxd->engines[i];
+
+                       engine->group = group;
+               }
+       }
+
+       grpcfg_offset = GRPFLGCFG_OFFSET(idxd, group->id);
+       group->grpcfg.flags.bits = ioread32(idxd->reg_base + grpcfg_offset);
+       dev_dbg(dev, "GRPFLAGS flags[%d: %#x]: %#x\n",
+               group->id, grpcfg_offset, group->grpcfg.flags.bits);
+}
+
+int idxd_device_load_config(struct idxd_device *idxd)
+{
+       union gencfg_reg reg;
+       int i, rc;
+
+       reg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET);
+       idxd->token_limit = reg.token_limit;
+
+       for (i = 0; i < idxd->max_groups; i++) {
+               struct idxd_group *group = idxd->groups[i];
+
+               idxd_group_load_config(group);
+       }
+
+       for (i = 0; i < idxd->max_wqs; i++) {
+               struct idxd_wq *wq = idxd->wqs[i];
+
+               rc = idxd_wq_load_config(wq);
+               if (rc < 0)
+                       return rc;
+       }
+
+       return 0;
+}
index a15e501..77439b6 100644 (file)
 
 static inline struct idxd_wq *to_idxd_wq(struct dma_chan *c)
 {
-       return container_of(c, struct idxd_wq, dma_chan);
+       struct idxd_dma_chan *idxd_chan;
+
+       idxd_chan = container_of(c, struct idxd_dma_chan, chan);
+       return idxd_chan->wq;
 }
 
 void idxd_dma_complete_txd(struct idxd_desc *desc,
@@ -135,7 +138,7 @@ static void idxd_dma_issue_pending(struct dma_chan *dma_chan)
 {
 }
 
-dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx)
+static dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 {
        struct dma_chan *c = tx->chan;
        struct idxd_wq *wq = to_idxd_wq(c);
@@ -156,14 +159,25 @@ dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 
 static void idxd_dma_release(struct dma_device *device)
 {
+       struct idxd_dma_dev *idxd_dma = container_of(device, struct idxd_dma_dev, dma);
+
+       kfree(idxd_dma);
 }
 
 int idxd_register_dma_device(struct idxd_device *idxd)
 {
-       struct dma_device *dma = &idxd->dma_dev;
+       struct idxd_dma_dev *idxd_dma;
+       struct dma_device *dma;
+       struct device *dev = &idxd->pdev->dev;
+       int rc;
 
+       idxd_dma = kzalloc_node(sizeof(*idxd_dma), GFP_KERNEL, dev_to_node(dev));
+       if (!idxd_dma)
+               return -ENOMEM;
+
+       dma = &idxd_dma->dma;
        INIT_LIST_HEAD(&dma->channels);
-       dma->dev = &idxd->pdev->dev;
+       dma->dev = dev;
 
        dma_cap_set(DMA_PRIVATE, dma->cap_mask);
        dma_cap_set(DMA_COMPLETION_NO_ORDER, dma->cap_mask);
@@ -179,35 +193,72 @@ int idxd_register_dma_device(struct idxd_device *idxd)
        dma->device_alloc_chan_resources = idxd_dma_alloc_chan_resources;
        dma->device_free_chan_resources = idxd_dma_free_chan_resources;
 
-       return dma_async_device_register(&idxd->dma_dev);
+       rc = dma_async_device_register(dma);
+       if (rc < 0) {
+               kfree(idxd_dma);
+               return rc;
+       }
+
+       idxd_dma->idxd = idxd;
+       /*
+        * This pointer is protected by the refs taken by the dma_chan. It will remain valid
+        * as long as there are outstanding channels.
+        */
+       idxd->idxd_dma = idxd_dma;
+       return 0;
 }
 
 void idxd_unregister_dma_device(struct idxd_device *idxd)
 {
-       dma_async_device_unregister(&idxd->dma_dev);
+       dma_async_device_unregister(&idxd->idxd_dma->dma);
 }
 
 int idxd_register_dma_channel(struct idxd_wq *wq)
 {
        struct idxd_device *idxd = wq->idxd;
-       struct dma_device *dma = &idxd->dma_dev;
-       struct dma_chan *chan = &wq->dma_chan;
-       int rc;
+       struct dma_device *dma = &idxd->idxd_dma->dma;
+       struct device *dev = &idxd->pdev->dev;
+       struct idxd_dma_chan *idxd_chan;
+       struct dma_chan *chan;
+       int rc, i;
+
+       idxd_chan = kzalloc_node(sizeof(*idxd_chan), GFP_KERNEL, dev_to_node(dev));
+       if (!idxd_chan)
+               return -ENOMEM;
 
-       memset(&wq->dma_chan, 0, sizeof(struct dma_chan));
+       chan = &idxd_chan->chan;
        chan->device = dma;
        list_add_tail(&chan->device_node, &dma->channels);
+
+       for (i = 0; i < wq->num_descs; i++) {
+               struct idxd_desc *desc = wq->descs[i];
+
+               dma_async_tx_descriptor_init(&desc->txd, chan);
+               desc->txd.tx_submit = idxd_dma_tx_submit;
+       }
+
        rc = dma_async_device_channel_register(dma, chan);
-       if (rc < 0)
+       if (rc < 0) {
+               kfree(idxd_chan);
                return rc;
+       }
+
+       wq->idxd_chan = idxd_chan;
+       idxd_chan->wq = wq;
+       get_device(&wq->conf_dev);
 
        return 0;
 }
 
 void idxd_unregister_dma_channel(struct idxd_wq *wq)
 {
-       struct dma_chan *chan = &wq->dma_chan;
+       struct idxd_dma_chan *idxd_chan = wq->idxd_chan;
+       struct dma_chan *chan = &idxd_chan->chan;
+       struct idxd_dma_dev *idxd_dma = wq->idxd->idxd_dma;
 
-       dma_async_device_channel_unregister(&wq->idxd->dma_dev, chan);
+       dma_async_device_channel_unregister(&idxd_dma->dma, chan);
        list_del(&chan->device_node);
+       kfree(wq->idxd_chan);
+       wq->idxd_chan = NULL;
+       put_device(&wq->conf_dev);
 }
index 76014c1..26482c7 100644 (file)
@@ -8,12 +8,18 @@
 #include <linux/percpu-rwsem.h>
 #include <linux/wait.h>
 #include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/pci.h>
+#include <linux/perf_event.h>
 #include "registers.h"
 
 #define IDXD_DRIVER_VERSION    "1.00"
 
 extern struct kmem_cache *idxd_desc_pool;
 
+struct idxd_device;
+struct idxd_wq;
+
 #define IDXD_REG_TIMEOUT       50
 #define IDXD_DRAIN_TIMEOUT     5000
 
@@ -25,6 +31,7 @@ enum idxd_type {
 };
 
 #define IDXD_NAME_SIZE         128
+#define IDXD_PMU_EVENT_MAX     64
 
 struct idxd_device_driver {
        struct device_driver drv;
@@ -33,6 +40,7 @@ struct idxd_device_driver {
 struct idxd_irq_entry {
        struct idxd_device *idxd;
        int id;
+       int vector;
        struct llist_head pending_llist;
        struct list_head work_list;
        /*
@@ -56,6 +64,31 @@ struct idxd_group {
        int tc_b;
 };
 
+struct idxd_pmu {
+       struct idxd_device *idxd;
+
+       struct perf_event *event_list[IDXD_PMU_EVENT_MAX];
+       int n_events;
+
+       DECLARE_BITMAP(used_mask, IDXD_PMU_EVENT_MAX);
+
+       struct pmu pmu;
+       char name[IDXD_NAME_SIZE];
+       int cpu;
+
+       int n_counters;
+       int counter_width;
+       int n_event_categories;
+
+       bool per_counter_caps_supported;
+       unsigned long supported_event_categories;
+
+       unsigned long supported_filters;
+       int n_filters;
+
+       struct hlist_node cpuhp_node;
+};
+
 #define IDXD_MAX_PRIORITY      0xf
 
 enum idxd_wq_state {
@@ -75,10 +108,10 @@ enum idxd_wq_type {
 };
 
 struct idxd_cdev {
+       struct idxd_wq *wq;
        struct cdev cdev;
-       struct device *dev;
+       struct device dev;
        int minor;
-       struct wait_queue_head err_queue;
 };
 
 #define IDXD_ALLOCATED_BATCH_SIZE      128U
@@ -96,10 +129,18 @@ enum idxd_complete_type {
        IDXD_COMPLETE_DEV_FAIL,
 };
 
+struct idxd_dma_chan {
+       struct dma_chan chan;
+       struct idxd_wq *wq;
+};
+
 struct idxd_wq {
        void __iomem *portal;
+       struct percpu_ref wq_active;
+       struct completion wq_dead;
        struct device conf_dev;
-       struct idxd_cdev idxd_cdev;
+       struct idxd_cdev *idxd_cdev;
+       struct wait_queue_head err_queue;
        struct idxd_device *idxd;
        int id;
        enum idxd_wq_type type;
@@ -125,7 +166,7 @@ struct idxd_wq {
        int compls_size;
        struct idxd_desc **descs;
        struct sbitmap_queue sbq;
-       struct dma_chan dma_chan;
+       struct idxd_dma_chan *idxd_chan;
        char name[WQ_NAME_SIZE + 1];
        u64 max_xfer_bytes;
        u32 max_batch_size;
@@ -147,6 +188,7 @@ struct idxd_hw {
        union group_cap_reg group_cap;
        union engine_cap_reg engine_cap;
        struct opcap opcap;
+       u32 cmd_cap;
 };
 
 enum idxd_device_state {
@@ -162,9 +204,22 @@ enum idxd_device_flag {
        IDXD_FLAG_PASID_ENABLED,
 };
 
-struct idxd_device {
+struct idxd_dma_dev {
+       struct idxd_device *idxd;
+       struct dma_device dma;
+};
+
+struct idxd_driver_data {
+       const char *name_prefix;
        enum idxd_type type;
+       struct device_type *dev_type;
+       int compl_size;
+       int align;
+};
+
+struct idxd_device {
        struct device conf_dev;
+       struct idxd_driver_data *data;
        struct list_head list;
        struct idxd_hw hw;
        enum idxd_device_state state;
@@ -177,10 +232,11 @@ struct idxd_device {
        void __iomem *reg_base;
 
        spinlock_t dev_lock;    /* spinlock for device */
+       spinlock_t cmd_lock;    /* spinlock for device commands */
        struct completion *cmd_done;
-       struct idxd_group *groups;
-       struct idxd_wq *wqs;
-       struct idxd_engine *engines;
+       struct idxd_group **groups;
+       struct idxd_wq **wqs;
+       struct idxd_engine **engines;
 
        struct iommu_sva *sva;
        unsigned int pasid;
@@ -202,17 +258,19 @@ struct idxd_device {
        int token_limit;
        int nr_tokens;          /* non-reserved tokens */
        unsigned int wqcfg_size;
-       int compl_size;
 
        union sw_err_reg sw_err;
        wait_queue_head_t cmd_waitq;
-       struct msix_entry *msix_entries;
        int num_wq_irqs;
        struct idxd_irq_entry *irq_entries;
 
-       struct dma_device dma_dev;
+       struct idxd_dma_dev *idxd_dma;
        struct workqueue_struct *wq;
        struct work_struct work;
+
+       int *int_handles;
+
+       struct idxd_pmu *idxd_pmu;
 };
 
 /* IDXD software descriptor */
@@ -232,6 +290,7 @@ struct idxd_desc {
        struct list_head list;
        int id;
        int cpu;
+       unsigned int vector;
        struct idxd_wq *wq;
 };
 
@@ -242,6 +301,44 @@ extern struct bus_type dsa_bus_type;
 extern struct bus_type iax_bus_type;
 
 extern bool support_enqcmd;
+extern struct ida idxd_ida;
+extern struct device_type dsa_device_type;
+extern struct device_type iax_device_type;
+extern struct device_type idxd_wq_device_type;
+extern struct device_type idxd_engine_device_type;
+extern struct device_type idxd_group_device_type;
+
+static inline bool is_dsa_dev(struct device *dev)
+{
+       return dev->type == &dsa_device_type;
+}
+
+static inline bool is_iax_dev(struct device *dev)
+{
+       return dev->type == &iax_device_type;
+}
+
+static inline bool is_idxd_dev(struct device *dev)
+{
+       return is_dsa_dev(dev) || is_iax_dev(dev);
+}
+
+static inline bool is_idxd_wq_dev(struct device *dev)
+{
+       return dev->type == &idxd_wq_device_type;
+}
+
+static inline bool is_idxd_wq_dmaengine(struct idxd_wq *wq)
+{
+       if (wq->type == IDXD_WQT_KERNEL && strcmp(wq->name, "dmaengine") == 0)
+               return true;
+       return false;
+}
+
+static inline bool is_idxd_wq_cdev(struct idxd_wq *wq)
+{
+       return wq->type == IDXD_WQT_USER;
+}
 
 static inline bool wq_dedicated(struct idxd_wq *wq)
 {
@@ -268,6 +365,11 @@ enum idxd_portal_prot {
        IDXD_PORTAL_LIMITED,
 };
 
+enum idxd_interrupt_type {
+       IDXD_IRQ_MSIX = 0,
+       IDXD_IRQ_IMS,
+};
+
 static inline int idxd_get_wq_portal_offset(enum idxd_portal_prot prot)
 {
        return prot * 0x1000;
@@ -279,18 +381,6 @@ static inline int idxd_get_wq_portal_full_offset(int wq_id,
        return ((wq_id * 4) << PAGE_SHIFT) + idxd_get_wq_portal_offset(prot);
 }
 
-static inline void idxd_set_type(struct idxd_device *idxd)
-{
-       struct pci_dev *pdev = idxd->pdev;
-
-       if (pdev->device == PCI_DEVICE_ID_INTEL_DSA_SPR0)
-               idxd->type = IDXD_TYPE_DSA;
-       else if (pdev->device == PCI_DEVICE_ID_INTEL_IAX_SPR0)
-               idxd->type = IDXD_TYPE_IAX;
-       else
-               idxd->type = IDXD_TYPE_UNKNOWN;
-}
-
 static inline void idxd_wq_get(struct idxd_wq *wq)
 {
        wq->client_count++;
@@ -306,19 +396,17 @@ static inline int idxd_wq_refcount(struct idxd_wq *wq)
        return wq->client_count;
 };
 
-const char *idxd_get_dev_name(struct idxd_device *idxd);
 int idxd_register_bus_type(void);
 void idxd_unregister_bus_type(void);
-int idxd_setup_sysfs(struct idxd_device *idxd);
-void idxd_cleanup_sysfs(struct idxd_device *idxd);
+int idxd_register_devices(struct idxd_device *idxd);
+void idxd_unregister_devices(struct idxd_device *idxd);
 int idxd_register_driver(void);
 void idxd_unregister_driver(void);
-struct bus_type *idxd_get_bus_type(struct idxd_device *idxd);
+void idxd_wqs_quiesce(struct idxd_device *idxd);
 
 /* device interrupt control */
 void idxd_msix_perm_setup(struct idxd_device *idxd);
 void idxd_msix_perm_clear(struct idxd_device *idxd);
-irqreturn_t idxd_irq_handler(int vec, void *data);
 irqreturn_t idxd_misc_thread(int vec, void *data);
 irqreturn_t idxd_wq_thread(int irq, void *data);
 void idxd_mask_error_interrupts(struct idxd_device *idxd);
@@ -336,8 +424,14 @@ void idxd_device_cleanup(struct idxd_device *idxd);
 int idxd_device_config(struct idxd_device *idxd);
 void idxd_device_wqs_clear_state(struct idxd_device *idxd);
 void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid);
+int idxd_device_load_config(struct idxd_device *idxd);
+int idxd_device_request_int_handle(struct idxd_device *idxd, int idx, int *handle,
+                                  enum idxd_interrupt_type irq_type);
+int idxd_device_release_int_handle(struct idxd_device *idxd, int handle,
+                                  enum idxd_interrupt_type irq_type);
 
 /* work queue control */
+void idxd_wqs_unmap_portal(struct idxd_device *idxd);
 int idxd_wq_alloc_resources(struct idxd_wq *wq);
 void idxd_wq_free_resources(struct idxd_wq *wq);
 int idxd_wq_enable(struct idxd_wq *wq);
@@ -349,6 +443,8 @@ void idxd_wq_unmap_portal(struct idxd_wq *wq);
 void idxd_wq_disable_cleanup(struct idxd_wq *wq);
 int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid);
 int idxd_wq_disable_pasid(struct idxd_wq *wq);
+void idxd_wq_quiesce(struct idxd_wq *wq);
+int idxd_wq_init_percpu_ref(struct idxd_wq *wq);
 
 /* submission */
 int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc);
@@ -363,7 +459,6 @@ void idxd_unregister_dma_channel(struct idxd_wq *wq);
 void idxd_parse_completion_status(u8 status, enum dmaengine_tx_result *res);
 void idxd_dma_complete_txd(struct idxd_desc *desc,
                           enum idxd_complete_type comp_type);
-dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx);
 
 /* cdev */
 int idxd_cdev_register(void);
@@ -372,4 +467,19 @@ int idxd_cdev_get_major(struct idxd_device *idxd);
 int idxd_wq_add_cdev(struct idxd_wq *wq);
 void idxd_wq_del_cdev(struct idxd_wq *wq);
 
+/* perfmon */
+#if IS_ENABLED(CONFIG_INTEL_IDXD_PERFMON)
+int perfmon_pmu_init(struct idxd_device *idxd);
+void perfmon_pmu_remove(struct idxd_device *idxd);
+void perfmon_counter_overflow(struct idxd_device *idxd);
+void perfmon_init(void);
+void perfmon_exit(void);
+#else
+static inline int perfmon_pmu_init(struct idxd_device *idxd) { return 0; }
+static inline void perfmon_pmu_remove(struct idxd_device *idxd) {}
+static inline void perfmon_counter_overflow(struct idxd_device *idxd) {}
+static inline void perfmon_init(void) {}
+static inline void perfmon_exit(void) {}
+#endif
+
 #endif
index 6584b0e..2a926be 100644 (file)
@@ -21,6 +21,7 @@
 #include "../dmaengine.h"
 #include "registers.h"
 #include "idxd.h"
+#include "perfmon.h"
 
 MODULE_VERSION(IDXD_DRIVER_VERSION);
 MODULE_LICENSE("GPL v2");
@@ -33,35 +34,39 @@ MODULE_PARM_DESC(sva, "Toggle SVA support on/off");
 #define DRV_NAME "idxd"
 
 bool support_enqcmd;
-
-static struct idr idxd_idrs[IDXD_TYPE_MAX];
-static DEFINE_MUTEX(idxd_idr_lock);
+DEFINE_IDA(idxd_ida);
+
+static struct idxd_driver_data idxd_driver_data[] = {
+       [IDXD_TYPE_DSA] = {
+               .name_prefix = "dsa",
+               .type = IDXD_TYPE_DSA,
+               .compl_size = sizeof(struct dsa_completion_record),
+               .align = 32,
+               .dev_type = &dsa_device_type,
+       },
+       [IDXD_TYPE_IAX] = {
+               .name_prefix = "iax",
+               .type = IDXD_TYPE_IAX,
+               .compl_size = sizeof(struct iax_completion_record),
+               .align = 64,
+               .dev_type = &iax_device_type,
+       },
+};
 
 static struct pci_device_id idxd_pci_tbl[] = {
        /* DSA ver 1.0 platforms */
-       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_DSA_SPR0) },
+       { PCI_DEVICE_DATA(INTEL, DSA_SPR0, &idxd_driver_data[IDXD_TYPE_DSA]) },
 
        /* IAX ver 1.0 platforms */
-       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IAX_SPR0) },
+       { PCI_DEVICE_DATA(INTEL, IAX_SPR0, &idxd_driver_data[IDXD_TYPE_IAX]) },
        { 0, }
 };
 MODULE_DEVICE_TABLE(pci, idxd_pci_tbl);
 
-static char *idxd_name[] = {
-       "dsa",
-       "iax"
-};
-
-const char *idxd_get_dev_name(struct idxd_device *idxd)
-{
-       return idxd_name[idxd->type];
-}
-
 static int idxd_setup_interrupts(struct idxd_device *idxd)
 {
        struct pci_dev *pdev = idxd->pdev;
        struct device *dev = &pdev->dev;
-       struct msix_entry *msix;
        struct idxd_irq_entry *irq_entry;
        int i, msixcnt;
        int rc = 0;
@@ -69,23 +74,13 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
        msixcnt = pci_msix_vec_count(pdev);
        if (msixcnt < 0) {
                dev_err(dev, "Not MSI-X interrupt capable.\n");
-               goto err_no_irq;
+               return -ENOSPC;
        }
 
-       idxd->msix_entries = devm_kzalloc(dev, sizeof(struct msix_entry) *
-                       msixcnt, GFP_KERNEL);
-       if (!idxd->msix_entries) {
-               rc = -ENOMEM;
-               goto err_no_irq;
-       }
-
-       for (i = 0; i < msixcnt; i++)
-               idxd->msix_entries[i].entry = i;
-
-       rc = pci_enable_msix_exact(pdev, idxd->msix_entries, msixcnt);
-       if (rc) {
-               dev_err(dev, "Failed enabling %d MSIX entries.\n", msixcnt);
-               goto err_no_irq;
+       rc = pci_alloc_irq_vectors(pdev, msixcnt, msixcnt, PCI_IRQ_MSIX);
+       if (rc != msixcnt) {
+               dev_err(dev, "Failed enabling %d MSIX entries: %d\n", msixcnt, rc);
+               return -ENOSPC;
        }
        dev_dbg(dev, "Enabled %d msix vectors\n", msixcnt);
 
@@ -93,119 +88,266 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
         * We implement 1 completion list per MSI-X entry except for
         * entry 0, which is for errors and others.
         */
-       idxd->irq_entries = devm_kcalloc(dev, msixcnt,
-                                        sizeof(struct idxd_irq_entry),
-                                        GFP_KERNEL);
+       idxd->irq_entries = kcalloc_node(msixcnt, sizeof(struct idxd_irq_entry),
+                                        GFP_KERNEL, dev_to_node(dev));
        if (!idxd->irq_entries) {
                rc = -ENOMEM;
-               goto err_no_irq;
+               goto err_irq_entries;
        }
 
        for (i = 0; i < msixcnt; i++) {
                idxd->irq_entries[i].id = i;
                idxd->irq_entries[i].idxd = idxd;
+               idxd->irq_entries[i].vector = pci_irq_vector(pdev, i);
                spin_lock_init(&idxd->irq_entries[i].list_lock);
        }
 
-       msix = &idxd->msix_entries[0];
        irq_entry = &idxd->irq_entries[0];
-       rc = devm_request_threaded_irq(dev, msix->vector, idxd_irq_handler,
-                                      idxd_misc_thread, 0, "idxd-misc",
-                                      irq_entry);
+       rc = request_threaded_irq(irq_entry->vector, NULL, idxd_misc_thread,
+                                 0, "idxd-misc", irq_entry);
        if (rc < 0) {
                dev_err(dev, "Failed to allocate misc interrupt.\n");
-               goto err_no_irq;
+               goto err_misc_irq;
        }
 
-       dev_dbg(dev, "Allocated idxd-misc handler on msix vector %d\n",
-               msix->vector);
+       dev_dbg(dev, "Allocated idxd-misc handler on msix vector %d\n", irq_entry->vector);
 
        /* first MSI-X entry is not for wq interrupts */
        idxd->num_wq_irqs = msixcnt - 1;
 
        for (i = 1; i < msixcnt; i++) {
-               msix = &idxd->msix_entries[i];
                irq_entry = &idxd->irq_entries[i];
 
                init_llist_head(&idxd->irq_entries[i].pending_llist);
                INIT_LIST_HEAD(&idxd->irq_entries[i].work_list);
-               rc = devm_request_threaded_irq(dev, msix->vector,
-                                              idxd_irq_handler,
-                                              idxd_wq_thread, 0,
-                                              "idxd-portal", irq_entry);
+               rc = request_threaded_irq(irq_entry->vector, NULL,
+                                         idxd_wq_thread, 0, "idxd-portal", irq_entry);
                if (rc < 0) {
-                       dev_err(dev, "Failed to allocate irq %d.\n",
-                               msix->vector);
-                       goto err_no_irq;
+                       dev_err(dev, "Failed to allocate irq %d.\n", irq_entry->vector);
+                       goto err_wq_irqs;
+               }
+
+               dev_dbg(dev, "Allocated idxd-msix %d for vector %d\n", i, irq_entry->vector);
+               if (idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE)) {
+                       /*
+                        * The MSIX vector enumeration starts at 1 with vector 0 being the
+                        * misc interrupt that handles non I/O completion events. The
+                        * interrupt handles are for IMS enumeration on guest. The misc
+                        * interrupt vector does not require a handle and therefore we start
+                        * the int_handles at index 0. Since 'i' starts at 1, the first
+                        * int_handles index will be 0.
+                        */
+                       rc = idxd_device_request_int_handle(idxd, i, &idxd->int_handles[i - 1],
+                                                           IDXD_IRQ_MSIX);
+                       if (rc < 0) {
+                               free_irq(irq_entry->vector, irq_entry);
+                               goto err_wq_irqs;
+                       }
+                       dev_dbg(dev, "int handle requested: %u\n", idxd->int_handles[i - 1]);
                }
-               dev_dbg(dev, "Allocated idxd-msix %d for vector %d\n",
-                       i, msix->vector);
        }
 
        idxd_unmask_error_interrupts(idxd);
        idxd_msix_perm_setup(idxd);
        return 0;
 
- err_no_irq:
+ err_wq_irqs:
+       while (--i >= 0) {
+               irq_entry = &idxd->irq_entries[i];
+               free_irq(irq_entry->vector, irq_entry);
+               if (i != 0)
+                       idxd_device_release_int_handle(idxd,
+                                                      idxd->int_handles[i], IDXD_IRQ_MSIX);
+       }
+ err_misc_irq:
        /* Disable error interrupt generation */
        idxd_mask_error_interrupts(idxd);
-       pci_disable_msix(pdev);
+ err_irq_entries:
+       pci_free_irq_vectors(pdev);
        dev_err(dev, "No usable interrupts\n");
        return rc;
 }
 
-static int idxd_setup_internals(struct idxd_device *idxd)
+static int idxd_setup_wqs(struct idxd_device *idxd)
 {
        struct device *dev = &idxd->pdev->dev;
-       int i;
+       struct idxd_wq *wq;
+       int i, rc;
 
-       init_waitqueue_head(&idxd->cmd_waitq);
-       idxd->groups = devm_kcalloc(dev, idxd->max_groups,
-                                   sizeof(struct idxd_group), GFP_KERNEL);
-       if (!idxd->groups)
-               return -ENOMEM;
-
-       for (i = 0; i < idxd->max_groups; i++) {
-               idxd->groups[i].idxd = idxd;
-               idxd->groups[i].id = i;
-               idxd->groups[i].tc_a = -1;
-               idxd->groups[i].tc_b = -1;
-       }
-
-       idxd->wqs = devm_kcalloc(dev, idxd->max_wqs, sizeof(struct idxd_wq),
-                                GFP_KERNEL);
+       idxd->wqs = kcalloc_node(idxd->max_wqs, sizeof(struct idxd_wq *),
+                                GFP_KERNEL, dev_to_node(dev));
        if (!idxd->wqs)
                return -ENOMEM;
 
-       idxd->engines = devm_kcalloc(dev, idxd->max_engines,
-                                    sizeof(struct idxd_engine), GFP_KERNEL);
-       if (!idxd->engines)
-               return -ENOMEM;
-
        for (i = 0; i < idxd->max_wqs; i++) {
-               struct idxd_wq *wq = &idxd->wqs[i];
+               wq = kzalloc_node(sizeof(*wq), GFP_KERNEL, dev_to_node(dev));
+               if (!wq) {
+                       rc = -ENOMEM;
+                       goto err;
+               }
 
                wq->id = i;
                wq->idxd = idxd;
+               device_initialize(&wq->conf_dev);
+               wq->conf_dev.parent = &idxd->conf_dev;
+               wq->conf_dev.bus = &dsa_bus_type;
+               wq->conf_dev.type = &idxd_wq_device_type;
+               rc = dev_set_name(&wq->conf_dev, "wq%d.%d", idxd->id, wq->id);
+               if (rc < 0) {
+                       put_device(&wq->conf_dev);
+                       goto err;
+               }
+
                mutex_init(&wq->wq_lock);
-               wq->idxd_cdev.minor = -1;
+               init_waitqueue_head(&wq->err_queue);
+               init_completion(&wq->wq_dead);
                wq->max_xfer_bytes = idxd->max_xfer_bytes;
                wq->max_batch_size = idxd->max_batch_size;
-               wq->wqcfg = devm_kzalloc(dev, idxd->wqcfg_size, GFP_KERNEL);
-               if (!wq->wqcfg)
-                       return -ENOMEM;
+               wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev));
+               if (!wq->wqcfg) {
+                       put_device(&wq->conf_dev);
+                       rc = -ENOMEM;
+                       goto err;
+               }
+               idxd->wqs[i] = wq;
        }
 
+       return 0;
+
+ err:
+       while (--i >= 0)
+               put_device(&idxd->wqs[i]->conf_dev);
+       return rc;
+}
+
+static int idxd_setup_engines(struct idxd_device *idxd)
+{
+       struct idxd_engine *engine;
+       struct device *dev = &idxd->pdev->dev;
+       int i, rc;
+
+       idxd->engines = kcalloc_node(idxd->max_engines, sizeof(struct idxd_engine *),
+                                    GFP_KERNEL, dev_to_node(dev));
+       if (!idxd->engines)
+               return -ENOMEM;
+
        for (i = 0; i < idxd->max_engines; i++) {
-               idxd->engines[i].idxd = idxd;
-               idxd->engines[i].id = i;
+               engine = kzalloc_node(sizeof(*engine), GFP_KERNEL, dev_to_node(dev));
+               if (!engine) {
+                       rc = -ENOMEM;
+                       goto err;
+               }
+
+               engine->id = i;
+               engine->idxd = idxd;
+               device_initialize(&engine->conf_dev);
+               engine->conf_dev.parent = &idxd->conf_dev;
+               engine->conf_dev.type = &idxd_engine_device_type;
+               rc = dev_set_name(&engine->conf_dev, "engine%d.%d", idxd->id, engine->id);
+               if (rc < 0) {
+                       put_device(&engine->conf_dev);
+                       goto err;
+               }
+
+               idxd->engines[i] = engine;
        }
 
-       idxd->wq = create_workqueue(dev_name(dev));
-       if (!idxd->wq)
+       return 0;
+
+ err:
+       while (--i >= 0)
+               put_device(&idxd->engines[i]->conf_dev);
+       return rc;
+}
+
+static int idxd_setup_groups(struct idxd_device *idxd)
+{
+       struct device *dev = &idxd->pdev->dev;
+       struct idxd_group *group;
+       int i, rc;
+
+       idxd->groups = kcalloc_node(idxd->max_groups, sizeof(struct idxd_group *),
+                                   GFP_KERNEL, dev_to_node(dev));
+       if (!idxd->groups)
                return -ENOMEM;
 
+       for (i = 0; i < idxd->max_groups; i++) {
+               group = kzalloc_node(sizeof(*group), GFP_KERNEL, dev_to_node(dev));
+               if (!group) {
+                       rc = -ENOMEM;
+                       goto err;
+               }
+
+               group->id = i;
+               group->idxd = idxd;
+               device_initialize(&group->conf_dev);
+               group->conf_dev.parent = &idxd->conf_dev;
+               group->conf_dev.bus = &dsa_bus_type;
+               group->conf_dev.type = &idxd_group_device_type;
+               rc = dev_set_name(&group->conf_dev, "group%d.%d", idxd->id, group->id);
+               if (rc < 0) {
+                       put_device(&group->conf_dev);
+                       goto err;
+               }
+
+               idxd->groups[i] = group;
+               group->tc_a = -1;
+               group->tc_b = -1;
+       }
+
+       return 0;
+
+ err:
+       while (--i >= 0)
+               put_device(&idxd->groups[i]->conf_dev);
+       return rc;
+}
+
+static int idxd_setup_internals(struct idxd_device *idxd)
+{
+       struct device *dev = &idxd->pdev->dev;
+       int rc, i;
+
+       init_waitqueue_head(&idxd->cmd_waitq);
+
+       if (idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE)) {
+               idxd->int_handles = devm_kcalloc(dev, idxd->max_wqs, sizeof(int), GFP_KERNEL);
+               if (!idxd->int_handles)
+                       return -ENOMEM;
+       }
+
+       rc = idxd_setup_wqs(idxd);
+       if (rc < 0)
+               goto err_wqs;
+
+       rc = idxd_setup_engines(idxd);
+       if (rc < 0)
+               goto err_engine;
+
+       rc = idxd_setup_groups(idxd);
+       if (rc < 0)
+               goto err_group;
+
+       idxd->wq = create_workqueue(dev_name(dev));
+       if (!idxd->wq) {
+               rc = -ENOMEM;
+               goto err_wkq_create;
+       }
+
        return 0;
+
+ err_wkq_create:
+       for (i = 0; i < idxd->max_groups; i++)
+               put_device(&idxd->groups[i]->conf_dev);
+ err_group:
+       for (i = 0; i < idxd->max_engines; i++)
+               put_device(&idxd->engines[i]->conf_dev);
+ err_engine:
+       for (i = 0; i < idxd->max_wqs; i++)
+               put_device(&idxd->wqs[i]->conf_dev);
+ err_wqs:
+       kfree(idxd->int_handles);
+       return rc;
 }
 
 static void idxd_read_table_offsets(struct idxd_device *idxd)
@@ -233,6 +375,12 @@ static void idxd_read_caps(struct idxd_device *idxd)
        /* reading generic capabilities */
        idxd->hw.gen_cap.bits = ioread64(idxd->reg_base + IDXD_GENCAP_OFFSET);
        dev_dbg(dev, "gen_cap: %#llx\n", idxd->hw.gen_cap.bits);
+
+       if (idxd->hw.gen_cap.cmd_cap) {
+               idxd->hw.cmd_cap = ioread32(idxd->reg_base + IDXD_CMDCAP_OFFSET);
+               dev_dbg(dev, "cmd_cap: %#x\n", idxd->hw.cmd_cap);
+       }
+
        idxd->max_xfer_bytes = 1ULL << idxd->hw.gen_cap.max_xfer_shift;
        dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes);
        idxd->max_batch_size = 1U << idxd->hw.gen_cap.max_batch_shift;
@@ -275,17 +423,34 @@ static void idxd_read_caps(struct idxd_device *idxd)
        }
 }
 
-static struct idxd_device *idxd_alloc(struct pci_dev *pdev)
+static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_data *data)
 {
        struct device *dev = &pdev->dev;
        struct idxd_device *idxd;
+       int rc;
 
-       idxd = devm_kzalloc(dev, sizeof(struct idxd_device), GFP_KERNEL);
+       idxd = kzalloc_node(sizeof(*idxd), GFP_KERNEL, dev_to_node(dev));
        if (!idxd)
                return NULL;
 
        idxd->pdev = pdev;
+       idxd->data = data;
+       idxd->id = ida_alloc(&idxd_ida, GFP_KERNEL);
+       if (idxd->id < 0)
+               return NULL;
+
+       device_initialize(&idxd->conf_dev);
+       idxd->conf_dev.parent = dev;
+       idxd->conf_dev.bus = &dsa_bus_type;
+       idxd->conf_dev.type = idxd->data->dev_type;
+       rc = dev_set_name(&idxd->conf_dev, "%s%d", idxd->data->name_prefix, idxd->id);
+       if (rc < 0) {
+               put_device(&idxd->conf_dev);
+               return NULL;
+       }
+
        spin_lock_init(&idxd->dev_lock);
+       spin_lock_init(&idxd->cmd_lock);
 
        return idxd;
 }
@@ -338,11 +503,18 @@ static int idxd_probe(struct idxd_device *idxd)
        dev_dbg(dev, "IDXD reset complete\n");
 
        if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM) && sva) {
-               rc = idxd_enable_system_pasid(idxd);
-               if (rc < 0)
-                       dev_warn(dev, "Failed to enable PASID. No SVA support: %d\n", rc);
-               else
-                       set_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags);
+               rc = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA);
+               if (rc == 0) {
+                       rc = idxd_enable_system_pasid(idxd);
+                       if (rc < 0) {
+                               iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_SVA);
+                               dev_warn(dev, "Failed to enable PASID. No SVA support: %d\n", rc);
+                       } else {
+                               set_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags);
+                       }
+               } else {
+                       dev_warn(dev, "Unable to turn on SVA feature.\n");
+               }
        } else if (!sva) {
                dev_warn(dev, "User forced SVA off via module param.\n");
        }
@@ -352,80 +524,75 @@ static int idxd_probe(struct idxd_device *idxd)
 
        rc = idxd_setup_internals(idxd);
        if (rc)
-               goto err_setup;
+               goto err;
+
+       /* If the configs are readonly, then load them from device */
+       if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) {
+               dev_dbg(dev, "Loading RO device config\n");
+               rc = idxd_device_load_config(idxd);
+               if (rc < 0)
+                       goto err;
+       }
 
        rc = idxd_setup_interrupts(idxd);
        if (rc)
-               goto err_setup;
+               goto err;
 
        dev_dbg(dev, "IDXD interrupt setup complete.\n");
 
-       mutex_lock(&idxd_idr_lock);
-       idxd->id = idr_alloc(&idxd_idrs[idxd->type], idxd, 0, 0, GFP_KERNEL);
-       mutex_unlock(&idxd_idr_lock);
-       if (idxd->id < 0) {
-               rc = -ENOMEM;
-               goto err_idr_fail;
-       }
-
        idxd->major = idxd_cdev_get_major(idxd);
 
+       rc = perfmon_pmu_init(idxd);
+       if (rc < 0)
+               dev_warn(dev, "Failed to initialize perfmon. No PMU support: %d\n", rc);
+
        dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id);
        return 0;
 
- err_idr_fail:
-       idxd_mask_error_interrupts(idxd);
-       idxd_mask_msix_vectors(idxd);
- err_setup:
+ err:
        if (device_pasid_enabled(idxd))
                idxd_disable_system_pasid(idxd);
+       iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_SVA);
        return rc;
 }
 
-static void idxd_type_init(struct idxd_device *idxd)
-{
-       if (idxd->type == IDXD_TYPE_DSA)
-               idxd->compl_size = sizeof(struct dsa_completion_record);
-       else if (idxd->type == IDXD_TYPE_IAX)
-               idxd->compl_size = sizeof(struct iax_completion_record);
-}
-
 static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
        struct device *dev = &pdev->dev;
        struct idxd_device *idxd;
+       struct idxd_driver_data *data = (struct idxd_driver_data *)id->driver_data;
        int rc;
 
-       rc = pcim_enable_device(pdev);
+       rc = pci_enable_device(pdev);
        if (rc)
                return rc;
 
        dev_dbg(dev, "Alloc IDXD context\n");
-       idxd = idxd_alloc(pdev);
-       if (!idxd)
-               return -ENOMEM;
+       idxd = idxd_alloc(pdev, data);
+       if (!idxd) {
+               rc = -ENOMEM;
+               goto err_idxd_alloc;
+       }
 
        dev_dbg(dev, "Mapping BARs\n");
-       idxd->reg_base = pcim_iomap(pdev, IDXD_MMIO_BAR, 0);
-       if (!idxd->reg_base)
-               return -ENOMEM;
+       idxd->reg_base = pci_iomap(pdev, IDXD_MMIO_BAR, 0);
+       if (!idxd->reg_base) {
+               rc = -ENOMEM;
+               goto err_iomap;
+       }
 
        dev_dbg(dev, "Set DMA masks\n");
        rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
        if (rc)
                rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
        if (rc)
-               return rc;
+               goto err;
 
        rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
        if (rc)
                rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
        if (rc)
-               return rc;
-
-       idxd_set_type(idxd);
-
-       idxd_type_init(idxd);
+               goto err;
 
        dev_dbg(dev, "Set PCI master\n");
        pci_set_master(pdev);
@@ -435,13 +602,13 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        rc = idxd_probe(idxd);
        if (rc) {
                dev_err(dev, "Intel(R) IDXD DMA Engine init failed\n");
-               return -ENODEV;
+               goto err;
        }
 
-       rc = idxd_setup_sysfs(idxd);
+       rc = idxd_register_devices(idxd);
        if (rc) {
                dev_err(dev, "IDXD sysfs setup failed\n");
-               return -ENODEV;
+               goto err;
        }
 
        idxd->state = IDXD_DEV_CONF_READY;
@@ -450,6 +617,14 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
                 idxd->hw.version);
 
        return 0;
+
+ err:
+       pci_iounmap(pdev, idxd->reg_base);
+ err_iomap:
+       put_device(&idxd->conf_dev);
+ err_idxd_alloc:
+       pci_disable_device(pdev);
+       return rc;
 }
 
 static void idxd_flush_pending_llist(struct idxd_irq_entry *ie)
@@ -478,6 +653,36 @@ static void idxd_flush_work_list(struct idxd_irq_entry *ie)
        }
 }
 
+void idxd_wqs_quiesce(struct idxd_device *idxd)
+{
+       struct idxd_wq *wq;
+       int i;
+
+       for (i = 0; i < idxd->max_wqs; i++) {
+               wq = idxd->wqs[i];
+               if (wq->state == IDXD_WQ_ENABLED && wq->type == IDXD_WQT_KERNEL)
+                       idxd_wq_quiesce(wq);
+       }
+}
+
+static void idxd_release_int_handles(struct idxd_device *idxd)
+{
+       struct device *dev = &idxd->pdev->dev;
+       int i, rc;
+
+       for (i = 0; i < idxd->num_wq_irqs; i++) {
+               if (idxd->hw.cmd_cap & BIT(IDXD_CMD_RELEASE_INT_HANDLE)) {
+                       rc = idxd_device_release_int_handle(idxd, idxd->int_handles[i],
+                                                           IDXD_IRQ_MSIX);
+                       if (rc < 0)
+                               dev_warn(dev, "irq handle %d release failed\n",
+                                        idxd->int_handles[i]);
+                       else
+                               dev_dbg(dev, "int handle requested: %u\n", idxd->int_handles[i]);
+               }
+       }
+}
+
 static void idxd_shutdown(struct pci_dev *pdev)
 {
        struct idxd_device *idxd = pci_get_drvdata(pdev);
@@ -495,7 +700,8 @@ static void idxd_shutdown(struct pci_dev *pdev)
 
        for (i = 0; i < msixcnt; i++) {
                irq_entry = &idxd->irq_entries[i];
-               synchronize_irq(idxd->msix_entries[i].vector);
+               synchronize_irq(irq_entry->vector);
+               free_irq(irq_entry->vector, irq_entry);
                if (i == 0)
                        continue;
                idxd_flush_pending_llist(irq_entry);
@@ -503,6 +709,10 @@ static void idxd_shutdown(struct pci_dev *pdev)
        }
 
        idxd_msix_perm_clear(idxd);
+       idxd_release_int_handles(idxd);
+       pci_free_irq_vectors(pdev);
+       pci_iounmap(pdev, idxd->reg_base);
+       pci_disable_device(pdev);
        destroy_workqueue(idxd->wq);
 }
 
@@ -511,13 +721,12 @@ static void idxd_remove(struct pci_dev *pdev)
        struct idxd_device *idxd = pci_get_drvdata(pdev);
 
        dev_dbg(&pdev->dev, "%s called\n", __func__);
-       idxd_cleanup_sysfs(idxd);
        idxd_shutdown(pdev);
        if (device_pasid_enabled(idxd))
                idxd_disable_system_pasid(idxd);
-       mutex_lock(&idxd_idr_lock);
-       idr_remove(&idxd_idrs[idxd->type], idxd->id);
-       mutex_unlock(&idxd_idr_lock);
+       idxd_unregister_devices(idxd);
+       perfmon_pmu_remove(idxd);
+       iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
 }
 
 static struct pci_driver idxd_pci_driver = {
@@ -530,7 +739,7 @@ static struct pci_driver idxd_pci_driver = {
 
 static int __init idxd_init_module(void)
 {
-       int err, i;
+       int err;
 
        /*
         * If the CPU does not support MOVDIR64B or ENQCMDS, there's no point in
@@ -546,8 +755,7 @@ static int __init idxd_init_module(void)
        else
                support_enqcmd = true;
 
-       for (i = 0; i < IDXD_TYPE_MAX; i++)
-               idr_init(&idxd_idrs[i]);
+       perfmon_init();
 
        err = idxd_register_bus_type();
        if (err < 0)
@@ -582,5 +790,6 @@ static void __exit idxd_exit_module(void)
        pci_unregister_driver(&idxd_pci_driver);
        idxd_cdev_remove();
        idxd_unregister_bus_type();
+       perfmon_exit();
 }
 module_exit(idxd_exit_module);
index f1463fc..ae68e1e 100644 (file)
@@ -45,7 +45,7 @@ static void idxd_device_reinit(struct work_struct *work)
                goto out;
 
        for (i = 0; i < idxd->max_wqs; i++) {
-               struct idxd_wq *wq = &idxd->wqs[i];
+               struct idxd_wq *wq = idxd->wqs[i];
 
                if (wq->state == IDXD_WQ_ENABLED) {
                        rc = idxd_wq_enable(wq);
@@ -102,15 +102,6 @@ static int idxd_device_schedule_fault_process(struct idxd_device *idxd,
        return 0;
 }
 
-irqreturn_t idxd_irq_handler(int vec, void *data)
-{
-       struct idxd_irq_entry *irq_entry = data;
-       struct idxd_device *idxd = irq_entry->idxd;
-
-       idxd_mask_msix_vector(idxd, irq_entry->id);
-       return IRQ_WAKE_THREAD;
-}
-
 static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 {
        struct device *dev = &idxd->pdev->dev;
@@ -130,18 +121,18 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 
                if (idxd->sw_err.valid && idxd->sw_err.wq_idx_valid) {
                        int id = idxd->sw_err.wq_idx;
-                       struct idxd_wq *wq = &idxd->wqs[id];
+                       struct idxd_wq *wq = idxd->wqs[id];
 
                        if (wq->type == IDXD_WQT_USER)
-                               wake_up_interruptible(&wq->idxd_cdev.err_queue);
+                               wake_up_interruptible(&wq->err_queue);
                } else {
                        int i;
 
                        for (i = 0; i < idxd->max_wqs; i++) {
-                               struct idxd_wq *wq = &idxd->wqs[i];
+                               struct idxd_wq *wq = idxd->wqs[i];
 
                                if (wq->type == IDXD_WQT_USER)
-                                       wake_up_interruptible(&wq->idxd_cdev.err_queue);
+                                       wake_up_interruptible(&wq->err_queue);
                        }
                }
 
@@ -165,11 +156,8 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
        }
 
        if (cause & IDXD_INTC_PERFMON_OVFL) {
-               /*
-                * Driver does not utilize perfmon counter overflow interrupt
-                * yet.
-                */
                val |= IDXD_INTC_PERFMON_OVFL;
+               perfmon_counter_overflow(idxd);
        }
 
        val ^= cause;
@@ -202,6 +190,8 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
                        queue_work(idxd->wq, &idxd->work);
                } else {
                        spin_lock_bh(&idxd->dev_lock);
+                       idxd_wqs_quiesce(idxd);
+                       idxd_wqs_unmap_portal(idxd);
                        idxd_device_wqs_clear_state(idxd);
                        dev_err(&idxd->pdev->dev,
                                "idxd halted, need %s.\n",
@@ -235,7 +225,6 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
                        iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET);
        }
 
-       idxd_unmask_msix_vector(idxd, irq_entry->id);
        return IRQ_HANDLED;
 }
 
@@ -392,8 +381,6 @@ irqreturn_t idxd_wq_thread(int irq, void *data)
        int processed;
 
        processed = idxd_desc_process(irq_entry);
-       idxd_unmask_msix_vector(irq_entry->idxd, irq_entry->id);
-
        if (processed == 0)
                return IRQ_NONE;
 
diff --git a/drivers/dma/idxd/perfmon.c b/drivers/dma/idxd/perfmon.c
new file mode 100644 (file)
index 0000000..d73004f
--- /dev/null
@@ -0,0 +1,662 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2020 Intel Corporation. All rights rsvd. */
+
+#include <linux/sched/task.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include "idxd.h"
+#include "perfmon.h"
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+                           char *buf);
+
+static cpumask_t               perfmon_dsa_cpu_mask;
+static bool                    cpuhp_set_up;
+static enum cpuhp_state                cpuhp_slot;
+
+/*
+ * perf userspace reads this attribute to determine which cpus to open
+ * counters on.  It's connected to perfmon_dsa_cpu_mask, which is
+ * maintained by the cpu hotplug handlers.
+ */
+static DEVICE_ATTR_RO(cpumask);
+
+static struct attribute *perfmon_cpumask_attrs[] = {
+       &dev_attr_cpumask.attr,
+       NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+       .attrs = perfmon_cpumask_attrs,
+};
+
+/*
+ * These attributes specify the bits in the config word that the perf
+ * syscall uses to pass the event ids and categories to perfmon.
+ */
+DEFINE_PERFMON_FORMAT_ATTR(event_category, "config:0-3");
+DEFINE_PERFMON_FORMAT_ATTR(event, "config:4-31");
+
+/*
+ * These attributes specify the bits in the config1 word that the perf
+ * syscall uses to pass filter data to perfmon.
+ */
+DEFINE_PERFMON_FORMAT_ATTR(filter_wq, "config1:0-31");
+DEFINE_PERFMON_FORMAT_ATTR(filter_tc, "config1:32-39");
+DEFINE_PERFMON_FORMAT_ATTR(filter_pgsz, "config1:40-43");
+DEFINE_PERFMON_FORMAT_ATTR(filter_sz, "config1:44-51");
+DEFINE_PERFMON_FORMAT_ATTR(filter_eng, "config1:52-59");
+
+#define PERFMON_FILTERS_START  2
+#define PERFMON_FILTERS_MAX    5
+
+static struct attribute *perfmon_format_attrs[] = {
+       &format_attr_idxd_event_category.attr,
+       &format_attr_idxd_event.attr,
+       &format_attr_idxd_filter_wq.attr,
+       &format_attr_idxd_filter_tc.attr,
+       &format_attr_idxd_filter_pgsz.attr,
+       &format_attr_idxd_filter_sz.attr,
+       &format_attr_idxd_filter_eng.attr,
+       NULL,
+};
+
+static struct attribute_group perfmon_format_attr_group = {
+       .name = "format",
+       .attrs = perfmon_format_attrs,
+};
+
+static const struct attribute_group *perfmon_attr_groups[] = {
+       &perfmon_format_attr_group,
+       &cpumask_attr_group,
+       NULL,
+};
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+                           char *buf)
+{
+       return cpumap_print_to_pagebuf(true, buf, &perfmon_dsa_cpu_mask);
+}
+
+static bool is_idxd_event(struct idxd_pmu *idxd_pmu, struct perf_event *event)
+{
+       return &idxd_pmu->pmu == event->pmu;
+}
+
+static int perfmon_collect_events(struct idxd_pmu *idxd_pmu,
+                                 struct perf_event *leader,
+                                 bool do_grp)
+{
+       struct perf_event *event;
+       int n, max_count;
+
+       max_count = idxd_pmu->n_counters;
+       n = idxd_pmu->n_events;
+
+       if (n >= max_count)
+               return -EINVAL;
+
+       if (is_idxd_event(idxd_pmu, leader)) {
+               idxd_pmu->event_list[n] = leader;
+               idxd_pmu->event_list[n]->hw.idx = n;
+               n++;
+       }
+
+       if (!do_grp)
+               return n;
+
+       for_each_sibling_event(event, leader) {
+               if (!is_idxd_event(idxd_pmu, event) ||
+                   event->state <= PERF_EVENT_STATE_OFF)
+                       continue;
+
+               if (n >= max_count)
+                       return -EINVAL;
+
+               idxd_pmu->event_list[n] = event;
+               idxd_pmu->event_list[n]->hw.idx = n;
+               n++;
+       }
+
+       return n;
+}
+
+static void perfmon_assign_hw_event(struct idxd_pmu *idxd_pmu,
+                                   struct perf_event *event, int idx)
+{
+       struct idxd_device *idxd = idxd_pmu->idxd;
+       struct hw_perf_event *hwc = &event->hw;
+
+       hwc->idx = idx;
+       hwc->config_base = ioread64(CNTRCFG_REG(idxd, idx));
+       hwc->event_base = ioread64(CNTRCFG_REG(idxd, idx));
+}
+
+static int perfmon_assign_event(struct idxd_pmu *idxd_pmu,
+                               struct perf_event *event)
+{
+       int i;
+
+       for (i = 0; i < IDXD_PMU_EVENT_MAX; i++)
+               if (!test_and_set_bit(i, idxd_pmu->used_mask))
+                       return i;
+
+       return -EINVAL;
+}
+
+/*
+ * Check whether there are enough counters to satisfy that all the
+ * events in the group can actually be scheduled at the same time.
+ *
+ * To do this, create a fake idxd_pmu object so the event collection
+ * and assignment functions can be used without affecting the internal
+ * state of the real idxd_pmu object.
+ */
+static int perfmon_validate_group(struct idxd_pmu *pmu,
+                                 struct perf_event *event)
+{
+       struct perf_event *leader = event->group_leader;
+       struct idxd_pmu *fake_pmu;
+       int i, ret = 0, n, idx;
+
+       fake_pmu = kzalloc(sizeof(*fake_pmu), GFP_KERNEL);
+       if (!fake_pmu)
+               return -ENOMEM;
+
+       fake_pmu->pmu.name = pmu->pmu.name;
+       fake_pmu->n_counters = pmu->n_counters;
+
+       n = perfmon_collect_events(fake_pmu, leader, true);
+       if (n < 0) {
+               ret = n;
+               goto out;
+       }
+
+       fake_pmu->n_events = n;
+       n = perfmon_collect_events(fake_pmu, event, false);
+       if (n < 0) {
+               ret = n;
+               goto out;
+       }
+
+       fake_pmu->n_events = n;
+
+       for (i = 0; i < n; i++) {
+               event = fake_pmu->event_list[i];
+
+               idx = perfmon_assign_event(fake_pmu, event);
+               if (idx < 0) {
+                       ret = idx;
+                       goto out;
+               }
+       }
+out:
+       kfree(fake_pmu);
+
+       return ret;
+}
+
+static int perfmon_pmu_event_init(struct perf_event *event)
+{
+       struct idxd_device *idxd;
+       int ret = 0;
+
+       idxd = event_to_idxd(event);
+       event->hw.idx = -1;
+
+       if (event->attr.type != event->pmu->type)
+               return -ENOENT;
+
+       /* sampling not supported */
+       if (event->attr.sample_period)
+               return -EINVAL;
+
+       if (event->cpu < 0)
+               return -EINVAL;
+
+       if (event->pmu != &idxd->idxd_pmu->pmu)
+               return -EINVAL;
+
+       event->hw.event_base = ioread64(PERFMON_TABLE_OFFSET(idxd));
+       event->cpu = idxd->idxd_pmu->cpu;
+       event->hw.config = event->attr.config;
+
+       if (event->group_leader != event)
+                /* non-group events have themselves as leader */
+               ret = perfmon_validate_group(idxd->idxd_pmu, event);
+
+       return ret;
+}
+
+static inline u64 perfmon_pmu_read_counter(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct idxd_device *idxd;
+       int cntr = hwc->idx;
+
+       idxd = event_to_idxd(event);
+
+       return ioread64(CNTRDATA_REG(idxd, cntr));
+}
+
+static void perfmon_pmu_event_update(struct perf_event *event)
+{
+       struct idxd_device *idxd = event_to_idxd(event);
+       u64 prev_raw_count, new_raw_count, delta, p, n;
+       int shift = 64 - idxd->idxd_pmu->counter_width;
+       struct hw_perf_event *hwc = &event->hw;
+
+       do {
+               prev_raw_count = local64_read(&hwc->prev_count);
+               new_raw_count = perfmon_pmu_read_counter(event);
+       } while (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                       new_raw_count) != prev_raw_count);
+
+       n = (new_raw_count << shift);
+       p = (prev_raw_count << shift);
+
+       delta = ((n - p) >> shift);
+
+       local64_add(delta, &event->count);
+}
+
+void perfmon_counter_overflow(struct idxd_device *idxd)
+{
+       int i, n_counters, max_loop = OVERFLOW_SIZE;
+       struct perf_event *event;
+       unsigned long ovfstatus;
+
+       n_counters = min(idxd->idxd_pmu->n_counters, OVERFLOW_SIZE);
+
+       ovfstatus = ioread32(OVFSTATUS_REG(idxd));
+
+       /*
+        * While updating overflowed counters, other counters behind
+        * them could overflow and be missed in a given pass.
+        * Normally this could happen at most n_counters times, but in
+        * theory a tiny counter width could result in continual
+        * overflows and endless looping.  max_loop provides a
+        * failsafe in that highly unlikely case.
+        */
+       while (ovfstatus && max_loop--) {
+               /* Figure out which counter(s) overflowed */
+               for_each_set_bit(i, &ovfstatus, n_counters) {
+                       unsigned long ovfstatus_clear = 0;
+
+                       /* Update event->count for overflowed counter */
+                       event = idxd->idxd_pmu->event_list[i];
+                       perfmon_pmu_event_update(event);
+                       /* Writing 1 to OVFSTATUS bit clears it */
+                       set_bit(i, &ovfstatus_clear);
+                       iowrite32(ovfstatus_clear, OVFSTATUS_REG(idxd));
+               }
+
+               ovfstatus = ioread32(OVFSTATUS_REG(idxd));
+       }
+
+       /*
+        * Should never happen.  If so, it means a counter(s) looped
+        * around twice while this handler was running.
+        */
+       WARN_ON_ONCE(ovfstatus);
+}
+
+static inline void perfmon_reset_config(struct idxd_device *idxd)
+{
+       iowrite32(CONFIG_RESET, PERFRST_REG(idxd));
+       iowrite32(0, OVFSTATUS_REG(idxd));
+       iowrite32(0, PERFFRZ_REG(idxd));
+}
+
+static inline void perfmon_reset_counters(struct idxd_device *idxd)
+{
+       iowrite32(CNTR_RESET, PERFRST_REG(idxd));
+}
+
+static inline void perfmon_reset(struct idxd_device *idxd)
+{
+       perfmon_reset_config(idxd);
+       perfmon_reset_counters(idxd);
+}
+
+static void perfmon_pmu_event_start(struct perf_event *event, int mode)
+{
+       u32 flt_wq, flt_tc, flt_pg_sz, flt_xfer_sz, flt_eng = 0;
+       u64 cntr_cfg, cntrdata, event_enc, event_cat = 0;
+       struct hw_perf_event *hwc = &event->hw;
+       union filter_cfg flt_cfg;
+       union event_cfg event_cfg;
+       struct idxd_device *idxd;
+       int cntr;
+
+       idxd = event_to_idxd(event);
+
+       event->hw.idx = hwc->idx;
+       cntr = hwc->idx;
+
+       /* Obtain event category and event value from user space */
+       event_cfg.val = event->attr.config;
+       flt_cfg.val = event->attr.config1;
+       event_cat = event_cfg.event_cat;
+       event_enc = event_cfg.event_enc;
+
+       /* Obtain filter configuration from user space */
+       flt_wq = flt_cfg.wq;
+       flt_tc = flt_cfg.tc;
+       flt_pg_sz = flt_cfg.pg_sz;
+       flt_xfer_sz = flt_cfg.xfer_sz;
+       flt_eng = flt_cfg.eng;
+
+       if (flt_wq && test_bit(FLT_WQ, &idxd->idxd_pmu->supported_filters))
+               iowrite32(flt_wq, FLTCFG_REG(idxd, cntr, FLT_WQ));
+       if (flt_tc && test_bit(FLT_TC, &idxd->idxd_pmu->supported_filters))
+               iowrite32(flt_tc, FLTCFG_REG(idxd, cntr, FLT_TC));
+       if (flt_pg_sz && test_bit(FLT_PG_SZ, &idxd->idxd_pmu->supported_filters))
+               iowrite32(flt_pg_sz, FLTCFG_REG(idxd, cntr, FLT_PG_SZ));
+       if (flt_xfer_sz && test_bit(FLT_XFER_SZ, &idxd->idxd_pmu->supported_filters))
+               iowrite32(flt_xfer_sz, FLTCFG_REG(idxd, cntr, FLT_XFER_SZ));
+       if (flt_eng && test_bit(FLT_ENG, &idxd->idxd_pmu->supported_filters))
+               iowrite32(flt_eng, FLTCFG_REG(idxd, cntr, FLT_ENG));
+
+       /* Read the start value */
+       cntrdata = ioread64(CNTRDATA_REG(idxd, cntr));
+       local64_set(&event->hw.prev_count, cntrdata);
+
+       /* Set counter to event/category */
+       cntr_cfg = event_cat << CNTRCFG_CATEGORY_SHIFT;
+       cntr_cfg |= event_enc << CNTRCFG_EVENT_SHIFT;
+       /* Set interrupt on overflow and counter enable bits */
+       cntr_cfg |= (CNTRCFG_IRQ_OVERFLOW | CNTRCFG_ENABLE);
+
+       iowrite64(cntr_cfg, CNTRCFG_REG(idxd, cntr));
+}
+
+static void perfmon_pmu_event_stop(struct perf_event *event, int mode)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct idxd_device *idxd;
+       int i, cntr = hwc->idx;
+       u64 cntr_cfg;
+
+       idxd = event_to_idxd(event);
+
+       /* remove this event from event list */
+       for (i = 0; i < idxd->idxd_pmu->n_events; i++) {
+               if (event != idxd->idxd_pmu->event_list[i])
+                       continue;
+
+               for (++i; i < idxd->idxd_pmu->n_events; i++)
+                       idxd->idxd_pmu->event_list[i - 1] = idxd->idxd_pmu->event_list[i];
+               --idxd->idxd_pmu->n_events;
+               break;
+       }
+
+       cntr_cfg = ioread64(CNTRCFG_REG(idxd, cntr));
+       cntr_cfg &= ~CNTRCFG_ENABLE;
+       iowrite64(cntr_cfg, CNTRCFG_REG(idxd, cntr));
+
+       if (mode == PERF_EF_UPDATE)
+               perfmon_pmu_event_update(event);
+
+       event->hw.idx = -1;
+       clear_bit(cntr, idxd->idxd_pmu->used_mask);
+}
+
+static void perfmon_pmu_event_del(struct perf_event *event, int mode)
+{
+       perfmon_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int perfmon_pmu_event_add(struct perf_event *event, int flags)
+{
+       struct idxd_device *idxd = event_to_idxd(event);
+       struct idxd_pmu *idxd_pmu = idxd->idxd_pmu;
+       struct hw_perf_event *hwc = &event->hw;
+       int idx, n;
+
+       n = perfmon_collect_events(idxd_pmu, event, false);
+       if (n < 0)
+               return n;
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+       if (!(flags & PERF_EF_START))
+               hwc->state |= PERF_HES_ARCH;
+
+       idx = perfmon_assign_event(idxd_pmu, event);
+       if (idx < 0)
+               return idx;
+
+       perfmon_assign_hw_event(idxd_pmu, event, idx);
+
+       if (flags & PERF_EF_START)
+               perfmon_pmu_event_start(event, 0);
+
+       idxd_pmu->n_events = n;
+
+       return 0;
+}
+
+static void enable_perfmon_pmu(struct idxd_device *idxd)
+{
+       iowrite32(COUNTER_UNFREEZE, PERFFRZ_REG(idxd));
+}
+
+static void disable_perfmon_pmu(struct idxd_device *idxd)
+{
+       iowrite32(COUNTER_FREEZE, PERFFRZ_REG(idxd));
+}
+
+static void perfmon_pmu_enable(struct pmu *pmu)
+{
+       struct idxd_device *idxd = pmu_to_idxd(pmu);
+
+       enable_perfmon_pmu(idxd);
+}
+
+static void perfmon_pmu_disable(struct pmu *pmu)
+{
+       struct idxd_device *idxd = pmu_to_idxd(pmu);
+
+       disable_perfmon_pmu(idxd);
+}
+
+static void skip_filter(int i)
+{
+       int j;
+
+       for (j = i; j < PERFMON_FILTERS_MAX; j++)
+               perfmon_format_attrs[PERFMON_FILTERS_START + j] =
+                       perfmon_format_attrs[PERFMON_FILTERS_START + j + 1];
+}
+
+static void idxd_pmu_init(struct idxd_pmu *idxd_pmu)
+{
+       int i;
+
+       for (i = 0 ; i < PERFMON_FILTERS_MAX; i++) {
+               if (!test_bit(i, &idxd_pmu->supported_filters))
+                       skip_filter(i);
+       }
+
+       idxd_pmu->pmu.name              = idxd_pmu->name;
+       idxd_pmu->pmu.attr_groups       = perfmon_attr_groups;
+       idxd_pmu->pmu.task_ctx_nr       = perf_invalid_context;
+       idxd_pmu->pmu.event_init        = perfmon_pmu_event_init;
+       idxd_pmu->pmu.pmu_enable        = perfmon_pmu_enable,
+       idxd_pmu->pmu.pmu_disable       = perfmon_pmu_disable,
+       idxd_pmu->pmu.add               = perfmon_pmu_event_add;
+       idxd_pmu->pmu.del               = perfmon_pmu_event_del;
+       idxd_pmu->pmu.start             = perfmon_pmu_event_start;
+       idxd_pmu->pmu.stop              = perfmon_pmu_event_stop;
+       idxd_pmu->pmu.read              = perfmon_pmu_event_update;
+       idxd_pmu->pmu.capabilities      = PERF_PMU_CAP_NO_EXCLUDE;
+       idxd_pmu->pmu.module            = THIS_MODULE;
+}
+
+void perfmon_pmu_remove(struct idxd_device *idxd)
+{
+       if (!idxd->idxd_pmu)
+               return;
+
+       cpuhp_state_remove_instance(cpuhp_slot, &idxd->idxd_pmu->cpuhp_node);
+       perf_pmu_unregister(&idxd->idxd_pmu->pmu);
+       kfree(idxd->idxd_pmu);
+       idxd->idxd_pmu = NULL;
+}
+
+static int perf_event_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+       struct idxd_pmu *idxd_pmu;
+
+       idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
+
+       /* select the first online CPU as the designated reader */
+       if (cpumask_empty(&perfmon_dsa_cpu_mask)) {
+               cpumask_set_cpu(cpu, &perfmon_dsa_cpu_mask);
+               idxd_pmu->cpu = cpu;
+       }
+
+       return 0;
+}
+
+static int perf_event_cpu_offline(unsigned int cpu, struct hlist_node *node)
+{
+       struct idxd_pmu *idxd_pmu;
+       unsigned int target;
+
+       idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
+
+       if (!cpumask_test_and_clear_cpu(cpu, &perfmon_dsa_cpu_mask))
+               return 0;
+
+       target = cpumask_any_but(cpu_online_mask, cpu);
+
+       /* migrate events if there is a valid target */
+       if (target < nr_cpu_ids)
+               cpumask_set_cpu(target, &perfmon_dsa_cpu_mask);
+       else
+               target = -1;
+
+       perf_pmu_migrate_context(&idxd_pmu->pmu, cpu, target);
+
+       return 0;
+}
+
+int perfmon_pmu_init(struct idxd_device *idxd)
+{
+       union idxd_perfcap perfcap;
+       struct idxd_pmu *idxd_pmu;
+       int rc = -ENODEV;
+
+       /*
+        * perfmon module initialization failed, nothing to do
+        */
+       if (!cpuhp_set_up)
+               return -ENODEV;
+
+       /*
+        * If perfmon_offset or num_counters is 0, it means perfmon is
+        * not supported on this hardware.
+        */
+       if (idxd->perfmon_offset == 0)
+               return -ENODEV;
+
+       idxd_pmu = kzalloc(sizeof(*idxd_pmu), GFP_KERNEL);
+       if (!idxd_pmu)
+               return -ENOMEM;
+
+       idxd_pmu->idxd = idxd;
+       idxd->idxd_pmu = idxd_pmu;
+
+       if (idxd->data->type == IDXD_TYPE_DSA) {
+               rc = sprintf(idxd_pmu->name, "dsa%d", idxd->id);
+               if (rc < 0)
+                       goto free;
+       } else if (idxd->data->type == IDXD_TYPE_IAX) {
+               rc = sprintf(idxd_pmu->name, "iax%d", idxd->id);
+               if (rc < 0)
+                       goto free;
+       } else {
+               goto free;
+       }
+
+       perfmon_reset(idxd);
+
+       perfcap.bits = ioread64(PERFCAP_REG(idxd));
+
+       /*
+        * If total perf counter is 0, stop further registration.
+        * This is necessary in order to support driver running on
+        * guest which does not have pmon support.
+        */
+       if (perfcap.num_perf_counter == 0)
+               goto free;
+
+       /* A counter width of 0 means it can't count */
+       if (perfcap.counter_width == 0)
+               goto free;
+
+       /* Overflow interrupt and counter freeze support must be available */
+       if (!perfcap.overflow_interrupt || !perfcap.counter_freeze)
+               goto free;
+
+       /* Number of event categories cannot be 0 */
+       if (perfcap.num_event_category == 0)
+               goto free;
+
+       /*
+        * We don't support per-counter capabilities for now.
+        */
+       if (perfcap.cap_per_counter)
+               goto free;
+
+       idxd_pmu->n_event_categories = perfcap.num_event_category;
+       idxd_pmu->supported_event_categories = perfcap.global_event_category;
+       idxd_pmu->per_counter_caps_supported = perfcap.cap_per_counter;
+
+       /* check filter capability.  If 0, then filters are not supported */
+       idxd_pmu->supported_filters = perfcap.filter;
+       if (perfcap.filter)
+               idxd_pmu->n_filters = hweight8(perfcap.filter);
+
+       /* Store the total number of counters categories, and counter width */
+       idxd_pmu->n_counters = perfcap.num_perf_counter;
+       idxd_pmu->counter_width = perfcap.counter_width;
+
+       idxd_pmu_init(idxd_pmu);
+
+       rc = perf_pmu_register(&idxd_pmu->pmu, idxd_pmu->name, -1);
+       if (rc)
+               goto free;
+
+       rc = cpuhp_state_add_instance(cpuhp_slot, &idxd_pmu->cpuhp_node);
+       if (rc) {
+               perf_pmu_unregister(&idxd->idxd_pmu->pmu);
+               goto free;
+       }
+out:
+       return rc;
+free:
+       kfree(idxd_pmu);
+       idxd->idxd_pmu = NULL;
+
+       goto out;
+}
+
+void __init perfmon_init(void)
+{
+       int rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+                                        "driver/dma/idxd/perf:online",
+                                        perf_event_cpu_online,
+                                        perf_event_cpu_offline);
+       if (WARN_ON(rc < 0))
+               return;
+
+       cpuhp_slot = rc;
+       cpuhp_set_up = true;
+}
+
+void __exit perfmon_exit(void)
+{
+       if (cpuhp_set_up)
+               cpuhp_remove_multi_state(cpuhp_slot);
+}
diff --git a/drivers/dma/idxd/perfmon.h b/drivers/dma/idxd/perfmon.h
new file mode 100644 (file)
index 0000000..9a081a1
--- /dev/null
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2020 Intel Corporation. All rights rsvd. */
+
+#ifndef _PERFMON_H_
+#define _PERFMON_H_
+
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/sbitmap.h>
+#include <linux/dmaengine.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/uuid.h>
+#include <linux/idxd.h>
+#include <linux/perf_event.h>
+#include "registers.h"
+
+static inline struct idxd_pmu *event_to_pmu(struct perf_event *event)
+{
+       struct idxd_pmu *idxd_pmu;
+       struct pmu *pmu;
+
+       pmu = event->pmu;
+       idxd_pmu = container_of(pmu, struct idxd_pmu, pmu);
+
+       return idxd_pmu;
+}
+
+static inline struct idxd_device *event_to_idxd(struct perf_event *event)
+{
+       struct idxd_pmu *idxd_pmu;
+       struct pmu *pmu;
+
+       pmu = event->pmu;
+       idxd_pmu = container_of(pmu, struct idxd_pmu, pmu);
+
+       return idxd_pmu->idxd;
+}
+
+static inline struct idxd_device *pmu_to_idxd(struct pmu *pmu)
+{
+       struct idxd_pmu *idxd_pmu;
+
+       idxd_pmu = container_of(pmu, struct idxd_pmu, pmu);
+
+       return idxd_pmu->idxd;
+}
+
+enum dsa_perf_events {
+       DSA_PERF_EVENT_WQ = 0,
+       DSA_PERF_EVENT_ENGINE,
+       DSA_PERF_EVENT_ADDR_TRANS,
+       DSA_PERF_EVENT_OP,
+       DSA_PERF_EVENT_COMPL,
+       DSA_PERF_EVENT_MAX,
+};
+
+enum filter_enc {
+       FLT_WQ = 0,
+       FLT_TC,
+       FLT_PG_SZ,
+       FLT_XFER_SZ,
+       FLT_ENG,
+       FLT_MAX,
+};
+
+#define CONFIG_RESET           0x0000000000000001
+#define CNTR_RESET             0x0000000000000002
+#define CNTR_ENABLE            0x0000000000000001
+#define INTR_OVFL              0x0000000000000002
+
+#define COUNTER_FREEZE         0x00000000FFFFFFFF
+#define COUNTER_UNFREEZE       0x0000000000000000
+#define OVERFLOW_SIZE          32
+
+#define CNTRCFG_ENABLE         BIT(0)
+#define CNTRCFG_IRQ_OVERFLOW   BIT(1)
+#define CNTRCFG_CATEGORY_SHIFT 8
+#define CNTRCFG_EVENT_SHIFT    32
+
+#define PERFMON_TABLE_OFFSET(_idxd)                            \
+({                                                             \
+       typeof(_idxd) __idxd = (_idxd);                         \
+       ((__idxd)->reg_base + (__idxd)->perfmon_offset);        \
+})
+#define PERFMON_REG_OFFSET(idxd, offset)                       \
+       (PERFMON_TABLE_OFFSET(idxd) + (offset))
+
+#define PERFCAP_REG(idxd)      (PERFMON_REG_OFFSET(idxd, IDXD_PERFCAP_OFFSET))
+#define PERFRST_REG(idxd)      (PERFMON_REG_OFFSET(idxd, IDXD_PERFRST_OFFSET))
+#define OVFSTATUS_REG(idxd)    (PERFMON_REG_OFFSET(idxd, IDXD_OVFSTATUS_OFFSET))
+#define PERFFRZ_REG(idxd)      (PERFMON_REG_OFFSET(idxd, IDXD_PERFFRZ_OFFSET))
+
+#define FLTCFG_REG(idxd, cntr, flt)                            \
+       (PERFMON_REG_OFFSET(idxd, IDXD_FLTCFG_OFFSET) + ((cntr) * 32) + ((flt) * 4))
+
+#define CNTRCFG_REG(idxd, cntr)                                        \
+       (PERFMON_REG_OFFSET(idxd, IDXD_CNTRCFG_OFFSET) + ((cntr) * 8))
+#define CNTRDATA_REG(idxd, cntr)                                       \
+       (PERFMON_REG_OFFSET(idxd, IDXD_CNTRDATA_OFFSET) + ((cntr) * 8))
+#define CNTRCAP_REG(idxd, cntr)                                        \
+       (PERFMON_REG_OFFSET(idxd, IDXD_CNTRCAP_OFFSET) + ((cntr) * 8))
+
+#define EVNTCAP_REG(idxd, category) \
+       (PERFMON_REG_OFFSET(idxd, IDXD_EVNTCAP_OFFSET) + ((category) * 8))
+
+#define DEFINE_PERFMON_FORMAT_ATTR(_name, _format)                     \
+static ssize_t __perfmon_idxd_##_name##_show(struct kobject *kobj,     \
+                               struct kobj_attribute *attr,            \
+                               char *page)                             \
+{                                                                      \
+       BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);                     \
+       return sprintf(page, _format "\n");                             \
+}                                                                      \
+static struct kobj_attribute format_attr_idxd_##_name =                        \
+       __ATTR(_name, 0444, __perfmon_idxd_##_name##_show, NULL)
+
+#endif
index 751ecb4..c970c3f 100644 (file)
@@ -24,8 +24,8 @@ union gen_cap_reg {
                u64 overlap_copy:1;
                u64 cache_control_mem:1;
                u64 cache_control_cache:1;
+               u64 cmd_cap:1;
                u64 rsvd:3;
-               u64 int_handle_req:1;
                u64 dest_readback:1;
                u64 drain_readback:1;
                u64 rsvd2:6;
@@ -120,7 +120,8 @@ union gencfg_reg {
 union genctrl_reg {
        struct {
                u32 softerr_int_en:1;
-               u32 rsvd:31;
+               u32 halt_int_en:1;
+               u32 rsvd:30;
        };
        u32 bits;
 } __packed;
@@ -180,8 +181,11 @@ enum idxd_cmd {
        IDXD_CMD_DRAIN_PASID,
        IDXD_CMD_ABORT_PASID,
        IDXD_CMD_REQUEST_INT_HANDLE,
+       IDXD_CMD_RELEASE_INT_HANDLE,
 };
 
+#define CMD_INT_HANDLE_IMS             0x10000
+
 #define IDXD_CMDSTS_OFFSET             0xa8
 union cmdsts_reg {
        struct {
@@ -193,6 +197,8 @@ union cmdsts_reg {
        u32 bits;
 } __packed;
 #define IDXD_CMDSTS_ACTIVE             0x80000000
+#define IDXD_CMDSTS_ERR_MASK           0xff
+#define IDXD_CMDSTS_RES_SHIFT          8
 
 enum idxd_cmdsts_err {
        IDXD_CMDSTS_SUCCESS = 0,
@@ -228,6 +234,8 @@ enum idxd_cmdsts_err {
        IDXD_CMDSTS_ERR_NO_HANDLE,
 };
 
+#define IDXD_CMDCAP_OFFSET             0xb0
+
 #define IDXD_SWERR_OFFSET              0xc0
 #define IDXD_SWERR_VALID               0x00000001
 #define IDXD_SWERR_OVERFLOW            0x00000002
@@ -378,4 +386,112 @@ union wqcfg {
 #define GRPENGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 32)
 #define GRPFLGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 40)
 
+/* Following is performance monitor registers */
+#define IDXD_PERFCAP_OFFSET            0x0
+union idxd_perfcap {
+       struct {
+               u64 num_perf_counter:6;
+               u64 rsvd1:2;
+               u64 counter_width:8;
+               u64 num_event_category:4;
+               u64 global_event_category:16;
+               u64 filter:8;
+               u64 rsvd2:8;
+               u64 cap_per_counter:1;
+               u64 writeable_counter:1;
+               u64 counter_freeze:1;
+               u64 overflow_interrupt:1;
+               u64 rsvd3:8;
+       };
+       u64 bits;
+} __packed;
+
+#define IDXD_EVNTCAP_OFFSET            0x80
+union idxd_evntcap {
+       struct {
+               u64 events:28;
+               u64 rsvd:36;
+       };
+       u64 bits;
+} __packed;
+
+struct idxd_event {
+       union {
+               struct {
+                       u32 event_category:4;
+                       u32 events:28;
+               };
+               u32 val;
+       };
+} __packed;
+
+#define IDXD_CNTRCAP_OFFSET            0x800
+struct idxd_cntrcap {
+       union {
+               struct {
+                       u32 counter_width:8;
+                       u32 rsvd:20;
+                       u32 num_events:4;
+               };
+               u32 val;
+       };
+       struct idxd_event events[];
+} __packed;
+
+#define IDXD_PERFRST_OFFSET            0x10
+union idxd_perfrst {
+       struct {
+               u32 perfrst_config:1;
+               u32 perfrst_counter:1;
+               u32 rsvd:30;
+       };
+       u32 val;
+} __packed;
+
+#define IDXD_OVFSTATUS_OFFSET          0x30
+#define IDXD_PERFFRZ_OFFSET            0x20
+#define IDXD_CNTRCFG_OFFSET            0x100
+union idxd_cntrcfg {
+       struct {
+               u64 enable:1;
+               u64 interrupt_ovf:1;
+               u64 global_freeze_ovf:1;
+               u64 rsvd1:5;
+               u64 event_category:4;
+               u64 rsvd2:20;
+               u64 events:28;
+               u64 rsvd3:4;
+       };
+       u64 val;
+} __packed;
+
+#define IDXD_FLTCFG_OFFSET             0x300
+
+#define IDXD_CNTRDATA_OFFSET           0x200
+union idxd_cntrdata {
+       struct {
+               u64 event_count_value;
+       };
+       u64 val;
+} __packed;
+
+union event_cfg {
+       struct {
+               u64 event_cat:4;
+               u64 event_enc:28;
+       };
+       u64 val;
+} __packed;
+
+union filter_cfg {
+       struct {
+               u64 wq:32;
+               u64 tc:8;
+               u64 pg_sz:4;
+               u64 xfer_sz:8;
+               u64 eng:8;
+       };
+       u64 val;
+} __packed;
+
 #endif
index a7a61bc..19afb62 100644 (file)
@@ -15,18 +15,30 @@ static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 
        desc = wq->descs[idx];
        memset(desc->hw, 0, sizeof(struct dsa_hw_desc));
-       memset(desc->completion, 0, idxd->compl_size);
+       memset(desc->completion, 0, idxd->data->compl_size);
        desc->cpu = cpu;
 
        if (device_pasid_enabled(idxd))
                desc->hw->pasid = idxd->pasid;
 
        /*
-        * Descriptor completion vectors are 1-8 for MSIX. We will round
-        * robin through the 8 vectors.
+        * Descriptor completion vectors are 1...N for MSIX. We will round
+        * robin through the N vectors.
         */
        wq->vec_ptr = (wq->vec_ptr % idxd->num_wq_irqs) + 1;
-       desc->hw->int_handle = wq->vec_ptr;
+       if (!idxd->int_handles) {
+               desc->hw->int_handle = wq->vec_ptr;
+       } else {
+               desc->vector = wq->vec_ptr;
+               /*
+                * int_handles are only for descriptor completion. However for device
+                * MSIX enumeration, vec 0 is used for misc interrupts. Therefore even
+                * though we are rotating through 1...N for descriptor interrupts, we
+                * need to acqurie the int_handles from 0..N-1.
+                */
+               desc->hw->int_handle = idxd->int_handles[desc->vector - 1];
+       }
+
        return desc;
 }
 
@@ -79,13 +91,15 @@ void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 {
        struct idxd_device *idxd = wq->idxd;
-       int vec = desc->hw->int_handle;
        void __iomem *portal;
        int rc;
 
        if (idxd->state != IDXD_DEV_ENABLED)
                return -EIO;
 
+       if (!percpu_ref_tryget_live(&wq->wq_active))
+               return -ENXIO;
+
        portal = wq->portal;
 
        /*
@@ -108,13 +122,25 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
                        return rc;
        }
 
+       percpu_ref_put(&wq->wq_active);
+
        /*
         * Pending the descriptor to the lockless list for the irq_entry
         * that we designated the descriptor to.
         */
-       if (desc->hw->flags & IDXD_OP_FLAG_RCI)
-               llist_add(&desc->llnode,
-                         &idxd->irq_entries[vec].pending_llist);
+       if (desc->hw->flags & IDXD_OP_FLAG_RCI) {
+               int vec;
+
+               /*
+                * If the driver is on host kernel, it would be the value
+                * assigned to interrupt handle, which is index for MSIX
+                * vector. If it's guest then can't use the int_handle since
+                * that is the index to IMS for the entire device. The guest
+                * device local index will be used.
+                */
+               vec = !idxd->int_handles ? desc->hw->int_handle : desc->vector;
+               llist_add(&desc->llnode, &idxd->irq_entries[vec].pending_llist);
+       }
 
        return 0;
 }
index 18bf4d1..0460d58 100644 (file)
@@ -16,69 +16,6 @@ static char *idxd_wq_type_names[] = {
        [IDXD_WQT_USER]         = "user",
 };
 
-static void idxd_conf_device_release(struct device *dev)
-{
-       dev_dbg(dev, "%s for %s\n", __func__, dev_name(dev));
-}
-
-static struct device_type idxd_group_device_type = {
-       .name = "group",
-       .release = idxd_conf_device_release,
-};
-
-static struct device_type idxd_wq_device_type = {
-       .name = "wq",
-       .release = idxd_conf_device_release,
-};
-
-static struct device_type idxd_engine_device_type = {
-       .name = "engine",
-       .release = idxd_conf_device_release,
-};
-
-static struct device_type dsa_device_type = {
-       .name = "dsa",
-       .release = idxd_conf_device_release,
-};
-
-static struct device_type iax_device_type = {
-       .name = "iax",
-       .release = idxd_conf_device_release,
-};
-
-static inline bool is_dsa_dev(struct device *dev)
-{
-       return dev ? dev->type == &dsa_device_type : false;
-}
-
-static inline bool is_iax_dev(struct device *dev)
-{
-       return dev ? dev->type == &iax_device_type : false;
-}
-
-static inline bool is_idxd_dev(struct device *dev)
-{
-       return is_dsa_dev(dev) || is_iax_dev(dev);
-}
-
-static inline bool is_idxd_wq_dev(struct device *dev)
-{
-       return dev ? dev->type == &idxd_wq_device_type : false;
-}
-
-static inline bool is_idxd_wq_dmaengine(struct idxd_wq *wq)
-{
-       if (wq->type == IDXD_WQT_KERNEL &&
-           strcmp(wq->name, "dmaengine") == 0)
-               return true;
-       return false;
-}
-
-static inline bool is_idxd_wq_cdev(struct idxd_wq *wq)
-{
-       return wq->type == IDXD_WQT_USER;
-}
-
 static int idxd_config_bus_match(struct device *dev,
                                 struct device_driver *drv)
 {
@@ -110,9 +47,131 @@ static int idxd_config_bus_match(struct device *dev,
        return matched;
 }
 
-static int idxd_config_bus_probe(struct device *dev)
+static int enable_wq(struct idxd_wq *wq)
 {
+       struct idxd_device *idxd = wq->idxd;
+       struct device *dev = &idxd->pdev->dev;
+       unsigned long flags;
        int rc;
+
+       mutex_lock(&wq->wq_lock);
+
+       if (idxd->state != IDXD_DEV_ENABLED) {
+               mutex_unlock(&wq->wq_lock);
+               dev_warn(dev, "Enabling while device not enabled.\n");
+               return -EPERM;
+       }
+
+       if (wq->state != IDXD_WQ_DISABLED) {
+               mutex_unlock(&wq->wq_lock);
+               dev_warn(dev, "WQ %d already enabled.\n", wq->id);
+               return -EBUSY;
+       }
+
+       if (!wq->group) {
+               mutex_unlock(&wq->wq_lock);
+               dev_warn(dev, "WQ not attached to group.\n");
+               return -EINVAL;
+       }
+
+       if (strlen(wq->name) == 0) {
+               mutex_unlock(&wq->wq_lock);
+               dev_warn(dev, "WQ name not set.\n");
+               return -EINVAL;
+       }
+
+       /* Shared WQ checks */
+       if (wq_shared(wq)) {
+               if (!device_swq_supported(idxd)) {
+                       dev_warn(dev, "PASID not enabled and shared WQ.\n");
+                       mutex_unlock(&wq->wq_lock);
+                       return -ENXIO;
+               }
+               /*
+                * Shared wq with the threshold set to 0 means the user
+                * did not set the threshold or transitioned from a
+                * dedicated wq but did not set threshold. A value
+                * of 0 would effectively disable the shared wq. The
+                * driver does not allow a value of 0 to be set for
+                * threshold via sysfs.
+                */
+               if (wq->threshold == 0) {
+                       dev_warn(dev, "Shared WQ and threshold 0.\n");
+                       mutex_unlock(&wq->wq_lock);
+                       return -EINVAL;
+               }
+       }
+
+       rc = idxd_wq_alloc_resources(wq);
+       if (rc < 0) {
+               mutex_unlock(&wq->wq_lock);
+               dev_warn(dev, "WQ resource alloc failed\n");
+               return rc;
+       }
+
+       spin_lock_irqsave(&idxd->dev_lock, flags);
+       if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+               rc = idxd_device_config(idxd);
+       spin_unlock_irqrestore(&idxd->dev_lock, flags);
+       if (rc < 0) {
+               mutex_unlock(&wq->wq_lock);
+               dev_warn(dev, "Writing WQ %d config failed: %d\n", wq->id, rc);
+               return rc;
+       }
+
+       rc = idxd_wq_enable(wq);
+       if (rc < 0) {
+               mutex_unlock(&wq->wq_lock);
+               dev_warn(dev, "WQ %d enabling failed: %d\n", wq->id, rc);
+               return rc;
+       }
+
+       rc = idxd_wq_map_portal(wq);
+       if (rc < 0) {
+               dev_warn(dev, "wq portal mapping failed: %d\n", rc);
+               rc = idxd_wq_disable(wq);
+               if (rc < 0)
+                       dev_warn(dev, "IDXD wq disable failed\n");
+               mutex_unlock(&wq->wq_lock);
+               return rc;
+       }
+
+       wq->client_count = 0;
+
+       if (wq->type == IDXD_WQT_KERNEL) {
+               rc = idxd_wq_init_percpu_ref(wq);
+               if (rc < 0) {
+                       dev_dbg(dev, "percpu_ref setup failed\n");
+                       mutex_unlock(&wq->wq_lock);
+                       return rc;
+               }
+       }
+
+       if (is_idxd_wq_dmaengine(wq)) {
+               rc = idxd_register_dma_channel(wq);
+               if (rc < 0) {
+                       dev_dbg(dev, "DMA channel register failed\n");
+                       mutex_unlock(&wq->wq_lock);
+                       return rc;
+               }
+       } else if (is_idxd_wq_cdev(wq)) {
+               rc = idxd_wq_add_cdev(wq);
+               if (rc < 0) {
+                       dev_dbg(dev, "Cdev creation failed\n");
+                       mutex_unlock(&wq->wq_lock);
+                       return rc;
+               }
+       }
+
+       mutex_unlock(&wq->wq_lock);
+       dev_info(dev, "wq %s enabled\n", dev_name(&wq->conf_dev));
+
+       return 0;
+}
+
+static int idxd_config_bus_probe(struct device *dev)
+{
+       int rc = 0;
        unsigned long flags;
 
        dev_dbg(dev, "%s called\n", __func__);
@@ -130,7 +189,8 @@ static int idxd_config_bus_probe(struct device *dev)
 
                /* Perform IDXD configuration and enabling */
                spin_lock_irqsave(&idxd->dev_lock, flags);
-               rc = idxd_device_config(idxd);
+               if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+                       rc = idxd_device_config(idxd);
                spin_unlock_irqrestore(&idxd->dev_lock, flags);
                if (rc < 0) {
                        module_put(THIS_MODULE);
@@ -157,115 +217,8 @@ static int idxd_config_bus_probe(struct device *dev)
                return 0;
        } else if (is_idxd_wq_dev(dev)) {
                struct idxd_wq *wq = confdev_to_wq(dev);
-               struct idxd_device *idxd = wq->idxd;
-
-               mutex_lock(&wq->wq_lock);
-
-               if (idxd->state != IDXD_DEV_ENABLED) {
-                       mutex_unlock(&wq->wq_lock);
-                       dev_warn(dev, "Enabling while device not enabled.\n");
-                       return -EPERM;
-               }
-
-               if (wq->state != IDXD_WQ_DISABLED) {
-                       mutex_unlock(&wq->wq_lock);
-                       dev_warn(dev, "WQ %d already enabled.\n", wq->id);
-                       return -EBUSY;
-               }
-
-               if (!wq->group) {
-                       mutex_unlock(&wq->wq_lock);
-                       dev_warn(dev, "WQ not attached to group.\n");
-                       return -EINVAL;
-               }
-
-               if (strlen(wq->name) == 0) {
-                       mutex_unlock(&wq->wq_lock);
-                       dev_warn(dev, "WQ name not set.\n");
-                       return -EINVAL;
-               }
-
-               /* Shared WQ checks */
-               if (wq_shared(wq)) {
-                       if (!device_swq_supported(idxd)) {
-                               dev_warn(dev,
-                                        "PASID not enabled and shared WQ.\n");
-                               mutex_unlock(&wq->wq_lock);
-                               return -ENXIO;
-                       }
-                       /*
-                        * Shared wq with the threshold set to 0 means the user
-                        * did not set the threshold or transitioned from a
-                        * dedicated wq but did not set threshold. A value
-                        * of 0 would effectively disable the shared wq. The
-                        * driver does not allow a value of 0 to be set for
-                        * threshold via sysfs.
-                        */
-                       if (wq->threshold == 0) {
-                               dev_warn(dev,
-                                        "Shared WQ and threshold 0.\n");
-                               mutex_unlock(&wq->wq_lock);
-                               return -EINVAL;
-                       }
-               }
-
-               rc = idxd_wq_alloc_resources(wq);
-               if (rc < 0) {
-                       mutex_unlock(&wq->wq_lock);
-                       dev_warn(dev, "WQ resource alloc failed\n");
-                       return rc;
-               }
-
-               spin_lock_irqsave(&idxd->dev_lock, flags);
-               rc = idxd_device_config(idxd);
-               spin_unlock_irqrestore(&idxd->dev_lock, flags);
-               if (rc < 0) {
-                       mutex_unlock(&wq->wq_lock);
-                       dev_warn(dev, "Writing WQ %d config failed: %d\n",
-                                wq->id, rc);
-                       return rc;
-               }
-
-               rc = idxd_wq_enable(wq);
-               if (rc < 0) {
-                       mutex_unlock(&wq->wq_lock);
-                       dev_warn(dev, "WQ %d enabling failed: %d\n",
-                                wq->id, rc);
-                       return rc;
-               }
-
-               rc = idxd_wq_map_portal(wq);
-               if (rc < 0) {
-                       dev_warn(dev, "wq portal mapping failed: %d\n", rc);
-                       rc = idxd_wq_disable(wq);
-                       if (rc < 0)
-                               dev_warn(dev, "IDXD wq disable failed\n");
-                       mutex_unlock(&wq->wq_lock);
-                       return rc;
-               }
-
-               wq->client_count = 0;
-
-               dev_info(dev, "wq %s enabled\n", dev_name(&wq->conf_dev));
 
-               if (is_idxd_wq_dmaengine(wq)) {
-                       rc = idxd_register_dma_channel(wq);
-                       if (rc < 0) {
-                               dev_dbg(dev, "DMA channel register failed\n");
-                               mutex_unlock(&wq->wq_lock);
-                               return rc;
-                       }
-               } else if (is_idxd_wq_cdev(wq)) {
-                       rc = idxd_wq_add_cdev(wq);
-                       if (rc < 0) {
-                               dev_dbg(dev, "Cdev creation failed\n");
-                               mutex_unlock(&wq->wq_lock);
-                               return rc;
-                       }
-               }
-
-               mutex_unlock(&wq->wq_lock);
-               return 0;
+               return enable_wq(wq);
        }
 
        return -ENODEV;
@@ -283,6 +236,9 @@ static void disable_wq(struct idxd_wq *wq)
                return;
        }
 
+       if (wq->type == IDXD_WQT_KERNEL)
+               idxd_wq_quiesce(wq);
+
        if (is_idxd_wq_dmaengine(wq))
                idxd_unregister_dma_channel(wq);
        else if (is_idxd_wq_cdev(wq))
@@ -322,7 +278,7 @@ static int idxd_config_bus_remove(struct device *dev)
                dev_dbg(dev, "%s removing dev %s\n", __func__,
                        dev_name(&idxd->conf_dev));
                for (i = 0; i < idxd->max_wqs; i++) {
-                       struct idxd_wq *wq = &idxd->wqs[i];
+                       struct idxd_wq *wq = idxd->wqs[i];
 
                        if (wq->state == IDXD_WQ_DISABLED)
                                continue;
@@ -333,12 +289,14 @@ static int idxd_config_bus_remove(struct device *dev)
 
                idxd_unregister_dma_device(idxd);
                rc = idxd_device_disable(idxd);
-               for (i = 0; i < idxd->max_wqs; i++) {
-                       struct idxd_wq *wq = &idxd->wqs[i];
+               if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) {
+                       for (i = 0; i < idxd->max_wqs; i++) {
+                               struct idxd_wq *wq = idxd->wqs[i];
 
-                       mutex_lock(&wq->wq_lock);
-                       idxd_wq_disable_cleanup(wq);
-                       mutex_unlock(&wq->wq_lock);
+                               mutex_lock(&wq->wq_lock);
+                               idxd_wq_disable_cleanup(wq);
+                               mutex_unlock(&wq->wq_lock);
+                       }
                }
                module_put(THIS_MODULE);
                if (rc < 0)
@@ -364,19 +322,6 @@ struct bus_type dsa_bus_type = {
        .shutdown = idxd_config_bus_shutdown,
 };
 
-struct bus_type iax_bus_type = {
-       .name = "iax",
-       .match = idxd_config_bus_match,
-       .probe = idxd_config_bus_probe,
-       .remove = idxd_config_bus_remove,
-       .shutdown = idxd_config_bus_shutdown,
-};
-
-static struct bus_type *idxd_bus_types[] = {
-       &dsa_bus_type,
-       &iax_bus_type
-};
-
 static struct idxd_device_driver dsa_drv = {
        .drv = {
                .name = "dsa",
@@ -386,60 +331,15 @@ static struct idxd_device_driver dsa_drv = {
        },
 };
 
-static struct idxd_device_driver iax_drv = {
-       .drv = {
-               .name = "iax",
-               .bus = &iax_bus_type,
-               .owner = THIS_MODULE,
-               .mod_name = KBUILD_MODNAME,
-       },
-};
-
-static struct idxd_device_driver *idxd_drvs[] = {
-       &dsa_drv,
-       &iax_drv
-};
-
-struct bus_type *idxd_get_bus_type(struct idxd_device *idxd)
-{
-       return idxd_bus_types[idxd->type];
-}
-
-static struct device_type *idxd_get_device_type(struct idxd_device *idxd)
-{
-       if (idxd->type == IDXD_TYPE_DSA)
-               return &dsa_device_type;
-       else if (idxd->type == IDXD_TYPE_IAX)
-               return &iax_device_type;
-       else
-               return NULL;
-}
-
 /* IDXD generic driver setup */
 int idxd_register_driver(void)
 {
-       int i, rc;
-
-       for (i = 0; i < IDXD_TYPE_MAX; i++) {
-               rc = driver_register(&idxd_drvs[i]->drv);
-               if (rc < 0)
-                       goto drv_fail;
-       }
-
-       return 0;
-
-drv_fail:
-       while (--i >= 0)
-               driver_unregister(&idxd_drvs[i]->drv);
-       return rc;
+       return driver_register(&dsa_drv.drv);
 }
 
 void idxd_unregister_driver(void)
 {
-       int i;
-
-       for (i = 0; i < IDXD_TYPE_MAX; i++)
-               driver_unregister(&idxd_drvs[i]->drv);
+       driver_unregister(&dsa_drv.drv);
 }
 
 /* IDXD engine attributes */
@@ -450,9 +350,9 @@ static ssize_t engine_group_id_show(struct device *dev,
                container_of(dev, struct idxd_engine, conf_dev);
 
        if (engine->group)
-               return sprintf(buf, "%d\n", engine->group->id);
+               return sysfs_emit(buf, "%d\n", engine->group->id);
        else
-               return sprintf(buf, "%d\n", -1);
+               return sysfs_emit(buf, "%d\n", -1);
 }
 
 static ssize_t engine_group_id_store(struct device *dev,
@@ -488,7 +388,7 @@ static ssize_t engine_group_id_store(struct device *dev,
 
        if (prevg)
                prevg->num_engines--;
-       engine->group = &idxd->groups[id];
+       engine->group = idxd->groups[id];
        engine->group->num_engines++;
 
        return count;
@@ -512,6 +412,19 @@ static const struct attribute_group *idxd_engine_attribute_groups[] = {
        NULL,
 };
 
+static void idxd_conf_engine_release(struct device *dev)
+{
+       struct idxd_engine *engine = container_of(dev, struct idxd_engine, conf_dev);
+
+       kfree(engine);
+}
+
+struct device_type idxd_engine_device_type = {
+       .name = "engine",
+       .release = idxd_conf_engine_release,
+       .groups = idxd_engine_attribute_groups,
+};
+
 /* Group attributes */
 
 static void idxd_set_free_tokens(struct idxd_device *idxd)
@@ -519,7 +432,7 @@ static void idxd_set_free_tokens(struct idxd_device *idxd)
        int i, tokens;
 
        for (i = 0, tokens = 0; i < idxd->max_groups; i++) {
-               struct idxd_group *g = &idxd->groups[i];
+               struct idxd_group *g = idxd->groups[i];
 
                tokens += g->tokens_reserved;
        }
@@ -534,7 +447,7 @@ static ssize_t group_tokens_reserved_show(struct device *dev,
        struct idxd_group *group =
                container_of(dev, struct idxd_group, conf_dev);
 
-       return sprintf(buf, "%u\n", group->tokens_reserved);
+       return sysfs_emit(buf, "%u\n", group->tokens_reserved);
 }
 
 static ssize_t group_tokens_reserved_store(struct device *dev,
@@ -551,7 +464,7 @@ static ssize_t group_tokens_reserved_store(struct device *dev,
        if (rc < 0)
                return -EINVAL;
 
-       if (idxd->type == IDXD_TYPE_IAX)
+       if (idxd->data->type == IDXD_TYPE_IAX)
                return -EOPNOTSUPP;
 
        if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
@@ -582,7 +495,7 @@ static ssize_t group_tokens_allowed_show(struct device *dev,
        struct idxd_group *group =
                container_of(dev, struct idxd_group, conf_dev);
 
-       return sprintf(buf, "%u\n", group->tokens_allowed);
+       return sysfs_emit(buf, "%u\n", group->tokens_allowed);
 }
 
 static ssize_t group_tokens_allowed_store(struct device *dev,
@@ -599,7 +512,7 @@ static ssize_t group_tokens_allowed_store(struct device *dev,
        if (rc < 0)
                return -EINVAL;
 
-       if (idxd->type == IDXD_TYPE_IAX)
+       if (idxd->data->type == IDXD_TYPE_IAX)
                return -EOPNOTSUPP;
 
        if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
@@ -627,7 +540,7 @@ static ssize_t group_use_token_limit_show(struct device *dev,
        struct idxd_group *group =
                container_of(dev, struct idxd_group, conf_dev);
 
-       return sprintf(buf, "%u\n", group->use_token_limit);
+       return sysfs_emit(buf, "%u\n", group->use_token_limit);
 }
 
 static ssize_t group_use_token_limit_store(struct device *dev,
@@ -644,7 +557,7 @@ static ssize_t group_use_token_limit_store(struct device *dev,
        if (rc < 0)
                return -EINVAL;
 
-       if (idxd->type == IDXD_TYPE_IAX)
+       if (idxd->data->type == IDXD_TYPE_IAX)
                return -EOPNOTSUPP;
 
        if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
@@ -670,22 +583,22 @@ static ssize_t group_engines_show(struct device *dev,
        struct idxd_group *group =
                container_of(dev, struct idxd_group, conf_dev);
        int i, rc = 0;
-       char *tmp = buf;
        struct idxd_device *idxd = group->idxd;
 
        for (i = 0; i < idxd->max_engines; i++) {
-               struct idxd_engine *engine = &idxd->engines[i];
+               struct idxd_engine *engine = idxd->engines[i];
 
                if (!engine->group)
                        continue;
 
                if (engine->group->id == group->id)
-                       rc += sprintf(tmp + rc, "engine%d.%d ",
-                                       idxd->id, engine->id);
+                       rc += sysfs_emit_at(buf, rc, "engine%d.%d ", idxd->id, engine->id);
        }
 
+       if (!rc)
+               return 0;
        rc--;
-       rc += sprintf(tmp + rc, "\n");
+       rc += sysfs_emit_at(buf, rc, "\n");
 
        return rc;
 }
@@ -699,22 +612,22 @@ static ssize_t group_work_queues_show(struct device *dev,
        struct idxd_group *group =
                container_of(dev, struct idxd_group, conf_dev);
        int i, rc = 0;
-       char *tmp = buf;
        struct idxd_device *idxd = group->idxd;
 
        for (i = 0; i < idxd->max_wqs; i++) {
-               struct idxd_wq *wq = &idxd->wqs[i];
+               struct idxd_wq *wq = idxd->wqs[i];
 
                if (!wq->group)
                        continue;
 
                if (wq->group->id == group->id)
-                       rc += sprintf(tmp + rc, "wq%d.%d ",
-                                       idxd->id, wq->id);
+                       rc += sysfs_emit_at(buf, rc, "wq%d.%d ", idxd->id, wq->id);
        }
 
+       if (!rc)
+               return 0;
        rc--;
-       rc += sprintf(tmp + rc, "\n");
+       rc += sysfs_emit_at(buf, rc, "\n");
 
        return rc;
 }
@@ -729,7 +642,7 @@ static ssize_t group_traffic_class_a_show(struct device *dev,
        struct idxd_group *group =
                container_of(dev, struct idxd_group, conf_dev);
 
-       return sprintf(buf, "%d\n", group->tc_a);
+       return sysfs_emit(buf, "%d\n", group->tc_a);
 }
 
 static ssize_t group_traffic_class_a_store(struct device *dev,
@@ -770,7 +683,7 @@ static ssize_t group_traffic_class_b_show(struct device *dev,
        struct idxd_group *group =
                container_of(dev, struct idxd_group, conf_dev);
 
-       return sprintf(buf, "%d\n", group->tc_b);
+       return sysfs_emit(buf, "%d\n", group->tc_b);
 }
 
 static ssize_t group_traffic_class_b_store(struct device *dev,
@@ -824,13 +737,26 @@ static const struct attribute_group *idxd_group_attribute_groups[] = {
        NULL,
 };
 
+static void idxd_conf_group_release(struct device *dev)
+{
+       struct idxd_group *group = container_of(dev, struct idxd_group, conf_dev);
+
+       kfree(group);
+}
+
+struct device_type idxd_group_device_type = {
+       .name = "group",
+       .release = idxd_conf_group_release,
+       .groups = idxd_group_attribute_groups,
+};
+
 /* IDXD work queue attribs */
 static ssize_t wq_clients_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
 {
        struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-       return sprintf(buf, "%d\n", wq->client_count);
+       return sysfs_emit(buf, "%d\n", wq->client_count);
 }
 
 static struct device_attribute dev_attr_wq_clients =
@@ -843,12 +769,12 @@ static ssize_t wq_state_show(struct device *dev,
 
        switch (wq->state) {
        case IDXD_WQ_DISABLED:
-               return sprintf(buf, "disabled\n");
+               return sysfs_emit(buf, "disabled\n");
        case IDXD_WQ_ENABLED:
-               return sprintf(buf, "enabled\n");
+               return sysfs_emit(buf, "enabled\n");
        }
 
-       return sprintf(buf, "unknown\n");
+       return sysfs_emit(buf, "unknown\n");
 }
 
 static struct device_attribute dev_attr_wq_state =
@@ -860,9 +786,9 @@ static ssize_t wq_group_id_show(struct device *dev,
        struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
        if (wq->group)
-               return sprintf(buf, "%u\n", wq->group->id);
+               return sysfs_emit(buf, "%u\n", wq->group->id);
        else
-               return sprintf(buf, "-1\n");
+               return sysfs_emit(buf, "-1\n");
 }
 
 static ssize_t wq_group_id_store(struct device *dev,
@@ -896,7 +822,7 @@ static ssize_t wq_group_id_store(struct device *dev,
                return count;
        }
 
-       group = &idxd->groups[id];
+       group = idxd->groups[id];
        prevg = wq->group;
 
        if (prevg)
@@ -914,8 +840,7 @@ static ssize_t wq_mode_show(struct device *dev, struct device_attribute *attr,
 {
        struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-       return sprintf(buf, "%s\n",
-                       wq_dedicated(wq) ? "dedicated" : "shared");
+       return sysfs_emit(buf, "%s\n", wq_dedicated(wq) ? "dedicated" : "shared");
 }
 
 static ssize_t wq_mode_store(struct device *dev,
@@ -951,7 +876,7 @@ static ssize_t wq_size_show(struct device *dev, struct device_attribute *attr,
 {
        struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-       return sprintf(buf, "%u\n", wq->size);
+       return sysfs_emit(buf, "%u\n", wq->size);
 }
 
 static int total_claimed_wq_size(struct idxd_device *idxd)
@@ -960,7 +885,7 @@ static int total_claimed_wq_size(struct idxd_device *idxd)
        int wq_size = 0;
 
        for (i = 0; i < idxd->max_wqs; i++) {
-               struct idxd_wq *wq = &idxd->wqs[i];
+               struct idxd_wq *wq = idxd->wqs[i];
 
                wq_size += wq->size;
        }
@@ -1002,7 +927,7 @@ static ssize_t wq_priority_show(struct device *dev,
 {
        struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-       return sprintf(buf, "%u\n", wq->priority);
+       return sysfs_emit(buf, "%u\n", wq->priority);
 }
 
 static ssize_t wq_priority_store(struct device *dev,
@@ -1039,8 +964,7 @@ static ssize_t wq_block_on_fault_show(struct device *dev,
 {
        struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-       return sprintf(buf, "%u\n",
-                      test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags));
+       return sysfs_emit(buf, "%u\n", test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags));
 }
 
 static ssize_t wq_block_on_fault_store(struct device *dev,
@@ -1079,7 +1003,7 @@ static ssize_t wq_threshold_show(struct device *dev,
 {
        struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-       return sprintf(buf, "%u\n", wq->threshold);
+       return sysfs_emit(buf, "%u\n", wq->threshold);
 }
 
 static ssize_t wq_threshold_store(struct device *dev,
@@ -1122,15 +1046,12 @@ static ssize_t wq_type_show(struct device *dev,
 
        switch (wq->type) {
        case IDXD_WQT_KERNEL:
-               return sprintf(buf, "%s\n",
-                              idxd_wq_type_names[IDXD_WQT_KERNEL]);
+               return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_KERNEL]);
        case IDXD_WQT_USER:
-               return sprintf(buf, "%s\n",
-                              idxd_wq_type_names[IDXD_WQT_USER]);
+               return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_USER]);
        case IDXD_WQT_NONE:
        default:
-               return sprintf(buf, "%s\n",
-                              idxd_wq_type_names[IDXD_WQT_NONE]);
+               return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_NONE]);
        }
 
        return -EINVAL;
@@ -1171,7 +1092,7 @@ static ssize_t wq_name_show(struct device *dev,
 {
        struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-       return sprintf(buf, "%s\n", wq->name);
+       return sysfs_emit(buf, "%s\n", wq->name);
 }
 
 static ssize_t wq_name_store(struct device *dev,
@@ -1206,8 +1127,16 @@ static ssize_t wq_cdev_minor_show(struct device *dev,
                                  struct device_attribute *attr, char *buf)
 {
        struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+       int minor = -1;
 
-       return sprintf(buf, "%d\n", wq->idxd_cdev.minor);
+       mutex_lock(&wq->wq_lock);
+       if (wq->idxd_cdev)
+               minor = wq->idxd_cdev->minor;
+       mutex_unlock(&wq->wq_lock);
+
+       if (minor == -1)
+               return -ENXIO;
+       return sysfs_emit(buf, "%d\n", minor);
 }
 
 static struct device_attribute dev_attr_wq_cdev_minor =
@@ -1233,7 +1162,7 @@ static ssize_t wq_max_transfer_size_show(struct device *dev, struct device_attri
 {
        struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-       return sprintf(buf, "%llu\n", wq->max_xfer_bytes);
+       return sysfs_emit(buf, "%llu\n", wq->max_xfer_bytes);
 }
 
 static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attribute *attr,
@@ -1267,7 +1196,7 @@ static ssize_t wq_max_batch_size_show(struct device *dev, struct device_attribut
 {
        struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-       return sprintf(buf, "%u\n", wq->max_batch_size);
+       return sysfs_emit(buf, "%u\n", wq->max_batch_size);
 }
 
 static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribute *attr,
@@ -1300,7 +1229,7 @@ static ssize_t wq_ats_disable_show(struct device *dev, struct device_attribute *
 {
        struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
 
-       return sprintf(buf, "%u\n", wq->ats_dis);
+       return sysfs_emit(buf, "%u\n", wq->ats_dis);
 }
 
 static ssize_t wq_ats_disable_store(struct device *dev, struct device_attribute *attr,
@@ -1356,6 +1285,20 @@ static const struct attribute_group *idxd_wq_attribute_groups[] = {
        NULL,
 };
 
+static void idxd_conf_wq_release(struct device *dev)
+{
+       struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+       kfree(wq->wqcfg);
+       kfree(wq);
+}
+
+struct device_type idxd_wq_device_type = {
+       .name = "wq",
+       .release = idxd_conf_wq_release,
+       .groups = idxd_wq_attribute_groups,
+};
+
 /* IDXD device attribs */
 static ssize_t version_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
@@ -1363,7 +1306,7 @@ static ssize_t version_show(struct device *dev, struct device_attribute *attr,
        struct idxd_device *idxd =
                container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%#x\n", idxd->hw.version);
+       return sysfs_emit(buf, "%#x\n", idxd->hw.version);
 }
 static DEVICE_ATTR_RO(version);
 
@@ -1374,7 +1317,7 @@ static ssize_t max_work_queues_size_show(struct device *dev,
        struct idxd_device *idxd =
                container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%u\n", idxd->max_wq_size);
+       return sysfs_emit(buf, "%u\n", idxd->max_wq_size);
 }
 static DEVICE_ATTR_RO(max_work_queues_size);
 
@@ -1384,7 +1327,7 @@ static ssize_t max_groups_show(struct device *dev,
        struct idxd_device *idxd =
                container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%u\n", idxd->max_groups);
+       return sysfs_emit(buf, "%u\n", idxd->max_groups);
 }
 static DEVICE_ATTR_RO(max_groups);
 
@@ -1394,7 +1337,7 @@ static ssize_t max_work_queues_show(struct device *dev,
        struct idxd_device *idxd =
                container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%u\n", idxd->max_wqs);
+       return sysfs_emit(buf, "%u\n", idxd->max_wqs);
 }
 static DEVICE_ATTR_RO(max_work_queues);
 
@@ -1404,7 +1347,7 @@ static ssize_t max_engines_show(struct device *dev,
        struct idxd_device *idxd =
                container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%u\n", idxd->max_engines);
+       return sysfs_emit(buf, "%u\n", idxd->max_engines);
 }
 static DEVICE_ATTR_RO(max_engines);
 
@@ -1414,7 +1357,7 @@ static ssize_t numa_node_show(struct device *dev,
        struct idxd_device *idxd =
                container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%d\n", dev_to_node(&idxd->pdev->dev));
+       return sysfs_emit(buf, "%d\n", dev_to_node(&idxd->pdev->dev));
 }
 static DEVICE_ATTR_RO(numa_node);
 
@@ -1424,7 +1367,7 @@ static ssize_t max_batch_size_show(struct device *dev,
        struct idxd_device *idxd =
                container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%u\n", idxd->max_batch_size);
+       return sysfs_emit(buf, "%u\n", idxd->max_batch_size);
 }
 static DEVICE_ATTR_RO(max_batch_size);
 
@@ -1435,7 +1378,7 @@ static ssize_t max_transfer_size_show(struct device *dev,
        struct idxd_device *idxd =
                container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%llu\n", idxd->max_xfer_bytes);
+       return sysfs_emit(buf, "%llu\n", idxd->max_xfer_bytes);
 }
 static DEVICE_ATTR_RO(max_transfer_size);
 
@@ -1461,7 +1404,7 @@ static ssize_t gen_cap_show(struct device *dev,
        struct idxd_device *idxd =
                container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%#llx\n", idxd->hw.gen_cap.bits);
+       return sysfs_emit(buf, "%#llx\n", idxd->hw.gen_cap.bits);
 }
 static DEVICE_ATTR_RO(gen_cap);
 
@@ -1471,8 +1414,7 @@ static ssize_t configurable_show(struct device *dev,
        struct idxd_device *idxd =
                container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%u\n",
-                       test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags));
+       return sysfs_emit(buf, "%u\n", test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags));
 }
 static DEVICE_ATTR_RO(configurable);
 
@@ -1486,13 +1428,13 @@ static ssize_t clients_show(struct device *dev,
 
        spin_lock_irqsave(&idxd->dev_lock, flags);
        for (i = 0; i < idxd->max_wqs; i++) {
-               struct idxd_wq *wq = &idxd->wqs[i];
+               struct idxd_wq *wq = idxd->wqs[i];
 
                count += wq->client_count;
        }
        spin_unlock_irqrestore(&idxd->dev_lock, flags);
 
-       return sprintf(buf, "%d\n", count);
+       return sysfs_emit(buf, "%d\n", count);
 }
 static DEVICE_ATTR_RO(clients);
 
@@ -1502,7 +1444,7 @@ static ssize_t pasid_enabled_show(struct device *dev,
        struct idxd_device *idxd =
                container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%u\n", device_pasid_enabled(idxd));
+       return sysfs_emit(buf, "%u\n", device_pasid_enabled(idxd));
 }
 static DEVICE_ATTR_RO(pasid_enabled);
 
@@ -1515,14 +1457,14 @@ static ssize_t state_show(struct device *dev,
        switch (idxd->state) {
        case IDXD_DEV_DISABLED:
        case IDXD_DEV_CONF_READY:
-               return sprintf(buf, "disabled\n");
+               return sysfs_emit(buf, "disabled\n");
        case IDXD_DEV_ENABLED:
-               return sprintf(buf, "enabled\n");
+               return sysfs_emit(buf, "enabled\n");
        case IDXD_DEV_HALTED:
-               return sprintf(buf, "halted\n");
+               return sysfs_emit(buf, "halted\n");
        }
 
-       return sprintf(buf, "unknown\n");
+       return sysfs_emit(buf, "unknown\n");
 }
 static DEVICE_ATTR_RO(state);
 
@@ -1536,10 +1478,10 @@ static ssize_t errors_show(struct device *dev,
 
        spin_lock_irqsave(&idxd->dev_lock, flags);
        for (i = 0; i < 4; i++)
-               out += sprintf(buf + out, "%#018llx ", idxd->sw_err.bits[i]);
+               out += sysfs_emit_at(buf, out, "%#018llx ", idxd->sw_err.bits[i]);
        spin_unlock_irqrestore(&idxd->dev_lock, flags);
        out--;
-       out += sprintf(buf + out, "\n");
+       out += sysfs_emit_at(buf, out, "\n");
        return out;
 }
 static DEVICE_ATTR_RO(errors);
@@ -1550,7 +1492,7 @@ static ssize_t max_tokens_show(struct device *dev,
        struct idxd_device *idxd =
                container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%u\n", idxd->max_tokens);
+       return sysfs_emit(buf, "%u\n", idxd->max_tokens);
 }
 static DEVICE_ATTR_RO(max_tokens);
 
@@ -1560,7 +1502,7 @@ static ssize_t token_limit_show(struct device *dev,
        struct idxd_device *idxd =
                container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%u\n", idxd->token_limit);
+       return sysfs_emit(buf, "%u\n", idxd->token_limit);
 }
 
 static ssize_t token_limit_store(struct device *dev,
@@ -1599,7 +1541,7 @@ static ssize_t cdev_major_show(struct device *dev,
        struct idxd_device *idxd =
                container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%u\n", idxd->major);
+       return sysfs_emit(buf, "%u\n", idxd->major);
 }
 static DEVICE_ATTR_RO(cdev_major);
 
@@ -1608,7 +1550,7 @@ static ssize_t cmd_status_show(struct device *dev,
 {
        struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev);
 
-       return sprintf(buf, "%#x\n", idxd->cmd_status);
+       return sysfs_emit(buf, "%#x\n", idxd->cmd_status);
 }
 static DEVICE_ATTR_RO(cmd_status);
 
@@ -1644,183 +1586,161 @@ static const struct attribute_group *idxd_attribute_groups[] = {
        NULL,
 };
 
-static int idxd_setup_engine_sysfs(struct idxd_device *idxd)
+static void idxd_conf_device_release(struct device *dev)
 {
-       struct device *dev = &idxd->pdev->dev;
-       int i, rc;
+       struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev);
+
+       kfree(idxd->groups);
+       kfree(idxd->wqs);
+       kfree(idxd->engines);
+       kfree(idxd->irq_entries);
+       kfree(idxd->int_handles);
+       ida_free(&idxd_ida, idxd->id);
+       kfree(idxd);
+}
+
+struct device_type dsa_device_type = {
+       .name = "dsa",
+       .release = idxd_conf_device_release,
+       .groups = idxd_attribute_groups,
+};
+
+struct device_type iax_device_type = {
+       .name = "iax",
+       .release = idxd_conf_device_release,
+       .groups = idxd_attribute_groups,
+};
+
+static int idxd_register_engine_devices(struct idxd_device *idxd)
+{
+       int i, j, rc;
 
        for (i = 0; i < idxd->max_engines; i++) {
-               struct idxd_engine *engine = &idxd->engines[i];
-
-               engine->conf_dev.parent = &idxd->conf_dev;
-               dev_set_name(&engine->conf_dev, "engine%d.%d",
-                            idxd->id, engine->id);
-               engine->conf_dev.bus = idxd_get_bus_type(idxd);
-               engine->conf_dev.groups = idxd_engine_attribute_groups;
-               engine->conf_dev.type = &idxd_engine_device_type;
-               dev_dbg(dev, "Engine device register: %s\n",
-                       dev_name(&engine->conf_dev));
-               rc = device_register(&engine->conf_dev);
-               if (rc < 0) {
-                       put_device(&engine->conf_dev);
+               struct idxd_engine *engine = idxd->engines[i];
+
+               rc = device_add(&engine->conf_dev);
+               if (rc < 0)
                        goto cleanup;
-               }
        }
 
        return 0;
 
 cleanup:
-       while (i--) {
-               struct idxd_engine *engine = &idxd->engines[i];
+       j = i - 1;
+       for (; i < idxd->max_engines; i++)
+               put_device(&idxd->engines[i]->conf_dev);
 
-               device_unregister(&engine->conf_dev);
-       }
+       while (j--)
+               device_unregister(&idxd->engines[j]->conf_dev);
        return rc;
 }
 
-static int idxd_setup_group_sysfs(struct idxd_device *idxd)
+static int idxd_register_group_devices(struct idxd_device *idxd)
 {
-       struct device *dev = &idxd->pdev->dev;
-       int i, rc;
+       int i, j, rc;
 
        for (i = 0; i < idxd->max_groups; i++) {
-               struct idxd_group *group = &idxd->groups[i];
-
-               group->conf_dev.parent = &idxd->conf_dev;
-               dev_set_name(&group->conf_dev, "group%d.%d",
-                            idxd->id, group->id);
-               group->conf_dev.bus = idxd_get_bus_type(idxd);
-               group->conf_dev.groups = idxd_group_attribute_groups;
-               group->conf_dev.type = &idxd_group_device_type;
-               dev_dbg(dev, "Group device register: %s\n",
-                       dev_name(&group->conf_dev));
-               rc = device_register(&group->conf_dev);
-               if (rc < 0) {
-                       put_device(&group->conf_dev);
+               struct idxd_group *group = idxd->groups[i];
+
+               rc = device_add(&group->conf_dev);
+               if (rc < 0)
                        goto cleanup;
-               }
        }
 
        return 0;
 
 cleanup:
-       while (i--) {
-               struct idxd_group *group = &idxd->groups[i];
+       j = i - 1;
+       for (; i < idxd->max_groups; i++)
+               put_device(&idxd->groups[i]->conf_dev);
 
-               device_unregister(&group->conf_dev);
-       }
+       while (j--)
+               device_unregister(&idxd->groups[j]->conf_dev);
        return rc;
 }
 
-static int idxd_setup_wq_sysfs(struct idxd_device *idxd)
+static int idxd_register_wq_devices(struct idxd_device *idxd)
 {
-       struct device *dev = &idxd->pdev->dev;
-       int i, rc;
+       int i, rc, j;
 
        for (i = 0; i < idxd->max_wqs; i++) {
-               struct idxd_wq *wq = &idxd->wqs[i];
-
-               wq->conf_dev.parent = &idxd->conf_dev;
-               dev_set_name(&wq->conf_dev, "wq%d.%d", idxd->id, wq->id);
-               wq->conf_dev.bus = idxd_get_bus_type(idxd);
-               wq->conf_dev.groups = idxd_wq_attribute_groups;
-               wq->conf_dev.type = &idxd_wq_device_type;
-               dev_dbg(dev, "WQ device register: %s\n",
-                       dev_name(&wq->conf_dev));
-               rc = device_register(&wq->conf_dev);
-               if (rc < 0) {
-                       put_device(&wq->conf_dev);
+               struct idxd_wq *wq = idxd->wqs[i];
+
+               rc = device_add(&wq->conf_dev);
+               if (rc < 0)
                        goto cleanup;
-               }
        }
 
        return 0;
 
 cleanup:
-       while (i--) {
-               struct idxd_wq *wq = &idxd->wqs[i];
+       j = i - 1;
+       for (; i < idxd->max_wqs; i++)
+               put_device(&idxd->wqs[i]->conf_dev);
 
-               device_unregister(&wq->conf_dev);
-       }
+       while (j--)
+               device_unregister(&idxd->wqs[j]->conf_dev);
        return rc;
 }
 
-static int idxd_setup_device_sysfs(struct idxd_device *idxd)
+int idxd_register_devices(struct idxd_device *idxd)
 {
        struct device *dev = &idxd->pdev->dev;
-       int rc;
-       char devname[IDXD_NAME_SIZE];
+       int rc, i;
 
-       sprintf(devname, "%s%d", idxd_get_dev_name(idxd), idxd->id);
-       idxd->conf_dev.parent = dev;
-       dev_set_name(&idxd->conf_dev, "%s", devname);
-       idxd->conf_dev.bus = idxd_get_bus_type(idxd);
-       idxd->conf_dev.groups = idxd_attribute_groups;
-       idxd->conf_dev.type = idxd_get_device_type(idxd);
-
-       dev_dbg(dev, "IDXD device register: %s\n", dev_name(&idxd->conf_dev));
-       rc = device_register(&idxd->conf_dev);
-       if (rc < 0) {
-               put_device(&idxd->conf_dev);
-               return rc;
-       }
-
-       return 0;
-}
-
-int idxd_setup_sysfs(struct idxd_device *idxd)
-{
-       struct device *dev = &idxd->pdev->dev;
-       int rc;
-
-       rc = idxd_setup_device_sysfs(idxd);
-       if (rc < 0) {
-               dev_dbg(dev, "Device sysfs registering failed: %d\n", rc);
+       rc = device_add(&idxd->conf_dev);
+       if (rc < 0)
                return rc;
-       }
 
-       rc = idxd_setup_wq_sysfs(idxd);
+       rc = idxd_register_wq_devices(idxd);
        if (rc < 0) {
-               /* unregister conf dev */
-               dev_dbg(dev, "Work Queue sysfs registering failed: %d\n", rc);
-               return rc;
+               dev_dbg(dev, "WQ devices registering failed: %d\n", rc);
+               goto err_wq;
        }
 
-       rc = idxd_setup_group_sysfs(idxd);
+       rc = idxd_register_engine_devices(idxd);
        if (rc < 0) {
-               /* unregister conf dev */
-               dev_dbg(dev, "Group sysfs registering failed: %d\n", rc);
-               return rc;
+               dev_dbg(dev, "Engine devices registering failed: %d\n", rc);
+               goto err_engine;
        }
 
-       rc = idxd_setup_engine_sysfs(idxd);
+       rc = idxd_register_group_devices(idxd);
        if (rc < 0) {
-               /* unregister conf dev */
-               dev_dbg(dev, "Engine sysfs registering failed: %d\n", rc);
-               return rc;
+               dev_dbg(dev, "Group device registering failed: %d\n", rc);
+               goto err_group;
        }
 
        return 0;
+
+ err_group:
+       for (i = 0; i < idxd->max_engines; i++)
+               device_unregister(&idxd->engines[i]->conf_dev);
+ err_engine:
+       for (i = 0; i < idxd->max_wqs; i++)
+               device_unregister(&idxd->wqs[i]->conf_dev);
+ err_wq:
+       device_del(&idxd->conf_dev);
+       return rc;
 }
 
-void idxd_cleanup_sysfs(struct idxd_device *idxd)
+void idxd_unregister_devices(struct idxd_device *idxd)
 {
        int i;
 
        for (i = 0; i < idxd->max_wqs; i++) {
-               struct idxd_wq *wq = &idxd->wqs[i];
+               struct idxd_wq *wq = idxd->wqs[i];
 
                device_unregister(&wq->conf_dev);
        }
 
        for (i = 0; i < idxd->max_engines; i++) {
-               struct idxd_engine *engine = &idxd->engines[i];
+               struct idxd_engine *engine = idxd->engines[i];
 
                device_unregister(&engine->conf_dev);
        }
 
        for (i = 0; i < idxd->max_groups; i++) {
-               struct idxd_group *group = &idxd->groups[i];
+               struct idxd_group *group = idxd->groups[i];
 
                device_unregister(&group->conf_dev);
        }
@@ -1830,26 +1750,10 @@ void idxd_cleanup_sysfs(struct idxd_device *idxd)
 
 int idxd_register_bus_type(void)
 {
-       int i, rc;
-
-       for (i = 0; i < IDXD_TYPE_MAX; i++) {
-               rc = bus_register(idxd_bus_types[i]);
-               if (rc < 0)
-                       goto bus_err;
-       }
-
-       return 0;
-
-bus_err:
-       while (--i >= 0)
-               bus_unregister(idxd_bus_types[i]);
-       return rc;
+       return bus_register(&dsa_bus_type);
 }
 
 void idxd_unregister_bus_type(void)
 {
-       int i;
-
-       for (i = 0; i < IDXD_TYPE_MAX; i++)
-               bus_unregister(idxd_bus_types[i]);
+       bus_unregister(&dsa_bus_type);
 }
index d0b2e60..ecdaada 100644 (file)
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2013 - 2015 Linaro Ltd.
- * Copyright (c) 2013 Hisilicon Limited.
+ * Copyright (c) 2013 HiSilicon Limited.
  */
 #include <linux/sched.h>
 #include <linux/device.h>
@@ -1039,6 +1039,6 @@ static struct platform_driver k3_pdma_driver = {
 
 module_platform_driver(k3_pdma_driver);
 
-MODULE_DESCRIPTION("Hisilicon k3 DMA Driver");
+MODULE_DESCRIPTION("HiSilicon k3 DMA Driver");
 MODULE_ALIAS("platform:k3dma");
 MODULE_LICENSE("GPL v2");
index 57f5ee4..43ac3ab 100644 (file)
@@ -2281,6 +2281,7 @@ static int gpi_probe(struct platform_device *pdev)
 
 static const struct of_device_id gpi_of_match[] = {
        { .compatible = "qcom,sdm845-gpi-dma" },
+       { .compatible = "qcom,sm8150-gpi-dma" },
        { },
 };
 MODULE_DEVICE_TABLE(of, gpi_of_match);
index 6c0f9eb..23d6448 100644 (file)
@@ -90,12 +90,6 @@ static inline struct hidma_chan *to_hidma_chan(struct dma_chan *dmach)
        return container_of(dmach, struct hidma_chan, chan);
 }
 
-static inline
-struct hidma_desc *to_hidma_desc(struct dma_async_tx_descriptor *t)
-{
-       return container_of(t, struct hidma_desc, desc);
-}
-
 static void hidma_free(struct hidma_dev *dmadev)
 {
        INIT_LIST_HEAD(&dmadev->ddev.channels);
index 3aded78..75c0b8e 100644 (file)
@@ -2453,6 +2453,13 @@ static int xilinx_dma_terminate_all(struct dma_chan *dchan)
        return 0;
 }
 
+static void xilinx_dma_synchronize(struct dma_chan *dchan)
+{
+       struct xilinx_dma_chan *chan = to_xilinx_chan(dchan);
+
+       tasklet_kill(&chan->tasklet);
+}
+
 /**
  * xilinx_dma_channel_set_config - Configure VDMA channel
  * Run-time configuration for Axi VDMA, supports:
@@ -3074,6 +3081,7 @@ static int xilinx_dma_probe(struct platform_device *pdev)
        xdev->common.device_free_chan_resources =
                                xilinx_dma_free_chan_resources;
        xdev->common.device_terminate_all = xilinx_dma_terminate_all;
+       xdev->common.device_synchronize = xilinx_dma_synchronize;
        xdev->common.device_tx_status = xilinx_dma_tx_status;
        xdev->common.device_issue_pending = xilinx_dma_issue_pending;
        if (xdev->dma_config->dmatype == XDMA_TYPE_AXIDMA) {
index 64344e8..3c1c5da 100644 (file)
@@ -23,6 +23,7 @@
 
 #include <asm/cpuidle.h>
 #include <asm/cputype.h>
+#include <asm/hypervisor.h>
 #include <asm/system_misc.h>
 #include <asm/smp_plat.h>
 #include <asm/suspend.h>
@@ -138,7 +139,7 @@ static int psci_to_linux_errno(int errno)
                return -EINVAL;
        case PSCI_RET_DENIED:
                return -EPERM;
-       };
+       }
 
        return -EINVAL;
 }
@@ -501,6 +502,7 @@ static int __init psci_probe(void)
                psci_init_cpu_suspend();
                psci_init_system_suspend();
                psci_init_system_reset2();
+               kvm_init_hyp_services();
        }
 
        return 0;
index 72ab840..40d1914 100644 (file)
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 #
-obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY) += smccc.o
+obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY) += smccc.o kvm_guest.o
 obj-$(CONFIG_ARM_SMCCC_SOC_ID) += soc_id.o
diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c
new file mode 100644 (file)
index 0000000..2d3e866
--- /dev/null
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "smccc: KVM: " fmt
+
+#include <linux/arm-smccc.h>
+#include <linux/bitmap.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include <asm/hypervisor.h>
+
+static DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS) __ro_after_init = { };
+
+void __init kvm_init_hyp_services(void)
+{
+       struct arm_smccc_res res;
+       u32 val[4];
+
+       if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_HVC)
+               return;
+
+       arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, &res);
+       if (res.a0 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 ||
+           res.a1 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 ||
+           res.a2 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 ||
+           res.a3 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3)
+               return;
+
+       memset(&res, 0, sizeof(res));
+       arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID, &res);
+
+       val[0] = lower_32_bits(res.a0);
+       val[1] = lower_32_bits(res.a1);
+       val[2] = lower_32_bits(res.a2);
+       val[3] = lower_32_bits(res.a3);
+
+       bitmap_from_arr32(__kvm_arm_hyp_services, val, ARM_SMCCC_KVM_NUM_FUNCS);
+
+       pr_info("hypervisor services detected (0x%08lx 0x%08lx 0x%08lx 0x%08lx)\n",
+                res.a3, res.a2, res.a1, res.a0);
+}
+
+bool kvm_arm_hyp_service_available(u32 func_id)
+{
+       if (func_id >= ARM_SMCCC_KVM_NUM_FUNCS)
+               return false;
+
+       return test_bit(func_id, __kvm_arm_hyp_services);
+}
+EXPORT_SYMBOL_GPL(kvm_arm_hyp_service_available);
index d52bfc5..028f81d 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/cache.h>
 #include <linux/init.h>
 #include <linux/arm-smccc.h>
+#include <linux/kernel.h>
 #include <asm/archrandom.h>
 
 static u32 smccc_version = ARM_SMCCC_VERSION_1_0;
index d3b3de5..1dd0ec6 100644 (file)
@@ -321,9 +321,8 @@ config GPIO_HLWD
 
 config GPIO_ICH
        tristate "Intel ICH GPIO"
-       depends on PCI && X86
-       select MFD_CORE
-       select LPC_ICH
+       depends on X86
+       depends on LPC_ICH
        help
          Say yes here to support the GPIO functionality of a number of Intel
          ICH-based chipsets.  Currently supported devices: ICH6, ICH7, ICH8
@@ -502,6 +501,19 @@ config GPIO_RDA
        help
          Say Y here to support RDA Micro GPIO controller.
 
+config GPIO_REALTEK_OTTO
+       tristate "Realtek Otto GPIO support"
+       depends on MACH_REALTEK_RTL
+       default MACH_REALTEK_RTL
+       select GPIO_GENERIC
+       select GPIOLIB_IRQCHIP
+       help
+         The GPIO controller on the Otto MIPS platform supports up to two
+         banks of 32 GPIOs, with edge triggered interrupts. The 32 GPIOs
+         are grouped in four 8-bit wide ports.
+
+         When built as a module, the module will be called realtek_otto_gpio.
+
 config GPIO_REG
        bool
        help
@@ -847,9 +859,9 @@ config GPIO_IT87
 
 config GPIO_SCH
        tristate "Intel SCH/TunnelCreek/Centerton/Quark X1000 GPIO"
-       depends on (X86 || COMPILE_TEST) && PCI
-       select MFD_CORE
-       select LPC_SCH
+       depends on (X86 || COMPILE_TEST) && ACPI
+       depends on LPC_SCH
+       select GPIOLIB_IRQCHIP
        help
          Say yes here to support GPIO interface on Intel Poulsbo SCH,
          Intel Tunnel Creek processor, Intel Centerton processor or
index 4c12f31..d7c81e1 100644 (file)
@@ -125,6 +125,7 @@ obj-$(CONFIG_GPIO_RC5T583)          += gpio-rc5t583.o
 obj-$(CONFIG_GPIO_RCAR)                        += gpio-rcar.o
 obj-$(CONFIG_GPIO_RDA)                 += gpio-rda.o
 obj-$(CONFIG_GPIO_RDC321X)             += gpio-rdc321x.o
+obj-$(CONFIG_GPIO_REALTEK_OTTO)                += gpio-realtek-otto.o
 obj-$(CONFIG_GPIO_REG)                 += gpio-reg.o
 obj-$(CONFIG_ARCH_SA1100)              += gpio-sa1100.o
 obj-$(CONFIG_GPIO_SAMA5D2_PIOBU)       += gpio-sama5d2-piobu.o
index 7a9021c..71c0bea 100644 (file)
@@ -49,15 +49,15 @@ struct dio48e_gpio {
        unsigned char out_state[6];
        unsigned char control[2];
        raw_spinlock_t lock;
-       unsigned base;
+       unsigned int base;
        unsigned char irq_mask;
 };
 
-static int dio48e_gpio_get_direction(struct gpio_chip *chip, unsigned offset)
+static int dio48e_gpio_get_direction(struct gpio_chip *chip, unsigned int offset)
 {
        struct dio48e_gpio *const dio48egpio = gpiochip_get_data(chip);
-       const unsigned port = offset / 8;
-       const unsigned mask = BIT(offset % 8);
+       const unsigned int port = offset / 8;
+       const unsigned int mask = BIT(offset % 8);
 
        if (dio48egpio->io_state[port] & mask)
                return  GPIO_LINE_DIRECTION_IN;
@@ -65,14 +65,14 @@ static int dio48e_gpio_get_direction(struct gpio_chip *chip, unsigned offset)
        return GPIO_LINE_DIRECTION_OUT;
 }
 
-static int dio48e_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
+static int dio48e_gpio_direction_input(struct gpio_chip *chip, unsigned int offset)
 {
        struct dio48e_gpio *const dio48egpio = gpiochip_get_data(chip);
-       const unsigned io_port = offset / 8;
+       const unsigned int io_port = offset / 8;
        const unsigned int control_port = io_port / 3;
-       const unsigned control_addr = dio48egpio->base + 3 + control_port*4;
+       const unsigned int control_addr = dio48egpio->base + 3 + control_port * 4;
        unsigned long flags;
-       unsigned control;
+       unsigned int control;
 
        raw_spin_lock_irqsave(&dio48egpio->lock, flags);
 
@@ -104,17 +104,17 @@ static int dio48e_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
        return 0;
 }
 
-static int dio48e_gpio_direction_output(struct gpio_chip *chip, unsigned offset,
-       int value)
+static int dio48e_gpio_direction_output(struct gpio_chip *chip, unsigned int offset,
+                                       int value)
 {
        struct dio48e_gpio *const dio48egpio = gpiochip_get_data(chip);
-       const unsigned io_port = offset / 8;
+       const unsigned int io_port = offset / 8;
        const unsigned int control_port = io_port / 3;
-       const unsigned mask = BIT(offset % 8);
-       const unsigned control_addr = dio48egpio->base + 3 + control_port*4;
-       const unsigned out_port = (io_port > 2) ? io_port + 1 : io_port;
+       const unsigned int mask = BIT(offset % 8);
+       const unsigned int control_addr = dio48egpio->base + 3 + control_port * 4;
+       const unsigned int out_port = (io_port > 2) ? io_port + 1 : io_port;
        unsigned long flags;
-       unsigned control;
+       unsigned int control;
 
        raw_spin_lock_irqsave(&dio48egpio->lock, flags);
 
@@ -154,14 +154,14 @@ static int dio48e_gpio_direction_output(struct gpio_chip *chip, unsigned offset,
        return 0;
 }
 
-static int dio48e_gpio_get(struct gpio_chip *chip, unsigned offset)
+static int dio48e_gpio_get(struct gpio_chip *chip, unsigned int offset)
 {
        struct dio48e_gpio *const dio48egpio = gpiochip_get_data(chip);
-       const unsigned port = offset / 8;
-       const unsigned mask = BIT(offset % 8);
-       const unsigned in_port = (port > 2) ? port + 1 : port;
+       const unsigned int port = offset / 8;
+       const unsigned int mask = BIT(offset % 8);
+       const unsigned int in_port = (port > 2) ? port + 1 : port;
        unsigned long flags;
-       unsigned port_state;
+       unsigned int port_state;
 
        raw_spin_lock_irqsave(&dio48egpio->lock, flags);
 
@@ -202,12 +202,12 @@ static int dio48e_gpio_get_multiple(struct gpio_chip *chip, unsigned long *mask,
        return 0;
 }
 
-static void dio48e_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
+static void dio48e_gpio_set(struct gpio_chip *chip, unsigned int offset, int value)
 {
        struct dio48e_gpio *const dio48egpio = gpiochip_get_data(chip);
-       const unsigned port = offset / 8;
-       const unsigned mask = BIT(offset % 8);
-       const unsigned out_port = (port > 2) ? port + 1 : port;
+       const unsigned int port = offset / 8;
+       const unsigned int mask = BIT(offset % 8);
+       const unsigned int out_port = (port > 2) ? port + 1 : port;
        unsigned long flags;
 
        raw_spin_lock_irqsave(&dio48egpio->lock, flags);
@@ -306,7 +306,7 @@ static void dio48e_irq_unmask(struct irq_data *data)
        raw_spin_unlock_irqrestore(&dio48egpio->lock, flags);
 }
 
-static int dio48e_irq_set_type(struct irq_data *data, unsigned flow_type)
+static int dio48e_irq_set_type(struct irq_data *data, unsigned int flow_type)
 {
        const unsigned long offset = irqd_to_hwirq(data);
 
index 0817143..34e35b6 100644 (file)
@@ -37,31 +37,6 @@ struct gpio_aggregator {
 static DEFINE_MUTEX(gpio_aggregator_lock);     /* protects idr */
 static DEFINE_IDR(gpio_aggregator_idr);
 
-static char *get_arg(char **args)
-{
-       char *start, *end;
-
-       start = skip_spaces(*args);
-       if (!*start)
-               return NULL;
-
-       if (*start == '"') {
-               /* Quoted arg */
-               end = strchr(++start, '"');
-               if (!end)
-                       return ERR_PTR(-EINVAL);
-       } else {
-               /* Unquoted arg */
-               for (end = start; *end && !isspace(*end); end++) ;
-       }
-
-       if (*end)
-               *end++ = '\0';
-
-       *args = end;
-       return start;
-}
-
 static int aggr_add_gpio(struct gpio_aggregator *aggr, const char *key,
                         int hwnum, unsigned int *n)
 {
@@ -83,8 +58,8 @@ static int aggr_add_gpio(struct gpio_aggregator *aggr, const char *key,
 
 static int aggr_parse(struct gpio_aggregator *aggr)
 {
+       char *args = skip_spaces(aggr->args);
        char *name, *offsets, *p;
-       char *args = aggr->args;
        unsigned long *bitmap;
        unsigned int i, n = 0;
        int error = 0;
@@ -93,13 +68,9 @@ static int aggr_parse(struct gpio_aggregator *aggr)
        if (!bitmap)
                return -ENOMEM;
 
-       for (name = get_arg(&args), offsets = get_arg(&args); name;
-            offsets = get_arg(&args)) {
-               if (IS_ERR(name)) {
-                       pr_err("Cannot get GPIO specifier: %pe\n", name);
-                       error = PTR_ERR(name);
-                       goto free_bitmap;
-               }
+       args = next_arg(args, &name, &p);
+       while (*args) {
+               args = next_arg(args, &offsets, &p);
 
                p = get_options(offsets, 0, &error);
                if (error == 0 || *p) {
@@ -125,7 +96,7 @@ static int aggr_parse(struct gpio_aggregator *aggr)
                                goto free_bitmap;
                }
 
-               name = get_arg(&args);
+               args = next_arg(args, &name, &p);
        }
 
        if (!n) {
index de56c01..3b31f5e 100644 (file)
@@ -5,13 +5,11 @@
  * Copyright (C) 2010 Extreme Engineering Solutions.
  */
 
-
 #include <linux/bitops.h>
 #include <linux/gpio/driver.h>
 #include <linux/ioport.h>
 #include <linux/mfd/lpc_ich.h>
 #include <linux/module.h>
-#include <linux/pci.h>
 #include <linux/platform_device.h>
 
 #define DRV_NAME "gpio_ich"
index 8f1be34..f332341 100644 (file)
@@ -125,14 +125,6 @@ static inline int superio_inw(int reg)
        return val;
 }
 
-static inline void superio_outw(int val, int reg)
-{
-       outb(reg++, REG);
-       outb(val >> 8, VAL);
-       outb(reg, REG);
-       outb(val, VAL);
-}
-
 static inline void superio_set_mask(int mask, int reg)
 {
        u8 curr_val = superio_inb(reg);
index 28b757d..d7e7387 100644 (file)
@@ -479,15 +479,10 @@ static struct platform_device *gpio_mockup_pdevs[GPIO_MOCKUP_MAX_GC];
 
 static void gpio_mockup_unregister_pdevs(void)
 {
-       struct platform_device *pdev;
        int i;
 
-       for (i = 0; i < GPIO_MOCKUP_MAX_GC; i++) {
-               pdev = gpio_mockup_pdevs[i];
-
-               if (pdev)
-                       platform_device_unregister(pdev);
-       }
+       for (i = 0; i < GPIO_MOCKUP_MAX_GC; i++)
+               platform_device_unregister(gpio_mockup_pdevs[i]);
 }
 
 static __init char **gpio_mockup_make_line_names(const char *label,
index 6dfca83..4b9157a 100644 (file)
@@ -9,6 +9,7 @@
  * kind, whether express or implied.
  */
 
+#include <linux/acpi.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
@@ -18,6 +19,8 @@
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
+#include <linux/property.h>
+#include <linux/mod_devicetable.h>
 #include <linux/slab.h>
 #include <linux/irq.h>
 #include <linux/gpio/driver.h>
@@ -303,8 +306,8 @@ static int mpc8xxx_probe(struct platform_device *pdev)
        struct device_node *np = pdev->dev.of_node;
        struct mpc8xxx_gpio_chip *mpc8xxx_gc;
        struct gpio_chip        *gc;
-       const struct mpc8xxx_gpio_devtype *devtype =
-               of_device_get_match_data(&pdev->dev);
+       const struct mpc8xxx_gpio_devtype *devtype = NULL;
+       struct fwnode_handle *fwnode;
        int ret;
 
        mpc8xxx_gc = devm_kzalloc(&pdev->dev, sizeof(*mpc8xxx_gc), GFP_KERNEL);
@@ -315,14 +318,14 @@ static int mpc8xxx_probe(struct platform_device *pdev)
 
        raw_spin_lock_init(&mpc8xxx_gc->lock);
 
-       mpc8xxx_gc->regs = of_iomap(np, 0);
-       if (!mpc8xxx_gc->regs)
-               return -ENOMEM;
+       mpc8xxx_gc->regs = devm_platform_ioremap_resource(pdev, 0);
+       if (IS_ERR(mpc8xxx_gc->regs))
+               return PTR_ERR(mpc8xxx_gc->regs);
 
        gc = &mpc8xxx_gc->gc;
        gc->parent = &pdev->dev;
 
-       if (of_property_read_bool(np, "little-endian")) {
+       if (device_property_read_bool(&pdev->dev, "little-endian")) {
                ret = bgpio_init(gc, &pdev->dev, 4,
                                 mpc8xxx_gc->regs + GPIO_DAT,
                                 NULL, NULL,
@@ -345,6 +348,7 @@ static int mpc8xxx_probe(struct platform_device *pdev)
 
        mpc8xxx_gc->direction_output = gc->direction_output;
 
+       devtype = device_get_match_data(&pdev->dev);
        if (!devtype)
                devtype = &mpc8xxx_gpio_devtype_default;
 
@@ -369,24 +373,29 @@ static int mpc8xxx_probe(struct platform_device *pdev)
         * associated input enable must be set (GPIOxGPIE[IEn]=1) to propagate
         * the port value to the GPIO Data Register.
         */
+       fwnode = dev_fwnode(&pdev->dev);
        if (of_device_is_compatible(np, "fsl,qoriq-gpio") ||
            of_device_is_compatible(np, "fsl,ls1028a-gpio") ||
-           of_device_is_compatible(np, "fsl,ls1088a-gpio"))
+           of_device_is_compatible(np, "fsl,ls1088a-gpio") ||
+           is_acpi_node(fwnode))
                gc->write_reg(mpc8xxx_gc->regs + GPIO_IBE, 0xffffffff);
 
        ret = gpiochip_add_data(gc, mpc8xxx_gc);
        if (ret) {
-               pr_err("%pOF: GPIO chip registration failed with status %d\n",
-                      np, ret);
+               dev_err(&pdev->dev,
+                       "GPIO chip registration failed with status %d\n", ret);
                goto err;
        }
 
-       mpc8xxx_gc->irqn = irq_of_parse_and_map(np, 0);
+       mpc8xxx_gc->irqn = platform_get_irq(pdev, 0);
        if (!mpc8xxx_gc->irqn)
                return 0;
 
-       mpc8xxx_gc->irq = irq_domain_add_linear(np, MPC8XXX_GPIO_PINS,
-                                       &mpc8xxx_gpio_irq_ops, mpc8xxx_gc);
+       mpc8xxx_gc->irq = irq_domain_create_linear(fwnode,
+                                                  MPC8XXX_GPIO_PINS,
+                                                  &mpc8xxx_gpio_irq_ops,
+                                                  mpc8xxx_gc);
+
        if (!mpc8xxx_gc->irq)
                return 0;
 
@@ -399,8 +408,9 @@ static int mpc8xxx_probe(struct platform_device *pdev)
                               IRQF_SHARED, "gpio-cascade",
                               mpc8xxx_gc);
        if (ret) {
-               dev_err(&pdev->dev, "%s: failed to devm_request_irq(%d), ret = %d\n",
-                       np->full_name, mpc8xxx_gc->irqn, ret);
+               dev_err(&pdev->dev,
+                       "failed to devm_request_irq(%d), ret = %d\n",
+                       mpc8xxx_gc->irqn, ret);
                goto err;
        }
 
@@ -425,12 +435,21 @@ static int mpc8xxx_remove(struct platform_device *pdev)
        return 0;
 }
 
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id gpio_acpi_ids[] = {
+       {"NXP0031",},
+       { }
+};
+MODULE_DEVICE_TABLE(acpi, gpio_acpi_ids);
+#endif
+
 static struct platform_driver mpc8xxx_plat_driver = {
        .probe          = mpc8xxx_probe,
        .remove         = mpc8xxx_remove,
        .driver         = {
                .name = "gpio-mpc8xxx",
                .of_match_table = mpc8xxx_gpio_ids,
+               .acpi_match_table = ACPI_PTR(gpio_acpi_ids),
        },
 };
 
index dfc0c1e..524b668 100644 (file)
@@ -60,11 +60,6 @@ static inline int is_imx23_gpio(struct mxs_gpio_port *port)
        return port->devid == IMX23_GPIO;
 }
 
-static inline int is_imx28_gpio(struct mxs_gpio_port *port)
-{
-       return port->devid == IMX28_GPIO;
-}
-
 /* Note: This driver assumes 32 GPIOs are handled in one register */
 
 static int mxs_gpio_set_irq_type(struct irq_data *d, unsigned int type)
index 5615226..ca23f72 100644 (file)
@@ -1373,15 +1373,14 @@ static int omap_gpio_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
        struct device_node *node = dev->of_node;
-       const struct of_device_id *match;
        const struct omap_gpio_platform_data *pdata;
        struct gpio_bank *bank;
        struct irq_chip *irqc;
        int ret;
 
-       match = of_match_device(of_match_ptr(omap_gpio_match), dev);
+       pdata = device_get_match_data(dev);
 
-       pdata = match ? match->data : dev_get_platdata(dev);
+       pdata = pdata ?: dev_get_platdata(dev);
        if (!pdata)
                return -EINVAL;
 
diff --git a/drivers/gpio/gpio-realtek-otto.c b/drivers/gpio/gpio-realtek-otto.c
new file mode 100644 (file)
index 0000000..cb64fb5
--- /dev/null
@@ -0,0 +1,325 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/gpio/driver.h>
+#include <linux/irq.h>
+#include <linux/minmax.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/property.h>
+
+/*
+ * Total register block size is 0x1C for one bank of four ports (A, B, C, D).
+ * An optional second bank, with ports E, F, G, and H, may be present, starting
+ * at register offset 0x1C.
+ */
+
+/*
+ * Pin select: (0) "normal", (1) "dedicate peripheral"
+ * Not used on RTL8380/RTL8390, peripheral selection is managed by control bits
+ * in the peripheral registers.
+ */
+#define REALTEK_GPIO_REG_CNR           0x00
+/* Clear bit (0) for input, set bit (1) for output */
+#define REALTEK_GPIO_REG_DIR           0x08
+#define REALTEK_GPIO_REG_DATA          0x0C
+/* Read bit for IRQ status, write 1 to clear IRQ */
+#define REALTEK_GPIO_REG_ISR           0x10
+/* Two bits per GPIO in IMR registers */
+#define REALTEK_GPIO_REG_IMR           0x14
+#define REALTEK_GPIO_REG_IMR_AB                0x14
+#define REALTEK_GPIO_REG_IMR_CD                0x18
+#define REALTEK_GPIO_IMR_LINE_MASK     GENMASK(1, 0)
+#define REALTEK_GPIO_IRQ_EDGE_FALLING  1
+#define REALTEK_GPIO_IRQ_EDGE_RISING   2
+#define REALTEK_GPIO_IRQ_EDGE_BOTH     3
+
+#define REALTEK_GPIO_MAX               32
+#define REALTEK_GPIO_PORTS_PER_BANK    4
+
+/**
+ * realtek_gpio_ctrl - Realtek Otto GPIO driver data
+ *
+ * @gc: Associated gpio_chip instance
+ * @base: Base address of the register block for a GPIO bank
+ * @lock: Lock for accessing the IRQ registers and values
+ * @intr_mask: Mask for interrupts lines
+ * @intr_type: Interrupt type selection
+ *
+ * Because the interrupt mask register (IMR) combines the function of IRQ type
+ * selection and masking, two extra values are stored. @intr_mask is used to
+ * mask/unmask the interrupts for a GPIO port, and @intr_type is used to store
+ * the selected interrupt types. The logical AND of these values is written to
+ * IMR on changes.
+ */
+struct realtek_gpio_ctrl {
+       struct gpio_chip gc;
+       void __iomem *base;
+       raw_spinlock_t lock;
+       u16 intr_mask[REALTEK_GPIO_PORTS_PER_BANK];
+       u16 intr_type[REALTEK_GPIO_PORTS_PER_BANK];
+};
+
+/* Expand with more flags as devices with other quirks are added */
+enum realtek_gpio_flags {
+       /*
+        * Allow disabling interrupts, for cases where the port order is
+        * unknown. This may result in a port mismatch between ISR and IMR.
+        * An interrupt would appear to come from a different line than the
+        * line the IRQ handler was assigned to, causing uncaught interrupts.
+        */
+       GPIO_INTERRUPTS_DISABLED = BIT(0),
+};
+
+static struct realtek_gpio_ctrl *irq_data_to_ctrl(struct irq_data *data)
+{
+       struct gpio_chip *gc = irq_data_get_irq_chip_data(data);
+
+       return container_of(gc, struct realtek_gpio_ctrl, gc);
+}
+
+/*
+ * Normal port order register access
+ *
+ * Port information is stored with the first port at offset 0, followed by the
+ * second, etc. Most registers store one bit per GPIO and use a u8 value per
+ * port. The two interrupt mask registers store two bits per GPIO, so use u16
+ * values.
+ */
+static void realtek_gpio_write_imr(struct realtek_gpio_ctrl *ctrl,
+       unsigned int port, u16 irq_type, u16 irq_mask)
+{
+       iowrite16(irq_type & irq_mask, ctrl->base + REALTEK_GPIO_REG_IMR + 2 * port);
+}
+
+static void realtek_gpio_clear_isr(struct realtek_gpio_ctrl *ctrl,
+       unsigned int port, u8 mask)
+{
+       iowrite8(mask, ctrl->base + REALTEK_GPIO_REG_ISR + port);
+}
+
+static u8 realtek_gpio_read_isr(struct realtek_gpio_ctrl *ctrl, unsigned int port)
+{
+       return ioread8(ctrl->base + REALTEK_GPIO_REG_ISR + port);
+}
+
+/* Set the rising and falling edge mask bits for a GPIO port pin */
+static u16 realtek_gpio_imr_bits(unsigned int pin, u16 value)
+{
+       return (value & REALTEK_GPIO_IMR_LINE_MASK) << 2 * pin;
+}
+
+static void realtek_gpio_irq_ack(struct irq_data *data)
+{
+       struct realtek_gpio_ctrl *ctrl = irq_data_to_ctrl(data);
+       irq_hw_number_t line = irqd_to_hwirq(data);
+       unsigned int port = line / 8;
+       unsigned int port_pin = line % 8;
+
+       realtek_gpio_clear_isr(ctrl, port, BIT(port_pin));
+}
+
+static void realtek_gpio_irq_unmask(struct irq_data *data)
+{
+       struct realtek_gpio_ctrl *ctrl = irq_data_to_ctrl(data);
+       unsigned int line = irqd_to_hwirq(data);
+       unsigned int port = line / 8;
+       unsigned int port_pin = line % 8;
+       unsigned long flags;
+       u16 m;
+
+       raw_spin_lock_irqsave(&ctrl->lock, flags);
+       m = ctrl->intr_mask[port];
+       m |= realtek_gpio_imr_bits(port_pin, REALTEK_GPIO_IMR_LINE_MASK);
+       ctrl->intr_mask[port] = m;
+       realtek_gpio_write_imr(ctrl, port, ctrl->intr_type[port], m);
+       raw_spin_unlock_irqrestore(&ctrl->lock, flags);
+}
+
+static void realtek_gpio_irq_mask(struct irq_data *data)
+{
+       struct realtek_gpio_ctrl *ctrl = irq_data_to_ctrl(data);
+       unsigned int line = irqd_to_hwirq(data);
+       unsigned int port = line / 8;
+       unsigned int port_pin = line % 8;
+       unsigned long flags;
+       u16 m;
+
+       raw_spin_lock_irqsave(&ctrl->lock, flags);
+       m = ctrl->intr_mask[port];
+       m &= ~realtek_gpio_imr_bits(port_pin, REALTEK_GPIO_IMR_LINE_MASK);
+       ctrl->intr_mask[port] = m;
+       realtek_gpio_write_imr(ctrl, port, ctrl->intr_type[port], m);
+       raw_spin_unlock_irqrestore(&ctrl->lock, flags);
+}
+
+static int realtek_gpio_irq_set_type(struct irq_data *data, unsigned int flow_type)
+{
+       struct realtek_gpio_ctrl *ctrl = irq_data_to_ctrl(data);
+       unsigned int line = irqd_to_hwirq(data);
+       unsigned int port = line / 8;
+       unsigned int port_pin = line % 8;
+       unsigned long flags;
+       u16 type, t;
+
+       switch (flow_type & IRQ_TYPE_SENSE_MASK) {
+       case IRQ_TYPE_EDGE_FALLING:
+               type = REALTEK_GPIO_IRQ_EDGE_FALLING;
+               break;
+       case IRQ_TYPE_EDGE_RISING:
+               type = REALTEK_GPIO_IRQ_EDGE_RISING;
+               break;
+       case IRQ_TYPE_EDGE_BOTH:
+               type = REALTEK_GPIO_IRQ_EDGE_BOTH;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       irq_set_handler_locked(data, handle_edge_irq);
+
+       raw_spin_lock_irqsave(&ctrl->lock, flags);
+       t = ctrl->intr_type[port];
+       t &= ~realtek_gpio_imr_bits(port_pin, REALTEK_GPIO_IMR_LINE_MASK);
+       t |= realtek_gpio_imr_bits(port_pin, type);
+       ctrl->intr_type[port] = t;
+       realtek_gpio_write_imr(ctrl, port, t, ctrl->intr_mask[port]);
+       raw_spin_unlock_irqrestore(&ctrl->lock, flags);
+
+       return 0;
+}
+
+static void realtek_gpio_irq_handler(struct irq_desc *desc)
+{
+       struct gpio_chip *gc = irq_desc_get_handler_data(desc);
+       struct realtek_gpio_ctrl *ctrl = gpiochip_get_data(gc);
+       struct irq_chip *irq_chip = irq_desc_get_chip(desc);
+       unsigned int lines_done;
+       unsigned int port_pin_count;
+       unsigned int irq;
+       unsigned long status;
+       int offset;
+
+       chained_irq_enter(irq_chip, desc);
+
+       for (lines_done = 0; lines_done < gc->ngpio; lines_done += 8) {
+               status = realtek_gpio_read_isr(ctrl, lines_done / 8);
+               port_pin_count = min(gc->ngpio - lines_done, 8U);
+               for_each_set_bit(offset, &status, port_pin_count) {
+                       irq = irq_find_mapping(gc->irq.domain, offset);
+                       generic_handle_irq(irq);
+               }
+       }
+
+       chained_irq_exit(irq_chip, desc);
+}
+
+static int realtek_gpio_irq_init(struct gpio_chip *gc)
+{
+       struct realtek_gpio_ctrl *ctrl = gpiochip_get_data(gc);
+       unsigned int port;
+
+       for (port = 0; (port * 8) < gc->ngpio; port++) {
+               realtek_gpio_write_imr(ctrl, port, 0, 0);
+               realtek_gpio_clear_isr(ctrl, port, GENMASK(7, 0));
+       }
+
+       return 0;
+}
+
+static struct irq_chip realtek_gpio_irq_chip = {
+       .name = "realtek-otto-gpio",
+       .irq_ack = realtek_gpio_irq_ack,
+       .irq_mask = realtek_gpio_irq_mask,
+       .irq_unmask = realtek_gpio_irq_unmask,
+       .irq_set_type = realtek_gpio_irq_set_type,
+};
+
+static const struct of_device_id realtek_gpio_of_match[] = {
+       {
+               .compatible = "realtek,otto-gpio",
+               .data = (void *)GPIO_INTERRUPTS_DISABLED,
+       },
+       {
+               .compatible = "realtek,rtl8380-gpio",
+       },
+       {
+               .compatible = "realtek,rtl8390-gpio",
+       },
+       {}
+};
+MODULE_DEVICE_TABLE(of, realtek_gpio_of_match);
+
+static int realtek_gpio_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       unsigned int dev_flags;
+       struct gpio_irq_chip *girq;
+       struct realtek_gpio_ctrl *ctrl;
+       u32 ngpios;
+       int err, irq;
+
+       ctrl = devm_kzalloc(dev, sizeof(*ctrl), GFP_KERNEL);
+       if (!ctrl)
+               return -ENOMEM;
+
+       dev_flags = (unsigned int) device_get_match_data(dev);
+
+       ngpios = REALTEK_GPIO_MAX;
+       device_property_read_u32(dev, "ngpios", &ngpios);
+
+       if (ngpios > REALTEK_GPIO_MAX) {
+               dev_err(&pdev->dev, "invalid ngpios (max. %d)\n",
+                       REALTEK_GPIO_MAX);
+               return -EINVAL;
+       }
+
+       ctrl->base = devm_platform_ioremap_resource(pdev, 0);
+       if (IS_ERR(ctrl->base))
+               return PTR_ERR(ctrl->base);
+
+       raw_spin_lock_init(&ctrl->lock);
+
+       err = bgpio_init(&ctrl->gc, dev, 4,
+               ctrl->base + REALTEK_GPIO_REG_DATA, NULL, NULL,
+               ctrl->base + REALTEK_GPIO_REG_DIR, NULL,
+               BGPIOF_BIG_ENDIAN_BYTE_ORDER);
+       if (err) {
+               dev_err(dev, "unable to init generic GPIO");
+               return err;
+       }
+
+       ctrl->gc.ngpio = ngpios;
+       ctrl->gc.owner = THIS_MODULE;
+
+       irq = platform_get_irq_optional(pdev, 0);
+       if (!(dev_flags & GPIO_INTERRUPTS_DISABLED) && irq > 0) {
+               girq = &ctrl->gc.irq;
+               girq->chip = &realtek_gpio_irq_chip;
+               girq->default_type = IRQ_TYPE_NONE;
+               girq->handler = handle_bad_irq;
+               girq->parent_handler = realtek_gpio_irq_handler;
+               girq->num_parents = 1;
+               girq->parents = devm_kcalloc(dev, girq->num_parents,
+                                       sizeof(*girq->parents), GFP_KERNEL);
+               if (!girq->parents)
+                       return -ENOMEM;
+               girq->parents[0] = irq;
+               girq->init_hw = realtek_gpio_irq_init;
+       }
+
+       return devm_gpiochip_add_data(dev, &ctrl->gc, ctrl);
+}
+
+static struct platform_driver realtek_gpio_driver = {
+       .driver = {
+               .name = "realtek-otto-gpio",
+               .of_match_table = realtek_gpio_of_match,
+       },
+       .probe = realtek_gpio_probe,
+};
+module_platform_driver(realtek_gpio_driver);
+
+MODULE_DESCRIPTION("Realtek Otto GPIO support");
+MODULE_AUTHOR("Sander Vanheule <sander@svanheule.net>");
+MODULE_LICENSE("GPL v2");
index 3a1b1ad..a6f0421 100644 (file)
@@ -7,33 +7,55 @@
  */
 
 #include <linux/acpi.h>
+#include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/gpio/driver.h>
 #include <linux/io.h>
+#include <linux/irq.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/pci_ids.h>
 #include <linux/platform_device.h>
+#include <linux/types.h>
 
 #define GEN    0x00
 #define GIO    0x04
 #define GLV    0x08
+#define GTPE   0x0c
+#define GTNE   0x10
+#define GGPE   0x14
+#define GSMI   0x18
+#define GTS    0x1c
+
+#define CORE_BANK_OFFSET       0x00
+#define RESUME_BANK_OFFSET     0x20
+
+/*
+ * iLB datasheet describes GPE0BLK registers, in particular GPE0E.GPIO bit.
+ * Document Number: 328195-001
+ */
+#define GPE0E_GPIO     14
 
 struct sch_gpio {
        struct gpio_chip chip;
+       struct irq_chip irqchip;
        spinlock_t lock;
        unsigned short iobase;
        unsigned short resume_base;
+
+       /* GPE handling */
+       u32 gpe;
+       acpi_gpe_handler gpe_handler;
 };
 
 static unsigned int sch_gpio_offset(struct sch_gpio *sch, unsigned int gpio,
                                unsigned int reg)
 {
-       unsigned int base = 0;
+       unsigned int base = CORE_BANK_OFFSET;
 
        if (gpio >= sch->resume_base) {
                gpio -= sch->resume_base;
-               base += 0x20;
+               base = RESUME_BANK_OFFSET;
        }
 
        return base + reg + gpio / 8;
@@ -79,10 +101,11 @@ static void sch_gpio_reg_set(struct sch_gpio *sch, unsigned int gpio, unsigned i
 static int sch_gpio_direction_in(struct gpio_chip *gc, unsigned int gpio_num)
 {
        struct sch_gpio *sch = gpiochip_get_data(gc);
+       unsigned long flags;
 
-       spin_lock(&sch->lock);
+       spin_lock_irqsave(&sch->lock, flags);
        sch_gpio_reg_set(sch, gpio_num, GIO, 1);
-       spin_unlock(&sch->lock);
+       spin_unlock_irqrestore(&sch->lock, flags);
        return 0;
 }
 
@@ -96,20 +119,22 @@ static int sch_gpio_get(struct gpio_chip *gc, unsigned int gpio_num)
 static void sch_gpio_set(struct gpio_chip *gc, unsigned int gpio_num, int val)
 {
        struct sch_gpio *sch = gpiochip_get_data(gc);
+       unsigned long flags;
 
-       spin_lock(&sch->lock);
+       spin_lock_irqsave(&sch->lock, flags);
        sch_gpio_reg_set(sch, gpio_num, GLV, val);
-       spin_unlock(&sch->lock);
+       spin_unlock_irqrestore(&sch->lock, flags);
 }
 
 static int sch_gpio_direction_out(struct gpio_chip *gc, unsigned int gpio_num,
                                  int val)
 {
        struct sch_gpio *sch = gpiochip_get_data(gc);
+       unsigned long flags;
 
-       spin_lock(&sch->lock);
+       spin_lock_irqsave(&sch->lock, flags);
        sch_gpio_reg_set(sch, gpio_num, GIO, 0);
-       spin_unlock(&sch->lock);
+       spin_unlock_irqrestore(&sch->lock, flags);
 
        /*
         * according to the datasheet, writing to the level register has no
@@ -144,10 +169,145 @@ static const struct gpio_chip sch_gpio_chip = {
        .get_direction          = sch_gpio_get_direction,
 };
 
+static int sch_irq_type(struct irq_data *d, unsigned int type)
+{
+       struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+       struct sch_gpio *sch = gpiochip_get_data(gc);
+       irq_hw_number_t gpio_num = irqd_to_hwirq(d);
+       unsigned long flags;
+       int rising, falling;
+
+       switch (type & IRQ_TYPE_SENSE_MASK) {
+       case IRQ_TYPE_EDGE_RISING:
+               rising = 1;
+               falling = 0;
+               break;
+       case IRQ_TYPE_EDGE_FALLING:
+               rising = 0;
+               falling = 1;
+               break;
+       case IRQ_TYPE_EDGE_BOTH:
+               rising = 1;
+               falling = 1;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       spin_lock_irqsave(&sch->lock, flags);
+
+       sch_gpio_reg_set(sch, gpio_num, GTPE, rising);
+       sch_gpio_reg_set(sch, gpio_num, GTNE, falling);
+
+       irq_set_handler_locked(d, handle_edge_irq);
+
+       spin_unlock_irqrestore(&sch->lock, flags);
+
+       return 0;
+}
+
+static void sch_irq_ack(struct irq_data *d)
+{
+       struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+       struct sch_gpio *sch = gpiochip_get_data(gc);
+       irq_hw_number_t gpio_num = irqd_to_hwirq(d);
+       unsigned long flags;
+
+       spin_lock_irqsave(&sch->lock, flags);
+       sch_gpio_reg_set(sch, gpio_num, GTS, 1);
+       spin_unlock_irqrestore(&sch->lock, flags);
+}
+
+static void sch_irq_mask_unmask(struct irq_data *d, int val)
+{
+       struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+       struct sch_gpio *sch = gpiochip_get_data(gc);
+       irq_hw_number_t gpio_num = irqd_to_hwirq(d);
+       unsigned long flags;
+
+       spin_lock_irqsave(&sch->lock, flags);
+       sch_gpio_reg_set(sch, gpio_num, GGPE, val);
+       spin_unlock_irqrestore(&sch->lock, flags);
+}
+
+static void sch_irq_mask(struct irq_data *d)
+{
+       sch_irq_mask_unmask(d, 0);
+}
+
+static void sch_irq_unmask(struct irq_data *d)
+{
+       sch_irq_mask_unmask(d, 1);
+}
+
+static u32 sch_gpio_gpe_handler(acpi_handle gpe_device, u32 gpe, void *context)
+{
+       struct sch_gpio *sch = context;
+       struct gpio_chip *gc = &sch->chip;
+       unsigned long core_status, resume_status;
+       unsigned long pending;
+       unsigned long flags;
+       int offset;
+       u32 ret;
+
+       spin_lock_irqsave(&sch->lock, flags);
+
+       core_status = inl(sch->iobase + CORE_BANK_OFFSET + GTS);
+       resume_status = inl(sch->iobase + RESUME_BANK_OFFSET + GTS);
+
+       spin_unlock_irqrestore(&sch->lock, flags);
+
+       pending = (resume_status << sch->resume_base) | core_status;
+       for_each_set_bit(offset, &pending, sch->chip.ngpio)
+               generic_handle_irq(irq_find_mapping(gc->irq.domain, offset));
+
+       /* Set returning value depending on whether we handled an interrupt */
+       ret = pending ? ACPI_INTERRUPT_HANDLED : ACPI_INTERRUPT_NOT_HANDLED;
+
+       /* Acknowledge GPE to ACPICA */
+       ret |= ACPI_REENABLE_GPE;
+
+       return ret;
+}
+
+static void sch_gpio_remove_gpe_handler(void *data)
+{
+       struct sch_gpio *sch = data;
+
+       acpi_disable_gpe(NULL, sch->gpe);
+       acpi_remove_gpe_handler(NULL, sch->gpe, sch->gpe_handler);
+}
+
+static int sch_gpio_install_gpe_handler(struct sch_gpio *sch)
+{
+       struct device *dev = sch->chip.parent;
+       acpi_status status;
+
+       status = acpi_install_gpe_handler(NULL, sch->gpe, ACPI_GPE_LEVEL_TRIGGERED,
+                                         sch->gpe_handler, sch);
+       if (ACPI_FAILURE(status)) {
+               dev_err(dev, "Failed to install GPE handler for %u: %s\n",
+                       sch->gpe, acpi_format_exception(status));
+               return -ENODEV;
+       }
+
+       status = acpi_enable_gpe(NULL, sch->gpe);
+       if (ACPI_FAILURE(status)) {
+               dev_err(dev, "Failed to enable GPE handler for %u: %s\n",
+                       sch->gpe, acpi_format_exception(status));
+               acpi_remove_gpe_handler(NULL, sch->gpe, sch->gpe_handler);
+               return -ENODEV;
+       }
+
+       return devm_add_action_or_reset(dev, sch_gpio_remove_gpe_handler, sch);
+}
+
 static int sch_gpio_probe(struct platform_device *pdev)
 {
+       struct gpio_irq_chip *girq;
        struct sch_gpio *sch;
        struct resource *res;
+       int ret;
 
        sch = devm_kzalloc(&pdev->dev, sizeof(*sch), GFP_KERNEL);
        if (!sch)
@@ -207,6 +367,28 @@ static int sch_gpio_probe(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, sch);
 
+       sch->irqchip.name = "sch_gpio";
+       sch->irqchip.irq_ack = sch_irq_ack;
+       sch->irqchip.irq_mask = sch_irq_mask;
+       sch->irqchip.irq_unmask = sch_irq_unmask;
+       sch->irqchip.irq_set_type = sch_irq_type;
+
+       girq = &sch->chip.irq;
+       girq->chip = &sch->irqchip;
+       girq->num_parents = 0;
+       girq->parents = NULL;
+       girq->parent_handler = NULL;
+       girq->default_type = IRQ_TYPE_NONE;
+       girq->handler = handle_bad_irq;
+
+       /* GPE setup is optional */
+       sch->gpe = GPE0E_GPIO;
+       sch->gpe_handler = sch_gpio_gpe_handler;
+
+       ret = sch_gpio_install_gpe_handler(sch);
+       if (ret)
+               dev_warn(&pdev->dev, "Can't setup GPE, no IRQ support\n");
+
        return devm_gpiochip_add_data(&pdev->dev, &sch->chip, sch);
 }
 
index 1aacd2a..3ef22a3 100644 (file)
@@ -1291,6 +1291,13 @@ void acpi_gpiochip_remove(struct gpio_chip *chip)
        kfree(acpi_gpio);
 }
 
+void acpi_gpio_dev_init(struct gpio_chip *gc, struct gpio_device *gdev)
+{
+       /* Set default fwnode to parent's one if present */
+       if (gc->parent)
+               ACPI_COMPANION_SET(&gdev->dev, ACPI_COMPANION(gc->parent));
+}
+
 static int acpi_gpio_package_count(const union acpi_object *obj)
 {
        const union acpi_object *element = obj->package.elements;
@@ -1438,6 +1445,20 @@ static const struct dmi_system_id gpiolib_acpi_quirks[] __initconst = {
                        .no_edge_events_on_boot = true,
                },
        },
+       {
+               /*
+                * The Dell Venue 10 Pro 5055, with Bay Trail SoC + TI PMIC uses an
+                * external embedded-controller connected via I2C + an ACPI GPIO
+                * event handler on INT33FFC:02 pin 12, causing spurious wakeups.
+                */
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Venue 10 Pro 5055"),
+               },
+               .driver_data = &(struct acpi_gpiolib_dmi_quirk) {
+                       .ignore_wake = "INT33FC:02@12",
+               },
+       },
        {
                /*
                 * HP X2 10 models with Cherry Trail SoC + TI PMIC use an
index e2edb63..e476558 100644 (file)
@@ -36,6 +36,8 @@ struct acpi_gpio_info {
 void acpi_gpiochip_add(struct gpio_chip *chip);
 void acpi_gpiochip_remove(struct gpio_chip *chip);
 
+void acpi_gpio_dev_init(struct gpio_chip *gc, struct gpio_device *gdev);
+
 void acpi_gpiochip_request_interrupts(struct gpio_chip *chip);
 void acpi_gpiochip_free_interrupts(struct gpio_chip *chip);
 
@@ -58,6 +60,8 @@ int acpi_gpio_count(struct device *dev, const char *con_id);
 static inline void acpi_gpiochip_add(struct gpio_chip *chip) { }
 static inline void acpi_gpiochip_remove(struct gpio_chip *chip) { }
 
+static inline void acpi_gpio_dev_init(struct gpio_chip *gc, struct gpio_device *gdev) { }
+
 static inline void
 acpi_gpiochip_request_interrupts(struct gpio_chip *chip) { }
 
index baf0153..bbcc7c0 100644 (file)
@@ -1042,11 +1042,13 @@ void of_gpiochip_remove(struct gpio_chip *chip)
 
 void of_gpio_dev_init(struct gpio_chip *gc, struct gpio_device *gdev)
 {
+       /* Set default OF node to parent's one if present */
+       if (gc->parent)
+               gdev->dev.of_node = gc->parent->of_node;
+
        /* If the gpiochip has an assigned OF node this takes precedence */
        if (gc->of_node)
                gdev->dev.of_node = gc->of_node;
        else
                gc->of_node = gdev->dev.of_node;
-       if (gdev->dev.of_node)
-               gdev->dev.fwnode = of_fwnode_handle(gdev->dev.of_node);
 }
index 6367646..1427c1b 100644 (file)
@@ -586,14 +586,12 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
        if (!gdev)
                return -ENOMEM;
        gdev->dev.bus = &gpio_bus_type;
+       gdev->dev.parent = gc->parent;
        gdev->chip = gc;
        gc->gpiodev = gdev;
-       if (gc->parent) {
-               gdev->dev.parent = gc->parent;
-               gdev->dev.of_node = gc->parent->of_node;
-       }
 
        of_gpio_dev_init(gc, gdev);
+       acpi_gpio_dev_init(gc, gdev);
 
        /*
         * Assign fwnode depending on the result of the previous calls,
@@ -1465,9 +1463,8 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc,
                                struct lock_class_key *lock_key,
                                struct lock_class_key *request_key)
 {
+       struct fwnode_handle *fwnode = dev_fwnode(&gc->gpiodev->dev);
        struct irq_chip *irqchip = gc->irq.chip;
-       const struct irq_domain_ops *ops = NULL;
-       struct device_node *np;
        unsigned int type;
        unsigned int i;
 
@@ -1479,7 +1476,6 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc,
                return -EINVAL;
        }
 
-       np = gc->gpiodev->dev.of_node;
        type = gc->irq.default_type;
 
        /*
@@ -1487,15 +1483,9 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc,
         * used to configure the interrupts, as you may end up with
         * conflicting triggers. Tell the user, and reset to NONE.
         */
-       if (WARN(np && type != IRQ_TYPE_NONE,
-                "%s: Ignoring %u default trigger\n", np->full_name, type))
-               type = IRQ_TYPE_NONE;
-
-       if (has_acpi_companion(gc->parent) && type != IRQ_TYPE_NONE) {
-               acpi_handle_warn(ACPI_HANDLE(gc->parent),
-                                "Ignoring %u default trigger\n", type);
+       if (WARN(fwnode && type != IRQ_TYPE_NONE,
+                "%pfw: Ignoring %u default trigger\n", fwnode, type))
                type = IRQ_TYPE_NONE;
-       }
 
        if (gc->to_irq)
                chip_warn(gc, "to_irq is redefined in %s and you shouldn't rely on it\n", __func__);
@@ -1512,15 +1502,11 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc,
                        return ret;
        } else {
                /* Some drivers provide custom irqdomain ops */
-               if (gc->irq.domain_ops)
-                       ops = gc->irq.domain_ops;
-
-               if (!ops)
-                       ops = &gpiochip_domain_ops;
-               gc->irq.domain = irq_domain_add_simple(np,
+               gc->irq.domain = irq_domain_create_simple(fwnode,
                        gc->ngpio,
                        gc->irq.first,
-                       ops, gc);
+                       gc->irq.domain_ops ?: &gpiochip_domain_ops,
+                       gc);
                if (!gc->irq.domain)
                        return -EINVAL;
        }
@@ -3684,11 +3670,12 @@ EXPORT_SYMBOL_GPL(fwnode_gpiod_get_index);
  */
 int gpiod_count(struct device *dev, const char *con_id)
 {
+       const struct fwnode_handle *fwnode = dev ? dev_fwnode(dev) : NULL;
        int count = -ENOENT;
 
-       if (IS_ENABLED(CONFIG_OF) && dev && dev->of_node)
+       if (is_of_node(fwnode))
                count = of_gpio_get_count(dev, con_id);
-       else if (IS_ENABLED(CONFIG_ACPI) && dev && ACPI_HANDLE(dev))
+       else if (is_acpi_node(fwnode))
                count = acpi_gpio_count(dev, con_id);
 
        if (count < 0)
@@ -3826,18 +3813,17 @@ struct gpio_desc *__must_check gpiod_get_index(struct device *dev,
        int ret;
        /* Maybe we have a device name, maybe not */
        const char *devname = dev ? dev_name(dev) : "?";
+       const struct fwnode_handle *fwnode = dev ? dev_fwnode(dev) : NULL;
 
        dev_dbg(dev, "GPIO lookup for consumer %s\n", con_id);
 
-       if (dev) {
-               /* Using device tree? */
-               if (IS_ENABLED(CONFIG_OF) && dev->of_node) {
-                       dev_dbg(dev, "using device tree for GPIO lookup\n");
-                       desc = of_find_gpio(dev, con_id, idx, &lookupflags);
-               } else if (ACPI_COMPANION(dev)) {
-                       dev_dbg(dev, "using ACPI for GPIO lookup\n");
-                       desc = acpi_find_gpio(dev, con_id, idx, &flags, &lookupflags);
-               }
+       /* Using device tree? */
+       if (is_of_node(fwnode)) {
+               dev_dbg(dev, "using device tree for GPIO lookup\n");
+               desc = of_find_gpio(dev, con_id, idx, &lookupflags);
+       } else if (is_acpi_node(fwnode)) {
+               dev_dbg(dev, "using ACPI for GPIO lookup\n");
+               desc = acpi_find_gpio(dev, con_id, idx, &flags, &lookupflags);
        }
 
        /*
@@ -3921,9 +3907,6 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
        struct gpio_desc *desc = ERR_PTR(-ENODEV);
        int ret;
 
-       if (!fwnode)
-               return ERR_PTR(-EINVAL);
-
        if (is_of_node(fwnode)) {
                desc = gpiod_get_from_of_node(to_of_node(fwnode),
                                              propname, index,
@@ -3939,7 +3922,8 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
 
                acpi_gpio_update_gpiod_flags(&dflags, &info);
                acpi_gpio_update_gpiod_lookup_flags(&lflags, &info);
-       }
+       } else
+               return ERR_PTR(-EINVAL);
 
        /* Currently only ACPI takes this path */
        ret = gpiod_request(desc, label);
@@ -4220,11 +4204,13 @@ EXPORT_SYMBOL_GPL(gpiod_put_array);
 
 static int gpio_bus_match(struct device *dev, struct device_driver *drv)
 {
+       struct fwnode_handle *fwnode = dev_fwnode(dev);
+
        /*
         * Only match if the fwnode doesn't already have a proper struct device
         * created for it.
         */
-       if (dev->fwnode && dev->fwnode->dev != dev)
+       if (fwnode && fwnode->dev != dev)
                return 0;
        return 1;
 }
index 5a1f243..73f2257 100644 (file)
@@ -333,10 +333,6 @@ int kfd_iommu_resume(struct kfd_dev *kfd)
        return 0;
 }
 
-extern bool amd_iommu_pc_supported(void);
-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
-
 /** kfd_iommu_add_perf_counters - Add IOMMU performance counters to topology
  */
 int kfd_iommu_add_perf_counters(struct kfd_topology_device *kdev)
index 21cc408..ce6b664 100644 (file)
@@ -42,7 +42,7 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj)
 
        max_order = MAX_ORDER;
 #ifdef CONFIG_SWIOTLB
-       if (swiotlb_nr_tbl()) {
+       if (is_swiotlb_active()) {
                unsigned int max_segment;
 
                max_segment = swiotlb_max_segment();
index 6a35a30..cf89729 100644 (file)
@@ -188,10 +188,7 @@ int adreno_zap_shader_load(struct msm_gpu *gpu, u32 pasid)
 
 void adreno_set_llc_attributes(struct iommu_domain *iommu)
 {
-       struct io_pgtable_domain_attr pgtbl_cfg;
-
-       pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA;
-       iommu_domain_set_attr(iommu, DOMAIN_ATTR_IO_PGTABLE_CFG, &pgtbl_cfg);
+       iommu_set_pgtable_quirks(iommu, IO_PGTABLE_QUIRK_ARM_OUTER_WBWA);
 }
 
 struct msm_gem_address_space *
index b81ae90..e8b506a 100644 (file)
@@ -321,7 +321,7 @@ nouveau_ttm_init(struct nouveau_drm *drm)
        }
 
 #if IS_ENABLED(CONFIG_SWIOTLB) && IS_ENABLED(CONFIG_X86)
-       need_swiotlb = !!swiotlb_nr_tbl();
+       need_swiotlb = is_swiotlb_active();
 #endif
 
        ret = ttm_device_init(&drm->ttm.bdev, &nouveau_bo_driver, drm->dev->dev,
index 1864467..6754f57 100644 (file)
@@ -1,4 +1,3 @@
-/* vim: set ts=8 sw=8 tw=78 ai noexpandtab */
 /* qxl_drv.c -- QXL driver -*- linux-c -*-
  *
  * Copyright 2011 Red Hat, Inc.
index 32cd263..53e1347 100644 (file)
@@ -28,17 +28,6 @@ config HWSPINLOCK_QCOM
 
          If unsure, say N.
 
-config HWSPINLOCK_SIRF
-       tristate "SIRF Hardware Spinlock device"
-       depends on ARCH_SIRF || COMPILE_TEST
-       help
-         Say y here to support the SIRF Hardware Spinlock device, which
-         provides a synchronisation mechanism for the various processors
-         on the SoC.
-
-         It's safe to say n here if you're not interested in SIRF hardware
-         spinlock or just want a bare minimum kernel.
-
 config HWSPINLOCK_SPRD
        tristate "SPRD Hardware Spinlock device"
        depends on ARCH_SPRD || COMPILE_TEST
index ed053e3..1f8dd6f 100644 (file)
@@ -6,7 +6,6 @@
 obj-$(CONFIG_HWSPINLOCK)               += hwspinlock_core.o
 obj-$(CONFIG_HWSPINLOCK_OMAP)          += omap_hwspinlock.o
 obj-$(CONFIG_HWSPINLOCK_QCOM)          += qcom_hwspinlock.o
-obj-$(CONFIG_HWSPINLOCK_SIRF)          += sirf_hwspinlock.o
 obj-$(CONFIG_HWSPINLOCK_SPRD)          += sprd_hwspinlock.o
 obj-$(CONFIG_HWSPINLOCK_STM32)         += stm32_hwspinlock.o
 obj-$(CONFIG_HSEM_U8500)               += u8500_hsem.o
diff --git a/drivers/hwspinlock/sirf_hwspinlock.c b/drivers/hwspinlock/sirf_hwspinlock.c
deleted file mode 100644 (file)
index a3f7712..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * SIRF hardware spinlock driver
- *
- * Copyright (c) 2015 Cambridge Silicon Radio Limited, a CSR plc group company.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/hwspinlock.h>
-#include <linux/platform_device.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-
-#include "hwspinlock_internal.h"
-
-struct sirf_hwspinlock {
-       void __iomem *io_base;
-       struct hwspinlock_device bank;
-};
-
-/* Number of Hardware Spinlocks*/
-#define        HW_SPINLOCK_NUMBER      30
-
-/* Hardware spinlock register offsets */
-#define HW_SPINLOCK_BASE       0x404
-#define HW_SPINLOCK_OFFSET(x)  (HW_SPINLOCK_BASE + 0x4 * (x))
-
-static int sirf_hwspinlock_trylock(struct hwspinlock *lock)
-{
-       void __iomem *lock_addr = lock->priv;
-
-       /* attempt to acquire the lock by reading value == 1 from it */
-       return !!readl(lock_addr);
-}
-
-static void sirf_hwspinlock_unlock(struct hwspinlock *lock)
-{
-       void __iomem *lock_addr = lock->priv;
-
-       /* release the lock by writing 0 to it */
-       writel(0, lock_addr);
-}
-
-static const struct hwspinlock_ops sirf_hwspinlock_ops = {
-       .trylock = sirf_hwspinlock_trylock,
-       .unlock = sirf_hwspinlock_unlock,
-};
-
-static int sirf_hwspinlock_probe(struct platform_device *pdev)
-{
-       struct sirf_hwspinlock *hwspin;
-       struct hwspinlock *hwlock;
-       int idx;
-
-       if (!pdev->dev.of_node)
-               return -ENODEV;
-
-       hwspin = devm_kzalloc(&pdev->dev,
-                             struct_size(hwspin, bank.lock,
-                                         HW_SPINLOCK_NUMBER),
-                             GFP_KERNEL);
-       if (!hwspin)
-               return -ENOMEM;
-
-       /* retrieve io base */
-       hwspin->io_base = devm_platform_ioremap_resource(pdev, 0);
-       if (IS_ERR(hwspin->io_base))
-               return PTR_ERR(hwspin->io_base);
-
-       for (idx = 0; idx < HW_SPINLOCK_NUMBER; idx++) {
-               hwlock = &hwspin->bank.lock[idx];
-               hwlock->priv = hwspin->io_base + HW_SPINLOCK_OFFSET(idx);
-       }
-
-       platform_set_drvdata(pdev, hwspin);
-
-       return devm_hwspin_lock_register(&pdev->dev, &hwspin->bank,
-                                        &sirf_hwspinlock_ops, 0,
-                                        HW_SPINLOCK_NUMBER);
-}
-
-static const struct of_device_id sirf_hwpinlock_ids[] = {
-       { .compatible = "sirf,hwspinlock", },
-       {},
-};
-MODULE_DEVICE_TABLE(of, sirf_hwpinlock_ids);
-
-static struct platform_driver sirf_hwspinlock_driver = {
-       .probe = sirf_hwspinlock_probe,
-       .driver = {
-               .name = "atlas7_hwspinlock",
-               .of_match_table = sirf_hwpinlock_ids,
-       },
-};
-
-module_platform_driver(sirf_hwspinlock_driver);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("SIRF Hardware spinlock driver");
-MODULE_AUTHOR("Wei Chen <wei.chen@csr.com>");
index 7b44ba2..84530fd 100644 (file)
@@ -97,15 +97,15 @@ config CORESIGHT_SOURCE_ETM3X
          module will be called coresight-etm3x.
 
 config CORESIGHT_SOURCE_ETM4X
-       tristate "CoreSight Embedded Trace Macrocell 4.x driver"
+       tristate "CoreSight ETMv4.x / ETE driver"
        depends on ARM64
        select CORESIGHT_LINKS_AND_SINKS
        select PID_IN_CONTEXTIDR
        help
-         This driver provides support for the ETM4.x tracer module, tracing the
-         instructions that a processor is executing. This is primarily useful
-         for instruction level tracing. Depending on the implemented version
-         data tracing may also be available.
+         This driver provides support for the CoreSight Embedded Trace Macrocell
+         version 4.x and the Embedded Trace Extensions (ETE). Both are CPU tracer
+         modules, tracing the instructions that a processor is executing. This is
+         primarily useful for instruction level tracing.
 
          To compile this driver as a module, choose M here: the
          module will be called coresight-etm4x.
@@ -173,4 +173,18 @@ config CORESIGHT_CTI_INTEGRATION_REGS
          CTI trigger connections between this and other devices.These
          registers are not used in normal operation and can leave devices in
          an inconsistent state.
+
+config CORESIGHT_TRBE
+       tristate "Trace Buffer Extension (TRBE) driver"
+       depends on ARM64 && CORESIGHT_SOURCE_ETM4X
+       help
+         This driver provides support for percpu Trace Buffer Extension (TRBE).
+         TRBE always needs to be used along with it's corresponding percpu ETE
+         component. ETE generates trace data which is then captured with TRBE.
+         Unlike traditional sink devices, TRBE is a CPU feature accessible via
+         system registers. But it's explicit dependency with trace unit (ETE)
+         requires it to be plugged in as a coresight sink device.
+
+         To compile this driver as a module, choose M here: the module will be
+         called coresight-trbe.
 endif
index f20e357..d608165 100644 (file)
@@ -21,5 +21,6 @@ obj-$(CONFIG_CORESIGHT_STM) += coresight-stm.o
 obj-$(CONFIG_CORESIGHT_CPU_DEBUG) += coresight-cpu-debug.o
 obj-$(CONFIG_CORESIGHT_CATU) += coresight-catu.o
 obj-$(CONFIG_CORESIGHT_CTI) += coresight-cti.o
+obj-$(CONFIG_CORESIGHT_TRBE) += coresight-trbe.o
 coresight-cti-y := coresight-cti-core.o        coresight-cti-platform.o \
                   coresight-cti-sysfs.o
index b57bea1..6c68d34 100644 (file)
@@ -23,6 +23,7 @@
 #include "coresight-priv.h"
 
 static DEFINE_MUTEX(coresight_mutex);
+static DEFINE_PER_CPU(struct coresight_device *, csdev_sink);
 
 /**
  * struct coresight_node - elements of a path, from source to sink
@@ -70,6 +71,18 @@ void coresight_remove_cti_ops(void)
 }
 EXPORT_SYMBOL_GPL(coresight_remove_cti_ops);
 
+void coresight_set_percpu_sink(int cpu, struct coresight_device *csdev)
+{
+       per_cpu(csdev_sink, cpu) = csdev;
+}
+EXPORT_SYMBOL_GPL(coresight_set_percpu_sink);
+
+struct coresight_device *coresight_get_percpu_sink(int cpu)
+{
+       return per_cpu(csdev_sink, cpu);
+}
+EXPORT_SYMBOL_GPL(coresight_get_percpu_sink);
+
 static int coresight_id_match(struct device *dev, void *data)
 {
        int trace_id, i_trace_id;
@@ -784,6 +797,14 @@ static int _coresight_build_path(struct coresight_device *csdev,
        if (csdev == sink)
                goto out;
 
+       if (coresight_is_percpu_source(csdev) && coresight_is_percpu_sink(sink) &&
+           sink == per_cpu(csdev_sink, source_ops(csdev)->cpu_id(csdev))) {
+               if (_coresight_build_path(sink, sink, path) == 0) {
+                       found = true;
+                       goto out;
+               }
+       }
+
        /* Not a sink - recursively explore each port found on this element */
        for (i = 0; i < csdev->pdata->nr_outport; i++) {
                struct coresight_device *child_dev;
@@ -999,8 +1020,12 @@ coresight_find_default_sink(struct coresight_device *csdev)
        int depth = 0;
 
        /* look for a default sink if we have not found for this device */
-       if (!csdev->def_sink)
-               csdev->def_sink = coresight_find_sink(csdev, &depth);
+       if (!csdev->def_sink) {
+               if (coresight_is_percpu_source(csdev))
+                       csdev->def_sink = per_cpu(csdev_sink, source_ops(csdev)->cpu_id(csdev));
+               if (!csdev->def_sink)
+                       csdev->def_sink = coresight_find_sink(csdev, &depth);
+       }
        return csdev->def_sink;
 }
 
index c1bec2a..6f39837 100644 (file)
 static struct pmu etm_pmu;
 static bool etm_perf_up;
 
-static DEFINE_PER_CPU(struct perf_output_handle, ctx_handle);
+/*
+ * An ETM context for a running event includes the perf aux handle
+ * and aux_data. For ETM, the aux_data (etm_event_data), consists of
+ * the trace path and the sink configuration. The event data is accessible
+ * via perf_get_aux(handle). However, a sink could "end" a perf output
+ * handle via the IRQ handler. And if the "sink" encounters a failure
+ * to "begin" another session (e.g due to lack of space in the buffer),
+ * the handle will be cleared. Thus, the event_data may not be accessible
+ * from the handle when we get to the etm_event_stop(), which is required
+ * for stopping the trace path. The event_data is guaranteed to stay alive
+ * until "free_aux()", which cannot happen as long as the event is active on
+ * the ETM. Thus the event_data for the session must be part of the ETM context
+ * to make sure we can disable the trace path.
+ */
+struct etm_ctxt {
+       struct perf_output_handle handle;
+       struct etm_event_data *event_data;
+};
+
+static DEFINE_PER_CPU(struct etm_ctxt, etm_ctxt);
 static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
 
 /*
@@ -232,6 +251,25 @@ static void etm_free_aux(void *data)
        schedule_work(&event_data->work);
 }
 
+/*
+ * Check if two given sinks are compatible with each other,
+ * so that they can use the same sink buffers, when an event
+ * moves around.
+ */
+static bool sinks_compatible(struct coresight_device *a,
+                            struct coresight_device *b)
+{
+       if (!a || !b)
+               return false;
+       /*
+        * If the sinks are of the same subtype and driven
+        * by the same driver, we can use the same buffer
+        * on these sinks.
+        */
+       return (a->subtype.sink_subtype == b->subtype.sink_subtype) &&
+              (sink_ops(a) == sink_ops(b));
+}
+
 static void *etm_setup_aux(struct perf_event *event, void **pages,
                           int nr_pages, bool overwrite)
 {
@@ -239,6 +277,7 @@ static void *etm_setup_aux(struct perf_event *event, void **pages,
        int cpu = event->cpu;
        cpumask_t *mask;
        struct coresight_device *sink = NULL;
+       struct coresight_device *user_sink = NULL, *last_sink = NULL;
        struct etm_event_data *event_data = NULL;
 
        event_data = alloc_event_data(cpu);
@@ -249,7 +288,7 @@ static void *etm_setup_aux(struct perf_event *event, void **pages,
        /* First get the selected sink from user space. */
        if (event->attr.config2) {
                id = (u32)event->attr.config2;
-               sink = coresight_get_sink_by_id(id);
+               sink = user_sink = coresight_get_sink_by_id(id);
        }
 
        mask = &event_data->mask;
@@ -277,14 +316,33 @@ static void *etm_setup_aux(struct perf_event *event, void **pages,
                }
 
                /*
-                * No sink provided - look for a default sink for one of the
-                * devices. At present we only support topology where all CPUs
-                * use the same sink [N:1], so only need to find one sink. The
-                * coresight_build_path later will remove any CPU that does not
-                * attach to the sink, or if we have not found a sink.
+                * No sink provided - look for a default sink for all the ETMs,
+                * where this event can be scheduled.
+                * We allocate the sink specific buffers only once for this
+                * event. If the ETMs have different default sink devices, we
+                * can only use a single "type" of sink as the event can carry
+                * only one sink specific buffer. Thus we have to make sure
+                * that the sinks are of the same type and driven by the same
+                * driver, as the one we allocate the buffer for. As such
+                * we choose the first sink and check if the remaining ETMs
+                * have a compatible default sink. We don't trace on a CPU
+                * if the sink is not compatible.
                 */
-               if (!sink)
+               if (!user_sink) {
+                       /* Find the default sink for this ETM */
                        sink = coresight_find_default_sink(csdev);
+                       if (!sink) {
+                               cpumask_clear_cpu(cpu, mask);
+                               continue;
+                       }
+
+                       /* Check if this sink compatible with the last sink */
+                       if (last_sink && !sinks_compatible(last_sink, sink)) {
+                               cpumask_clear_cpu(cpu, mask);
+                               continue;
+                       }
+                       last_sink = sink;
+               }
 
                /*
                 * Building a path doesn't enable it, it simply builds a
@@ -312,7 +370,12 @@ static void *etm_setup_aux(struct perf_event *event, void **pages,
        if (!sink_ops(sink)->alloc_buffer || !sink_ops(sink)->free_buffer)
                goto err;
 
-       /* Allocate the sink buffer for this session */
+       /*
+        * Allocate the sink buffer for this session. All the sinks
+        * where this event can be scheduled are ensured to be of the
+        * same type. Thus the same sink configuration is used by the
+        * sinks.
+        */
        event_data->snk_config =
                        sink_ops(sink)->alloc_buffer(sink, event, pages,
                                                     nr_pages, overwrite);
@@ -332,13 +395,18 @@ static void etm_event_start(struct perf_event *event, int flags)
 {
        int cpu = smp_processor_id();
        struct etm_event_data *event_data;
-       struct perf_output_handle *handle = this_cpu_ptr(&ctx_handle);
+       struct etm_ctxt *ctxt = this_cpu_ptr(&etm_ctxt);
+       struct perf_output_handle *handle = &ctxt->handle;
        struct coresight_device *sink, *csdev = per_cpu(csdev_src, cpu);
        struct list_head *path;
 
        if (!csdev)
                goto fail;
 
+       /* Have we messed up our tracking ? */
+       if (WARN_ON(ctxt->event_data))
+               goto fail;
+
        /*
         * Deal with the ring buffer API and get a handle on the
         * session's information.
@@ -374,6 +442,8 @@ static void etm_event_start(struct perf_event *event, int flags)
        if (source_ops(csdev)->enable(csdev, event, CS_MODE_PERF))
                goto fail_disable_path;
 
+       /* Save the event_data for this ETM */
+       ctxt->event_data = event_data;
 out:
        return;
 
@@ -392,13 +462,30 @@ static void etm_event_stop(struct perf_event *event, int mode)
        int cpu = smp_processor_id();
        unsigned long size;
        struct coresight_device *sink, *csdev = per_cpu(csdev_src, cpu);
-       struct perf_output_handle *handle = this_cpu_ptr(&ctx_handle);
-       struct etm_event_data *event_data = perf_get_aux(handle);
+       struct etm_ctxt *ctxt = this_cpu_ptr(&etm_ctxt);
+       struct perf_output_handle *handle = &ctxt->handle;
+       struct etm_event_data *event_data;
        struct list_head *path;
 
+       /*
+        * If we still have access to the event_data via handle,
+        * confirm that we haven't messed up the tracking.
+        */
+       if (handle->event &&
+           WARN_ON(perf_get_aux(handle) != ctxt->event_data))
+               return;
+
+       event_data = ctxt->event_data;
+       /* Clear the event_data as this ETM is stopping the trace. */
+       ctxt->event_data = NULL;
+
        if (event->hw.state == PERF_HES_STOPPED)
                return;
 
+       /* We must have a valid event_data for a running event */
+       if (WARN_ON(!event_data))
+               return;
+
        if (!csdev)
                return;
 
@@ -416,7 +503,13 @@ static void etm_event_stop(struct perf_event *event, int mode)
        /* tell the core */
        event->hw.state = PERF_HES_STOPPED;
 
-       if (mode & PERF_EF_UPDATE) {
+       /*
+        * If the handle is not bound to an event anymore
+        * (e.g, the sink driver was unable to restart the
+        * handle due to lack of buffer space), we don't
+        * have to do anything here.
+        */
+       if (handle->event && (mode & PERF_EF_UPDATE)) {
                if (WARN_ON_ONCE(handle->event != event))
                        return;
 
index a5b13a7..db88199 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/property.h>
 
+#include <asm/barrier.h>
 #include <asm/sections.h>
 #include <asm/sysreg.h>
 #include <asm/local.h>
@@ -114,30 +115,91 @@ void etm4x_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit)
        }
 }
 
-static void etm4_os_unlock_csa(struct etmv4_drvdata *drvdata, struct csdev_access *csa)
+static u64 ete_sysreg_read(u32 offset, bool _relaxed, bool _64bit)
 {
-       /* Writing 0 to TRCOSLAR unlocks the trace registers */
-       etm4x_relaxed_write32(csa, 0x0, TRCOSLAR);
-       drvdata->os_unlock = true;
+       u64 res = 0;
+
+       switch (offset) {
+       ETE_READ_CASES(res)
+       default :
+               pr_warn_ratelimited("ete: trying to read unsupported register @%x\n",
+                                   offset);
+       }
+
+       if (!_relaxed)
+               __iormb(res);   /* Imitate the !relaxed I/O helpers */
+
+       return res;
+}
+
+static void ete_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit)
+{
+       if (!_relaxed)
+               __iowmb();      /* Imitate the !relaxed I/O helpers */
+       if (!_64bit)
+               val &= GENMASK(31, 0);
+
+       switch (offset) {
+       ETE_WRITE_CASES(val)
+       default :
+               pr_warn_ratelimited("ete: trying to write to unsupported register @%x\n",
+                                   offset);
+       }
+}
+
+static void etm_detect_os_lock(struct etmv4_drvdata *drvdata,
+                              struct csdev_access *csa)
+{
+       u32 oslsr = etm4x_relaxed_read32(csa, TRCOSLSR);
+
+       drvdata->os_lock_model = ETM_OSLSR_OSLM(oslsr);
+}
+
+static void etm_write_os_lock(struct etmv4_drvdata *drvdata,
+                             struct csdev_access *csa, u32 val)
+{
+       val = !!val;
+
+       switch (drvdata->os_lock_model) {
+       case ETM_OSLOCK_PRESENT:
+               etm4x_relaxed_write32(csa, val, TRCOSLAR);
+               break;
+       case ETM_OSLOCK_PE:
+               write_sysreg_s(val, SYS_OSLAR_EL1);
+               break;
+       default:
+               pr_warn_once("CPU%d: Unsupported Trace OSLock model: %x\n",
+                            smp_processor_id(), drvdata->os_lock_model);
+               fallthrough;
+       case ETM_OSLOCK_NI:
+               return;
+       }
        isb();
 }
 
+static inline void etm4_os_unlock_csa(struct etmv4_drvdata *drvdata,
+                                     struct csdev_access *csa)
+{
+       WARN_ON(drvdata->cpu != smp_processor_id());
+
+       /* Writing 0 to OS Lock unlocks the trace unit registers */
+       etm_write_os_lock(drvdata, csa, 0x0);
+       drvdata->os_unlock = true;
+}
+
 static void etm4_os_unlock(struct etmv4_drvdata *drvdata)
 {
        if (!WARN_ON(!drvdata->csdev))
                etm4_os_unlock_csa(drvdata, &drvdata->csdev->access);
-
 }
 
 static void etm4_os_lock(struct etmv4_drvdata *drvdata)
 {
        if (WARN_ON(!drvdata->csdev))
                return;
-
-       /* Writing 0x1 to TRCOSLAR locks the trace registers */
-       etm4x_relaxed_write32(&drvdata->csdev->access, 0x1, TRCOSLAR);
+       /* Writing 0x1 to OS Lock locks the trace registers */
+       etm_write_os_lock(drvdata, &drvdata->csdev->access, 0x1);
        drvdata->os_unlock = false;
-       isb();
 }
 
 static void etm4_cs_lock(struct etmv4_drvdata *drvdata,
@@ -371,6 +433,13 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata)
                etm4x_relaxed_write32(csa, trcpdcr | TRCPDCR_PU, TRCPDCR);
        }
 
+       /*
+        * ETE mandates that the TRCRSR is written to before
+        * enabling it.
+        */
+       if (etm4x_is_ete(drvdata))
+               etm4x_relaxed_write32(csa, TRCRSR_TA, TRCRSR);
+
        /* Enable the trace unit */
        etm4x_relaxed_write32(csa, 1, TRCPRGCTLR);
 
@@ -654,6 +723,7 @@ static int etm4_enable(struct coresight_device *csdev,
 static void etm4_disable_hw(void *info)
 {
        u32 control;
+       u64 trfcr;
        struct etmv4_drvdata *drvdata = info;
        struct etmv4_config *config = &drvdata->config;
        struct coresight_device *csdev = drvdata->csdev;
@@ -676,6 +746,16 @@ static void etm4_disable_hw(void *info)
        /* EN, bit[0] Trace unit enable bit */
        control &= ~0x1;
 
+       /*
+        * If the CPU supports v8.4 Trace filter Control,
+        * set the ETM to trace prohibited region.
+        */
+       if (drvdata->trfc) {
+               trfcr = read_sysreg_s(SYS_TRFCR_EL1);
+               write_sysreg_s(trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE),
+                              SYS_TRFCR_EL1);
+               isb();
+       }
        /*
         * Make sure everything completes before disabling, as recommended
         * by section 7.3.77 ("TRCVICTLR, ViewInst Main Control Register,
@@ -683,12 +763,16 @@ static void etm4_disable_hw(void *info)
         */
        dsb(sy);
        isb();
+       /* Trace synchronization barrier, is a nop if not supported */
+       tsb_csync();
        etm4x_relaxed_write32(csa, control, TRCPRGCTLR);
 
        /* wait for TRCSTATR.PMSTABLE to go to '1' */
        if (coresight_timeout(csa, TRCSTATR, TRCSTATR_PMSTABLE_BIT, 1))
                dev_err(etm_dev,
                        "timeout while waiting for PM stable Trace Status\n");
+       if (drvdata->trfc)
+               write_sysreg_s(trfcr, SYS_TRFCR_EL1);
 
        /* read the status of the single shot comparators */
        for (i = 0; i < drvdata->nr_ss_cmp; i++) {
@@ -817,13 +901,24 @@ static bool etm4_init_sysreg_access(struct etmv4_drvdata *drvdata,
         * ETMs implementing sysreg access must implement TRCDEVARCH.
         */
        devarch = read_etm4x_sysreg_const_offset(TRCDEVARCH);
-       if ((devarch & ETM_DEVARCH_ID_MASK) != ETM_DEVARCH_ETMv4x_ARCH)
+       switch (devarch & ETM_DEVARCH_ID_MASK) {
+       case ETM_DEVARCH_ETMv4x_ARCH:
+               *csa = (struct csdev_access) {
+                       .io_mem = false,
+                       .read   = etm4x_sysreg_read,
+                       .write  = etm4x_sysreg_write,
+               };
+               break;
+       case ETM_DEVARCH_ETE_ARCH:
+               *csa = (struct csdev_access) {
+                       .io_mem = false,
+                       .read   = ete_sysreg_read,
+                       .write  = ete_sysreg_write,
+               };
+               break;
+       default:
                return false;
-       *csa = (struct csdev_access) {
-               .io_mem = false,
-               .read   = etm4x_sysreg_read,
-               .write  = etm4x_sysreg_write,
-       };
+       }
 
        drvdata->arch = etm_devarch_to_arch(devarch);
        return true;
@@ -873,7 +968,7 @@ static bool etm4_init_csdev_access(struct etmv4_drvdata *drvdata,
        return false;
 }
 
-static void cpu_enable_tracing(void)
+static void cpu_enable_tracing(struct etmv4_drvdata *drvdata)
 {
        u64 dfr0 = read_sysreg(id_aa64dfr0_el1);
        u64 trfcr;
@@ -881,6 +976,7 @@ static void cpu_enable_tracing(void)
        if (!cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_TRACE_FILT_SHIFT))
                return;
 
+       drvdata->trfc = true;
        /*
         * If the CPU supports v8.4 SelfHosted Tracing, enable
         * tracing at the kernel EL and EL0, forcing to use the
@@ -920,6 +1016,9 @@ static void etm4_init_arch_data(void *info)
        if (!etm4_init_csdev_access(drvdata, csa))
                return;
 
+       /* Detect the support for OS Lock before we actually use it */
+       etm_detect_os_lock(drvdata, csa);
+
        /* Make sure all registers are accessible */
        etm4_os_unlock_csa(drvdata, csa);
        etm4_cs_unlock(drvdata, csa);
@@ -1082,7 +1181,7 @@ static void etm4_init_arch_data(void *info)
        /* NUMCNTR, bits[30:28] number of counters available for tracing */
        drvdata->nr_cntr = BMVAL(etmidr5, 28, 30);
        etm4_cs_lock(drvdata, csa);
-       cpu_enable_tracing();
+       cpu_enable_tracing(drvdata);
 }
 
 static inline u32 etm4_get_victlr_access_type(struct etmv4_config *config)
@@ -1760,6 +1859,8 @@ static int etm4_probe(struct device *dev, void __iomem *base, u32 etm_pid)
        struct etmv4_drvdata *drvdata;
        struct coresight_desc desc = { 0 };
        struct etm4_init_arg init_arg = { 0 };
+       u8 major, minor;
+       char *type_name;
 
        drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
        if (!drvdata)
@@ -1786,10 +1887,6 @@ static int etm4_probe(struct device *dev, void __iomem *base, u32 etm_pid)
        if (drvdata->cpu < 0)
                return drvdata->cpu;
 
-       desc.name = devm_kasprintf(dev, GFP_KERNEL, "etm%d", drvdata->cpu);
-       if (!desc.name)
-               return -ENOMEM;
-
        init_arg.drvdata = drvdata;
        init_arg.csa = &desc.access;
        init_arg.pid = etm_pid;
@@ -1806,6 +1903,22 @@ static int etm4_probe(struct device *dev, void __iomem *base, u32 etm_pid)
            fwnode_property_present(dev_fwnode(dev), "qcom,skip-power-up"))
                drvdata->skip_power_up = true;
 
+       major = ETM_ARCH_MAJOR_VERSION(drvdata->arch);
+       minor = ETM_ARCH_MINOR_VERSION(drvdata->arch);
+
+       if (etm4x_is_ete(drvdata)) {
+               type_name = "ete";
+               /* ETE v1 has major version == 0b101. Adjust this for logging.*/
+               major -= 4;
+       } else {
+               type_name = "etm";
+       }
+
+       desc.name = devm_kasprintf(dev, GFP_KERNEL,
+                                  "%s%d", type_name, drvdata->cpu);
+       if (!desc.name)
+               return -ENOMEM;
+
        etm4_init_trace_id(drvdata);
        etm4_set_default(&drvdata->config);
 
@@ -1833,9 +1946,8 @@ static int etm4_probe(struct device *dev, void __iomem *base, u32 etm_pid)
 
        etmdrvdata[drvdata->cpu] = drvdata;
 
-       dev_info(&drvdata->csdev->dev, "CPU%d: ETM v%d.%d initialized\n",
-                drvdata->cpu, ETM_ARCH_MAJOR_VERSION(drvdata->arch),
-                ETM_ARCH_MINOR_VERSION(drvdata->arch));
+       dev_info(&drvdata->csdev->dev, "CPU%d: %s v%d.%d initialized\n",
+                drvdata->cpu, type_name, major, minor);
 
        if (boot_enable) {
                coresight_enable(drvdata->csdev);
@@ -1979,6 +2091,7 @@ static struct amba_driver etm4x_amba_driver = {
 
 static const struct of_device_id etm4_sysreg_match[] = {
        { .compatible   = "arm,coresight-etm4x-sysreg" },
+       { .compatible   = "arm,embedded-trace-extension" },
        {}
 };
 
index 0995a10..007bad9 100644 (file)
@@ -2374,12 +2374,20 @@ static inline bool
 etm4x_register_implemented(struct etmv4_drvdata *drvdata, u32 offset)
 {
        switch (offset) {
-       ETM4x_SYSREG_LIST_CASES
+       ETM_COMMON_SYSREG_LIST_CASES
                /*
-                * Registers accessible via system instructions are always
-                * implemented.
+                * Common registers to ETE & ETM4x accessible via system
+                * instructions are always implemented.
                 */
                return true;
+
+       ETM4x_ONLY_SYSREG_LIST_CASES
+               /*
+                * We only support etm4x and ete. So if the device is not
+                * ETE, it must be ETMv4x.
+                */
+               return !etm4x_is_ete(drvdata);
+
        ETM4x_MMAP_LIST_CASES
                /*
                 * Registers accessible only via memory-mapped registers
@@ -2389,8 +2397,13 @@ etm4x_register_implemented(struct etmv4_drvdata *drvdata, u32 offset)
                 * coresight_register() and the csdev is not initialized
                 * until that is done. So rely on the drvdata->base to
                 * detect if we have a memory mapped access.
+                * Also ETE doesn't implement memory mapped access, thus
+                * it is sufficient to check that we are using mmio.
                 */
                return !!drvdata->base;
+
+       ETE_ONLY_SYSREG_LIST_CASES
+               return etm4x_is_ete(drvdata);
        }
 
        return false;
index 0af6057..e5b79bd 100644 (file)
@@ -29,6 +29,7 @@
 #define TRCAUXCTLR                     0x018
 #define TRCEVENTCTL0R                  0x020
 #define TRCEVENTCTL1R                  0x024
+#define TRCRSR                         0x028
 #define TRCSTALLCTLR                   0x02C
 #define TRCTSCTLR                      0x030
 #define TRCSYNCPR                      0x034
@@ -49,6 +50,7 @@
 #define TRCSEQRSTEVR                   0x118
 #define TRCSEQSTR                      0x11C
 #define TRCEXTINSELR                   0x120
+#define TRCEXTINSELRn(n)               (0x120 + (n * 4)) /* n = 0-3 */
 #define TRCCNTRLDVRn(n)                        (0x140 + (n * 4)) /* n = 0-3 */
 #define TRCCNTCTLRn(n)                 (0x150 + (n * 4)) /* n = 0-3 */
 #define TRCCNTVRn(n)                   (0x160 + (n * 4)) /* n = 0-3 */
 #define TRCCIDR2                       0xFF8
 #define TRCCIDR3                       0xFFC
 
+#define TRCRSR_TA                      BIT(12)
+
 /*
  * System instructions to access ETM registers.
  * See ETMv4.4 spec ARM IHI0064F section 4.3.6 System instructions
 #define CASE_NOP(__unused, x)                                  \
        case (x):       /* fall through */
 
+#define ETE_ONLY_SYSREG_LIST(op, val)          \
+       CASE_##op((val), TRCRSR)                \
+       CASE_##op((val), TRCEXTINSELRn(1))      \
+       CASE_##op((val), TRCEXTINSELRn(2))      \
+       CASE_##op((val), TRCEXTINSELRn(3))
+
 /* List of registers accessible via System instructions */
-#define ETM_SYSREG_LIST(op, val)               \
-       CASE_##op((val), TRCPRGCTLR)            \
+#define ETM4x_ONLY_SYSREG_LIST(op, val)                \
        CASE_##op((val), TRCPROCSELR)           \
+       CASE_##op((val), TRCVDCTLR)             \
+       CASE_##op((val), TRCVDSACCTLR)          \
+       CASE_##op((val), TRCVDARCCTLR)          \
+       CASE_##op((val), TRCOSLAR)
+
+#define ETM_COMMON_SYSREG_LIST(op, val)                \
+       CASE_##op((val), TRCPRGCTLR)            \
        CASE_##op((val), TRCSTATR)              \
        CASE_##op((val), TRCCONFIGR)            \
        CASE_##op((val), TRCAUXCTLR)            \
        CASE_##op((val), TRCVIIECTLR)           \
        CASE_##op((val), TRCVISSCTLR)           \
        CASE_##op((val), TRCVIPCSSCTLR)         \
-       CASE_##op((val), TRCVDCTLR)             \
-       CASE_##op((val), TRCVDSACCTLR)          \
-       CASE_##op((val), TRCVDARCCTLR)          \
        CASE_##op((val), TRCSEQEVRn(0))         \
        CASE_##op((val), TRCSEQEVRn(1))         \
        CASE_##op((val), TRCSEQEVRn(2))         \
        CASE_##op((val), TRCSSPCICRn(5))        \
        CASE_##op((val), TRCSSPCICRn(6))        \
        CASE_##op((val), TRCSSPCICRn(7))        \
-       CASE_##op((val), TRCOSLAR)              \
        CASE_##op((val), TRCOSLSR)              \
        CASE_##op((val), TRCACVRn(0))           \
        CASE_##op((val), TRCACVRn(1))           \
        CASE_##op((val), TRCPIDR2)              \
        CASE_##op((val), TRCPIDR3)
 
-#define ETM4x_READ_SYSREG_CASES(res)   ETM_SYSREG_LIST(READ, (res))
-#define ETM4x_WRITE_SYSREG_CASES(val)  ETM_SYSREG_LIST(WRITE, (val))
+#define ETM4x_READ_SYSREG_CASES(res)           \
+       ETM_COMMON_SYSREG_LIST(READ, (res))     \
+       ETM4x_ONLY_SYSREG_LIST(READ, (res))
+
+#define ETM4x_WRITE_SYSREG_CASES(val)          \
+       ETM_COMMON_SYSREG_LIST(WRITE, (val))    \
+       ETM4x_ONLY_SYSREG_LIST(WRITE, (val))
+
+#define ETM_COMMON_SYSREG_LIST_CASES           \
+       ETM_COMMON_SYSREG_LIST(NOP, __unused)
+
+#define ETM4x_ONLY_SYSREG_LIST_CASES           \
+       ETM4x_ONLY_SYSREG_LIST(NOP, __unused)
+
+#define ETM4x_SYSREG_LIST_CASES                        \
+       ETM_COMMON_SYSREG_LIST_CASES            \
+       ETM4x_ONLY_SYSREG_LIST(NOP, __unused)
 
-#define ETM4x_SYSREG_LIST_CASES                ETM_SYSREG_LIST(NOP, __unused)
 #define ETM4x_MMAP_LIST_CASES          ETM_MMAP_LIST(NOP, __unused)
 
+/* ETE only supports system register access */
+#define ETE_READ_CASES(res)                    \
+       ETM_COMMON_SYSREG_LIST(READ, (res))     \
+       ETE_ONLY_SYSREG_LIST(READ, (res))
+
+#define ETE_WRITE_CASES(val)                   \
+       ETM_COMMON_SYSREG_LIST(WRITE, (val))    \
+       ETE_ONLY_SYSREG_LIST(WRITE, (val))
+
+#define ETE_ONLY_SYSREG_LIST_CASES             \
+       ETE_ONLY_SYSREG_LIST(NOP, __unused)
+
 #define read_etm4x_sysreg_offset(offset, _64bit)                               \
        ({                                                                      \
                u64 __val;                                                      \
                                         ETM_MODE_EXCL_KERN | \
                                         ETM_MODE_EXCL_USER)
 
+/*
+ * TRCOSLSR.OSLM advertises the OS Lock model.
+ * OSLM[2:0] = TRCOSLSR[4:3,0]
+ *
+ *     0b000 - Trace OS Lock is not implemented.
+ *     0b010 - Trace OS Lock is implemented.
+ *     0b100 - Trace OS Lock is not implemented, unit is controlled by PE OS Lock.
+ */
+#define ETM_OSLOCK_NI          0b000
+#define ETM_OSLOCK_PRESENT     0b010
+#define ETM_OSLOCK_PE          0b100
+
+#define ETM_OSLSR_OSLM(oslsr)  ((((oslsr) & GENMASK(4, 3)) >> 2) | (oslsr & 0x1))
+
 /*
  * TRCDEVARCH Bit field definitions
  * Bits[31:21] - ARCHITECT = Always Arm Ltd.
        ((ETM_DEVARCH_MAKE_ARCHID_ARCH_VER(major)) | ETM_DEVARCH_ARCHID_ARCH_PART(0xA13))
 
 #define ETM_DEVARCH_ARCHID_ETMv4x              ETM_DEVARCH_MAKE_ARCHID(0x4)
+#define ETM_DEVARCH_ARCHID_ETE                 ETM_DEVARCH_MAKE_ARCHID(0x5)
 
 #define ETM_DEVARCH_ID_MASK                                            \
        (ETM_DEVARCH_ARCHITECT_MASK | ETM_DEVARCH_ARCHID_MASK | ETM_DEVARCH_PRESENT)
 #define ETM_DEVARCH_ETMv4x_ARCH                                                \
        (ETM_DEVARCH_ARCHITECT_ARM | ETM_DEVARCH_ARCHID_ETMv4x | ETM_DEVARCH_PRESENT)
+#define ETM_DEVARCH_ETE_ARCH                                           \
+       (ETM_DEVARCH_ARCHITECT_ARM | ETM_DEVARCH_ARCHID_ETE | ETM_DEVARCH_PRESENT)
 
 #define TRCSTATR_IDLE_BIT              0
 #define TRCSTATR_PMSTABLE_BIT          1
 #define ETM_ARCH_MINOR_VERSION(arch)   ((arch) & 0xfU)
 
 #define ETM_ARCH_V4    ETM_ARCH_VERSION(4, 0)
+#define ETM_ARCH_ETE   ETM_ARCH_VERSION(5, 0)
+
 /* Interpretation of resource numbers change at ETM v4.3 architecture */
 #define ETM_ARCH_V4_3  ETM_ARCH_VERSION(4, 3)
 
@@ -862,6 +919,7 @@ struct etmv4_save_state {
  * @nooverflow:        Indicate if overflow prevention is supported.
  * @atbtrig:   If the implementation can support ATB triggers
  * @lpoverride:        If the implementation can support low-power state over.
+ * @trfc:      If the implementation supports Arm v8.4 trace filter controls.
  * @config:    structure holding configuration parameters.
  * @save_state:        State to be preserved across power loss
  * @state_needs_restore: True when there is context to restore after PM exit
@@ -897,6 +955,7 @@ struct etmv4_drvdata {
        u8                              s_ex_level;
        u8                              ns_ex_level;
        u8                              q_support;
+       u8                              os_lock_model;
        bool                            sticky_enable;
        bool                            boot_enable;
        bool                            os_unlock;
@@ -912,6 +971,7 @@ struct etmv4_drvdata {
        bool                            nooverflow;
        bool                            atbtrig;
        bool                            lpoverride;
+       bool                            trfc;
        struct etmv4_config             config;
        struct etmv4_save_state         *save_state;
        bool                            state_needs_restore;
@@ -940,4 +1000,9 @@ void etm4_config_trace_mode(struct etmv4_config *config);
 
 u64 etm4x_sysreg_read(u32 offset, bool _relaxed, bool _64bit);
 void etm4x_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit);
+
+static inline bool etm4x_is_ete(struct etmv4_drvdata *drvdata)
+{
+       return drvdata->arch >= ETM_ARCH_ETE;
+}
 #endif
index 3629b78..c594f45 100644 (file)
@@ -90,6 +90,12 @@ static void of_coresight_get_ports_legacy(const struct device_node *node,
        struct of_endpoint endpoint;
        int in = 0, out = 0;
 
+       /*
+        * Avoid warnings in of_graph_get_next_endpoint()
+        * if the device doesn't have any graph connections
+        */
+       if (!of_graph_is_present(node))
+               return;
        do {
                ep = of_graph_get_next_endpoint(node, ep);
                if (!ep)
index f5f654e..ff1dd20 100644 (file)
@@ -232,4 +232,7 @@ coresight_find_csdev_by_fwnode(struct fwnode_handle *r_fwnode);
 void coresight_set_assoc_ectdev_mutex(struct coresight_device *csdev,
                                      struct coresight_device *ect_csdev);
 
+void coresight_set_percpu_sink(int cpu, struct coresight_device *csdev);
+struct coresight_device *coresight_get_percpu_sink(int cpu);
+
 #endif
diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c
new file mode 100644 (file)
index 0000000..1768684
--- /dev/null
@@ -0,0 +1,1157 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This driver enables Trace Buffer Extension (TRBE) as a per-cpu coresight
+ * sink device could then pair with an appropriate per-cpu coresight source
+ * device (ETE) thus generating required trace data. Trace can be enabled
+ * via the perf framework.
+ *
+ * The AUX buffer handling is inspired from Arm SPE PMU driver.
+ *
+ * Copyright (C) 2020 ARM Ltd.
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#define DRVNAME "arm_trbe"
+
+#define pr_fmt(fmt) DRVNAME ": " fmt
+
+#include <asm/barrier.h>
+#include "coresight-trbe.h"
+
+#define PERF_IDX2OFF(idx, buf) ((idx) % ((buf)->nr_pages << PAGE_SHIFT))
+
+/*
+ * A padding packet that will help the user space tools
+ * in skipping relevant sections in the captured trace
+ * data which could not be decoded. TRBE doesn't support
+ * formatting the trace data, unlike the legacy CoreSight
+ * sinks and thus we use ETE trace packets to pad the
+ * sections of the buffer.
+ */
+#define ETE_IGNORE_PACKET              0x70
+
+/*
+ * Minimum amount of meaningful trace will contain:
+ * A-Sync, Trace Info, Trace On, Address, Atom.
+ * This is about 44bytes of ETE trace. To be on
+ * the safer side, we assume 64bytes is the minimum
+ * space required for a meaningful session, before
+ * we hit a "WRAP" event.
+ */
+#define TRBE_TRACE_MIN_BUF_SIZE                64
+
+enum trbe_fault_action {
+       TRBE_FAULT_ACT_WRAP,
+       TRBE_FAULT_ACT_SPURIOUS,
+       TRBE_FAULT_ACT_FATAL,
+};
+
+struct trbe_buf {
+       /*
+        * Even though trbe_base represents vmap()
+        * mapped allocated buffer's start address,
+        * it's being as unsigned long for various
+        * arithmetic and comparision operations &
+        * also to be consistent with trbe_write &
+        * trbe_limit sibling pointers.
+        */
+       unsigned long trbe_base;
+       unsigned long trbe_limit;
+       unsigned long trbe_write;
+       int nr_pages;
+       void **pages;
+       bool snapshot;
+       struct trbe_cpudata *cpudata;
+};
+
+struct trbe_cpudata {
+       bool trbe_flag;
+       u64 trbe_align;
+       int cpu;
+       enum cs_mode mode;
+       struct trbe_buf *buf;
+       struct trbe_drvdata *drvdata;
+};
+
+struct trbe_drvdata {
+       struct trbe_cpudata __percpu *cpudata;
+       struct perf_output_handle * __percpu *handle;
+       struct hlist_node hotplug_node;
+       int irq;
+       cpumask_t supported_cpus;
+       enum cpuhp_state trbe_online;
+       struct platform_device *pdev;
+};
+
+static int trbe_alloc_node(struct perf_event *event)
+{
+       if (event->cpu == -1)
+               return NUMA_NO_NODE;
+       return cpu_to_node(event->cpu);
+}
+
+static void trbe_drain_buffer(void)
+{
+       tsb_csync();
+       dsb(nsh);
+}
+
+static void trbe_drain_and_disable_local(void)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+
+       trbe_drain_buffer();
+
+       /*
+        * Disable the TRBE without clearing LIMITPTR which
+        * might be required for fetching the buffer limits.
+        */
+       trblimitr &= ~TRBLIMITR_ENABLE;
+       write_sysreg_s(trblimitr, SYS_TRBLIMITR_EL1);
+       isb();
+}
+
+static void trbe_reset_local(void)
+{
+       trbe_drain_and_disable_local();
+       write_sysreg_s(0, SYS_TRBLIMITR_EL1);
+       write_sysreg_s(0, SYS_TRBPTR_EL1);
+       write_sysreg_s(0, SYS_TRBBASER_EL1);
+       write_sysreg_s(0, SYS_TRBSR_EL1);
+}
+
+static void trbe_stop_and_truncate_event(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+
+       /*
+        * We cannot proceed with the buffer collection and we
+        * do not have any data for the current session. The
+        * etm_perf driver expects to close out the aux_buffer
+        * at event_stop(). So disable the TRBE here and leave
+        * the update_buffer() to return a 0 size.
+        */
+       trbe_drain_and_disable_local();
+       perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+       *this_cpu_ptr(buf->cpudata->drvdata->handle) = NULL;
+}
+
+/*
+ * TRBE Buffer Management
+ *
+ * The TRBE buffer spans from the base pointer till the limit pointer. When enabled,
+ * it starts writing trace data from the write pointer onward till the limit pointer.
+ * When the write pointer reaches the address just before the limit pointer, it gets
+ * wrapped around again to the base pointer. This is called a TRBE wrap event, which
+ * generates a maintenance interrupt when operated in WRAP or FILL mode. This driver
+ * uses FILL mode, where the TRBE stops the trace collection at wrap event. The IRQ
+ * handler updates the AUX buffer and re-enables the TRBE with updated WRITE and
+ * LIMIT pointers.
+ *
+ *     Wrap around with an IRQ
+ *     ------ < ------ < ------- < ----- < -----
+ *     |                                       |
+ *     ------ > ------ > ------- > ----- > -----
+ *
+ *     +---------------+-----------------------+
+ *     |               |                       |
+ *     +---------------+-----------------------+
+ *     Base Pointer    Write Pointer           Limit Pointer
+ *
+ * The base and limit pointers always needs to be PAGE_SIZE aligned. But the write
+ * pointer can be aligned to the implementation defined TRBE trace buffer alignment
+ * as captured in trbe_cpudata->trbe_align.
+ *
+ *
+ *             head            tail            wakeup
+ *     +---------------------------------------+----- ~ ~ ------
+ *     |$$$$$$$|################|$$$$$$$$$$$$$$|               |
+ *     +---------------------------------------+----- ~ ~ ------
+ *     Base Pointer    Write Pointer           Limit Pointer
+ *
+ * The perf_output_handle indices (head, tail, wakeup) are monotonically increasing
+ * values which tracks all the driver writes and user reads from the perf auxiliary
+ * buffer. Generally [head..tail] is the area where the driver can write into unless
+ * the wakeup is behind the tail. Enabled TRBE buffer span needs to be adjusted and
+ * configured depending on the perf_output_handle indices, so that the driver does
+ * not override into areas in the perf auxiliary buffer which is being or yet to be
+ * consumed from the user space. The enabled TRBE buffer area is a moving subset of
+ * the allocated perf auxiliary buffer.
+ */
+static void trbe_pad_buf(struct perf_output_handle *handle, int len)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       u64 head = PERF_IDX2OFF(handle->head, buf);
+
+       memset((void *)buf->trbe_base + head, ETE_IGNORE_PACKET, len);
+       if (!buf->snapshot)
+               perf_aux_output_skip(handle, len);
+}
+
+static unsigned long trbe_snapshot_offset(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+
+       /*
+        * The ETE trace has alignment synchronization packets allowing
+        * the decoder to reset in case of an overflow or corruption.
+        * So we can use the entire buffer for the snapshot mode.
+        */
+       return buf->nr_pages * PAGE_SIZE;
+}
+
+/*
+ * TRBE Limit Calculation
+ *
+ * The following markers are used to illustrate various TRBE buffer situations.
+ *
+ * $$$$ - Data area, unconsumed captured trace data, not to be overridden
+ * #### - Free area, enabled, trace will be written
+ * %%%% - Free area, disabled, trace will not be written
+ * ==== - Free area, padded with ETE_IGNORE_PACKET, trace will be skipped
+ */
+static unsigned long __trbe_normal_offset(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       struct trbe_cpudata *cpudata = buf->cpudata;
+       const u64 bufsize = buf->nr_pages * PAGE_SIZE;
+       u64 limit = bufsize;
+       u64 head, tail, wakeup;
+
+       head = PERF_IDX2OFF(handle->head, buf);
+
+       /*
+        *              head
+        *      ------->|
+        *      |
+        *      head    TRBE align      tail
+        * +----|-------|---------------|-------+
+        * |$$$$|=======|###############|$$$$$$$|
+        * +----|-------|---------------|-------+
+        * trbe_base                            trbe_base + nr_pages
+        *
+        * Perf aux buffer output head position can be misaligned depending on
+        * various factors including user space reads. In case misaligned, head
+        * needs to be aligned before TRBE can be configured. Pad the alignment
+        * gap with ETE_IGNORE_PACKET bytes that will be ignored by user tools
+        * and skip this section thus advancing the head.
+        */
+       if (!IS_ALIGNED(head, cpudata->trbe_align)) {
+               unsigned long delta = roundup(head, cpudata->trbe_align) - head;
+
+               delta = min(delta, handle->size);
+               trbe_pad_buf(handle, delta);
+               head = PERF_IDX2OFF(handle->head, buf);
+       }
+
+       /*
+        *      head = tail (size = 0)
+        * +----|-------------------------------+
+        * |$$$$|$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ |
+        * +----|-------------------------------+
+        * trbe_base                            trbe_base + nr_pages
+        *
+        * Perf aux buffer does not have any space for the driver to write into.
+        * Just communicate trace truncation event to the user space by marking
+        * it with PERF_AUX_FLAG_TRUNCATED.
+        */
+       if (!handle->size) {
+               perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+               return 0;
+       }
+
+       /* Compute the tail and wakeup indices now that we've aligned head */
+       tail = PERF_IDX2OFF(handle->head + handle->size, buf);
+       wakeup = PERF_IDX2OFF(handle->wakeup, buf);
+
+       /*
+        * Lets calculate the buffer area which TRBE could write into. There
+        * are three possible scenarios here. Limit needs to be aligned with
+        * PAGE_SIZE per the TRBE requirement. Always avoid clobbering the
+        * unconsumed data.
+        *
+        * 1) head < tail
+        *
+        *      head                    tail
+        * +----|-----------------------|-------+
+        * |$$$$|#######################|$$$$$$$|
+        * +----|-----------------------|-------+
+        * trbe_base                    limit   trbe_base + nr_pages
+        *
+        * TRBE could write into [head..tail] area. Unless the tail is right at
+        * the end of the buffer, neither an wrap around nor an IRQ is expected
+        * while being enabled.
+        *
+        * 2) head == tail
+        *
+        *      head = tail (size > 0)
+        * +----|-------------------------------+
+        * |%%%%|###############################|
+        * +----|-------------------------------+
+        * trbe_base                            limit = trbe_base + nr_pages
+        *
+        * TRBE should just write into [head..base + nr_pages] area even though
+        * the entire buffer is empty. Reason being, when the trace reaches the
+        * end of the buffer, it will just wrap around with an IRQ giving an
+        * opportunity to reconfigure the buffer.
+        *
+        * 3) tail < head
+        *
+        *      tail                    head
+        * +----|-----------------------|-------+
+        * |%%%%|$$$$$$$$$$$$$$$$$$$$$$$|#######|
+        * +----|-----------------------|-------+
+        * trbe_base                            limit = trbe_base + nr_pages
+        *
+        * TRBE should just write into [head..base + nr_pages] area even though
+        * the [trbe_base..tail] is also empty. Reason being, when the trace
+        * reaches the end of the buffer, it will just wrap around with an IRQ
+        * giving an opportunity to reconfigure the buffer.
+        */
+       if (head < tail)
+               limit = round_down(tail, PAGE_SIZE);
+
+       /*
+        * Wakeup may be arbitrarily far into the future. If it's not in the
+        * current generation, either we'll wrap before hitting it, or it's
+        * in the past and has been handled already.
+        *
+        * If there's a wakeup before we wrap, arrange to be woken up by the
+        * page boundary following it. Keep the tail boundary if that's lower.
+        *
+        *      head            wakeup  tail
+        * +----|---------------|-------|-------+
+        * |$$$$|###############|%%%%%%%|$$$$$$$|
+        * +----|---------------|-------|-------+
+        * trbe_base            limit           trbe_base + nr_pages
+        */
+       if (handle->wakeup < (handle->head + handle->size) && head <= wakeup)
+               limit = min(limit, round_up(wakeup, PAGE_SIZE));
+
+       /*
+        * There are two situation when this can happen i.e limit is before
+        * the head and hence TRBE cannot be configured.
+        *
+        * 1) head < tail (aligned down with PAGE_SIZE) and also they are both
+        * within the same PAGE size range.
+        *
+        *                      PAGE_SIZE
+        *              |----------------------|
+        *
+        *              limit   head    tail
+        * +------------|------|--------|-------+
+        * |$$$$$$$$$$$$$$$$$$$|========|$$$$$$$|
+        * +------------|------|--------|-------+
+        * trbe_base                            trbe_base + nr_pages
+        *
+        * 2) head < wakeup (aligned up with PAGE_SIZE) < tail and also both
+        * head and wakeup are within same PAGE size range.
+        *
+        *              PAGE_SIZE
+        *      |----------------------|
+        *
+        *      limit   head    wakeup  tail
+        * +----|------|-------|--------|-------+
+        * |$$$$$$$$$$$|=======|========|$$$$$$$|
+        * +----|------|-------|--------|-------+
+        * trbe_base                            trbe_base + nr_pages
+        */
+       if (limit > head)
+               return limit;
+
+       trbe_pad_buf(handle, handle->size);
+       perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+       return 0;
+}
+
+static unsigned long trbe_normal_offset(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = perf_get_aux(handle);
+       u64 limit = __trbe_normal_offset(handle);
+       u64 head = PERF_IDX2OFF(handle->head, buf);
+
+       /*
+        * If the head is too close to the limit and we don't
+        * have space for a meaningful run, we rather pad it
+        * and start fresh.
+        */
+       if (limit && (limit - head < TRBE_TRACE_MIN_BUF_SIZE)) {
+               trbe_pad_buf(handle, limit - head);
+               limit = __trbe_normal_offset(handle);
+       }
+       return limit;
+}
+
+static unsigned long compute_trbe_buffer_limit(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       unsigned long offset;
+
+       if (buf->snapshot)
+               offset = trbe_snapshot_offset(handle);
+       else
+               offset = trbe_normal_offset(handle);
+       return buf->trbe_base + offset;
+}
+
+static void clr_trbe_status(void)
+{
+       u64 trbsr = read_sysreg_s(SYS_TRBSR_EL1);
+
+       WARN_ON(is_trbe_enabled());
+       trbsr &= ~TRBSR_IRQ;
+       trbsr &= ~TRBSR_TRG;
+       trbsr &= ~TRBSR_WRAP;
+       trbsr &= ~(TRBSR_EC_MASK << TRBSR_EC_SHIFT);
+       trbsr &= ~(TRBSR_BSC_MASK << TRBSR_BSC_SHIFT);
+       trbsr &= ~TRBSR_STOP;
+       write_sysreg_s(trbsr, SYS_TRBSR_EL1);
+}
+
+static void set_trbe_limit_pointer_enabled(unsigned long addr)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+
+       WARN_ON(!IS_ALIGNED(addr, (1UL << TRBLIMITR_LIMIT_SHIFT)));
+       WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+
+       trblimitr &= ~TRBLIMITR_NVM;
+       trblimitr &= ~(TRBLIMITR_FILL_MODE_MASK << TRBLIMITR_FILL_MODE_SHIFT);
+       trblimitr &= ~(TRBLIMITR_TRIG_MODE_MASK << TRBLIMITR_TRIG_MODE_SHIFT);
+       trblimitr &= ~(TRBLIMITR_LIMIT_MASK << TRBLIMITR_LIMIT_SHIFT);
+
+       /*
+        * Fill trace buffer mode is used here while configuring the
+        * TRBE for trace capture. In this particular mode, the trace
+        * collection is stopped and a maintenance interrupt is raised
+        * when the current write pointer wraps. This pause in trace
+        * collection gives the software an opportunity to capture the
+        * trace data in the interrupt handler, before reconfiguring
+        * the TRBE.
+        */
+       trblimitr |= (TRBE_FILL_MODE_FILL & TRBLIMITR_FILL_MODE_MASK) << TRBLIMITR_FILL_MODE_SHIFT;
+
+       /*
+        * Trigger mode is not used here while configuring the TRBE for
+        * the trace capture. Hence just keep this in the ignore mode.
+        */
+       trblimitr |= (TRBE_TRIG_MODE_IGNORE & TRBLIMITR_TRIG_MODE_MASK) <<
+                     TRBLIMITR_TRIG_MODE_SHIFT;
+       trblimitr |= (addr & PAGE_MASK);
+
+       trblimitr |= TRBLIMITR_ENABLE;
+       write_sysreg_s(trblimitr, SYS_TRBLIMITR_EL1);
+
+       /* Synchronize the TRBE enable event */
+       isb();
+}
+
+static void trbe_enable_hw(struct trbe_buf *buf)
+{
+       WARN_ON(buf->trbe_write < buf->trbe_base);
+       WARN_ON(buf->trbe_write >= buf->trbe_limit);
+       set_trbe_disabled();
+       isb();
+       clr_trbe_status();
+       set_trbe_base_pointer(buf->trbe_base);
+       set_trbe_write_pointer(buf->trbe_write);
+
+       /*
+        * Synchronize all the register updates
+        * till now before enabling the TRBE.
+        */
+       isb();
+       set_trbe_limit_pointer_enabled(buf->trbe_limit);
+}
+
+static enum trbe_fault_action trbe_get_fault_act(u64 trbsr)
+{
+       int ec = get_trbe_ec(trbsr);
+       int bsc = get_trbe_bsc(trbsr);
+
+       WARN_ON(is_trbe_running(trbsr));
+       if (is_trbe_trg(trbsr) || is_trbe_abort(trbsr))
+               return TRBE_FAULT_ACT_FATAL;
+
+       if ((ec == TRBE_EC_STAGE1_ABORT) || (ec == TRBE_EC_STAGE2_ABORT))
+               return TRBE_FAULT_ACT_FATAL;
+
+       if (is_trbe_wrap(trbsr) && (ec == TRBE_EC_OTHERS) && (bsc == TRBE_BSC_FILLED)) {
+               if (get_trbe_write_pointer() == get_trbe_base_pointer())
+                       return TRBE_FAULT_ACT_WRAP;
+       }
+       return TRBE_FAULT_ACT_SPURIOUS;
+}
+
+static void *arm_trbe_alloc_buffer(struct coresight_device *csdev,
+                                  struct perf_event *event, void **pages,
+                                  int nr_pages, bool snapshot)
+{
+       struct trbe_buf *buf;
+       struct page **pglist;
+       int i;
+
+       /*
+        * TRBE LIMIT and TRBE WRITE pointers must be page aligned. But with
+        * just a single page, there would not be any room left while writing
+        * into a partially filled TRBE buffer after the page size alignment.
+        * Hence restrict the minimum buffer size as two pages.
+        */
+       if (nr_pages < 2)
+               return NULL;
+
+       buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, trbe_alloc_node(event));
+       if (!buf)
+               return ERR_PTR(-ENOMEM);
+
+       pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL);
+       if (!pglist) {
+               kfree(buf);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       for (i = 0; i < nr_pages; i++)
+               pglist[i] = virt_to_page(pages[i]);
+
+       buf->trbe_base = (unsigned long)vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL);
+       if (!buf->trbe_base) {
+               kfree(pglist);
+               kfree(buf);
+               return ERR_PTR(-ENOMEM);
+       }
+       buf->trbe_limit = buf->trbe_base + nr_pages * PAGE_SIZE;
+       buf->trbe_write = buf->trbe_base;
+       buf->snapshot = snapshot;
+       buf->nr_pages = nr_pages;
+       buf->pages = pages;
+       kfree(pglist);
+       return buf;
+}
+
+static void arm_trbe_free_buffer(void *config)
+{
+       struct trbe_buf *buf = config;
+
+       vunmap((void *)buf->trbe_base);
+       kfree(buf);
+}
+
+static unsigned long arm_trbe_update_buffer(struct coresight_device *csdev,
+                                           struct perf_output_handle *handle,
+                                           void *config)
+{
+       struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+       struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev);
+       struct trbe_buf *buf = config;
+       enum trbe_fault_action act;
+       unsigned long size, offset;
+       unsigned long write, base, status;
+       unsigned long flags;
+
+       WARN_ON(buf->cpudata != cpudata);
+       WARN_ON(cpudata->cpu != smp_processor_id());
+       WARN_ON(cpudata->drvdata != drvdata);
+       if (cpudata->mode != CS_MODE_PERF)
+               return 0;
+
+       perf_aux_output_flag(handle, PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW);
+
+       /*
+        * We are about to disable the TRBE. And this could in turn
+        * fill up the buffer triggering, an IRQ. This could be consumed
+        * by the PE asynchronously, causing a race here against
+        * the IRQ handler in closing out the handle. So, let us
+        * make sure the IRQ can't trigger while we are collecting
+        * the buffer. We also make sure that a WRAP event is handled
+        * accordingly.
+        */
+       local_irq_save(flags);
+
+       /*
+        * If the TRBE was disabled due to lack of space in the AUX buffer or a
+        * spurious fault, the driver leaves it disabled, truncating the buffer.
+        * Since the etm_perf driver expects to close out the AUX buffer, the
+        * driver skips it. Thus, just pass in 0 size here to indicate that the
+        * buffer was truncated.
+        */
+       if (!is_trbe_enabled()) {
+               size = 0;
+               goto done;
+       }
+       /*
+        * perf handle structure needs to be shared with the TRBE IRQ handler for
+        * capturing trace data and restarting the handle. There is a probability
+        * of an undefined reference based crash when etm event is being stopped
+        * while a TRBE IRQ also getting processed. This happens due the release
+        * of perf handle via perf_aux_output_end() in etm_event_stop(). Stopping
+        * the TRBE here will ensure that no IRQ could be generated when the perf
+        * handle gets freed in etm_event_stop().
+        */
+       trbe_drain_and_disable_local();
+       write = get_trbe_write_pointer();
+       base = get_trbe_base_pointer();
+
+       /* Check if there is a pending interrupt and handle it here */
+       status = read_sysreg_s(SYS_TRBSR_EL1);
+       if (is_trbe_irq(status)) {
+
+               /*
+                * Now that we are handling the IRQ here, clear the IRQ
+                * from the status, to let the irq handler know that it
+                * is taken care of.
+                */
+               clr_trbe_irq();
+               isb();
+
+               act = trbe_get_fault_act(status);
+               /*
+                * If this was not due to a WRAP event, we have some
+                * errors and as such buffer is empty.
+                */
+               if (act != TRBE_FAULT_ACT_WRAP) {
+                       size = 0;
+                       goto done;
+               }
+
+               /*
+                * Otherwise, the buffer is full and the write pointer
+                * has reached base. Adjust this back to the Limit pointer
+                * for correct size. Also, mark the buffer truncated.
+                */
+               write = get_trbe_limit_pointer();
+               perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+       }
+
+       offset = write - base;
+       if (WARN_ON_ONCE(offset < PERF_IDX2OFF(handle->head, buf)))
+               size = 0;
+       else
+               size = offset - PERF_IDX2OFF(handle->head, buf);
+
+done:
+       local_irq_restore(flags);
+
+       if (buf->snapshot)
+               handle->head += size;
+       return size;
+}
+
+static int arm_trbe_enable(struct coresight_device *csdev, u32 mode, void *data)
+{
+       struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+       struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev);
+       struct perf_output_handle *handle = data;
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+
+       WARN_ON(cpudata->cpu != smp_processor_id());
+       WARN_ON(cpudata->drvdata != drvdata);
+       if (mode != CS_MODE_PERF)
+               return -EINVAL;
+
+       *this_cpu_ptr(drvdata->handle) = handle;
+       cpudata->buf = buf;
+       cpudata->mode = mode;
+       buf->cpudata = cpudata;
+       buf->trbe_limit = compute_trbe_buffer_limit(handle);
+       buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf);
+       if (buf->trbe_limit == buf->trbe_base) {
+               trbe_stop_and_truncate_event(handle);
+               return 0;
+       }
+       trbe_enable_hw(buf);
+       return 0;
+}
+
+static int arm_trbe_disable(struct coresight_device *csdev)
+{
+       struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+       struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev);
+       struct trbe_buf *buf = cpudata->buf;
+
+       WARN_ON(buf->cpudata != cpudata);
+       WARN_ON(cpudata->cpu != smp_processor_id());
+       WARN_ON(cpudata->drvdata != drvdata);
+       if (cpudata->mode != CS_MODE_PERF)
+               return -EINVAL;
+
+       trbe_drain_and_disable_local();
+       buf->cpudata = NULL;
+       cpudata->buf = NULL;
+       cpudata->mode = CS_MODE_DISABLED;
+       return 0;
+}
+
+static void trbe_handle_spurious(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+
+       buf->trbe_limit = compute_trbe_buffer_limit(handle);
+       buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf);
+       if (buf->trbe_limit == buf->trbe_base) {
+               trbe_drain_and_disable_local();
+               return;
+       }
+       trbe_enable_hw(buf);
+}
+
+static void trbe_handle_overflow(struct perf_output_handle *handle)
+{
+       struct perf_event *event = handle->event;
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       unsigned long offset, size;
+       struct etm_event_data *event_data;
+
+       offset = get_trbe_limit_pointer() - get_trbe_base_pointer();
+       size = offset - PERF_IDX2OFF(handle->head, buf);
+       if (buf->snapshot)
+               handle->head += size;
+
+       /*
+        * Mark the buffer as truncated, as we have stopped the trace
+        * collection upon the WRAP event, without stopping the source.
+        */
+       perf_aux_output_flag(handle, PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW |
+                                    PERF_AUX_FLAG_TRUNCATED);
+       perf_aux_output_end(handle, size);
+       event_data = perf_aux_output_begin(handle, event);
+       if (!event_data) {
+               /*
+                * We are unable to restart the trace collection,
+                * thus leave the TRBE disabled. The etm-perf driver
+                * is able to detect this with a disconnected handle
+                * (handle->event = NULL).
+                */
+               trbe_drain_and_disable_local();
+               *this_cpu_ptr(buf->cpudata->drvdata->handle) = NULL;
+               return;
+       }
+       buf->trbe_limit = compute_trbe_buffer_limit(handle);
+       buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf);
+       if (buf->trbe_limit == buf->trbe_base) {
+               trbe_stop_and_truncate_event(handle);
+               return;
+       }
+       *this_cpu_ptr(buf->cpudata->drvdata->handle) = handle;
+       trbe_enable_hw(buf);
+}
+
+static bool is_perf_trbe(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       struct trbe_cpudata *cpudata = buf->cpudata;
+       struct trbe_drvdata *drvdata = cpudata->drvdata;
+       int cpu = smp_processor_id();
+
+       WARN_ON(buf->trbe_base != get_trbe_base_pointer());
+       WARN_ON(buf->trbe_limit != get_trbe_limit_pointer());
+
+       if (cpudata->mode != CS_MODE_PERF)
+               return false;
+
+       if (cpudata->cpu != cpu)
+               return false;
+
+       if (!cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+               return false;
+
+       return true;
+}
+
+static irqreturn_t arm_trbe_irq_handler(int irq, void *dev)
+{
+       struct perf_output_handle **handle_ptr = dev;
+       struct perf_output_handle *handle = *handle_ptr;
+       enum trbe_fault_action act;
+       u64 status;
+
+       /*
+        * Ensure the trace is visible to the CPUs and
+        * any external aborts have been resolved.
+        */
+       trbe_drain_and_disable_local();
+
+       status = read_sysreg_s(SYS_TRBSR_EL1);
+       /*
+        * If the pending IRQ was handled by update_buffer callback
+        * we have nothing to do here.
+        */
+       if (!is_trbe_irq(status))
+               return IRQ_NONE;
+
+       clr_trbe_irq();
+       isb();
+
+       if (WARN_ON_ONCE(!handle) || !perf_get_aux(handle))
+               return IRQ_NONE;
+
+       if (!is_perf_trbe(handle))
+               return IRQ_NONE;
+
+       /*
+        * Ensure perf callbacks have completed, which may disable
+        * the trace buffer in response to a TRUNCATION flag.
+        */
+       irq_work_run();
+
+       act = trbe_get_fault_act(status);
+       switch (act) {
+       case TRBE_FAULT_ACT_WRAP:
+               trbe_handle_overflow(handle);
+               break;
+       case TRBE_FAULT_ACT_SPURIOUS:
+               trbe_handle_spurious(handle);
+               break;
+       case TRBE_FAULT_ACT_FATAL:
+               trbe_stop_and_truncate_event(handle);
+               break;
+       }
+       return IRQ_HANDLED;
+}
+
+static const struct coresight_ops_sink arm_trbe_sink_ops = {
+       .enable         = arm_trbe_enable,
+       .disable        = arm_trbe_disable,
+       .alloc_buffer   = arm_trbe_alloc_buffer,
+       .free_buffer    = arm_trbe_free_buffer,
+       .update_buffer  = arm_trbe_update_buffer,
+};
+
+static const struct coresight_ops arm_trbe_cs_ops = {
+       .sink_ops       = &arm_trbe_sink_ops,
+};
+
+static ssize_t align_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct trbe_cpudata *cpudata = dev_get_drvdata(dev);
+
+       return sprintf(buf, "%llx\n", cpudata->trbe_align);
+}
+static DEVICE_ATTR_RO(align);
+
+static ssize_t flag_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct trbe_cpudata *cpudata = dev_get_drvdata(dev);
+
+       return sprintf(buf, "%d\n", cpudata->trbe_flag);
+}
+static DEVICE_ATTR_RO(flag);
+
+static struct attribute *arm_trbe_attrs[] = {
+       &dev_attr_align.attr,
+       &dev_attr_flag.attr,
+       NULL,
+};
+
+static const struct attribute_group arm_trbe_group = {
+       .attrs = arm_trbe_attrs,
+};
+
+static const struct attribute_group *arm_trbe_groups[] = {
+       &arm_trbe_group,
+       NULL,
+};
+
+static void arm_trbe_enable_cpu(void *info)
+{
+       struct trbe_drvdata *drvdata = info;
+
+       trbe_reset_local();
+       enable_percpu_irq(drvdata->irq, IRQ_TYPE_NONE);
+}
+
+static void arm_trbe_register_coresight_cpu(struct trbe_drvdata *drvdata, int cpu)
+{
+       struct trbe_cpudata *cpudata = per_cpu_ptr(drvdata->cpudata, cpu);
+       struct coresight_device *trbe_csdev = coresight_get_percpu_sink(cpu);
+       struct coresight_desc desc = { 0 };
+       struct device *dev;
+
+       if (WARN_ON(trbe_csdev))
+               return;
+
+       dev = &cpudata->drvdata->pdev->dev;
+       desc.name = devm_kasprintf(dev, GFP_KERNEL, "trbe%d", cpu);
+       if (!desc.name)
+               goto cpu_clear;
+
+       desc.type = CORESIGHT_DEV_TYPE_SINK;
+       desc.subtype.sink_subtype = CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM;
+       desc.ops = &arm_trbe_cs_ops;
+       desc.pdata = dev_get_platdata(dev);
+       desc.groups = arm_trbe_groups;
+       desc.dev = dev;
+       trbe_csdev = coresight_register(&desc);
+       if (IS_ERR(trbe_csdev))
+               goto cpu_clear;
+
+       dev_set_drvdata(&trbe_csdev->dev, cpudata);
+       coresight_set_percpu_sink(cpu, trbe_csdev);
+       return;
+cpu_clear:
+       cpumask_clear_cpu(cpu, &drvdata->supported_cpus);
+}
+
+static void arm_trbe_probe_cpu(void *info)
+{
+       struct trbe_drvdata *drvdata = info;
+       int cpu = smp_processor_id();
+       struct trbe_cpudata *cpudata = per_cpu_ptr(drvdata->cpudata, cpu);
+       u64 trbidr;
+
+       if (WARN_ON(!cpudata))
+               goto cpu_clear;
+
+       if (!is_trbe_available()) {
+               pr_err("TRBE is not implemented on cpu %d\n", cpu);
+               goto cpu_clear;
+       }
+
+       trbidr = read_sysreg_s(SYS_TRBIDR_EL1);
+       if (!is_trbe_programmable(trbidr)) {
+               pr_err("TRBE is owned in higher exception level on cpu %d\n", cpu);
+               goto cpu_clear;
+       }
+
+       cpudata->trbe_align = 1ULL << get_trbe_address_align(trbidr);
+       if (cpudata->trbe_align > SZ_2K) {
+               pr_err("Unsupported alignment on cpu %d\n", cpu);
+               goto cpu_clear;
+       }
+       cpudata->trbe_flag = get_trbe_flag_update(trbidr);
+       cpudata->cpu = cpu;
+       cpudata->drvdata = drvdata;
+       return;
+cpu_clear:
+       cpumask_clear_cpu(cpu, &drvdata->supported_cpus);
+}
+
+static void arm_trbe_remove_coresight_cpu(void *info)
+{
+       int cpu = smp_processor_id();
+       struct trbe_drvdata *drvdata = info;
+       struct trbe_cpudata *cpudata = per_cpu_ptr(drvdata->cpudata, cpu);
+       struct coresight_device *trbe_csdev = coresight_get_percpu_sink(cpu);
+
+       disable_percpu_irq(drvdata->irq);
+       trbe_reset_local();
+       if (trbe_csdev) {
+               coresight_unregister(trbe_csdev);
+               cpudata->drvdata = NULL;
+               coresight_set_percpu_sink(cpu, NULL);
+       }
+}
+
+static int arm_trbe_probe_coresight(struct trbe_drvdata *drvdata)
+{
+       int cpu;
+
+       drvdata->cpudata = alloc_percpu(typeof(*drvdata->cpudata));
+       if (!drvdata->cpudata)
+               return -ENOMEM;
+
+       for_each_cpu(cpu, &drvdata->supported_cpus) {
+               smp_call_function_single(cpu, arm_trbe_probe_cpu, drvdata, 1);
+               if (cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+                       arm_trbe_register_coresight_cpu(drvdata, cpu);
+               if (cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+                       smp_call_function_single(cpu, arm_trbe_enable_cpu, drvdata, 1);
+       }
+       return 0;
+}
+
+static int arm_trbe_remove_coresight(struct trbe_drvdata *drvdata)
+{
+       int cpu;
+
+       for_each_cpu(cpu, &drvdata->supported_cpus)
+               smp_call_function_single(cpu, arm_trbe_remove_coresight_cpu, drvdata, 1);
+       free_percpu(drvdata->cpudata);
+       return 0;
+}
+
+static int arm_trbe_cpu_startup(unsigned int cpu, struct hlist_node *node)
+{
+       struct trbe_drvdata *drvdata = hlist_entry_safe(node, struct trbe_drvdata, hotplug_node);
+
+       if (cpumask_test_cpu(cpu, &drvdata->supported_cpus)) {
+
+               /*
+                * If this CPU was not probed for TRBE,
+                * initialize it now.
+                */
+               if (!coresight_get_percpu_sink(cpu)) {
+                       arm_trbe_probe_cpu(drvdata);
+                       if (cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+                               arm_trbe_register_coresight_cpu(drvdata, cpu);
+                       if (cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+                               arm_trbe_enable_cpu(drvdata);
+               } else {
+                       arm_trbe_enable_cpu(drvdata);
+               }
+       }
+       return 0;
+}
+
+static int arm_trbe_cpu_teardown(unsigned int cpu, struct hlist_node *node)
+{
+       struct trbe_drvdata *drvdata = hlist_entry_safe(node, struct trbe_drvdata, hotplug_node);
+
+       if (cpumask_test_cpu(cpu, &drvdata->supported_cpus)) {
+               disable_percpu_irq(drvdata->irq);
+               trbe_reset_local();
+       }
+       return 0;
+}
+
+static int arm_trbe_probe_cpuhp(struct trbe_drvdata *drvdata)
+{
+       enum cpuhp_state trbe_online;
+       int ret;
+
+       trbe_online = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, DRVNAME,
+                                             arm_trbe_cpu_startup, arm_trbe_cpu_teardown);
+       if (trbe_online < 0)
+               return trbe_online;
+
+       ret = cpuhp_state_add_instance(trbe_online, &drvdata->hotplug_node);
+       if (ret) {
+               cpuhp_remove_multi_state(trbe_online);
+               return ret;
+       }
+       drvdata->trbe_online = trbe_online;
+       return 0;
+}
+
+static void arm_trbe_remove_cpuhp(struct trbe_drvdata *drvdata)
+{
+       cpuhp_remove_multi_state(drvdata->trbe_online);
+}
+
+static int arm_trbe_probe_irq(struct platform_device *pdev,
+                             struct trbe_drvdata *drvdata)
+{
+       int ret;
+
+       drvdata->irq = platform_get_irq(pdev, 0);
+       if (drvdata->irq < 0) {
+               pr_err("IRQ not found for the platform device\n");
+               return drvdata->irq;
+       }
+
+       if (!irq_is_percpu(drvdata->irq)) {
+               pr_err("IRQ is not a PPI\n");
+               return -EINVAL;
+       }
+
+       if (irq_get_percpu_devid_partition(drvdata->irq, &drvdata->supported_cpus))
+               return -EINVAL;
+
+       drvdata->handle = alloc_percpu(struct perf_output_handle *);
+       if (!drvdata->handle)
+               return -ENOMEM;
+
+       ret = request_percpu_irq(drvdata->irq, arm_trbe_irq_handler, DRVNAME, drvdata->handle);
+       if (ret) {
+               free_percpu(drvdata->handle);
+               return ret;
+       }
+       return 0;
+}
+
+static void arm_trbe_remove_irq(struct trbe_drvdata *drvdata)
+{
+       free_percpu_irq(drvdata->irq, drvdata->handle);
+       free_percpu(drvdata->handle);
+}
+
+static int arm_trbe_device_probe(struct platform_device *pdev)
+{
+       struct coresight_platform_data *pdata;
+       struct trbe_drvdata *drvdata;
+       struct device *dev = &pdev->dev;
+       int ret;
+
+       drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
+       if (!drvdata)
+               return -ENOMEM;
+
+       pdata = coresight_get_platform_data(dev);
+       if (IS_ERR(pdata))
+               return PTR_ERR(pdata);
+
+       dev_set_drvdata(dev, drvdata);
+       dev->platform_data = pdata;
+       drvdata->pdev = pdev;
+       ret = arm_trbe_probe_irq(pdev, drvdata);
+       if (ret)
+               return ret;
+
+       ret = arm_trbe_probe_coresight(drvdata);
+       if (ret)
+               goto probe_failed;
+
+       ret = arm_trbe_probe_cpuhp(drvdata);
+       if (ret)
+               goto cpuhp_failed;
+
+       return 0;
+cpuhp_failed:
+       arm_trbe_remove_coresight(drvdata);
+probe_failed:
+       arm_trbe_remove_irq(drvdata);
+       return ret;
+}
+
+static int arm_trbe_device_remove(struct platform_device *pdev)
+{
+       struct trbe_drvdata *drvdata = platform_get_drvdata(pdev);
+
+       arm_trbe_remove_cpuhp(drvdata);
+       arm_trbe_remove_coresight(drvdata);
+       arm_trbe_remove_irq(drvdata);
+       return 0;
+}
+
+static const struct of_device_id arm_trbe_of_match[] = {
+       { .compatible = "arm,trace-buffer-extension"},
+       {},
+};
+MODULE_DEVICE_TABLE(of, arm_trbe_of_match);
+
+static struct platform_driver arm_trbe_driver = {
+       .driver = {
+               .name = DRVNAME,
+               .of_match_table = of_match_ptr(arm_trbe_of_match),
+               .suppress_bind_attrs = true,
+       },
+       .probe  = arm_trbe_device_probe,
+       .remove = arm_trbe_device_remove,
+};
+
+static int __init arm_trbe_init(void)
+{
+       int ret;
+
+       if (arm64_kernel_unmapped_at_el0()) {
+               pr_err("TRBE wouldn't work if kernel gets unmapped at EL0\n");
+               return -EOPNOTSUPP;
+       }
+
+       ret = platform_driver_register(&arm_trbe_driver);
+       if (!ret)
+               return 0;
+
+       pr_err("Error registering %s platform driver\n", DRVNAME);
+       return ret;
+}
+
+static void __exit arm_trbe_exit(void)
+{
+       platform_driver_unregister(&arm_trbe_driver);
+}
+module_init(arm_trbe_init);
+module_exit(arm_trbe_exit);
+
+MODULE_AUTHOR("Anshuman Khandual <anshuman.khandual@arm.com>");
+MODULE_DESCRIPTION("Arm Trace Buffer Extension (TRBE) driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/hwtracing/coresight/coresight-trbe.h b/drivers/hwtracing/coresight/coresight-trbe.h
new file mode 100644 (file)
index 0000000..abf3e36
--- /dev/null
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This contains all required hardware related helper functions for
+ * Trace Buffer Extension (TRBE) driver in the coresight framework.
+ *
+ * Copyright (C) 2020 ARM Ltd.
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#include <linux/coresight.h>
+#include <linux/device.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/smp.h>
+
+#include "coresight-etm-perf.h"
+
+static inline bool is_trbe_available(void)
+{
+       u64 aa64dfr0 = read_sysreg_s(SYS_ID_AA64DFR0_EL1);
+       unsigned int trbe = cpuid_feature_extract_unsigned_field(aa64dfr0, ID_AA64DFR0_TRBE_SHIFT);
+
+       return trbe >= 0b0001;
+}
+
+static inline bool is_trbe_enabled(void)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+
+       return trblimitr & TRBLIMITR_ENABLE;
+}
+
+#define TRBE_EC_OTHERS         0
+#define TRBE_EC_STAGE1_ABORT   36
+#define TRBE_EC_STAGE2_ABORT   37
+
+static inline int get_trbe_ec(u64 trbsr)
+{
+       return (trbsr >> TRBSR_EC_SHIFT) & TRBSR_EC_MASK;
+}
+
+#define TRBE_BSC_NOT_STOPPED 0
+#define TRBE_BSC_FILLED      1
+#define TRBE_BSC_TRIGGERED   2
+
+static inline int get_trbe_bsc(u64 trbsr)
+{
+       return (trbsr >> TRBSR_BSC_SHIFT) & TRBSR_BSC_MASK;
+}
+
+static inline void clr_trbe_irq(void)
+{
+       u64 trbsr = read_sysreg_s(SYS_TRBSR_EL1);
+
+       trbsr &= ~TRBSR_IRQ;
+       write_sysreg_s(trbsr, SYS_TRBSR_EL1);
+}
+
+static inline bool is_trbe_irq(u64 trbsr)
+{
+       return trbsr & TRBSR_IRQ;
+}
+
+static inline bool is_trbe_trg(u64 trbsr)
+{
+       return trbsr & TRBSR_TRG;
+}
+
+static inline bool is_trbe_wrap(u64 trbsr)
+{
+       return trbsr & TRBSR_WRAP;
+}
+
+static inline bool is_trbe_abort(u64 trbsr)
+{
+       return trbsr & TRBSR_ABORT;
+}
+
+static inline bool is_trbe_running(u64 trbsr)
+{
+       return !(trbsr & TRBSR_STOP);
+}
+
+#define TRBE_TRIG_MODE_STOP            0
+#define TRBE_TRIG_MODE_IRQ             1
+#define TRBE_TRIG_MODE_IGNORE          3
+
+#define TRBE_FILL_MODE_FILL            0
+#define TRBE_FILL_MODE_WRAP            1
+#define TRBE_FILL_MODE_CIRCULAR_BUFFER 3
+
+static inline void set_trbe_disabled(void)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+
+       trblimitr &= ~TRBLIMITR_ENABLE;
+       write_sysreg_s(trblimitr, SYS_TRBLIMITR_EL1);
+}
+
+static inline bool get_trbe_flag_update(u64 trbidr)
+{
+       return trbidr & TRBIDR_FLAG;
+}
+
+static inline bool is_trbe_programmable(u64 trbidr)
+{
+       return !(trbidr & TRBIDR_PROG);
+}
+
+static inline int get_trbe_address_align(u64 trbidr)
+{
+       return (trbidr >> TRBIDR_ALIGN_SHIFT) & TRBIDR_ALIGN_MASK;
+}
+
+static inline unsigned long get_trbe_write_pointer(void)
+{
+       return read_sysreg_s(SYS_TRBPTR_EL1);
+}
+
+static inline void set_trbe_write_pointer(unsigned long addr)
+{
+       WARN_ON(is_trbe_enabled());
+       write_sysreg_s(addr, SYS_TRBPTR_EL1);
+}
+
+static inline unsigned long get_trbe_limit_pointer(void)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+       unsigned long addr = trblimitr & (TRBLIMITR_LIMIT_MASK << TRBLIMITR_LIMIT_SHIFT);
+
+       WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+       return addr;
+}
+
+static inline unsigned long get_trbe_base_pointer(void)
+{
+       u64 trbbaser = read_sysreg_s(SYS_TRBBASER_EL1);
+       unsigned long addr = trbbaser & (TRBBASER_BASE_MASK << TRBBASER_BASE_SHIFT);
+
+       WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+       return addr;
+}
+
+static inline void set_trbe_base_pointer(unsigned long addr)
+{
+       WARN_ON(is_trbe_enabled());
+       WARN_ON(!IS_ALIGNED(addr, (1UL << TRBBASER_BASE_SHIFT)));
+       WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+       write_sysreg_s(addr, SYS_TRBBASER_EL1);
+}
index f8e9b73..e2e12a5 100644 (file)
@@ -2535,7 +2535,7 @@ int i3c_master_register(struct i3c_master_controller *master,
 
        ret = i3c_master_bus_init(master);
        if (ret)
-               goto err_destroy_wq;
+               goto err_put_dev;
 
        ret = device_add(&master->dev);
        if (ret)
@@ -2566,9 +2566,6 @@ err_del_dev:
 err_cleanup_bus:
        i3c_master_bus_cleanup(master);
 
-err_destroy_wq:
-       destroy_workqueue(master->wq);
-
 err_put_dev:
        put_device(&master->dev);
 
index 8d99069..1f6ba42 100644 (file)
@@ -1124,7 +1124,6 @@ static int svc_i3c_master_send_direct_ccc_cmd(struct svc_i3c_master *master,
        cmd->in = NULL;
        cmd->out = &ccc->id;
        cmd->len = 1;
-       cmd->read_len = xfer_len;
        cmd->read_len = 0;
        cmd->continued = true;
 
index 5c9fac7..3b0991f 100644 (file)
@@ -121,7 +121,7 @@ struct ib_gid_table {
        u32                             default_gid_indices;
 };
 
-static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port)
+static void dispatch_gid_change_event(struct ib_device *ib_dev, u32 port)
 {
        struct ib_event event;
 
@@ -197,7 +197,7 @@ int ib_cache_gid_parse_type_str(const char *buf)
 }
 EXPORT_SYMBOL(ib_cache_gid_parse_type_str);
 
-static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port)
+static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u32 port)
 {
        return device->port_data[port].cache.gid;
 }
@@ -237,10 +237,10 @@ static void put_gid_ndev(struct rcu_head *head)
 static void free_gid_entry_locked(struct ib_gid_table_entry *entry)
 {
        struct ib_device *device = entry->attr.device;
-       u8 port_num = entry->attr.port_num;
+       u32 port_num = entry->attr.port_num;
        struct ib_gid_table *table = rdma_gid_table(device, port_num);
 
-       dev_dbg(&device->dev, "%s port=%d index=%d gid %pI6\n", __func__,
+       dev_dbg(&device->dev, "%s port=%u index=%d gid %pI6\n", __func__,
                port_num, entry->attr.index, entry->attr.gid.raw);
 
        write_lock_irq(&table->rwlock);
@@ -282,7 +282,7 @@ static void free_gid_work(struct work_struct *work)
        struct ib_gid_table_entry *entry =
                container_of(work, struct ib_gid_table_entry, del_work);
        struct ib_device *device = entry->attr.device;
-       u8 port_num = entry->attr.port_num;
+       u32 port_num = entry->attr.port_num;
        struct ib_gid_table *table = rdma_gid_table(device, port_num);
 
        mutex_lock(&table->lock);
@@ -379,7 +379,7 @@ static int add_roce_gid(struct ib_gid_table_entry *entry)
  * @ix:                GID entry index to delete
  *
  */
-static void del_gid(struct ib_device *ib_dev, u8 port,
+static void del_gid(struct ib_device *ib_dev, u32 port,
                    struct ib_gid_table *table, int ix)
 {
        struct roce_gid_ndev_storage *ndev_storage;
@@ -387,7 +387,7 @@ static void del_gid(struct ib_device *ib_dev, u8 port,
 
        lockdep_assert_held(&table->lock);
 
-       dev_dbg(&ib_dev->dev, "%s port=%d index=%d gid %pI6\n", __func__, port,
+       dev_dbg(&ib_dev->dev, "%s port=%u index=%d gid %pI6\n", __func__, port,
                ix, table->data_vec[ix]->attr.gid.raw);
 
        write_lock_irq(&table->rwlock);
@@ -543,7 +543,7 @@ static void make_default_gid(struct  net_device *dev, union ib_gid *gid)
        addrconf_ifid_eui48(&gid->raw[8], dev);
 }
 
-static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port,
                              union ib_gid *gid, struct ib_gid_attr *attr,
                              unsigned long mask, bool default_gid)
 {
@@ -587,7 +587,7 @@ out_unlock:
        return ret;
 }
 
-int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+int ib_cache_gid_add(struct ib_device *ib_dev, u32 port,
                     union ib_gid *gid, struct ib_gid_attr *attr)
 {
        unsigned long mask = GID_ATTR_FIND_MASK_GID |
@@ -598,7 +598,7 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
 }
 
 static int
-_ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+_ib_cache_gid_del(struct ib_device *ib_dev, u32 port,
                  union ib_gid *gid, struct ib_gid_attr *attr,
                  unsigned long mask, bool default_gid)
 {
@@ -627,7 +627,7 @@ out_unlock:
        return ret;
 }
 
-int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+int ib_cache_gid_del(struct ib_device *ib_dev, u32 port,
                     union ib_gid *gid, struct ib_gid_attr *attr)
 {
        unsigned long mask = GID_ATTR_FIND_MASK_GID       |
@@ -638,7 +638,7 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
        return _ib_cache_gid_del(ib_dev, port, gid, attr, mask, false);
 }
 
-int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port,
                                     struct net_device *ndev)
 {
        struct ib_gid_table *table;
@@ -683,7 +683,7 @@ const struct ib_gid_attr *
 rdma_find_gid_by_port(struct ib_device *ib_dev,
                      const union ib_gid *gid,
                      enum ib_gid_type gid_type,
-                     u8 port, struct net_device *ndev)
+                     u32 port, struct net_device *ndev)
 {
        int local_index;
        struct ib_gid_table *table;
@@ -734,7 +734,7 @@ EXPORT_SYMBOL(rdma_find_gid_by_port);
  *
  */
 const struct ib_gid_attr *rdma_find_gid_by_filter(
-       struct ib_device *ib_dev, const union ib_gid *gid, u8 port,
+       struct ib_device *ib_dev, const union ib_gid *gid, u32 port,
        bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *,
                       void *),
        void *context)
@@ -818,7 +818,7 @@ static void release_gid_table(struct ib_device *device,
        kfree(table);
 }
 
-static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
+static void cleanup_gid_table_port(struct ib_device *ib_dev, u32 port,
                                   struct ib_gid_table *table)
 {
        int i;
@@ -834,7 +834,7 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
        mutex_unlock(&table->lock);
 }
 
-void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port,
                                  struct net_device *ndev,
                                  unsigned long gid_type_mask,
                                  enum ib_cache_gid_default_mode mode)
@@ -867,7 +867,7 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
        }
 }
 
-static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
+static void gid_table_reserve_default(struct ib_device *ib_dev, u32 port,
                                      struct ib_gid_table *table)
 {
        unsigned int i;
@@ -884,7 +884,7 @@ static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
 
 static void gid_table_release_one(struct ib_device *ib_dev)
 {
-       unsigned int p;
+       u32 p;
 
        rdma_for_each_port (ib_dev, p) {
                release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid);
@@ -895,7 +895,7 @@ static void gid_table_release_one(struct ib_device *ib_dev)
 static int _gid_table_setup_one(struct ib_device *ib_dev)
 {
        struct ib_gid_table *table;
-       unsigned int rdma_port;
+       u32 rdma_port;
 
        rdma_for_each_port (ib_dev, rdma_port) {
                table = alloc_gid_table(
@@ -915,7 +915,7 @@ rollback_table_setup:
 
 static void gid_table_cleanup_one(struct ib_device *ib_dev)
 {
-       unsigned int p;
+       u32 p;
 
        rdma_for_each_port (ib_dev, p)
                cleanup_gid_table_port(ib_dev, p,
@@ -950,7 +950,7 @@ static int gid_table_setup_one(struct ib_device *ib_dev)
  * Returns 0 on success or appropriate error code.
  *
  */
-int rdma_query_gid(struct ib_device *device, u8 port_num,
+int rdma_query_gid(struct ib_device *device, u32 port_num,
                   int index, union ib_gid *gid)
 {
        struct ib_gid_table *table;
@@ -1014,7 +1014,7 @@ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device,
        unsigned long mask = GID_ATTR_FIND_MASK_GID |
                             GID_ATTR_FIND_MASK_GID_TYPE;
        struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
-       unsigned int p;
+       u32 p;
 
        if (ndev)
                mask |= GID_ATTR_FIND_MASK_NETDEV;
@@ -1043,7 +1043,7 @@ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device,
 EXPORT_SYMBOL(rdma_find_gid);
 
 int ib_get_cached_pkey(struct ib_device *device,
-                      u               port_num,
+                      u32               port_num,
                       int               index,
                       u16              *pkey)
 {
@@ -1069,9 +1069,8 @@ int ib_get_cached_pkey(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_get_cached_pkey);
 
-int ib_get_cached_subnet_prefix(struct ib_device *device,
-                               u8                port_num,
-                               u64              *sn_pfx)
+int ib_get_cached_subnet_prefix(struct ib_device *device, u32 port_num,
+                               u64 *sn_pfx)
 {
        unsigned long flags;
 
@@ -1086,10 +1085,8 @@ int ib_get_cached_subnet_prefix(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_get_cached_subnet_prefix);
 
-int ib_find_cached_pkey(struct ib_device *device,
-                       u8                port_num,
-                       u16               pkey,
-                       u16              *index)
+int ib_find_cached_pkey(struct ib_device *device, u32 port_num,
+                       u16 pkey, u16 *index)
 {
        struct ib_pkey_cache *cache;
        unsigned long flags;
@@ -1116,8 +1113,9 @@ int ib_find_cached_pkey(struct ib_device *device,
                                *index = i;
                                ret = 0;
                                break;
-                       } else
+                       } else {
                                partial_ix = i;
+                       }
                }
 
        if (ret && partial_ix >= 0) {
@@ -1132,10 +1130,8 @@ err:
 }
 EXPORT_SYMBOL(ib_find_cached_pkey);
 
-int ib_find_exact_cached_pkey(struct ib_device *device,
-                             u8                port_num,
-                             u16               pkey,
-                             u16              *index)
+int ib_find_exact_cached_pkey(struct ib_device *device, u32 port_num,
+                             u16 pkey, u16 *index)
 {
        struct ib_pkey_cache *cache;
        unsigned long flags;
@@ -1169,9 +1165,7 @@ err:
 }
 EXPORT_SYMBOL(ib_find_exact_cached_pkey);
 
-int ib_get_cached_lmc(struct ib_device *device,
-                     u8                port_num,
-                     u8                *lmc)
+int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc)
 {
        unsigned long flags;
        int ret = 0;
@@ -1187,8 +1181,7 @@ int ib_get_cached_lmc(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_get_cached_lmc);
 
-int ib_get_cached_port_state(struct ib_device   *device,
-                            u8                  port_num,
+int ib_get_cached_port_state(struct ib_device *device, u32 port_num,
                             enum ib_port_state *port_state)
 {
        unsigned long flags;
@@ -1222,7 +1215,7 @@ EXPORT_SYMBOL(ib_get_cached_port_state);
  * code.
  */
 const struct ib_gid_attr *
-rdma_get_gid_attr(struct ib_device *device, u8 port_num, int index)
+rdma_get_gid_attr(struct ib_device *device, u32 port_num, int index)
 {
        const struct ib_gid_attr *attr = ERR_PTR(-ENODATA);
        struct ib_gid_table *table;
@@ -1263,7 +1256,7 @@ ssize_t rdma_query_gid_table(struct ib_device *device,
        const struct ib_gid_attr *gid_attr;
        ssize_t num_entries = 0, ret;
        struct ib_gid_table *table;
-       unsigned int port_num, i;
+       u32 port_num, i;
        struct net_device *ndev;
        unsigned long flags;
 
@@ -1361,7 +1354,7 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr)
                        container_of(attr, struct ib_gid_table_entry, attr);
        struct ib_device *device = entry->attr.device;
        struct net_device *ndev = ERR_PTR(-EINVAL);
-       u8 port_num = entry->attr.port_num;
+       u32 port_num = entry->attr.port_num;
        struct ib_gid_table *table;
        unsigned long flags;
        bool valid;
@@ -1441,7 +1434,7 @@ int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr,
 EXPORT_SYMBOL(rdma_read_gid_l2_fields);
 
 static int config_non_roce_gid_cache(struct ib_device *device,
-                                    u8 port, int gid_tbl_len)
+                                    u32 port, int gid_tbl_len)
 {
        struct ib_gid_attr gid_attr = {};
        struct ib_gid_table *table;
@@ -1472,7 +1465,7 @@ err:
 }
 
 static int
-ib_cache_update(struct ib_device *device, u8 port, bool enforce_security)
+ib_cache_update(struct ib_device *device, u32 port, bool enforce_security)
 {
        struct ib_port_attr       *tprops = NULL;
        struct ib_pkey_cache      *pkey_cache = NULL, *old_pkey_cache;
@@ -1621,7 +1614,7 @@ EXPORT_SYMBOL(ib_dispatch_event);
 
 int ib_cache_setup_one(struct ib_device *device)
 {
-       unsigned int p;
+       u32 p;
        int err;
 
        rwlock_init(&device->cache_lock);
@@ -1641,7 +1634,7 @@ int ib_cache_setup_one(struct ib_device *device)
 
 void ib_cache_release_one(struct ib_device *device)
 {
-       unsigned int p;
+       u32 p;
 
        /*
         * The release function frees all the cache elements.
index 3d194bb..0ead0d2 100644 (file)
@@ -202,7 +202,7 @@ static struct attribute *cm_counter_default_attrs[] = {
 struct cm_port {
        struct cm_device *cm_dev;
        struct ib_mad_agent *mad_agent;
-       u8 port_num;
+       u32 port_num;
        struct list_head cm_priv_prim_list;
        struct list_head cm_priv_altr_list;
        struct cm_counter_group counter_group[CM_COUNTER_GROUPS];
@@ -255,7 +255,8 @@ struct cm_id_private {
        struct completion comp;
        refcount_t refcount;
        /* Number of clients sharing this ib_cm_id. Only valid for listeners.
-        * Protected by the cm.lock spinlock. */
+        * Protected by the cm.lock spinlock.
+        */
        int listen_sharecount;
        struct rcu_head rcu;
 
@@ -420,8 +421,7 @@ static int cm_alloc_response_msg(struct cm_port *port,
        return 0;
 }
 
-static void * cm_copy_private_data(const void *private_data,
-                                  u8 private_data_len)
+static void *cm_copy_private_data(const void *private_data, u8 private_data_len)
 {
        void *data;
 
@@ -680,8 +680,8 @@ static struct cm_id_private *cm_insert_listen(struct cm_id_private *cm_id_priv,
        return cm_id_priv;
 }
 
-static struct cm_id_private * cm_find_listen(struct ib_device *device,
-                                            __be64 service_id)
+static struct cm_id_private *cm_find_listen(struct ib_device *device,
+                                           __be64 service_id)
 {
        struct rb_node *node = cm.listen_service_table.rb_node;
        struct cm_id_private *cm_id_priv;
@@ -708,8 +708,8 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device,
        return NULL;
 }
 
-static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info
-                                                    *timewait_info)
+static struct cm_timewait_info *
+cm_insert_remote_id(struct cm_timewait_info *timewait_info)
 {
        struct rb_node **link = &cm.remote_id_table.rb_node;
        struct rb_node *parent = NULL;
@@ -767,8 +767,8 @@ static struct cm_id_private *cm_find_remote_id(__be64 remote_ca_guid,
        return res;
 }
 
-static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info
-                                                     *timewait_info)
+static struct cm_timewait_info *
+cm_insert_remote_qpn(struct cm_timewait_info *timewait_info)
 {
        struct rb_node **link = &cm.remote_qp_table.rb_node;
        struct rb_node *parent = NULL;
@@ -797,8 +797,8 @@ static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info
        return NULL;
 }
 
-static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private
-                                                   *cm_id_priv)
+static struct cm_id_private *
+cm_insert_remote_sidr(struct cm_id_private *cm_id_priv)
 {
        struct rb_node **link = &cm.remote_sidr_table.rb_node;
        struct rb_node *parent = NULL;
@@ -897,7 +897,7 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_create_cm_id);
 
-static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv)
+static struct cm_work *cm_dequeue_work(struct cm_id_private *cm_id_priv)
 {
        struct cm_work *work;
 
@@ -986,7 +986,7 @@ static void cm_remove_remote(struct cm_id_private *cm_id_priv)
        }
 }
 
-static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id)
+static struct cm_timewait_info *cm_create_timewait_info(__be32 local_id)
 {
        struct cm_timewait_info *timewait_info;
 
@@ -1631,7 +1631,7 @@ static bool cm_req_has_alt_path(struct cm_req_msg *req_msg)
                                               req_msg))));
 }
 
-static void cm_path_set_rec_type(struct ib_device *ib_device, u8 port_num,
+static void cm_path_set_rec_type(struct ib_device *ib_device, u32 port_num,
                                 struct sa_path_rec *path, union ib_gid *gid)
 {
        if (ib_is_opa_gid(gid) && rdma_cap_opa_ah(ib_device, port_num))
@@ -1750,7 +1750,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
 static u16 cm_get_bth_pkey(struct cm_work *work)
 {
        struct ib_device *ib_dev = work->port->cm_dev->ib_device;
-       u8 port_num = work->port->port_num;
+       u32 port_num = work->port->port_num;
        u16 pkey_index = work->mad_recv_wc->wc->pkey_index;
        u16 pkey;
        int ret;
@@ -1778,7 +1778,7 @@ static void cm_opa_to_ib_sgid(struct cm_work *work,
                              struct sa_path_rec *path)
 {
        struct ib_device *dev = work->port->cm_dev->ib_device;
-       u8 port_num = work->port->port_num;
+       u32 port_num = work->port->port_num;
 
        if (rdma_cap_opa_ah(dev, port_num) &&
            (ib_is_opa_gid(&path->sgid))) {
@@ -1977,8 +1977,8 @@ unlock:   spin_unlock_irq(&cm_id_priv->lock);
 free:  cm_free_msg(msg);
 }
 
-static struct cm_id_private * cm_match_req(struct cm_work *work,
-                                          struct cm_id_private *cm_id_priv)
+static struct cm_id_private *cm_match_req(struct cm_work *work,
+                                         struct cm_id_private *cm_id_priv)
 {
        struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv;
        struct cm_timewait_info *timewait_info;
@@ -2138,20 +2138,17 @@ static int cm_req_handler(struct cm_work *work)
                goto destroy;
        }
 
-       cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
-
        memset(&work->path[0], 0, sizeof(work->path[0]));
        if (cm_req_has_alt_path(req_msg))
                memset(&work->path[1], 0, sizeof(work->path[1]));
        grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr);
        gid_attr = grh->sgid_attr;
 
-       if (gid_attr &&
-           rdma_protocol_roce(work->port->cm_dev->ib_device,
-                              work->port->port_num)) {
+       if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) {
                work->path[0].rec_type =
                        sa_conv_gid_to_pathrec_type(gid_attr->gid_type);
        } else {
+               cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
                cm_path_set_rec_type(
                        work->port->cm_dev->ib_device, work->port->port_num,
                        &work->path[0],
@@ -2993,7 +2990,7 @@ static void cm_format_rej_event(struct cm_work *work)
                IBA_GET_MEM_PTR(CM_REJ_PRIVATE_DATA, rej_msg);
 }
 
-static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
+static struct cm_id_private *cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
 {
        struct cm_id_private *cm_id_priv;
        __be32 remote_id;
@@ -3098,7 +3095,7 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id,
        cm_id_priv = container_of(cm_id, struct cm_id_private, id);
 
        spin_lock_irqsave(&cm_id_priv->lock, flags);
-       switch(cm_id_priv->id.state) {
+       switch (cm_id_priv->id.state) {
        case IB_CM_REQ_RCVD:
                cm_state = IB_CM_MRA_REQ_SENT;
                lap_state = cm_id->lap_state;
@@ -3155,7 +3152,7 @@ error2:   spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 }
 EXPORT_SYMBOL(ib_send_cm_mra);
 
-static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
+static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
 {
        switch (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg)) {
        case CM_MSG_RESPONSE_REQ:
@@ -3917,8 +3914,7 @@ static int cm_establish(struct ib_cm_id *cm_id)
 
        cm_id_priv = container_of(cm_id, struct cm_id_private, id);
        spin_lock_irqsave(&cm_id_priv->lock, flags);
-       switch (cm_id->state)
-       {
+       switch (cm_id->state) {
        case IB_CM_REP_SENT:
        case IB_CM_MRA_REP_RCVD:
                cm_id->state = IB_CM_ESTABLISHED;
@@ -4334,7 +4330,7 @@ static int cm_add_one(struct ib_device *ib_device)
        unsigned long flags;
        int ret;
        int count = 0;
-       unsigned int i;
+       u32 i;
 
        cm_dev = kzalloc(struct_size(cm_dev, port, ib_device->phys_port_cnt),
                         GFP_KERNEL);
@@ -4432,7 +4428,7 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
                .clr_port_cap_mask = IB_PORT_CM_SUP
        };
        unsigned long flags;
-       unsigned int i;
+       u32 i;
 
        write_lock_irqsave(&cm.device_lock, flags);
        list_del(&cm_dev->list);
index 0cc4065..8462de7 100644 (file)
@@ -22,7 +22,7 @@
 static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg)
 {
        u8 transport_type = IBA_GET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg);
-       switch(transport_type) {
+       switch (transport_type) {
        case 0: return IB_QPT_RC;
        case 1: return IB_QPT_UC;
        case 3:
@@ -37,7 +37,7 @@ static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg)
 static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg,
                                      enum ib_qp_type qp_type)
 {
-       switch(qp_type) {
+       switch (qp_type) {
        case IB_QPT_UC:
                IBA_SET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg, 1);
                break;
index 9409651..2b9ffc2 100644 (file)
@@ -43,7 +43,6 @@ MODULE_DESCRIPTION("Generic RDMA CM Agent");
 MODULE_LICENSE("Dual BSD/GPL");
 
 #define CMA_CM_RESPONSE_TIMEOUT 20
-#define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000
 #define CMA_MAX_CM_RETRIES 15
 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
 #define CMA_IBOE_PACKET_LIFETIME 18
@@ -219,14 +218,6 @@ struct rdma_bind_list {
        unsigned short          port;
 };
 
-struct class_port_info_context {
-       struct ib_class_port_info       *class_port_info;
-       struct ib_device                *device;
-       struct completion               done;
-       struct ib_sa_query              *sa_query;
-       u8                              port_num;
-};
-
 static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps,
                        struct rdma_bind_list *bind_list, int snum)
 {
@@ -287,7 +278,7 @@ struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter      filter,
 }
 
 int cma_get_default_gid_type(struct cma_device *cma_dev,
-                            unsigned int port)
+                            u32 port)
 {
        if (!rdma_is_port_valid(cma_dev->device, port))
                return -EINVAL;
@@ -296,7 +287,7 @@ int cma_get_default_gid_type(struct cma_device *cma_dev,
 }
 
 int cma_set_default_gid_type(struct cma_device *cma_dev,
-                            unsigned int port,
+                            u32 port,
                             enum ib_gid_type default_gid_type)
 {
        unsigned long supported_gids;
@@ -319,7 +310,7 @@ int cma_set_default_gid_type(struct cma_device *cma_dev,
        return 0;
 }
 
-int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port)
+int cma_get_default_roce_tos(struct cma_device *cma_dev, u32 port)
 {
        if (!rdma_is_port_valid(cma_dev->device, port))
                return -EINVAL;
@@ -327,7 +318,7 @@ int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port)
        return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)];
 }
 
-int cma_set_default_roce_tos(struct cma_device *cma_dev, unsigned int port,
+int cma_set_default_roce_tos(struct cma_device *cma_dev, u32 port,
                             u8 default_roce_tos)
 {
        if (!rdma_is_port_valid(cma_dev->device, port))
@@ -463,7 +454,6 @@ static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
        id_priv->id.route.addr.dev_addr.transport =
                rdma_node_get_transport(cma_dev->device->node_type);
        list_add_tail(&id_priv->list, &cma_dev->id_list);
-       rdma_restrack_add(&id_priv->res);
 
        trace_cm_id_attach(id_priv, cma_dev->device);
 }
@@ -562,7 +552,7 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a
 }
 
 static const struct ib_gid_attr *
-cma_validate_port(struct ib_device *device, u8 port,
+cma_validate_port(struct ib_device *device, u32 port,
                  enum ib_gid_type gid_type,
                  union ib_gid *gid,
                  struct rdma_id_private *id_priv)
@@ -620,7 +610,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv)
        struct cma_device *cma_dev;
        enum ib_gid_type gid_type;
        int ret = -ENODEV;
-       unsigned int port;
+       u32 port;
 
        if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
            id_priv->id.ps == RDMA_PS_IPOIB)
@@ -700,6 +690,7 @@ static int cma_ib_acquire_dev(struct rdma_id_private *id_priv,
        mutex_lock(&lock);
        cma_attach_to_dev(id_priv, listen_id_priv->cma_dev);
        mutex_unlock(&lock);
+       rdma_restrack_add(&id_priv->res);
        return 0;
 }
 
@@ -711,8 +702,8 @@ static int cma_iw_acquire_dev(struct rdma_id_private *id_priv,
        struct cma_device *cma_dev;
        enum ib_gid_type gid_type;
        int ret = -ENODEV;
-       unsigned int port;
        union ib_gid gid;
+       u32 port;
 
        if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
            id_priv->id.ps == RDMA_PS_IPOIB)
@@ -754,8 +745,10 @@ static int cma_iw_acquire_dev(struct rdma_id_private *id_priv,
        }
 
 out:
-       if (!ret)
+       if (!ret) {
                cma_attach_to_dev(id_priv, cma_dev);
+               rdma_restrack_add(&id_priv->res);
+       }
 
        mutex_unlock(&lock);
        return ret;
@@ -816,6 +809,7 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
 
 found:
        cma_attach_to_dev(id_priv, cma_dev);
+       rdma_restrack_add(&id_priv->res);
        mutex_unlock(&lock);
        addr = (struct sockaddr_ib *)cma_src_addr(id_priv);
        memcpy(&addr->sib_addr, &sgid, sizeof(sgid));
@@ -852,6 +846,7 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
        id_priv->id.qp_type = qp_type;
        id_priv->tos_set = false;
        id_priv->timeout_set = false;
+       id_priv->min_rnr_timer_set = false;
        id_priv->gid_type = IB_GID_TYPE_IB;
        spin_lock_init(&id_priv->lock);
        mutex_init(&id_priv->qp_mutex);
@@ -1135,12 +1130,16 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
                                                 qp_attr_mask);
                qp_attr->port_num = id_priv->id.port_num;
                *qp_attr_mask |= IB_QP_PORT;
-       } else
+       } else {
                ret = -ENOSYS;
+       }
 
        if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set)
                qp_attr->timeout = id_priv->timeout;
 
+       if ((*qp_attr_mask & IB_QP_MIN_RNR_TIMER) && id_priv->min_rnr_timer_set)
+               qp_attr->min_rnr_timer = id_priv->min_rnr_timer;
+
        return ret;
 }
 EXPORT_SYMBOL(rdma_init_qp_attr);
@@ -1581,7 +1580,7 @@ static bool cma_match_private_data(struct rdma_id_private *id_priv,
 static bool cma_protocol_roce(const struct rdma_cm_id *id)
 {
        struct ib_device *device = id->device;
-       const int port_num = id->port_num ?: rdma_start_port(device);
+       const u32 port_num = id->port_num ?: rdma_start_port(device);
 
        return rdma_protocol_roce(device, port_num);
 }
@@ -2474,6 +2473,7 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
 
        id->tos = id_priv->tos;
        id->tos_set = id_priv->tos_set;
+       id->afonly = id_priv->afonly;
        id_priv->cm_id.iw = id;
 
        memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv),
@@ -2529,6 +2529,7 @@ static int cma_listen_on_dev(struct rdma_id_private *id_priv,
               rdma_addr_size(cma_src_addr(id_priv)));
 
        _cma_attach_to_dev(dev_id_priv, cma_dev);
+       rdma_restrack_add(&dev_id_priv->res);
        cma_id_get(id_priv);
        dev_id_priv->internal_id = 1;
        dev_id_priv->afonly = id_priv->afonly;
@@ -2615,6 +2616,43 @@ int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout)
 }
 EXPORT_SYMBOL(rdma_set_ack_timeout);
 
+/**
+ * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the
+ *                           QP associated with a connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK
+ *                Timer Field" in the IBTA specification.
+ *
+ * This function should be called before rdma_connect() on active
+ * side, and on passive side before rdma_accept(). The timer value
+ * will be associated with the local QP. When it receives a send it is
+ * not read to handle, typically if the receive queue is empty, an RNR
+ * Retry NAK is returned to the requester with the min_rnr_timer
+ * encoded. The requester will then wait at least the time specified
+ * in the NAK before retrying. The default is zero, which translates
+ * to a minimum RNR Timer value of 655 ms.
+ *
+ * Return: 0 for success
+ */
+int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer)
+{
+       struct rdma_id_private *id_priv;
+
+       /* It is a five-bit value */
+       if (min_rnr_timer & 0xe0)
+               return -EINVAL;
+
+       if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT))
+               return -EINVAL;
+
+       id_priv = container_of(id, struct rdma_id_private, id);
+       id_priv->min_rnr_timer = min_rnr_timer;
+       id_priv->min_rnr_timer_set = true;
+
+       return 0;
+}
+EXPORT_SYMBOL(rdma_set_min_rnr_timer);
+
 static void cma_query_handler(int status, struct sa_path_rec *path_rec,
                              void *context)
 {
@@ -3169,6 +3207,7 @@ port_found:
        ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
        id_priv->id.port_num = p;
        cma_attach_to_dev(id_priv, cma_dev);
+       rdma_restrack_add(&id_priv->res);
        cma_set_loopback(cma_src_addr(id_priv));
 out:
        mutex_unlock(&lock);
@@ -3201,6 +3240,7 @@ static void addr_handler(int status, struct sockaddr *src_addr,
                if (status)
                        pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n",
                                             status);
+               rdma_restrack_add(&id_priv->res);
        } else if (status) {
                pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status);
        }
@@ -3812,6 +3852,8 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
        if (ret)
                goto err2;
 
+       if (!cma_any_addr(addr))
+               rdma_restrack_add(&id_priv->res);
        return 0;
 err2:
        if (id_priv->cma_dev)
@@ -4124,10 +4166,11 @@ int rdma_connect_locked(struct rdma_cm_id *id,
                        ret = cma_resolve_ib_udp(id_priv, conn_param);
                else
                        ret = cma_connect_ib(id_priv, conn_param);
-       } else if (rdma_cap_iw_cm(id->device, id->port_num))
+       } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
                ret = cma_connect_iw(id_priv, conn_param);
-       else
+       } else {
                ret = -ENOSYS;
+       }
        if (ret)
                goto err_state;
        return 0;
@@ -4234,9 +4277,9 @@ static int cma_accept_iw(struct rdma_id_private *id_priv,
        iw_param.ird = conn_param->responder_resources;
        iw_param.private_data = conn_param->private_data;
        iw_param.private_data_len = conn_param->private_data_len;
-       if (id_priv->id.qp) {
+       if (id_priv->id.qp)
                iw_param.qpn = id_priv->qp_num;
-       else
+       else
                iw_param.qpn = conn_param->qp_num;
 
        return iw_cm_accept(id_priv->cm_id.iw, &iw_param);
@@ -4319,11 +4362,11 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
                        else
                                ret = cma_rep_recv(id_priv);
                }
-       } else if (rdma_cap_iw_cm(id->device, id->port_num))
+       } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
                ret = cma_accept_iw(id_priv, conn_param);
-       else
+       } else {
                ret = -ENOSYS;
-
+       }
        if (ret)
                goto reject;
 
@@ -4409,8 +4452,9 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data,
        } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
                ret = iw_cm_reject(id_priv->cm_id.iw,
                                   private_data, private_data_len);
-       } else
+       } else {
                ret = -ENOSYS;
+       }
 
        return ret;
 }
@@ -4864,14 +4908,28 @@ static void cma_process_remove(struct cma_device *cma_dev)
        wait_for_completion(&cma_dev->comp);
 }
 
+static bool cma_supported(struct ib_device *device)
+{
+       u32 i;
+
+       rdma_for_each_port(device, i) {
+               if (rdma_cap_ib_cm(device, i) || rdma_cap_iw_cm(device, i))
+                       return true;
+       }
+       return false;
+}
+
 static int cma_add_one(struct ib_device *device)
 {
        struct rdma_id_private *to_destroy;
        struct cma_device *cma_dev;
        struct rdma_id_private *id_priv;
-       unsigned int i;
        unsigned long supported_gids = 0;
        int ret;
+       u32 i;
+
+       if (!cma_supported(device))
+               return -EOPNOTSUPP;
 
        cma_dev = kmalloc(sizeof(*cma_dev), GFP_KERNEL);
        if (!cma_dev)
index e0d5e3b..9ac16e0 100644 (file)
@@ -43,7 +43,7 @@ struct cma_device;
 struct cma_dev_group;
 
 struct cma_dev_port_group {
-       unsigned int            port_num;
+       u32                     port_num;
        struct cma_dev_group    *cma_dev_group;
        struct config_group     group;
 };
@@ -200,10 +200,10 @@ static const struct config_item_type cma_port_group_type = {
 static int make_cma_ports(struct cma_dev_group *cma_dev_group,
                          struct cma_device *cma_dev)
 {
-       struct ib_device *ibdev;
-       unsigned int i;
-       unsigned int ports_num;
        struct cma_dev_port_group *ports;
+       struct ib_device *ibdev;
+       u32 ports_num;
+       u32 i;
 
        ibdev = cma_get_ib_dev(cma_dev);
 
index caece96..5c463da 100644 (file)
@@ -86,9 +86,11 @@ struct rdma_id_private {
        u8                      tos;
        u8                      tos_set:1;
        u8                      timeout_set:1;
+       u8                      min_rnr_timer_set:1;
        u8                      reuseaddr;
        u8                      afonly;
        u8                      timeout;
+       u8                      min_rnr_timer;
        enum ib_gid_type        gid_type;
 
        /*
@@ -117,11 +119,11 @@ void cma_dev_put(struct cma_device *dev);
 typedef bool (*cma_device_filter)(struct ib_device *, void *);
 struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
                                             void *cookie);
-int cma_get_default_gid_type(struct cma_device *dev, unsigned int port);
-int cma_set_default_gid_type(struct cma_device *dev, unsigned int port,
+int cma_get_default_gid_type(struct cma_device *dev, u32 port);
+int cma_set_default_gid_type(struct cma_device *dev, u32 port,
                             enum ib_gid_type default_gid_type);
-int cma_get_default_roce_tos(struct cma_device *dev, unsigned int port);
-int cma_set_default_roce_tos(struct cma_device *dev, unsigned int port,
+int cma_get_default_roce_tos(struct cma_device *dev, u32 port);
+int cma_set_default_roce_tos(struct cma_device *dev, u32 port,
                             u8 default_roce_tos);
 struct ib_device *cma_get_ib_dev(struct cma_device *dev);
 
index 315f7a2..29809dd 100644 (file)
@@ -83,14 +83,14 @@ void ib_device_unregister_sysfs(struct ib_device *device);
 int ib_device_rename(struct ib_device *ibdev, const char *name);
 int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim);
 
-typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
+typedef void (*roce_netdev_callback)(struct ib_device *device, u32 port,
              struct net_device *idev, void *cookie);
 
-typedef bool (*roce_netdev_filter)(struct ib_device *device, u8 port,
+typedef bool (*roce_netdev_filter)(struct ib_device *device, u32 port,
                                   struct net_device *idev, void *cookie);
 
 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
-                                       unsigned int port);
+                                       u32 port);
 
 void ib_enum_roce_netdev(struct ib_device *ib_dev,
                         roce_netdev_filter filter,
@@ -113,7 +113,7 @@ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
 struct ib_client_nl_info {
        struct sk_buff *nl_msg;
        struct device *cdev;
-       unsigned int port;
+       u32 port;
        u64 abi;
 };
 int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
@@ -128,24 +128,24 @@ int ib_cache_gid_parse_type_str(const char *buf);
 
 const char *ib_cache_gid_type_str(enum ib_gid_type gid_type);
 
-void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port,
                                  struct net_device *ndev,
                                  unsigned long gid_type_mask,
                                  enum ib_cache_gid_default_mode mode);
 
-int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+int ib_cache_gid_add(struct ib_device *ib_dev, u32 port,
                     union ib_gid *gid, struct ib_gid_attr *attr);
 
-int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+int ib_cache_gid_del(struct ib_device *ib_dev, u32 port,
                     union ib_gid *gid, struct ib_gid_attr *attr);
 
-int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port,
                                     struct net_device *ndev);
 
 int roce_gid_mgmt_init(void);
 void roce_gid_mgmt_cleanup(void);
 
-unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port);
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port);
 
 int ib_cache_setup_one(struct ib_device *device);
 void ib_cache_cleanup_one(struct ib_device *device);
@@ -215,14 +215,14 @@ int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
                             struct netlink_ext_ack *extack);
 
 int ib_get_cached_subnet_prefix(struct ib_device *device,
-                               u8                port_num,
-                               u64              *sn_pfx);
+                               u32 port_num,
+                               u64 *sn_pfx);
 
 #ifdef CONFIG_SECURITY_INFINIBAND
 void ib_security_release_port_pkey_list(struct ib_device *device);
 
 void ib_security_cache_change(struct ib_device *device,
-                             u8 port_num,
+                             u32 port_num,
                              u64 subnet_prefix);
 
 int ib_security_modify_qp(struct ib_qp *qp,
@@ -247,7 +247,7 @@ static inline void ib_security_release_port_pkey_list(struct ib_device *device)
 }
 
 static inline void ib_security_cache_change(struct ib_device *device,
-                                           u8 port_num,
+                                           u32 port_num,
                                            u64 subnet_prefix)
 {
 }
@@ -381,7 +381,7 @@ int ib_setup_port_attrs(struct ib_core_device *coredev);
 
 int rdma_compatdev_set(u8 enable);
 
-int ib_port_register_module_stat(struct ib_device *device, u8 port_num,
+int ib_port_register_module_stat(struct ib_device *device, u32 port_num,
                                 struct kobject *kobj, struct kobj_type *ktype,
                                 const char *name);
 void ib_port_unregister_module_stat(struct kobject *kobj);
index f3a7c1f..1549335 100644 (file)
@@ -14,10 +14,12 @@ static int __counter_set_mode(struct rdma_port_counter *port_counter,
                              enum rdma_nl_counter_mode new_mode,
                              enum rdma_nl_counter_mask new_mask)
 {
-       if (new_mode == RDMA_COUNTER_MODE_AUTO && port_counter->num_counters)
-               if (new_mask & ~ALL_AUTO_MODE_MASKS ||
-                   port_counter->mode.mode != RDMA_COUNTER_MODE_NONE)
+       if (new_mode == RDMA_COUNTER_MODE_AUTO) {
+               if (new_mask & (~ALL_AUTO_MODE_MASKS))
                        return -EINVAL;
+               if (port_counter->num_counters)
+                       return -EBUSY;
+       }
 
        port_counter->mode.mode = new_mode;
        port_counter->mode.mask = new_mask;
@@ -32,14 +34,17 @@ static int __counter_set_mode(struct rdma_port_counter *port_counter,
  * @mask: Mask to configure
  * @extack: Message to the user
  *
- * Return 0 on success.
+ * Return 0 on success. If counter mode wasn't changed then it is considered
+ * as success as well.
+ * Return -EBUSY when changing to auto mode while there are bounded counters.
+ *
  */
-int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
+int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port,
                               enum rdma_nl_counter_mask mask,
                               struct netlink_ext_ack *extack)
 {
-       enum rdma_nl_counter_mode mode = RDMA_COUNTER_MODE_AUTO;
        struct rdma_port_counter *port_counter;
+       enum rdma_nl_counter_mode mode;
        int ret;
 
        port_counter = &dev->port_data[port].port_counter;
@@ -47,25 +52,26 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
                return -EOPNOTSUPP;
 
        mutex_lock(&port_counter->lock);
-       if (mask) {
-               ret = __counter_set_mode(port_counter, mode, mask);
-               if (ret)
-                       NL_SET_ERR_MSG(
-                               extack,
-                               "Turning on auto mode is not allowed when there is bound QP");
+       if (mask)
+               mode = RDMA_COUNTER_MODE_AUTO;
+       else
+               mode = (port_counter->num_counters) ? RDMA_COUNTER_MODE_MANUAL :
+                                                     RDMA_COUNTER_MODE_NONE;
+
+       if (port_counter->mode.mode == mode &&
+           port_counter->mode.mask == mask) {
+               ret = 0;
                goto out;
        }
 
-       if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) {
-               ret = -EINVAL;
-               goto out;
-       }
+       ret = __counter_set_mode(port_counter, mode, mask);
 
-       mode = (port_counter->num_counters) ? RDMA_COUNTER_MODE_MANUAL :
-                                                   RDMA_COUNTER_MODE_NONE;
-       ret = __counter_set_mode(port_counter, mode, 0);
 out:
        mutex_unlock(&port_counter->lock);
+       if (ret == -EBUSY)
+               NL_SET_ERR_MSG(
+                       extack,
+                       "Modifying auto mode is not allowed when there is a bound QP");
        return ret;
 }
 
@@ -100,7 +106,7 @@ static int __rdma_counter_bind_qp(struct rdma_counter *counter,
        return ret;
 }
 
-static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u8 port,
+static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
                                           struct ib_qp *qp,
                                           enum rdma_nl_counter_mode mode)
 {
@@ -238,7 +244,7 @@ static void counter_history_stat_update(struct rdma_counter *counter)
  * Return: The counter (with ref-count increased) if found
  */
 static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp,
-                                                      u8 port)
+                                                      u32 port)
 {
        struct rdma_port_counter *port_counter;
        struct rdma_counter *counter = NULL;
@@ -282,7 +288,7 @@ static void counter_release(struct kref *kref)
  * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on
  *   the auto-mode rule
  */
-int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port)
+int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port)
 {
        struct rdma_port_counter *port_counter;
        struct ib_device *dev = qp->device;
@@ -352,7 +358,7 @@ int rdma_counter_query_stats(struct rdma_counter *counter)
 }
 
 static u64 get_running_counters_hwstat_sum(struct ib_device *dev,
-                                          u8 port, u32 index)
+                                          u32 port, u32 index)
 {
        struct rdma_restrack_entry *res;
        struct rdma_restrack_root *rt;
@@ -388,7 +394,7 @@ next:
  * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a
  *   specific port, including the running ones and history data
  */
-u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index)
+u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index)
 {
        struct rdma_port_counter *port_counter;
        u64 sum;
@@ -443,7 +449,7 @@ static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev,
 /*
  * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id
  */
-int rdma_counter_bind_qpn(struct ib_device *dev, u8 port,
+int rdma_counter_bind_qpn(struct ib_device *dev, u32 port,
                          u32 qp_num, u32 counter_id)
 {
        struct rdma_port_counter *port_counter;
@@ -493,7 +499,7 @@ err:
  * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it
  *   The id of new counter is returned in @counter_id
  */
-int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
+int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port,
                                u32 qp_num, u32 *counter_id)
 {
        struct rdma_port_counter *port_counter;
@@ -540,7 +546,7 @@ err:
 /*
  * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter
  */
-int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port,
+int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port,
                            u32 qp_num, u32 counter_id)
 {
        struct rdma_port_counter *port_counter;
@@ -573,7 +579,7 @@ out:
        return ret;
 }
 
-int rdma_counter_get_mode(struct ib_device *dev, u8 port,
+int rdma_counter_get_mode(struct ib_device *dev, u32 port,
                          enum rdma_nl_counter_mode *mode,
                          enum rdma_nl_counter_mask *mask)
 {
index aac0fe1..c660cef 100644 (file)
@@ -779,7 +779,7 @@ static void remove_client_context(struct ib_device *device,
 static int alloc_port_data(struct ib_device *device)
 {
        struct ib_port_data_rcu *pdata_rcu;
-       unsigned int port;
+       u32 port;
 
        if (device->port_data)
                return 0;
@@ -788,6 +788,10 @@ static int alloc_port_data(struct ib_device *device)
        if (WARN_ON(!device->phys_port_cnt))
                return -EINVAL;
 
+       /* Reserve U32_MAX so the logic to go over all the ports is sane */
+       if (WARN_ON(device->phys_port_cnt == U32_MAX))
+               return -EINVAL;
+
        /*
         * device->port_data is indexed directly by the port number to make
         * access to this data as efficient as possible.
@@ -819,7 +823,7 @@ static int alloc_port_data(struct ib_device *device)
        return 0;
 }
 
-static int verify_immutable(const struct ib_device *dev, u8 port)
+static int verify_immutable(const struct ib_device *dev, u32 port)
 {
        return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
                            rdma_max_mad_size(dev, port) != 0);
@@ -827,7 +831,7 @@ static int verify_immutable(const struct ib_device *dev, u8 port)
 
 static int setup_port_data(struct ib_device *device)
 {
-       unsigned int port;
+       u32 port;
        int ret;
 
        ret = alloc_port_data(device);
@@ -2005,7 +2009,7 @@ void ib_dispatch_event_clients(struct ib_event *event)
 }
 
 static int iw_query_port(struct ib_device *device,
-                          u8 port_num,
+                          u32 port_num,
                           struct ib_port_attr *port_attr)
 {
        struct in_device *inetdev;
@@ -2044,7 +2048,7 @@ static int iw_query_port(struct ib_device *device,
 }
 
 static int __ib_query_port(struct ib_device *device,
-                          u8 port_num,
+                          u32 port_num,
                           struct ib_port_attr *port_attr)
 {
        union ib_gid gid = {};
@@ -2078,7 +2082,7 @@ static int __ib_query_port(struct ib_device *device,
  * @port_attr pointer.
  */
 int ib_query_port(struct ib_device *device,
-                 u8 port_num,
+                 u32 port_num,
                  struct ib_port_attr *port_attr)
 {
        if (!rdma_is_port_valid(device, port_num))
@@ -2130,7 +2134,7 @@ static void add_ndev_hash(struct ib_port_data *pdata)
  * NETDEV_UNREGISTER event.
  */
 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
-                        unsigned int port)
+                        u32 port)
 {
        struct net_device *old_ndev;
        struct ib_port_data *pdata;
@@ -2173,7 +2177,7 @@ EXPORT_SYMBOL(ib_device_set_netdev);
 static void free_netdevs(struct ib_device *ib_dev)
 {
        unsigned long flags;
-       unsigned int port;
+       u32 port;
 
        if (!ib_dev->port_data)
                return;
@@ -2204,7 +2208,7 @@ static void free_netdevs(struct ib_device *ib_dev)
 }
 
 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
-                                       unsigned int port)
+                                       u32 port)
 {
        struct ib_port_data *pdata;
        struct net_device *res;
@@ -2291,7 +2295,7 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev,
                         roce_netdev_callback cb,
                         void *cookie)
 {
-       unsigned int port;
+       u32 port;
 
        rdma_for_each_port (ib_dev, port)
                if (rdma_protocol_roce(ib_dev, port)) {
@@ -2369,7 +2373,7 @@ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
  * ib_query_pkey() fetches the specified P_Key table entry.
  */
 int ib_query_pkey(struct ib_device *device,
-                 u8 port_num, u16 index, u16 *pkey)
+                 u32 port_num, u16 index, u16 *pkey)
 {
        if (!rdma_is_port_valid(device, port_num))
                return -EINVAL;
@@ -2414,7 +2418,7 @@ EXPORT_SYMBOL(ib_modify_device);
  * @port_modify_mask and @port_modify structure.
  */
 int ib_modify_port(struct ib_device *device,
-                  u8 port_num, int port_modify_mask,
+                  u32 port_num, int port_modify_mask,
                   struct ib_port_modify *port_modify)
 {
        int rc;
@@ -2446,10 +2450,10 @@ EXPORT_SYMBOL(ib_modify_port);
  *   parameter may be NULL.
  */
 int ib_find_gid(struct ib_device *device, union ib_gid *gid,
-               u8 *port_num, u16 *index)
+               u32 *port_num, u16 *index)
 {
        union ib_gid tmp_gid;
-       unsigned int port;
+       u32 port;
        int ret, i;
 
        rdma_for_each_port (device, port) {
@@ -2483,7 +2487,7 @@ EXPORT_SYMBOL(ib_find_gid);
  * @index: The index into the PKey table where the PKey was found.
  */
 int ib_find_pkey(struct ib_device *device,
-                u8 port_num, u16 pkey, u16 *index)
+                u32 port_num, u16 pkey, u16 *index)
 {
        int ret, i;
        u16 tmp_pkey;
@@ -2526,7 +2530,7 @@ EXPORT_SYMBOL(ib_find_pkey);
  *
  */
 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
-                                           u8 port,
+                                           u32 port,
                                            u16 pkey,
                                            const union ib_gid *gid,
                                            const struct sockaddr *addr)
@@ -2696,7 +2700,6 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
        SET_DEVICE_OP(dev_ops, reg_dm_mr);
        SET_DEVICE_OP(dev_ops, reg_user_mr);
        SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf);
-       SET_DEVICE_OP(dev_ops, req_ncomp_notif);
        SET_DEVICE_OP(dev_ops, req_notify_cq);
        SET_DEVICE_OP(dev_ops, rereg_user_mr);
        SET_DEVICE_OP(dev_ops, resize_cq);
index 30a0ff7..932b26f 100644 (file)
@@ -528,7 +528,8 @@ add_mapping_response_exit:
 }
 
 /* netlink attribute policy for the response to add and query mapping request
- * and response with remote address info */
+ * and response with remote address info
+ */
 static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = {
        [IWPM_NLA_RQUERY_MAPPING_SEQ]     = { .type = NLA_U32 },
        [IWPM_NLA_RQUERY_LOCAL_ADDR]      = {
index 9355e52..2081e48 100644 (file)
@@ -61,7 +61,7 @@ static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr,
 {
        u16 pkey;
        struct ib_device *dev = qp_info->port_priv->device;
-       u8 pnum = qp_info->port_priv->port_num;
+       u32 pnum = qp_info->port_priv->port_num;
        struct ib_ud_wr *wr = &mad_send_wr->send_wr;
        struct rdma_ah_attr attr = {};
 
@@ -118,7 +118,7 @@ static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc);
  * Assumes ib_mad_port_list_lock is being held
  */
 static inline struct ib_mad_port_private *
-__ib_get_mad_port(struct ib_device *device, int port_num)
+__ib_get_mad_port(struct ib_device *device, u32 port_num)
 {
        struct ib_mad_port_private *entry;
 
@@ -134,7 +134,7 @@ __ib_get_mad_port(struct ib_device *device, int port_num)
  * for a device/port
  */
 static inline struct ib_mad_port_private *
-ib_get_mad_port(struct ib_device *device, int port_num)
+ib_get_mad_port(struct ib_device *device, u32 port_num)
 {
        struct ib_mad_port_private *entry;
        unsigned long flags;
@@ -155,8 +155,7 @@ static inline u8 convert_mgmt_class(u8 mgmt_class)
 
 static int get_spl_qp_index(enum ib_qp_type qp_type)
 {
-       switch (qp_type)
-       {
+       switch (qp_type) {
        case IB_QPT_SMI:
                return 0;
        case IB_QPT_GSI:
@@ -222,7 +221,7 @@ EXPORT_SYMBOL(ib_response_mad);
  * Context: Process context.
  */
 struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
-                                          u8 port_num,
+                                          u32 port_num,
                                           enum ib_qp_type qp_type,
                                           struct ib_mad_reg_req *mad_reg_req,
                                           u8 rmpp_version,
@@ -549,7 +548,7 @@ static void dequeue_mad(struct ib_mad_list_head *mad_list)
 }
 
 static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid,
-               u16 pkey_index, u8 port_num, struct ib_wc *wc)
+               u16 pkey_index, u32 port_num, struct ib_wc *wc)
 {
        memset(wc, 0, sizeof *wc);
        wc->wr_cqe = cqe;
@@ -608,7 +607,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
        struct ib_mad_port_private *port_priv;
        struct ib_mad_agent_private *recv_mad_agent = NULL;
        struct ib_device *device = mad_agent_priv->agent.device;
-       u8 port_num;
+       u32 port_num;
        struct ib_wc mad_wc;
        struct ib_ud_wr *send_wr = &mad_send_wr->send_wr;
        size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv);
@@ -707,8 +706,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
                                      (const struct ib_mad *)smp,
                                      (struct ib_mad *)mad_priv->mad, &mad_size,
                                      &out_mad_pkey_index);
-       switch (ret)
-       {
+       switch (ret) {
        case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
                if (ib_response_mad((const struct ib_mad_hdr *)mad_priv->mad) &&
                    mad_agent_priv->agent.recv_handler) {
@@ -807,7 +805,7 @@ static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
 
        /* Allocate data segments. */
        for (left = send_buf->data_len + pad; left > 0; left -= seg_size) {
-               seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask);
+               seg = kmalloc(sizeof(*seg) + seg_size, gfp_mask);
                if (!seg) {
                        free_send_rmpp_list(send_wr);
                        return -ENOMEM;
@@ -837,12 +835,11 @@ int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent)
 }
 EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent);
 
-struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
-                                           u32 remote_qpn, u16 pkey_index,
-                                           int rmpp_active,
-                                           int hdr_len, int data_len,
-                                           gfp_t gfp_mask,
-                                           u8 base_version)
+struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent,
+                                          u32 remote_qpn, u16 pkey_index,
+                                          int rmpp_active, int hdr_len,
+                                          int data_len, gfp_t gfp_mask,
+                                          u8 base_version)
 {
        struct ib_mad_agent_private *mad_agent_priv;
        struct ib_mad_send_wr_private *mad_send_wr;
@@ -1275,11 +1272,9 @@ static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method,
        int i;
 
        /* Remove any methods for this mad agent */
-       for (i = 0; i < IB_MGMT_MAX_METHODS; i++) {
-               if (method->agent[i] == agent) {
+       for (i = 0; i < IB_MGMT_MAX_METHODS; i++)
+               if (method->agent[i] == agent)
                        method->agent[i] = NULL;
-               }
-       }
 }
 
 static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
@@ -1454,9 +1449,8 @@ static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv)
         * Was MAD registration request supplied
         * with original registration ?
         */
-       if (!agent_priv->reg_req) {
+       if (!agent_priv->reg_req)
                goto out;
-       }
 
        port_priv = agent_priv->qp_info->port_priv;
        mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class);
@@ -1613,7 +1607,7 @@ out:
 
        if (mad_agent && !mad_agent->agent.recv_handler) {
                dev_notice(&port_priv->device->dev,
-                          "No receive handler for client %p on port %d\n",
+                          "No receive handler for client %p on port %u\n",
                           &mad_agent->agent, port_priv->port_num);
                deref_mad_agent(mad_agent);
                mad_agent = NULL;
@@ -1677,15 +1671,16 @@ static inline int rcv_has_same_class(const struct ib_mad_send_wr_private *wr,
                rwc->recv_buf.mad->mad_hdr.mgmt_class;
 }
 
-static inline int rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv,
-                                  const struct ib_mad_send_wr_private *wr,
-                                  const struct ib_mad_recv_wc *rwc )
+static inline int
+rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv,
+                const struct ib_mad_send_wr_private *wr,
+                const struct ib_mad_recv_wc *rwc)
 {
        struct rdma_ah_attr attr;
        u8 send_resp, rcv_resp;
        union ib_gid sgid;
        struct ib_device *device = mad_agent_priv->agent.device;
-       u8 port_num = mad_agent_priv->agent.port_num;
+       u32 port_num = mad_agent_priv->agent.port_num;
        u8 lmc;
        bool has_grh;
 
@@ -1834,7 +1829,8 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
                                deref_mad_agent(mad_agent_priv);
                        } else {
                                /* not user rmpp, revert to normal behavior and
-                                * drop the mad */
+                                * drop the mad
+                                */
                                ib_free_recv_mad(mad_recv_wc);
                                deref_mad_agent(mad_agent_priv);
                                return;
@@ -1860,14 +1856,12 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
                                                   mad_recv_wc);
                deref_mad_agent(mad_agent_priv);
        }
-
-       return;
 }
 
 static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv,
                                     const struct ib_mad_qp_info *qp_info,
                                     const struct ib_wc *wc,
-                                    int port_num,
+                                    u32 port_num,
                                     struct ib_mad_private *recv,
                                     struct ib_mad_private *response)
 {
@@ -1954,7 +1948,7 @@ static enum smi_action
 handle_opa_smi(struct ib_mad_port_private *port_priv,
               struct ib_mad_qp_info *qp_info,
               struct ib_wc *wc,
-              int port_num,
+              u32 port_num,
               struct ib_mad_private *recv,
               struct ib_mad_private *response)
 {
@@ -2010,7 +2004,7 @@ static enum smi_action
 handle_smi(struct ib_mad_port_private *port_priv,
           struct ib_mad_qp_info *qp_info,
           struct ib_wc *wc,
-          int port_num,
+          u32 port_num,
           struct ib_mad_private *recv,
           struct ib_mad_private *response,
           bool opa)
@@ -2034,7 +2028,7 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)
        struct ib_mad_private_header *mad_priv_hdr;
        struct ib_mad_private *recv, *response = NULL;
        struct ib_mad_agent_private *mad_agent;
-       int port_num;
+       u32 port_num;
        int ret = IB_MAD_RESULT_SUCCESS;
        size_t mad_size;
        u16 resp_mad_pkey_index = 0;
@@ -2202,9 +2196,10 @@ static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
                                       temp_mad_send_wr->timeout))
                                break;
                }
-       }
-       else
+       } else {
                list_item = &mad_agent_priv->wait_list;
+       }
+
        list_add(&mad_send_wr->agent_list, list_item);
 
        /* Reschedule a work item if we have a shorter timeout */
@@ -2258,7 +2253,7 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
        adjust_timeout(mad_agent_priv);
        spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
-       if (mad_send_wr->status != IB_WC_SUCCESS )
+       if (mad_send_wr->status != IB_WC_SUCCESS)
                mad_send_wc->status = mad_send_wr->status;
        if (ret == IB_RMPP_RESULT_INTERNAL)
                ib_rmpp_send_handler(mad_send_wc);
@@ -2947,7 +2942,7 @@ static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
  * Create the QP, PD, MR, and CQ if needed
  */
 static int ib_mad_port_open(struct ib_device *device,
-                           int port_num)
+                           u32 port_num)
 {
        int ret, cq_size;
        struct ib_mad_port_private *port_priv;
@@ -3002,7 +2997,7 @@ static int ib_mad_port_open(struct ib_device *device,
        if (ret)
                goto error7;
 
-       snprintf(name, sizeof name, "ib_mad%d", port_num);
+       snprintf(name, sizeof(name), "ib_mad%u", port_num);
        port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
        if (!port_priv->wq) {
                ret = -ENOMEM;
@@ -3048,7 +3043,7 @@ error3:
  * If there are no classes using the port, free the port
  * resources (CQ, MR, PD, QP) and remove the port's info structure
  */
-static int ib_mad_port_close(struct ib_device *device, int port_num)
+static int ib_mad_port_close(struct ib_device *device, u32 port_num)
 {
        struct ib_mad_port_private *port_priv;
        unsigned long flags;
@@ -3057,7 +3052,7 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
        port_priv = __ib_get_mad_port(device, port_num);
        if (port_priv == NULL) {
                spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
-               dev_err(&device->dev, "Port %d not found\n", port_num);
+               dev_err(&device->dev, "Port %u not found\n", port_num);
                return -ENODEV;
        }
        list_del_init(&port_priv->port_list);
index e0573e4..8af0619 100644 (file)
@@ -382,8 +382,8 @@ static inline int get_seg_num(struct ib_mad_recv_buf *seg)
        return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
 }
 
-static inline struct ib_mad_recv_buf * get_next_seg(struct list_head *rmpp_list,
-                                                   struct ib_mad_recv_buf *seg)
+static inline struct ib_mad_recv_buf *get_next_seg(struct list_head *rmpp_list,
+                                                  struct ib_mad_recv_buf *seg)
 {
        if (seg->list.next == rmpp_list)
                return NULL;
@@ -396,8 +396,8 @@ static inline int window_size(struct ib_mad_agent_private *agent)
        return max(agent->qp_info->recv_queue.max_active >> 3, 1);
 }
 
-static struct ib_mad_recv_buf * find_seg_location(struct list_head *rmpp_list,
-                                                 int seg_num)
+static struct ib_mad_recv_buf *find_seg_location(struct list_head *rmpp_list,
+                                                int seg_num)
 {
        struct ib_mad_recv_buf *seg_buf;
        int cur_seg_num;
@@ -449,7 +449,7 @@ static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv)
        return hdr_size + rmpp_recv->seg_num * data_size - pad;
 }
 
-static struct ib_mad_recv_wc * complete_rmpp(struct mad_rmpp_recv *rmpp_recv)
+static struct ib_mad_recv_wc *complete_rmpp(struct mad_rmpp_recv *rmpp_recv)
 {
        struct ib_mad_recv_wc *rmpp_wc;
 
index 57519ca..a5dd4b7 100644 (file)
@@ -63,7 +63,7 @@ struct mcast_port {
        struct rb_root          table;
        atomic_t                refcount;
        struct completion       comp;
-       u                     port_num;
+       u32                     port_num;
 };
 
 struct mcast_device {
@@ -605,7 +605,7 @@ found:
  */
 struct ib_sa_multicast *
 ib_sa_join_multicast(struct ib_sa_client *client,
-                    struct ib_device *device, u8 port_num,
+                    struct ib_device *device, u32 port_num,
                     struct ib_sa_mcmember_rec *rec,
                     ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
                     int (*callback)(int status,
@@ -690,7 +690,7 @@ void ib_sa_free_multicast(struct ib_sa_multicast *multicast)
 }
 EXPORT_SYMBOL(ib_sa_free_multicast);
 
-int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+int ib_sa_get_mcmember_rec(struct ib_device *device, u32 port_num,
                           union ib_gid *mgid, struct ib_sa_mcmember_rec *rec)
 {
        struct mcast_device *dev;
@@ -732,7 +732,7 @@ EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
  * success or appropriate error code.
  *
  */
-int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+int ib_init_ah_from_mcmember(struct ib_device *device, u32 port_num,
                             struct ib_sa_mcmember_rec *rec,
                             struct net_device *ndev,
                             enum ib_gid_type gid_type,
index d306049..34d0cc1 100644 (file)
@@ -92,7 +92,9 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
        [RDMA_NLDEV_ATTR_RES_CQE]               = { .type = NLA_U32 },
        [RDMA_NLDEV_ATTR_RES_CQN]               = { .type = NLA_U32 },
        [RDMA_NLDEV_ATTR_RES_CQ_ENTRY]          = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_CTX]               = { .type = NLA_NESTED },
        [RDMA_NLDEV_ATTR_RES_CTXN]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_CTX_ENTRY]         = { .type = NLA_NESTED },
        [RDMA_NLDEV_ATTR_RES_DST_ADDR]          = {
                        .len = sizeof(struct __kernel_sockaddr_storage) },
        [RDMA_NLDEV_ATTR_RES_IOVA]              = { .type = NLA_U64 },
@@ -130,6 +132,11 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
        [RDMA_NLDEV_ATTR_RES_TYPE]              = { .type = NLA_U8 },
        [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 },
        [RDMA_NLDEV_ATTR_RES_USECNT]            = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_RES_SRQ]               = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_SRQN]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_SRQ_ENTRY]         = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_MIN_RANGE]             = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_MAX_RANGE]             = { .type = NLA_U32 },
        [RDMA_NLDEV_ATTR_SM_LID]                = { .type = NLA_U32 },
        [RDMA_NLDEV_ATTR_SUBNET_PREFIX]         = { .type = NLA_U64 },
        [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]   = { .type = NLA_U32 },
@@ -146,6 +153,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
        [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID]      = { .type = NLA_U32 },
        [RDMA_NLDEV_NET_NS_FD]                  = { .type = NLA_U32 },
        [RDMA_NLDEV_SYS_ATTR_NETNS_MODE]        = { .type = NLA_U8 },
+       [RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK]      = { .type = NLA_U8 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -242,7 +250,7 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
 {
        char fw[IB_FW_VERSION_NAME_MAX];
        int ret = 0;
-       u8 port;
+       u32 port;
 
        if (fill_nldev_handle(msg, device))
                return -EMSGSIZE;
@@ -385,6 +393,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
                [RDMA_RESTRACK_CM_ID] = "cm_id",
                [RDMA_RESTRACK_MR] = "mr",
                [RDMA_RESTRACK_CTX] = "ctx",
+               [RDMA_RESTRACK_SRQ] = "srq",
        };
 
        struct nlattr *table_attr;
@@ -703,6 +712,135 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
 err:   return -EMSGSIZE;
 }
 
+static int fill_res_ctx_entry(struct sk_buff *msg, bool has_cap_net_admin,
+                             struct rdma_restrack_entry *res, uint32_t port)
+{
+       struct ib_ucontext *ctx = container_of(res, struct ib_ucontext, res);
+
+       if (rdma_is_kernel_res(res))
+               return 0;
+
+       if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, ctx->res.id))
+               return -EMSGSIZE;
+
+       return fill_res_name_pid(msg, res);
+}
+
+static int fill_res_range_qp_entry(struct sk_buff *msg, uint32_t min_range,
+                                  uint32_t max_range)
+{
+       struct nlattr *entry_attr;
+
+       if (!min_range)
+               return 0;
+
+       entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY);
+       if (!entry_attr)
+               return -EMSGSIZE;
+
+       if (min_range == max_range) {
+               if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, min_range))
+                       goto err;
+       } else {
+               if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MIN_RANGE, min_range))
+                       goto err;
+               if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MAX_RANGE, max_range))
+                       goto err;
+       }
+       nla_nest_end(msg, entry_attr);
+       return 0;
+
+err:
+       nla_nest_cancel(msg, entry_attr);
+       return -EMSGSIZE;
+}
+
+static int fill_res_srq_qps(struct sk_buff *msg, struct ib_srq *srq)
+{
+       uint32_t min_range = 0, prev = 0;
+       struct rdma_restrack_entry *res;
+       struct rdma_restrack_root *rt;
+       struct nlattr *table_attr;
+       struct ib_qp *qp = NULL;
+       unsigned long id = 0;
+
+       table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP);
+       if (!table_attr)
+               return -EMSGSIZE;
+
+       rt = &srq->device->res[RDMA_RESTRACK_QP];
+       xa_lock(&rt->xa);
+       xa_for_each(&rt->xa, id, res) {
+               if (!rdma_restrack_get(res))
+                       continue;
+
+               qp = container_of(res, struct ib_qp, res);
+               if (!qp->srq || (qp->srq->res.id != srq->res.id)) {
+                       rdma_restrack_put(res);
+                       continue;
+               }
+
+               if (qp->qp_num < prev)
+                       /* qp_num should be ascending */
+                       goto err_loop;
+
+               if (min_range == 0) {
+                       min_range = qp->qp_num;
+               } else if (qp->qp_num > (prev + 1)) {
+                       if (fill_res_range_qp_entry(msg, min_range, prev))
+                               goto err_loop;
+
+                       min_range = qp->qp_num;
+               }
+               prev = qp->qp_num;
+               rdma_restrack_put(res);
+       }
+
+       xa_unlock(&rt->xa);
+
+       if (fill_res_range_qp_entry(msg, min_range, prev))
+               goto err;
+
+       nla_nest_end(msg, table_attr);
+       return 0;
+
+err_loop:
+       rdma_restrack_put(res);
+       xa_unlock(&rt->xa);
+err:
+       nla_nest_cancel(msg, table_attr);
+       return -EMSGSIZE;
+}
+
+static int fill_res_srq_entry(struct sk_buff *msg, bool has_cap_net_admin,
+                             struct rdma_restrack_entry *res, uint32_t port)
+{
+       struct ib_srq *srq = container_of(res, struct ib_srq, res);
+
+       if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SRQN, srq->res.id))
+               goto err;
+
+       if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, srq->srq_type))
+               goto err;
+
+       if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, srq->pd->res.id))
+               goto err;
+
+       if (ib_srq_has_cq(srq->srq_type)) {
+               if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN,
+                               srq->ext.cq->res.id))
+                       goto err;
+       }
+
+       if (fill_res_srq_qps(msg, srq))
+               goto err;
+
+       return fill_res_name_pid(msg, res);
+
+err:
+       return -EMSGSIZE;
+}
+
 static int fill_stat_counter_mode(struct sk_buff *msg,
                                  struct rdma_counter *counter)
 {
@@ -1236,6 +1374,19 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
                .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY,
                .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID,
        },
+       [RDMA_RESTRACK_CTX] = {
+               .nldev_attr = RDMA_NLDEV_ATTR_RES_CTX,
+               .flags = NLDEV_PER_DEV,
+               .entry = RDMA_NLDEV_ATTR_RES_CTX_ENTRY,
+               .id = RDMA_NLDEV_ATTR_RES_CTXN,
+       },
+       [RDMA_RESTRACK_SRQ] = {
+               .nldev_attr = RDMA_NLDEV_ATTR_RES_SRQ,
+               .flags = NLDEV_PER_DEV,
+               .entry = RDMA_NLDEV_ATTR_RES_SRQ_ENTRY,
+               .id = RDMA_NLDEV_ATTR_RES_SRQN,
+       },
+
 };
 
 static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -1476,6 +1627,8 @@ RES_GET_FUNCS(pd, RDMA_RESTRACK_PD);
 RES_GET_FUNCS(mr, RDMA_RESTRACK_MR);
 RES_GET_FUNCS(mr_raw, RDMA_RESTRACK_MR);
 RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER);
+RES_GET_FUNCS(ctx, RDMA_RESTRACK_CTX);
+RES_GET_FUNCS(srq, RDMA_RESTRACK_SRQ);
 
 static LIST_HEAD(link_ops);
 static DECLARE_RWSEM(link_ops_rwsem);
@@ -1697,6 +1850,19 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                nlmsg_free(msg);
                return err;
        }
+
+       /*
+        * Copy-on-fork is supported.
+        * See commits:
+        * 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes")
+        * 4eae4efa2c29 ("hugetlb: do early cow when page pinned on src mm")
+        * for more details. Don't backport this without them.
+        *
+        * Return value ignored on purpose, assume copy-on-fork is not
+        * supported in case of failure.
+        */
+       nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK, 1);
+
        nlmsg_end(msg, nlh);
        return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
 }
@@ -2139,6 +2305,14 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
                .doit = nldev_res_get_pd_doit,
                .dump = nldev_res_get_pd_dumpit,
        },
+       [RDMA_NLDEV_CMD_RES_CTX_GET] = {
+               .doit = nldev_res_get_ctx_doit,
+               .dump = nldev_res_get_ctx_dumpit,
+       },
+       [RDMA_NLDEV_CMD_RES_SRQ_GET] = {
+               .doit = nldev_res_get_srq_doit,
+               .dump = nldev_res_get_srq_dumpit,
+       },
        [RDMA_NLDEV_CMD_SYS_GET] = {
                .doit = nldev_sys_get_doit,
        },
index af4879b..64e2822 100644 (file)
 #include "smi.h"
 
 enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch,
-                                      int port_num, int phys_port_cnt);
+                                      u32 port_num, int phys_port_cnt);
 int opa_smi_get_fwd_port(struct opa_smp *smp);
 extern enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp);
 extern enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp,
-                                             bool is_switch, int port_num);
+                                             bool is_switch, u32 port_num);
 
 /*
  * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
index 75eafd9..94d83b6 100644 (file)
@@ -112,7 +112,7 @@ static void assert_uverbs_usecnt(struct ib_uobject *uobj,
  * however the type's allocat_commit function cannot have been called and the
  * uobject cannot be on the uobjects_lists
  *
- * For RDMA_REMOVE_DESTROY the caller shold be holding a kref (eg via
+ * For RDMA_REMOVE_DESTROY the caller should be holding a kref (eg via
  * rdma_lookup_get_uobject) and the object is left in a state where the caller
  * needs to call rdma_lookup_put_uobject.
  *
@@ -916,7 +916,7 @@ static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile,
 }
 
 /*
- * Destroy the uncontext and every uobject associated with it.
+ * Destroy the ucontext and every uobject associated with it.
  *
  * This is internally locked and can be called in parallel from multiple
  * contexts.
index ffabaf3..0332078 100644 (file)
@@ -47,6 +47,7 @@ static const char *type2str(enum rdma_restrack_type type)
                [RDMA_RESTRACK_MR] = "MR",
                [RDMA_RESTRACK_CTX] = "CTX",
                [RDMA_RESTRACK_COUNTER] = "COUNTER",
+               [RDMA_RESTRACK_SRQ] = "SRQ",
        };
 
        return names[type];
@@ -141,6 +142,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res)
                return container_of(res, struct ib_ucontext, res)->device;
        case RDMA_RESTRACK_COUNTER:
                return container_of(res, struct rdma_counter, res)->device;
+       case RDMA_RESTRACK_SRQ:
+               return container_of(res, struct ib_srq, res)->device;
        default:
                WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
                return NULL;
index 34fff94..7b638d9 100644 (file)
@@ -70,7 +70,7 @@ struct netdev_event_work {
 };
 
 static const struct {
-       bool (*is_supported)(const struct ib_device *device, u8 port_num);
+       bool (*is_supported)(const struct ib_device *device, u32 port_num);
        enum ib_gid_type gid_type;
 } PORT_CAP_TO_GID_TYPE[] = {
        {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE},
@@ -79,7 +79,7 @@ static const struct {
 
 #define CAP_TO_GID_TABLE_SIZE  ARRAY_SIZE(PORT_CAP_TO_GID_TYPE)
 
-unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port)
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port)
 {
        int i;
        unsigned int ret_flags = 0;
@@ -96,7 +96,7 @@ unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port)
 EXPORT_SYMBOL(roce_gid_type_mask_support);
 
 static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
-                      u8 port, union ib_gid *gid,
+                      u32 port, union ib_gid *gid,
                       struct ib_gid_attr *gid_attr)
 {
        int i;
@@ -144,7 +144,7 @@ static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_de
 #define REQUIRED_BOND_STATES           (BONDING_SLAVE_STATE_ACTIVE |   \
                                         BONDING_SLAVE_STATE_NA)
 static bool
-is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u8 port,
+is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port,
                             struct net_device *rdma_ndev, void *cookie)
 {
        struct net_device *real_dev;
@@ -168,7 +168,7 @@ is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u8 port,
 }
 
 static bool
-is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u8 port,
+is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port,
                                  struct net_device *rdma_ndev, void *cookie)
 {
        struct net_device *master_dev;
@@ -197,7 +197,7 @@ is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u8 port,
  * considered for deriving default RoCE GID, returns false otherwise.
  */
 static bool
-is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u8 port,
+is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port,
                               struct net_device *rdma_ndev, void *cookie)
 {
        struct net_device *cookie_ndev = cookie;
@@ -223,13 +223,13 @@ is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u8 port,
        return res;
 }
 
-static bool pass_all_filter(struct ib_device *ib_dev, u8 port,
+static bool pass_all_filter(struct ib_device *ib_dev, u32 port,
                            struct net_device *rdma_ndev, void *cookie)
 {
        return true;
 }
 
-static bool upper_device_filter(struct ib_device *ib_dev, u8 port,
+static bool upper_device_filter(struct ib_device *ib_dev, u32 port,
                                struct net_device *rdma_ndev, void *cookie)
 {
        bool res;
@@ -260,7 +260,7 @@ static bool upper_device_filter(struct ib_device *ib_dev, u8 port,
  * not have been established as slave device yet.
  */
 static bool
-is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u8 port,
+is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port,
                                 struct net_device *rdma_ndev,
                                 void *cookie)
 {
@@ -280,7 +280,7 @@ is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u8 port,
 
 static void update_gid_ip(enum gid_op_type gid_op,
                          struct ib_device *ib_dev,
-                         u8 port, struct net_device *ndev,
+                         u32 port, struct net_device *ndev,
                          struct sockaddr *addr)
 {
        union ib_gid gid;
@@ -294,7 +294,7 @@ static void update_gid_ip(enum gid_op_type gid_op,
 }
 
 static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
-                                           u8 port,
+                                           u32 port,
                                            struct net_device *rdma_ndev,
                                            struct net_device *event_ndev)
 {
@@ -328,7 +328,7 @@ static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
 }
 
 static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
-                                u8 port, struct net_device *ndev)
+                                u32 port, struct net_device *ndev)
 {
        const struct in_ifaddr *ifa;
        struct in_device *in_dev;
@@ -372,7 +372,7 @@ static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
 }
 
 static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
-                                u8 port, struct net_device *ndev)
+                                u32 port, struct net_device *ndev)
 {
        struct inet6_ifaddr *ifp;
        struct inet6_dev *in6_dev;
@@ -417,7 +417,7 @@ static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
        }
 }
 
-static void _add_netdev_ips(struct ib_device *ib_dev, u8 port,
+static void _add_netdev_ips(struct ib_device *ib_dev, u32 port,
                            struct net_device *ndev)
 {
        enum_netdev_ipv4_ips(ib_dev, port, ndev);
@@ -425,13 +425,13 @@ static void _add_netdev_ips(struct ib_device *ib_dev, u8 port,
                enum_netdev_ipv6_ips(ib_dev, port, ndev);
 }
 
-static void add_netdev_ips(struct ib_device *ib_dev, u8 port,
+static void add_netdev_ips(struct ib_device *ib_dev, u32 port,
                           struct net_device *rdma_ndev, void *cookie)
 {
        _add_netdev_ips(ib_dev, port, cookie);
 }
 
-static void del_netdev_ips(struct ib_device *ib_dev, u8 port,
+static void del_netdev_ips(struct ib_device *ib_dev, u32 port,
                           struct net_device *rdma_ndev, void *cookie)
 {
        ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie);
@@ -446,7 +446,7 @@ static void del_netdev_ips(struct ib_device *ib_dev, u8 port,
  *
  * del_default_gids() deletes the default GIDs of the event/cookie netdevice.
  */
-static void del_default_gids(struct ib_device *ib_dev, u8 port,
+static void del_default_gids(struct ib_device *ib_dev, u32 port,
                             struct net_device *rdma_ndev, void *cookie)
 {
        struct net_device *cookie_ndev = cookie;
@@ -458,7 +458,7 @@ static void del_default_gids(struct ib_device *ib_dev, u8 port,
                                     IB_CACHE_GID_DEFAULT_MODE_DELETE);
 }
 
-static void add_default_gids(struct ib_device *ib_dev, u8 port,
+static void add_default_gids(struct ib_device *ib_dev, u32 port,
                             struct net_device *rdma_ndev, void *cookie)
 {
        struct net_device *event_ndev = cookie;
@@ -470,7 +470,7 @@ static void add_default_gids(struct ib_device *ib_dev, u8 port,
 }
 
 static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev,
-                                   u8 port,
+                                   u32 port,
                                    struct net_device *rdma_ndev,
                                    void *cookie)
 {
@@ -515,7 +515,7 @@ void rdma_roce_rescan_device(struct ib_device *ib_dev)
 EXPORT_SYMBOL(rdma_roce_rescan_device);
 
 static void callback_for_addr_gid_device_scan(struct ib_device *device,
-                                             u8 port,
+                                             u32 port,
                                              struct net_device *rdma_ndev,
                                              void *cookie)
 {
@@ -547,10 +547,10 @@ static int netdev_upper_walk(struct net_device *upper,
        return 0;
 }
 
-static void handle_netdev_upper(struct ib_device *ib_dev, u8 port,
+static void handle_netdev_upper(struct ib_device *ib_dev, u32 port,
                                void *cookie,
                                void (*handle_netdev)(struct ib_device *ib_dev,
-                                                     u8 port,
+                                                     u32 port,
                                                      struct net_device *ndev))
 {
        struct net_device *ndev = cookie;
@@ -574,25 +574,25 @@ static void handle_netdev_upper(struct ib_device *ib_dev, u8 port,
        }
 }
 
-static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port,
                                      struct net_device *event_ndev)
 {
        ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
 }
 
-static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port,
                                 struct net_device *rdma_ndev, void *cookie)
 {
        handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids);
 }
 
-static void add_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port,
                                 struct net_device *rdma_ndev, void *cookie)
 {
        handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips);
 }
 
-static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port,
+static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port,
                                        struct net_device *rdma_ndev,
                                        void *cookie)
 {
index 31156e2..a588c20 100644 (file)
@@ -25,7 +25,7 @@ MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations");
  * registration is also enabled if registering memory might yield better
  * performance than using multiple SGE entries, see rdma_rw_io_needs_mr()
  */
-static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num)
+static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u32 port_num)
 {
        if (rdma_protocol_iwarp(dev, port_num))
                return true;
@@ -42,7 +42,7 @@ static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num)
  * optimization otherwise.  Additionally we have a debug option to force usage
  * of MRs to help testing this code path.
  */
-static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
+static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u32 port_num,
                enum dma_data_direction dir, int dma_nents)
 {
        if (dir == DMA_FROM_DEVICE) {
@@ -87,7 +87,7 @@ static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg)
 }
 
 /* Caller must have zero-initialized *reg. */
-static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
+static int rdma_rw_init_one_mr(struct ib_qp *qp, u32 port_num,
                struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
                u32 sg_cnt, u32 offset)
 {
@@ -121,7 +121,7 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
 }
 
 static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
-               u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
+               u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
                u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
        struct rdma_rw_reg_ctx *prev = NULL;
@@ -308,7 +308,7 @@ static int rdma_rw_map_sg(struct ib_device *dev, struct scatterlist *sg,
  * Returns the number of WQEs that will be needed on the workqueue if
  * successful, or a negative error code.
  */
-int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
                struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
                u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
@@ -377,7 +377,7 @@ EXPORT_SYMBOL(rdma_rw_ctx_init);
  * successful, or a negative error code.
  */
 int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
-               u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+               u32 port_num, struct scatterlist *sg, u32 sg_cnt,
                struct scatterlist *prot_sg, u32 prot_sg_cnt,
                struct ib_sig_attrs *sig_attrs,
                u64 remote_addr, u32 rkey, enum dma_data_direction dir)
@@ -505,7 +505,7 @@ static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval)
  * completion notification.
  */
 struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
-               u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
+               u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
 {
        struct ib_send_wr *first_wr, *last_wr;
        int i;
@@ -562,7 +562,7 @@ EXPORT_SYMBOL(rdma_rw_ctx_wrs);
  * is not set @cqe must be set so that the caller gets a completion
  * notification.
  */
-int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
                struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
 {
        struct ib_send_wr *first_wr;
@@ -581,8 +581,9 @@ EXPORT_SYMBOL(rdma_rw_ctx_post);
  * @sg_cnt:    number of entries in @sg
  * @dir:       %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
  */
-void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
-               struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir)
+void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+                        u32 port_num, struct scatterlist *sg, u32 sg_cnt,
+                        enum dma_data_direction dir)
 {
        int i;
 
@@ -620,7 +621,7 @@ EXPORT_SYMBOL(rdma_rw_ctx_destroy);
  * @dir:       %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
  */
 void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
-               u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+               u32 port_num, struct scatterlist *sg, u32 sg_cnt,
                struct scatterlist *prot_sg, u32 prot_sg_cnt,
                enum dma_data_direction dir)
 {
@@ -647,7 +648,7 @@ EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
  * compute max_rdma_ctxts and the size of the transport's Send and
  * Send Completion Queues.
  */
-unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num,
+unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num,
                               unsigned int maxpages)
 {
        unsigned int mr_pages;
index cbaaaa9..143de37 100644 (file)
@@ -49,7 +49,7 @@ static inline void ib_sa_client_put(struct ib_sa_client *client)
 }
 
 int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
-                            struct ib_device *device, u8 port_num, u8 method,
+                            struct ib_device *device, u32 port_num, u8 method,
                             struct ib_sa_mcmember_rec *rec,
                             ib_sa_comp_mask comp_mask,
                             unsigned long timeout_ms, gfp_t gfp_mask,
index 9ef1a35..8f1705c 100644 (file)
@@ -95,7 +95,7 @@ struct ib_sa_port {
        struct delayed_work ib_cpi_work;
        spinlock_t                   classport_lock; /* protects class port info set */
        spinlock_t           ah_lock;
-       u                  port_num;
+       u32                  port_num;
 };
 
 struct ib_sa_device {
@@ -1194,7 +1194,7 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
 }
 EXPORT_SYMBOL(ib_sa_cancel_query);
 
-static u8 get_src_path_mask(struct ib_device *device, u8 port_num)
+static u8 get_src_path_mask(struct ib_device *device, u32 port_num)
 {
        struct ib_sa_device *sa_dev;
        struct ib_sa_port   *port;
@@ -1213,7 +1213,7 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num)
        return src_path_mask;
 }
 
-static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num,
+static int init_ah_attr_grh_fields(struct ib_device *device, u32 port_num,
                                   struct sa_path_rec *rec,
                                   struct rdma_ah_attr *ah_attr,
                                   const struct ib_gid_attr *gid_attr)
@@ -1251,7 +1251,7 @@ static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num,
  * User must invoke rdma_destroy_ah_attr() to release reference to SGID
  * attributes which are initialized using ib_init_ah_attr_from_path().
  */
-int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num,
+int ib_init_ah_attr_from_path(struct ib_device *device, u32 port_num,
                              struct sa_path_rec *rec,
                              struct rdma_ah_attr *ah_attr,
                              const struct ib_gid_attr *gid_attr)
@@ -1409,7 +1409,7 @@ EXPORT_SYMBOL(ib_sa_pack_path);
 
 static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client,
                                         struct ib_sa_device *sa_dev,
-                                        u8 port_num)
+                                        u32 port_num)
 {
        struct ib_sa_port *port;
        unsigned long flags;
@@ -1444,7 +1444,7 @@ enum opa_pr_supported {
  */
 static int opa_pr_query_possible(struct ib_sa_client *client,
                                 struct ib_sa_device *sa_dev,
-                                struct ib_device *device, u8 port_num,
+                                struct ib_device *device, u32 port_num,
                                 struct sa_path_rec *rec)
 {
        struct ib_port_attr port_attr;
@@ -1533,7 +1533,7 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
  * the query.
  */
 int ib_sa_path_rec_get(struct ib_sa_client *client,
-                      struct ib_device *device, u8 port_num,
+                      struct ib_device *device, u32 port_num,
                       struct sa_path_rec *rec,
                       ib_sa_comp_mask comp_mask,
                       unsigned long timeout_ms, gfp_t gfp_mask,
@@ -1688,7 +1688,7 @@ static void ib_sa_service_rec_release(struct ib_sa_query *sa_query)
  * the query.
  */
 int ib_sa_service_rec_query(struct ib_sa_client *client,
-                           struct ib_device *device, u8 port_num, u8 method,
+                           struct ib_device *device, u32 port_num, u8 method,
                            struct ib_sa_service_rec *rec,
                            ib_sa_comp_mask comp_mask,
                            unsigned long timeout_ms, gfp_t gfp_mask,
@@ -1784,7 +1784,7 @@ static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query)
 }
 
 int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
-                            struct ib_device *device, u8 port_num,
+                            struct ib_device *device, u32 port_num,
                             u8 method,
                             struct ib_sa_mcmember_rec *rec,
                             ib_sa_comp_mask comp_mask,
@@ -1876,7 +1876,7 @@ static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query)
 }
 
 int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
-                             struct ib_device *device, u8 port_num,
+                             struct ib_device *device, u32 port_num,
                              struct ib_sa_guidinfo_rec *rec,
                              ib_sa_comp_mask comp_mask, u8 method,
                              unsigned long timeout_ms, gfp_t gfp_mask,
@@ -2265,7 +2265,7 @@ static void ib_sa_event(struct ib_event_handler *handler,
                unsigned long flags;
                struct ib_sa_device *sa_dev =
                        container_of(handler, typeof(*sa_dev), event_handler);
-               u8 port_num = event->element.port_num - sa_dev->start_port;
+               u32 port_num = event->element.port_num - sa_dev->start_port;
                struct ib_sa_port *port = &sa_dev->port[port_num];
 
                if (!rdma_cap_ib_sa(handler->device, port->port_num))
index 75e7ec0..e5a78d1 100644 (file)
@@ -193,7 +193,7 @@ static void qp_to_error(struct ib_qp_security *sec)
 
 static inline void check_pkey_qps(struct pkey_index_qp_list *pkey,
                                  struct ib_device *device,
-                                 u8 port_num,
+                                 u32 port_num,
                                  u64 subnet_prefix)
 {
        struct ib_port_pkey *pp, *tmp_pp;
@@ -245,7 +245,7 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp)
        struct pkey_index_qp_list *tmp_pkey;
        struct pkey_index_qp_list *pkey;
        struct ib_device *dev;
-       u8 port_num = pp->port_num;
+       u32 port_num = pp->port_num;
        int ret = 0;
 
        if (pp->state != IB_PORT_PKEY_VALID)
@@ -538,7 +538,7 @@ void ib_destroy_qp_security_end(struct ib_qp_security *sec)
 }
 
 void ib_security_cache_change(struct ib_device *device,
-                             u8 port_num,
+                             u32 port_num,
                              u64 subnet_prefix)
 {
        struct pkey_index_qp_list *pkey;
@@ -649,7 +649,7 @@ int ib_security_modify_qp(struct ib_qp *qp,
 }
 
 static int ib_security_pkey_access(struct ib_device *dev,
-                                  u8 port_num,
+                                  u32 port_num,
                                   u16 pkey_index,
                                   void *sec)
 {
index f19b238..45f09b7 100644 (file)
@@ -41,7 +41,7 @@
 #include "smi.h"
 #include "opa_smi.h"
 
-static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num,
+static enum smi_action __smi_handle_dr_smp_send(bool is_switch, u32 port_num,
                                                u8 *hop_ptr, u8 hop_cnt,
                                                const u8 *initial_path,
                                                const u8 *return_path,
@@ -127,7 +127,7 @@ static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num,
  * Return IB_SMI_DISCARD if the SMP should be discarded
  */
 enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
-                                      bool is_switch, int port_num)
+                                      bool is_switch, u32 port_num)
 {
        return __smi_handle_dr_smp_send(is_switch, port_num,
                                        &smp->hop_ptr, smp->hop_cnt,
@@ -139,7 +139,7 @@ enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
 }
 
 enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp,
-                                      bool is_switch, int port_num)
+                                      bool is_switch, u32 port_num)
 {
        return __smi_handle_dr_smp_send(is_switch, port_num,
                                        &smp->hop_ptr, smp->hop_cnt,
@@ -152,7 +152,7 @@ enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp,
                                        OPA_LID_PERMISSIVE);
 }
 
-static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num,
+static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, u32 port_num,
                                                int phys_port_cnt,
                                                u8 *hop_ptr, u8 hop_cnt,
                                                const u8 *initial_path,
@@ -238,7 +238,7 @@ static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num,
  * Return IB_SMI_DISCARD if the SMP should be dropped
  */
 enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,
-                                      int port_num, int phys_port_cnt)
+                                      u32 port_num, int phys_port_cnt)
 {
        return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt,
                                        &smp->hop_ptr, smp->hop_cnt,
@@ -254,7 +254,7 @@ enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,
  * Return IB_SMI_DISCARD if the SMP should be dropped
  */
 enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch,
-                                          int port_num, int phys_port_cnt)
+                                          u32 port_num, int phys_port_cnt)
 {
        return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt,
                                        &smp->hop_ptr, smp->hop_cnt,
index 91d9b35..e350ed6 100644 (file)
@@ -52,11 +52,11 @@ enum smi_forward_action {
 };
 
 enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,
-                                      int port_num, int phys_port_cnt);
+                                      u32 port_num, int phys_port_cnt);
 int smi_get_fwd_port(struct ib_smp *smp);
 extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp);
 extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
-                                             bool is_switch, int port_num);
+                                             bool is_switch, u32 port_num);
 
 /*
  * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
index b8abb30..05b702d 100644 (file)
@@ -62,7 +62,7 @@ struct ib_port {
        const struct attribute_group *pma_table;
        struct attribute_group *hw_stats_ag;
        struct rdma_hw_stats   *hw_stats;
-       u8                     port_num;
+       u32                     port_num;
 };
 
 struct port_attribute {
@@ -94,7 +94,7 @@ struct hw_stats_attribute {
                                         const char *buf,
                                         size_t count);
        int                     index;
-       u                     port_num;
+       u32                     port_num;
 };
 
 static ssize_t port_attr_show(struct kobject *kobj,
@@ -297,7 +297,7 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
 
 static const char *phys_state_to_str(enum ib_port_phys_state phys_state)
 {
-       static const char * phys_state_str[] = {
+       static const char *phys_state_str[] = {
                "<unknown>",
                "Sleep",
                "Polling",
@@ -470,14 +470,14 @@ static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
 struct port_table_attribute port_pma_attr_##_name = {                  \
        .attr  = __ATTR(_name, S_IRUGO, show_pma_counter, NULL),        \
        .index = (_offset) | ((_width) << 16) | ((_counter) << 24),     \
-       .attr_id = IB_PMA_PORT_COUNTERS ,                               \
+       .attr_id = IB_PMA_PORT_COUNTERS                               \
 }
 
 #define PORT_PMA_ATTR_EXT(_name, _width, _offset)                      \
 struct port_table_attribute port_pma_attr_ext_##_name = {              \
        .attr  = __ATTR(_name, S_IRUGO, show_pma_counter, NULL),        \
        .index = (_offset) | ((_width) << 16),                          \
-       .attr_id = IB_PMA_PORT_COUNTERS_EXT ,                           \
+       .attr_id = IB_PMA_PORT_COUNTERS_EXT                           \
 }
 
 /*
@@ -812,7 +812,7 @@ static const struct attribute_group *get_counter_table(struct ib_device *dev,
 }
 
 static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats,
-                          u8 port_num, int index)
+                          u32 port_num, int index)
 {
        int ret;
 
@@ -938,7 +938,7 @@ static void free_hsag(struct kobject *kobj, struct attribute_group *attr_group)
        kfree(attr_group);
 }
 
-static struct attribute *alloc_hsa(int index, u8 port_num, const char *name)
+static struct attribute *alloc_hsa(int index, u32 port_num, const char *name)
 {
        struct hw_stats_attribute *hsa;
 
@@ -956,7 +956,7 @@ static struct attribute *alloc_hsa(int index, u8 port_num, const char *name)
        return &hsa->attr;
 }
 
-static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num)
+static struct attribute *alloc_hsa_lifespan(char *name, u32 port_num)
 {
        struct hw_stats_attribute *hsa;
 
@@ -975,7 +975,7 @@ static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num)
 }
 
 static void setup_hw_stats(struct ib_device *device, struct ib_port *port,
-                          u8 port_num)
+                          u32 port_num)
 {
        struct attribute_group *hsag;
        struct rdma_hw_stats *stats;
@@ -1049,7 +1049,6 @@ err_free_hsag:
        kfree(hsag);
 err_free_stats:
        kfree(stats);
-       return;
 }
 
 static int add_port(struct ib_core_device *coredev, int port_num)
@@ -1075,9 +1074,8 @@ static int add_port(struct ib_core_device *coredev, int port_num)
        ret = kobject_init_and_add(&p->kobj, &port_type,
                                   coredev->ports_kobj,
                                   "%d", port_num);
-       if (ret) {
+       if (ret)
                goto err_put;
-       }
 
        p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL);
        if (!p->gid_attr_group) {
@@ -1088,9 +1086,8 @@ static int add_port(struct ib_core_device *coredev, int port_num)
        p->gid_attr_group->port = p;
        ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type,
                                   &p->kobj, "gid_attrs");
-       if (ret) {
+       if (ret)
                goto err_put_gid_attrs;
-       }
 
        if (device->ops.process_mad && is_full_dev) {
                p->pma_table = get_counter_table(device, port_num);
@@ -1383,7 +1380,7 @@ void ib_free_port_attrs(struct ib_core_device *coredev)
 int ib_setup_port_attrs(struct ib_core_device *coredev)
 {
        struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);
-       unsigned int port;
+       u32 port;
        int ret;
 
        coredev->ports_kobj = kobject_create_and_add("ports",
@@ -1437,7 +1434,7 @@ void ib_device_unregister_sysfs(struct ib_device *device)
  * @ktype: pointer to the ktype for this kobject.
  * @name: the name of the kobject
  */
-int ib_port_register_module_stat(struct ib_device *device, u8 port_num,
+int ib_port_register_module_stat(struct ib_device *device, u32 port_num,
                                 struct kobject *kobj, struct kobj_type *ktype,
                                 const char *name)
 {
index da2512c..15d57ba 100644 (file)
@@ -231,7 +231,7 @@ static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst,
                memcpy(dst->private_data, src->private_data,
                       src->private_data_len);
        dst->private_data_len = src->private_data_len;
-       dst->responder_resources =src->responder_resources;
+       dst->responder_resources = src->responder_resources;
        dst->initiator_depth = src->initiator_depth;
        dst->flow_control = src->flow_control;
        dst->retry_count = src->retry_count;
@@ -1034,7 +1034,7 @@ static void ucma_copy_conn_param(struct rdma_cm_id *id,
 {
        dst->private_data = src->private_data;
        dst->private_data_len = src->private_data_len;
-       dst->responder_resources =src->responder_resources;
+       dst->responder_resources = src->responder_resources;
        dst->initiator_depth = src->initiator_depth;
        dst->flow_control = src->flow_control;
        dst->retry_count = src->retry_count;
@@ -1708,8 +1708,8 @@ static ssize_t ucma_write(struct file *filp, const char __user *buf,
        ssize_t ret;
 
        if (!ib_safe_file_access(filp)) {
-               pr_err_once("ucma_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
-                           task_tgid_vnr(current), current->comm);
+               pr_err_once("%s: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
+                           __func__, task_tgid_vnr(current), current->comm);
                return -EACCES;
        }
 
index 9b60701..0eb4002 100644 (file)
@@ -100,10 +100,6 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
         */
        pgsz_bitmap &= GENMASK(BITS_PER_LONG - 1, PAGE_SHIFT);
 
-       /* At minimum, drivers must support PAGE_SIZE or smaller */
-       if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0))))
-               return 0;
-
        umem->iova = va = virt;
        /* The best result is the smallest page size that results in the minimum
         * number of required pages. Compute the largest page size that could
@@ -309,8 +305,8 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
        int ret;
 
        if (offset > umem->length || length > umem->length - offset) {
-               pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
-                      offset, umem->length, end);
+               pr_err("%s not in range. offset: %zd umem length: %zd end: %zd\n",
+                      __func__, offset, umem->length, end);
                return -EINVAL;
        }
 
index f9b5162..0d65ce1 100644 (file)
@@ -168,6 +168,10 @@ void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
 {
        struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
 
+       dma_resv_lock(dmabuf->resv, NULL);
+       ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+       dma_resv_unlock(dmabuf->resv);
+
        dma_buf_detach(dmabuf, umem_dmabuf->attach);
        dma_buf_put(dmabuf);
        kfree(umem_dmabuf);
index dd7f3b4..852efed 100644 (file)
@@ -101,7 +101,7 @@ struct ib_umad_port {
        struct ib_device      *ib_dev;
        struct ib_umad_device *umad_dev;
        int                    dev_num;
-       u8                     port_num;
+       u32                     port_num;
 };
 
 struct ib_umad_device {
@@ -165,8 +165,8 @@ static void ib_umad_dev_put(struct ib_umad_device *dev)
 
 static int hdr_size(struct ib_umad_file *file)
 {
-       return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) :
-               sizeof (struct ib_user_mad_hdr_old);
+       return file->use_pkey_index ? sizeof(struct ib_user_mad_hdr) :
+                                     sizeof(struct ib_user_mad_hdr_old);
 }
 
 /* caller must hold file->mutex */
@@ -688,8 +688,7 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
        mutex_lock(&file->mutex);
 
        if (!file->port->ib_dev) {
-               dev_notice(&file->port->dev,
-                          "ib_umad_reg_agent: invalid device\n");
+               dev_notice(&file->port->dev, "%s: invalid device\n", __func__);
                ret = -EPIPE;
                goto out;
        }
@@ -701,7 +700,7 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
 
        if (ureq.qpn != 0 && ureq.qpn != 1) {
                dev_notice(&file->port->dev,
-                          "ib_umad_reg_agent: invalid QPN %d specified\n",
+                          "%s: invalid QPN %d specified\n", __func__,
                           ureq.qpn);
                ret = -EINVAL;
                goto out;
@@ -711,9 +710,9 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
                if (!__get_agent(file, agent_id))
                        goto found;
 
-       dev_notice(&file->port->dev,
-                  "ib_umad_reg_agent: Max Agents (%u) reached\n",
+       dev_notice(&file->port->dev, "%s: Max Agents (%u) reached\n", __func__,
                   IB_UMAD_MAX_AGENTS);
+
        ret = -ENOMEM;
        goto out;
 
@@ -790,8 +789,7 @@ static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
        mutex_lock(&file->mutex);
 
        if (!file->port->ib_dev) {
-               dev_notice(&file->port->dev,
-                          "ib_umad_reg_agent2: invalid device\n");
+               dev_notice(&file->port->dev, "%s: invalid device\n", __func__);
                ret = -EPIPE;
                goto out;
        }
@@ -802,17 +800,16 @@ static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
        }
 
        if (ureq.qpn != 0 && ureq.qpn != 1) {
-               dev_notice(&file->port->dev,
-                          "ib_umad_reg_agent2: invalid QPN %d specified\n",
-                          ureq.qpn);
+               dev_notice(&file->port->dev, "%s: invalid QPN %d specified\n",
+                          __func__, ureq.qpn);
                ret = -EINVAL;
                goto out;
        }
 
        if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) {
                dev_notice(&file->port->dev,
-                          "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n",
-                          ureq.flags, IB_USER_MAD_REG_FLAGS_CAP);
+                          "%s failed: invalid registration flags specified 0x%x; supported 0x%x\n",
+                          __func__, ureq.flags, IB_USER_MAD_REG_FLAGS_CAP);
                ret = -EINVAL;
 
                if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP,
@@ -827,8 +824,7 @@ static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
                if (!__get_agent(file, agent_id))
                        goto found;
 
-       dev_notice(&file->port->dev,
-                  "ib_umad_reg_agent2: Max Agents (%u) reached\n",
+       dev_notice(&file->port->dev, "%s: Max Agents (%u) reached\n", __func__,
                   IB_UMAD_MAX_AGENTS);
        ret = -ENOMEM;
        goto out;
@@ -840,7 +836,7 @@ found:
                req.mgmt_class_version = ureq.mgmt_class_version;
                if (ureq.oui & 0xff000000) {
                        dev_notice(&file->port->dev,
-                                  "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n",
+                                  "%s failed: oui invalid 0x%08x\n", __func__,
                                   ureq.oui);
                        ret = -EINVAL;
                        goto out;
@@ -1145,7 +1141,7 @@ static const struct file_operations umad_sm_fops = {
 
 static struct ib_umad_port *get_port(struct ib_device *ibdev,
                                     struct ib_umad_device *umad_dev,
-                                    unsigned int port)
+                                    u32 port)
 {
        if (!umad_dev)
                return ERR_PTR(-EOPNOTSUPP);
index f5b8be3..d5e15a8 100644 (file)
@@ -364,7 +364,7 @@ static void copy_query_dev_fields(struct ib_ucontext *ucontext,
        resp->max_srq_sge               = attr->max_srq_sge;
        resp->max_pkeys                 = attr->max_pkeys;
        resp->local_ca_ack_delay        = attr->local_ca_ack_delay;
-       resp->phys_port_cnt             = ib_dev->phys_port_cnt;
+       resp->phys_port_cnt = min_t(u32, ib_dev->phys_port_cnt, U8_MAX);
 }
 
 static int ib_uverbs_query_device(struct uverbs_attr_bundle *attrs)
@@ -2002,12 +2002,13 @@ static int ib_uverbs_destroy_qp(struct uverbs_attr_bundle *attrs)
 
 static void *alloc_wr(size_t wr_size, __u32 num_sge)
 {
-       if (num_sge >= (U32_MAX - ALIGN(wr_size, sizeof (struct ib_sge))) /
-                      sizeof (struct ib_sge))
+       if (num_sge >= (U32_MAX - ALIGN(wr_size, sizeof(struct ib_sge))) /
+                              sizeof(struct ib_sge))
                return NULL;
 
-       return kmalloc(ALIGN(wr_size, sizeof (struct ib_sge)) +
-                        num_sge * sizeof (struct ib_sge), GFP_KERNEL);
+       return kmalloc(ALIGN(wr_size, sizeof(struct ib_sge)) +
+                              num_sge * sizeof(struct ib_sge),
+                      GFP_KERNEL);
 }
 
 static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs)
@@ -2216,7 +2217,7 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count,
        const struct ib_sge __user *sgls;
        const void __user *wqes;
 
-       if (wqe_size < sizeof (struct ib_uverbs_recv_wr))
+       if (wqe_size < sizeof(struct ib_uverbs_recv_wr))
                return ERR_PTR(-EINVAL);
 
        wqes = uverbs_request_next_ptr(iter, wqe_size * wr_count);
@@ -2249,14 +2250,14 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count,
                }
 
                if (user_wr->num_sge >=
-                   (U32_MAX - ALIGN(sizeof *next, sizeof (struct ib_sge))) /
-                   sizeof (struct ib_sge)) {
+                   (U32_MAX - ALIGN(sizeof(*next), sizeof(struct ib_sge))) /
+                           sizeof(struct ib_sge)) {
                        ret = -EINVAL;
                        goto err;
                }
 
-               next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
-                              user_wr->num_sge * sizeof (struct ib_sge),
+               next = kmalloc(ALIGN(sizeof(*next), sizeof(struct ib_sge)) +
+                                      user_wr->num_sge * sizeof(struct ib_sge),
                               GFP_KERNEL);
                if (!next) {
                        ret = -ENOMEM;
@@ -2274,8 +2275,8 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count,
                next->num_sge    = user_wr->num_sge;
 
                if (next->num_sge) {
-                       next->sg_list = (void *) next +
-                               ALIGN(sizeof *next, sizeof (struct ib_sge));
+                       next->sg_list = (void *)next +
+                               ALIGN(sizeof(*next), sizeof(struct ib_sge));
                        if (copy_from_user(next->sg_list, sgls + sg_ind,
                                           next->num_sge *
                                                   sizeof(struct ib_sge))) {
index ff047eb..990f072 100644 (file)
@@ -752,9 +752,10 @@ int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx)
        return uverbs_set_output(bundle, attr);
 }
 
-int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle,
-                     size_t idx, s64 lower_bound, u64 upper_bound,
-                     s64  *def_val)
+int _uverbs_get_const_signed(s64 *to,
+                            const struct uverbs_attr_bundle *attrs_bundle,
+                            size_t idx, s64 lower_bound, u64 upper_bound,
+                            s64  *def_val)
 {
        const struct uverbs_attr *attr;
 
@@ -773,7 +774,30 @@ int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle,
 
        return 0;
 }
-EXPORT_SYMBOL(_uverbs_get_const);
+EXPORT_SYMBOL(_uverbs_get_const_signed);
+
+int _uverbs_get_const_unsigned(u64 *to,
+                              const struct uverbs_attr_bundle *attrs_bundle,
+                              size_t idx, u64 upper_bound, u64 *def_val)
+{
+       const struct uverbs_attr *attr;
+
+       attr = uverbs_attr_get(attrs_bundle, idx);
+       if (IS_ERR(attr)) {
+               if ((PTR_ERR(attr) != -ENOENT) || !def_val)
+                       return PTR_ERR(attr);
+
+               *to = *def_val;
+       } else {
+               *to = attr->ptr_attr.data;
+       }
+
+       if (*to > upper_bound)
+               return -EINVAL;
+
+       return 0;
+}
+EXPORT_SYMBOL(_uverbs_get_const_unsigned);
 
 int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle,
                                  size_t idx, const void *from, size_t size)
index 28464c5..2b07981 100644 (file)
@@ -96,10 +96,10 @@ static const char * const wc_statuses[] = {
        [IB_WC_LOC_EEC_OP_ERR]          = "local EE context operation error",
        [IB_WC_LOC_PROT_ERR]            = "local protection error",
        [IB_WC_WR_FLUSH_ERR]            = "WR flushed",
-       [IB_WC_MW_BIND_ERR]             = "memory management operation error",
+       [IB_WC_MW_BIND_ERR]             = "memory bind operation error",
        [IB_WC_BAD_RESP_ERR]            = "bad response error",
        [IB_WC_LOC_ACCESS_ERR]          = "local access error",
-       [IB_WC_REM_INV_REQ_ERR]         = "invalid request error",
+       [IB_WC_REM_INV_REQ_ERR]         = "remote invalid request error",
        [IB_WC_REM_ACCESS_ERR]          = "remote access error",
        [IB_WC_REM_OP_ERR]              = "remote operation error",
        [IB_WC_RETRY_EXC_ERR]           = "transport retry counter exceeded",
@@ -227,7 +227,8 @@ rdma_node_get_transport(unsigned int node_type)
 }
 EXPORT_SYMBOL(rdma_node_get_transport);
 
-enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device,
+                                             u32 port_num)
 {
        enum rdma_transport_type lt;
        if (device->ops.get_link_layer)
@@ -341,7 +342,8 @@ int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata)
        }
 
        /* uverbs manipulates usecnt with proper locking, while the kabi
-          requires the caller to guarantee we can't race here. */
+        * requires the caller to guarantee we can't race here.
+        */
        WARN_ON(atomic_read(&pd->usecnt));
 
        ret = pd->device->ops.dealloc_pd(pd, udata);
@@ -658,7 +660,7 @@ int ib_get_rdma_header_version(const union rdma_network_hdr *hdr)
 EXPORT_SYMBOL(ib_get_rdma_header_version);
 
 static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device,
-                                                    u8 port_num,
+                                                    u32 port_num,
                                                     const struct ib_grh *grh)
 {
        int grh_version;
@@ -701,7 +703,7 @@ static bool find_gid_index(const union ib_gid *gid,
 }
 
 static const struct ib_gid_attr *
-get_sgid_attr_from_eth(struct ib_device *device, u8 port_num,
+get_sgid_attr_from_eth(struct ib_device *device, u32 port_num,
                       u16 vlan_id, const union ib_gid *sgid,
                       enum ib_gid_type gid_type)
 {
@@ -788,7 +790,7 @@ static int ib_resolve_unicast_gid_dmac(struct ib_device *device,
  * On success the caller is responsible to call rdma_destroy_ah_attr on the
  * attr.
  */
-int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num,
+int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num,
                            const struct ib_wc *wc, const struct ib_grh *grh,
                            struct rdma_ah_attr *ah_attr)
 {
@@ -919,7 +921,7 @@ void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr)
 EXPORT_SYMBOL(rdma_destroy_ah_attr);
 
 struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
-                                  const struct ib_grh *grh, u8 port_num)
+                                  const struct ib_grh *grh, u32 port_num)
 {
        struct rdma_ah_attr ah_attr;
        struct ib_ah *ah;
@@ -1037,8 +1039,12 @@ struct ib_srq *ib_create_srq_user(struct ib_pd *pd,
        }
        atomic_inc(&pd->usecnt);
 
+       rdma_restrack_new(&srq->res, RDMA_RESTRACK_SRQ);
+       rdma_restrack_parent_name(&srq->res, &pd->res);
+
        ret = pd->device->ops.create_srq(srq, srq_init_attr, udata);
        if (ret) {
+               rdma_restrack_put(&srq->res);
                atomic_dec(&srq->pd->usecnt);
                if (srq->srq_type == IB_SRQT_XRC)
                        atomic_dec(&srq->ext.xrc.xrcd->usecnt);
@@ -1048,6 +1054,8 @@ struct ib_srq *ib_create_srq_user(struct ib_pd *pd,
                return ERR_PTR(ret);
        }
 
+       rdma_restrack_add(&srq->res);
+
        return srq;
 }
 EXPORT_SYMBOL(ib_create_srq_user);
@@ -1086,6 +1094,7 @@ int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata)
                atomic_dec(&srq->ext.xrc.xrcd->usecnt);
        if (ib_srq_has_cq(srq->srq_type))
                atomic_dec(&srq->ext.cq->usecnt);
+       rdma_restrack_del(&srq->res);
        kfree(srq);
 
        return ret;
@@ -1673,7 +1682,7 @@ static bool is_qp_type_connected(const struct ib_qp *qp)
 static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
                         int attr_mask, struct ib_udata *udata)
 {
-       u8 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+       u32 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
        const struct ib_gid_attr *old_sgid_attr_av;
        const struct ib_gid_attr *old_sgid_attr_alt_av;
        int ret;
@@ -1801,7 +1810,7 @@ int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr,
 }
 EXPORT_SYMBOL(ib_modify_qp_with_udata);
 
-int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u16 *speed, u8 *width)
+int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed, u8 *width)
 {
        int rc;
        u32 netdev_speed;
@@ -2467,7 +2476,7 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
 }
 EXPORT_SYMBOL(ib_check_mr_status);
 
-int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port,
+int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port,
                         int state)
 {
        if (!device->ops.set_vf_link_state)
@@ -2477,7 +2486,7 @@ int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port,
 }
 EXPORT_SYMBOL(ib_set_vf_link_state);
 
-int ib_get_vf_config(struct ib_device *device, int vf, u8 port,
+int ib_get_vf_config(struct ib_device *device, int vf, u32 port,
                     struct ifla_vf_info *info)
 {
        if (!device->ops.get_vf_config)
@@ -2487,7 +2496,7 @@ int ib_get_vf_config(struct ib_device *device, int vf, u8 port,
 }
 EXPORT_SYMBOL(ib_get_vf_config);
 
-int ib_get_vf_stats(struct ib_device *device, int vf, u8 port,
+int ib_get_vf_stats(struct ib_device *device, int vf, u32 port,
                    struct ifla_vf_stats *stats)
 {
        if (!device->ops.get_vf_stats)
@@ -2497,7 +2506,7 @@ int ib_get_vf_stats(struct ib_device *device, int vf, u8 port,
 }
 EXPORT_SYMBOL(ib_get_vf_stats);
 
-int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
+int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid,
                   int type)
 {
        if (!device->ops.set_vf_guid)
@@ -2507,7 +2516,7 @@ int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
 }
 EXPORT_SYMBOL(ib_set_vf_guid);
 
-int ib_get_vf_guid(struct ib_device *device, int vf, u8 port,
+int ib_get_vf_guid(struct ib_device *device, int vf, u32 port,
                   struct ifla_vf_guid *node_guid,
                   struct ifla_vf_guid *port_guid)
 {
@@ -2849,7 +2858,7 @@ void ib_drain_qp(struct ib_qp *qp)
 }
 EXPORT_SYMBOL(ib_drain_qp);
 
-struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num,
+struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num,
                                     enum rdma_netdev_t type, const char *name,
                                     unsigned char name_assign_type,
                                     void (*setup)(struct net_device *))
@@ -2875,7 +2884,7 @@ struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num,
 }
 EXPORT_SYMBOL(rdma_alloc_netdev);
 
-int rdma_init_netdev(struct ib_device *device, u8 port_num,
+int rdma_init_netdev(struct ib_device *device, u32 port_num,
                     enum rdma_netdev_t type, const char *name,
                     unsigned char name_assign_type,
                     void (*setup)(struct net_device *),
index 0feac51..6a17f5c 100644 (file)
@@ -2,9 +2,7 @@
 config INFINIBAND_BNXT_RE
        tristate "Broadcom Netxtreme HCA support"
        depends on 64BIT
-       depends on ETHERNET && NETDEVICES && PCI && INET && DCB
-       select NET_VENDOR_BROADCOM
-       select BNXT
+       depends on INET && DCB && BNXT
        help
          This driver supports Broadcom NetXtreme-E 10/25/40/50 gigabit
          RoCE HCAs.  To compile this driver as a module, choose M here:
index b930ea3..ba26d8e 100644 (file)
@@ -138,6 +138,7 @@ struct bnxt_re_dev {
 #define BNXT_RE_FLAG_QOS_WORK_REG              5
 #define BNXT_RE_FLAG_RESOURCES_ALLOCATED       7
 #define BNXT_RE_FLAG_RESOURCES_INITIALIZED     8
+#define BNXT_RE_FLAG_ERR_DEVICE_DETACHED       17
 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS          29
        struct net_device               *netdev;
        unsigned int                    version, major, minor;
index 5f5408c..3e54e1a 100644 (file)
@@ -114,7 +114,7 @@ static const char * const bnxt_re_stat_name[] = {
 
 int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
                            struct rdma_hw_stats *stats,
-                           u8 port, int index)
+                           u32 port, int index)
 {
        struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
        struct ctx_hw_stats *bnxt_re_stats = rdev->qplib_ctx.stats.dma;
@@ -235,7 +235,7 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
 }
 
 struct rdma_hw_stats *bnxt_re_ib_alloc_hw_stats(struct ib_device *ibdev,
-                                               u8 port_num)
+                                               u32 port_num)
 {
        BUILD_BUG_ON(ARRAY_SIZE(bnxt_re_stat_name) != BNXT_RE_NUM_COUNTERS);
        /* We support only per port stats */
index 76399f4..ede0486 100644 (file)
@@ -97,8 +97,8 @@ enum bnxt_re_hw_stats {
 };
 
 struct rdma_hw_stats *bnxt_re_ib_alloc_hw_stats(struct ib_device *ibdev,
-                                               u8 port_num);
+                                               u32 port_num);
 int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
                            struct rdma_hw_stats *stats,
-                           u8 port, int index);
+                           u32 port, int index);
 #endif /* __BNXT_RE_HW_STATS_H__ */
index ba515ef..2efaa80 100644 (file)
@@ -189,7 +189,7 @@ int bnxt_re_query_device(struct ib_device *ibdev,
 }
 
 /* Port */
-int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
+int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num,
                       struct ib_port_attr *port_attr)
 {
        struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
@@ -229,7 +229,7 @@ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
        return 0;
 }
 
-int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+int bnxt_re_get_port_immutable(struct ib_device *ibdev, u32 port_num,
                               struct ib_port_immutable *immutable)
 {
        struct ib_port_attr port_attr;
@@ -254,7 +254,7 @@ void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str)
                 rdev->dev_attr.fw_ver[2], rdev->dev_attr.fw_ver[3]);
 }
 
-int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,
+int bnxt_re_query_pkey(struct ib_device *ibdev, u32 port_num,
                       u16 index, u16 *pkey)
 {
        struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
@@ -266,7 +266,7 @@ int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,
                                   &rdev->qplib_res.pkey_tbl, index, pkey);
 }
 
-int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num,
+int bnxt_re_query_gid(struct ib_device *ibdev, u32 port_num,
                      int index, union ib_gid *gid)
 {
        struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
@@ -374,7 +374,7 @@ int bnxt_re_add_gid(const struct ib_gid_attr *attr, void **context)
 }
 
 enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
-                                           u8 port_num)
+                                           u32 port_num)
 {
        return IB_LINK_LAYER_ETHERNET;
 }
index 9a8130b..d68671c 100644 (file)
@@ -149,19 +149,19 @@ static inline u16 bnxt_re_get_rwqe_size(int nsge)
 int bnxt_re_query_device(struct ib_device *ibdev,
                         struct ib_device_attr *ib_attr,
                         struct ib_udata *udata);
-int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
+int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num,
                       struct ib_port_attr *port_attr);
-int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+int bnxt_re_get_port_immutable(struct ib_device *ibdev, u32 port_num,
                               struct ib_port_immutable *immutable);
 void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str);
-int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,
+int bnxt_re_query_pkey(struct ib_device *ibdev, u32 port_num,
                       u16 index, u16 *pkey);
 int bnxt_re_del_gid(const struct ib_gid_attr *attr, void **context);
 int bnxt_re_add_gid(const struct ib_gid_attr *attr, void **context);
-int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num,
+int bnxt_re_query_gid(struct ib_device *ibdev, u32 port_num,
                      int index, union ib_gid *gid);
 enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
-                                           u8 port_num);
+                                           u32 port_num);
 int bnxt_re_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 int bnxt_re_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 int bnxt_re_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr,
index fdb8c24..8bfbf02 100644 (file)
@@ -81,6 +81,7 @@ static struct workqueue_struct *bnxt_re_wq;
 static void bnxt_re_remove_device(struct bnxt_re_dev *rdev);
 static void bnxt_re_dealloc_driver(struct ib_device *ib_dev);
 static void bnxt_re_stop_irq(void *handle);
+static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev);
 
 static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode)
 {
@@ -221,6 +222,37 @@ static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev)
 /* for handling bnxt_en callbacks later */
 static void bnxt_re_stop(void *p)
 {
+       struct bnxt_re_dev *rdev = p;
+       struct bnxt *bp;
+
+       if (!rdev)
+               return;
+       ASSERT_RTNL();
+
+       /* L2 driver invokes this callback during device error/crash or device
+        * reset. Current RoCE driver doesn't recover the device in case of
+        * error. Handle the error by dispatching fatal events to all qps
+        * ie. by calling bnxt_re_dev_stop and release the MSIx vectors as
+        * L2 driver want to modify the MSIx table.
+        */
+       bp = netdev_priv(rdev->netdev);
+
+       ibdev_info(&rdev->ibdev, "Handle device stop call from L2 driver");
+       /* Check the current device state from L2 structure and move the
+        * device to detached state if FW_FATAL_COND is set.
+        * This prevents more commands to HW during clean-up,
+        * in case the device is already in error.
+        */
+       if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state))
+               set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags);
+
+       bnxt_re_dev_stop(rdev);
+       bnxt_re_stop_irq(rdev);
+       /* Move the device states to detached and  avoid sending any more
+        * commands to HW
+        */
+       set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags);
+       set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags);
 }
 
 static void bnxt_re_start(void *p)
@@ -234,6 +266,8 @@ static void bnxt_re_sriov_config(void *p, int num_vfs)
        if (!rdev)
                return;
 
+       if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags))
+               return;
        rdev->num_vfs = num_vfs;
        if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) {
                bnxt_re_set_resource_limits(rdev);
@@ -427,6 +461,9 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev,
        if (!en_dev)
                return rc;
 
+       if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags))
+               return 0;
+
        memset(&fw_msg, 0, sizeof(fw_msg));
 
        bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_FREE, -1, -1);
@@ -489,6 +526,9 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev,
        if (!en_dev)
                return rc;
 
+       if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags))
+               return 0;
+
        memset(&fw_msg, 0, sizeof(fw_msg));
 
        bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_FREE, -1, -1);
@@ -561,24 +601,12 @@ static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev)
        return container_of(ibdev, struct bnxt_re_dev, ibdev);
 }
 
-static void bnxt_re_dev_unprobe(struct net_device *netdev,
-                               struct bnxt_en_dev *en_dev)
-{
-       dev_put(netdev);
-       module_put(en_dev->pdev->driver->driver.owner);
-}
-
 static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev)
 {
-       struct bnxt *bp = netdev_priv(netdev);
        struct bnxt_en_dev *en_dev;
        struct pci_dev *pdev;
 
-       /* Call bnxt_en's RoCE probe via indirect API */
-       if (!bp->ulp_probe)
-               return ERR_PTR(-EINVAL);
-
-       en_dev = bp->ulp_probe(netdev);
+       en_dev = bnxt_ulp_probe(netdev);
        if (IS_ERR(en_dev))
                return en_dev;
 
@@ -593,10 +621,6 @@ static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev)
                return ERR_PTR(-ENODEV);
        }
 
-       /* Bump net device reference count */
-       if (!try_module_get(pdev->driver->driver.owner))
-               return ERR_PTR(-ENODEV);
-
        dev_hold(netdev);
 
        return en_dev;
@@ -1523,13 +1547,12 @@ fail:
 
 static void bnxt_re_dev_unreg(struct bnxt_re_dev *rdev)
 {
-       struct bnxt_en_dev *en_dev = rdev->en_dev;
        struct net_device *netdev = rdev->netdev;
 
        bnxt_re_dev_remove(rdev);
 
        if (netdev)
-               bnxt_re_dev_unprobe(netdev, en_dev);
+               dev_put(netdev);
 }
 
 static int bnxt_re_dev_reg(struct bnxt_re_dev **rdev, struct net_device *netdev)
@@ -1551,7 +1574,7 @@ static int bnxt_re_dev_reg(struct bnxt_re_dev **rdev, struct net_device *netdev)
        *rdev = bnxt_re_dev_add(netdev, en_dev);
        if (!*rdev) {
                rc = -ENOMEM;
-               bnxt_re_dev_unprobe(netdev, en_dev);
+               dev_put(netdev);
                goto exit;
        }
 exit:
index 995d463..d4d4959 100644 (file)
@@ -2784,6 +2784,7 @@ do_rq:
                dev_err(&cq->hwq.pdev->dev,
                        "FP: CQ Processed terminal reported rq_cons_idx 0x%x exceeds max 0x%x\n",
                        cqe_cons, rq->max_wqe);
+               rc = -EINVAL;
                goto done;
        }
 
index 441eb42..5d384de 100644 (file)
@@ -212,6 +212,10 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
        u8 opcode, retry_cnt = 0xFF;
        int rc = 0;
 
+       /* Prevent posting if f/w is not in a state to process */
+       if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags))
+               return 0;
+
        do {
                opcode = req->opcode;
                rc = __send_message(rcfw, req, resp, sb, is_block);
index 5f2f0a5..9474c00 100644 (file)
@@ -138,6 +138,8 @@ struct bnxt_qplib_qp_node {
 #define FIRMWARE_INITIALIZED_FLAG      (0)
 #define FIRMWARE_FIRST_FLAG            (31)
 #define FIRMWARE_TIMED_OUT             (3)
+#define ERR_DEVICE_DETACHED             (4)
+
 struct bnxt_qplib_cmdq_mbox {
        struct bnxt_qplib_reg_desc      reg;
        void __iomem                    *prod;
index fa78783..3ca4700 100644 (file)
@@ -854,6 +854,7 @@ static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res     *res,
 
 unmap_io:
        pci_iounmap(res->pdev, dpit->dbr_bar_reg_iomem);
+       dpit->dbr_bar_reg_iomem = NULL;
        return -ENOMEM;
 }
 
index e42c812..291471d 100644 (file)
@@ -145,7 +145,7 @@ static void connect_reply_upcall(struct c4iw_ep *ep, int status);
 static int sched(struct c4iw_dev *dev, struct sk_buff *skb);
 
 static LIST_HEAD(timeout_list);
-static spinlock_t timeout_lock;
+static DEFINE_SPINLOCK(timeout_lock);
 
 static void deref_cm_id(struct c4iw_ep_common *epc)
 {
@@ -4452,7 +4452,6 @@ c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS] = {
 
 int __init c4iw_cm_init(void)
 {
-       spin_lock_init(&timeout_lock);
        skb_queue_head_init(&rxq);
 
        workq = alloc_ordered_workqueue("iw_cxgb4", WQ_MEM_RECLAIM);
index f85477f..cdec5de 100644 (file)
@@ -341,11 +341,6 @@ static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev)
        return container_of(ibdev, struct c4iw_dev, ibdev);
 }
 
-static inline struct c4iw_dev *rdev_to_c4iw_dev(struct c4iw_rdev *rdev)
-{
-       return container_of(rdev, struct c4iw_dev, rdev);
-}
-
 static inline struct c4iw_cq *get_chp(struct c4iw_dev *rhp, u32 cqid)
 {
        return xa_load(&rhp->cqs, cqid);
@@ -659,12 +654,6 @@ static inline u32 c4iw_ib_to_tpt_access(int a)
               FW_RI_MEM_ACCESS_LOCAL_READ;
 }
 
-static inline u32 c4iw_ib_to_tpt_bind_access(int acc)
-{
-       return (acc & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) |
-              (acc & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0);
-}
-
 enum c4iw_mmid_state {
        C4IW_STAG_STATE_VALID,
        C4IW_STAG_STATE_INVALID
index 1f1f856..3f1893e 100644 (file)
@@ -237,12 +237,12 @@ static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_udata *udata)
        return 0;
 }
 
-static int c4iw_query_gid(struct ib_device *ibdev, u8 port, int index,
+static int c4iw_query_gid(struct ib_device *ibdev, u32 port, int index,
                          union ib_gid *gid)
 {
        struct c4iw_dev *dev;
 
-       pr_debug("ibdev %p, port %d, index %d, gid %p\n",
+       pr_debug("ibdev %p, port %u, index %d, gid %p\n",
                 ibdev, port, index, gid);
        if (!port)
                return -EINVAL;
@@ -295,7 +295,7 @@ static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr *pro
        return 0;
 }
 
-static int c4iw_query_port(struct ib_device *ibdev, u8 port,
+static int c4iw_query_port(struct ib_device *ibdev, u32 port,
                           struct ib_port_attr *props)
 {
        int ret = 0;
@@ -378,7 +378,7 @@ static const char * const names[] = {
 };
 
 static struct rdma_hw_stats *c4iw_alloc_stats(struct ib_device *ibdev,
-                                             u8 port_num)
+                                             u32 port_num)
 {
        BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS);
 
@@ -391,7 +391,7 @@ static struct rdma_hw_stats *c4iw_alloc_stats(struct ib_device *ibdev,
 
 static int c4iw_get_mib(struct ib_device *ibdev,
                        struct rdma_hw_stats *stats,
-                       u8 port, int index)
+                       u32 port, int index)
 {
        struct tp_tcp_stats v4, v6;
        struct c4iw_dev *c4iw_dev = to_c4iw_dev(ibdev);
@@ -420,7 +420,7 @@ static const struct attribute_group c4iw_attr_group = {
        .attrs = c4iw_class_attributes,
 };
 
-static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num,
+static int c4iw_port_immutable(struct ib_device *ibdev, u32 port_num,
                               struct ib_port_immutable *immutable)
 {
        struct ib_port_attr attr;
index 5c95c78..e800e8e 100644 (file)
@@ -216,7 +216,7 @@ u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
                        goto out;
                entry->qid = qid;
                list_add_tail(&entry->entry, &uctx->cqids);
-               for (i = qid; i & rdev->qpmask; i++) {
+               for (i = qid + 1; i & rdev->qpmask; i++) {
                        entry = kmalloc(sizeof(*entry), GFP_KERNEL);
                        if (!entry)
                                goto out;
index b170817..c3b0e28 100644 (file)
@@ -487,11 +487,6 @@ static inline int t4_rq_empty(struct t4_wq *wq)
        return wq->rq.in_use == 0;
 }
 
-static inline int t4_rq_full(struct t4_wq *wq)
-{
-       return wq->rq.in_use == (wq->rq.size - 1);
-}
-
 static inline u32 t4_rq_avail(struct t4_wq *wq)
 {
        return wq->rq.size - 1 - wq->rq.in_use;
@@ -534,11 +529,6 @@ static inline int t4_sq_empty(struct t4_wq *wq)
        return wq->sq.in_use == 0;
 }
 
-static inline int t4_sq_full(struct t4_wq *wq)
-{
-       return wq->sq.in_use == (wq->sq.size - 1);
-}
-
 static inline u32 t4_sq_avail(struct t4_wq *wq)
 {
        return wq->sq.size - 1 - wq->sq.in_use;
@@ -679,11 +669,6 @@ static inline void t4_enable_wq_db(struct t4_wq *wq)
        wq->rq.queue[wq->rq.size].status.db_off = 0;
 }
 
-static inline int t4_wq_db_enabled(struct t4_wq *wq)
-{
-       return !wq->rq.queue[wq->rq.size].status.db_off;
-}
-
 enum t4_cq_flags {
        CQ_ARMED        = 1,
 };
@@ -817,19 +802,6 @@ static inline int t4_next_hw_cqe(struct t4_cq *cq, struct t4_cqe **cqe)
        return ret;
 }
 
-static inline struct t4_cqe *t4_next_sw_cqe(struct t4_cq *cq)
-{
-       if (cq->sw_in_use == cq->size) {
-               pr_warn("%s cxgb4 sw cq overflow cqid %u\n",
-                       __func__, cq->cqid);
-               cq->error = 1;
-               return NULL;
-       }
-       if (cq->sw_in_use)
-               return &cq->sw_queue[cq->sw_cidx];
-       return NULL;
-}
-
 static inline int t4_next_cqe(struct t4_cq *cq, struct t4_cqe **cqe)
 {
        int ret = 0;
@@ -843,11 +815,6 @@ static inline int t4_next_cqe(struct t4_cq *cq, struct t4_cqe **cqe)
        return ret;
 }
 
-static inline int t4_cq_in_error(struct t4_cq *cq)
-{
-       return *cq->qp_errp;
-}
-
 static inline void t4_set_cq_in_error(struct t4_cq *cq)
 {
        *cq->qp_errp = 1;
index e5d9712..ea322ce 100644 (file)
@@ -120,14 +120,14 @@ struct efa_ah {
 int efa_query_device(struct ib_device *ibdev,
                     struct ib_device_attr *props,
                     struct ib_udata *udata);
-int efa_query_port(struct ib_device *ibdev, u8 port,
+int efa_query_port(struct ib_device *ibdev, u32 port,
                   struct ib_port_attr *props);
 int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
                 int qp_attr_mask,
                 struct ib_qp_init_attr *qp_init_attr);
-int efa_query_gid(struct ib_device *ibdev, u8 port, int index,
+int efa_query_gid(struct ib_device *ibdev, u32 port, int index,
                  union ib_gid *gid);
-int efa_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+int efa_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
                   u16 *pkey);
 int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
 int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
@@ -142,7 +142,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
                         u64 virt_addr, int access_flags,
                         struct ib_udata *udata);
 int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
-int efa_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+int efa_get_port_immutable(struct ib_device *ibdev, u32 port_num,
                           struct ib_port_immutable *immutable);
 int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata);
 void efa_dealloc_ucontext(struct ib_ucontext *ibucontext);
@@ -156,9 +156,9 @@ int efa_destroy_ah(struct ib_ah *ibah, u32 flags);
 int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
                  int qp_attr_mask, struct ib_udata *udata);
 enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
-                                        u8 port_num);
-struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, u8 port_num);
+                                        u32 port_num);
+struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, u32 port_num);
 int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
-                    u8 port_num, int index);
+                    u32 port_num, int index);
 
 #endif /* _EFA_H_ */
index 0f57873..816cfd6 100644 (file)
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include <linux/module.h>
@@ -209,11 +209,11 @@ static void efa_set_host_info(struct efa_dev *dev)
        if (!hinf)
                return;
 
-       strlcpy(hinf->os_dist_str, utsname()->release,
-               min(sizeof(hinf->os_dist_str), sizeof(utsname()->release)));
+       strscpy(hinf->os_dist_str, utsname()->release,
+               sizeof(hinf->os_dist_str));
        hinf->os_type = EFA_ADMIN_OS_LINUX;
-       strlcpy(hinf->kernel_ver_str, utsname()->version,
-               min(sizeof(hinf->kernel_ver_str), sizeof(utsname()->version)));
+       strscpy(hinf->kernel_ver_str, utsname()->version,
+               sizeof(hinf->kernel_ver_str));
        hinf->kernel_ver = LINUX_VERSION_CODE;
        EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MAJOR, 0);
        EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MINOR, 0);
index 479b604..51572f1 100644 (file)
@@ -247,7 +247,7 @@ int efa_query_device(struct ib_device *ibdev,
        return 0;
 }
 
-int efa_query_port(struct ib_device *ibdev, u8 port,
+int efa_query_port(struct ib_device *ibdev, u32 port,
                   struct ib_port_attr *props)
 {
        struct efa_dev *dev = to_edev(ibdev);
@@ -319,7 +319,7 @@ int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
        return 0;
 }
 
-int efa_query_gid(struct ib_device *ibdev, u8 port, int index,
+int efa_query_gid(struct ib_device *ibdev, u32 port, int index,
                  union ib_gid *gid)
 {
        struct efa_dev *dev = to_edev(ibdev);
@@ -329,7 +329,7 @@ int efa_query_gid(struct ib_device *ibdev, u8 port, int index,
        return 0;
 }
 
-int efa_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+int efa_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
                   u16 *pkey)
 {
        if (index > 0)
@@ -1619,7 +1619,7 @@ int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
        return 0;
 }
 
-int efa_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+int efa_get_port_immutable(struct ib_device *ibdev, u32 port_num,
                           struct ib_port_immutable *immutable)
 {
        struct ib_port_attr attr;
@@ -1904,7 +1904,7 @@ int efa_destroy_ah(struct ib_ah *ibah, u32 flags)
        return 0;
 }
 
-struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, u8 port_num)
+struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, u32 port_num)
 {
        return rdma_alloc_hw_stats_struct(efa_stats_names,
                                          ARRAY_SIZE(efa_stats_names),
@@ -1912,7 +1912,7 @@ struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, u8 port_num)
 }
 
 int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
-                    u8 port_num, int index)
+                    u32 port_num, int index)
 {
        struct efa_com_get_stats_params params = {};
        union efa_com_get_stats_result result;
@@ -1981,7 +1981,7 @@ int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
 }
 
 enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
-                                        u8 port_num)
+                                        u32 port_num)
 {
        return IB_LINK_LAYER_UNSPECIFIED;
 }
index 04b1e8f..16543f7 100644 (file)
@@ -962,7 +962,6 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
                           struct hfi1_msix_entry *msix)
 {
        struct cpu_mask_set *set = NULL;
-       struct hfi1_ctxtdata *rcd;
        struct hfi1_affinity_node *entry;
 
        mutex_lock(&node_affinity.lock);
@@ -976,14 +975,15 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
        case IRQ_GENERAL:
                /* Don't do accounting for general contexts */
                break;
-       case IRQ_RCVCTXT:
-               rcd = (struct hfi1_ctxtdata *)msix->arg;
+       case IRQ_RCVCTXT: {
+               struct hfi1_ctxtdata *rcd = msix->arg;
+
                /* Don't do accounting for control contexts */
                if (rcd->ctxt != HFI1_CTRL_CTXT)
                        set = &entry->rcv_intr;
                break;
+       }
        case IRQ_NETDEVCTXT:
-               rcd = (struct hfi1_ctxtdata *)msix->arg;
                set = &entry->def_intr;
                break;
        default:
index 993cbf3..5eeae8d 100644 (file)
@@ -1322,7 +1322,7 @@ CNTR_ELEM(#name, \
          access_ibp_##cntr)
 
 /**
- * hfi_addr_from_offset - return addr for readq/writeq
+ * hfi1_addr_from_offset - return addr for readq/writeq
  * @dd: the dd device
  * @offset: the offset of the CSR within bar0
  *
@@ -8316,7 +8316,7 @@ static void is_interrupt(struct hfi1_devdata *dd, unsigned int source)
 }
 
 /**
- * gerneral_interrupt() -  General interrupt handler
+ * general_interrupt -  General interrupt handler
  * @irq: MSIx IRQ vector
  * @data: hfi1 devdata
  *
@@ -15243,8 +15243,8 @@ int hfi1_init_dd(struct hfi1_devdata *dd)
                 (dd->revision >> CCE_REVISION_SW_SHIFT)
                    & CCE_REVISION_SW_MASK);
 
-       /* alloc netdev data */
-       ret = hfi1_netdev_alloc(dd);
+       /* alloc VNIC/AIP rx data */
+       ret = hfi1_alloc_rx(dd);
        if (ret)
                goto bail_cleanup;
 
@@ -15348,7 +15348,7 @@ bail_clear_intr:
        hfi1_comp_vectors_clean_up(dd);
        msix_clean_up_interrupts(dd);
 bail_cleanup:
-       hfi1_netdev_free(dd);
+       hfi1_free_rx(dd);
        hfi1_pcie_ddcleanup(dd);
 bail_free:
        hfi1_free_devdata(dd);
index 2c6f2de..ac26649 100644 (file)
@@ -822,11 +822,6 @@ int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
 int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
 #define LCB_START DC_LCB_CSRS
 #define LCB_END   DC_8051_CSRS /* next block is 8051 */
-static inline int is_lcb_offset(u32 offset)
-{
-       return (offset >= LCB_START && offset < LCB_END);
-}
-
 extern uint num_vls;
 
 extern uint disable_integrity;
index 0b64aa8..f88bb4a 100644 (file)
@@ -1026,7 +1026,7 @@ static bool __set_armed_to_active(struct hfi1_packet *packet)
 }
 
 /**
- * armed to active - the fast path for armed to active
+ * set_armed_to_active  - the fast path for armed to active
  * @packet: the packet structure
  *
  * Return true if packet processing needs to bail.
index 91f1314..a414214 100644 (file)
@@ -49,7 +49,7 @@
 #include "trace.h"
 
 /**
- * exp_tid_group_init - initialize exp_tid_set
+ * hfi1_exp_tid_set_init - initialize exp_tid_set
  * @set: the set
  */
 static void hfi1_exp_tid_set_init(struct exp_tid_set *set)
@@ -70,7 +70,7 @@ void hfi1_exp_tid_group_init(struct hfi1_ctxtdata *rcd)
 }
 
 /**
- * alloc_ctxt_rcv_groups - initialize expected receive groups
+ * hfi1_alloc_ctxt_rcv_groups - initialize expected receive groups
  * @rcd: the context to add the groupings to
  */
 int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd)
@@ -100,7 +100,7 @@ int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd)
 }
 
 /**
- * free_ctxt_rcv_groups - free  expected receive groups
+ * hfi1_free_ctxt_rcv_groups - free  expected receive groups
  * @rcd: the context to free
  *
  * The routine dismantles the expect receive linked
index 0e83d4b..2cf102b 100644 (file)
@@ -1916,6 +1916,7 @@ int parse_platform_config(struct hfi1_devdata *dd)
                        dd_dev_err(dd, "%s: Failed CRC check at offset %ld\n",
                                   __func__, (ptr -
                                   (u32 *)dd->platform_config.data));
+                       ret = -EINVAL;
                        goto bail;
                }
                /* Jump the CRC DWORD */
index 2a9a040..867ae0b 100644 (file)
@@ -69,7 +69,6 @@
 #include <rdma/ib_hdrs.h>
 #include <rdma/opa_addr.h>
 #include <linux/rhashtable.h>
-#include <linux/netdevice.h>
 #include <rdma/rdma_vt.h>
 
 #include "chip_registers.h"
@@ -717,12 +716,6 @@ static inline void incr_cntr64(u64 *cntr)
                (*cntr)++;
 }
 
-static inline void incr_cntr32(u32 *cntr)
-{
-       if (*cntr < (u32)-1LL)
-               (*cntr)++;
-}
-
 #define MAX_NAME_SIZE 64
 struct hfi1_msix_entry {
        enum irq_type type;
@@ -864,7 +857,7 @@ struct hfi1_pportdata {
        u8 rx_pol_inv;
 
        u8 hw_pidx;     /* physical port index */
-       u8 port;        /* IB port number and index into dd->pports - 1 */
+       u32 port;        /* IB port number and index into dd->pports - 1 */
        /* type of neighbor node */
        u8 neighbor_type;
        u8 neighbor_normal;
@@ -1066,6 +1059,7 @@ struct sdma_vl_map;
 #define SERIAL_MAX 16 /* length of the serial number */
 
 typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64);
+struct hfi1_netdev_rx;
 struct hfi1_devdata {
        struct hfi1_ibdev verbs_dev;     /* must be first */
        /* pointers to related structs for this device */
@@ -1408,7 +1402,7 @@ struct hfi1_devdata {
        /* Lock to protect IRQ SRC register access */
        spinlock_t irq_src_lock;
        int vnic_num_vports;
-       struct net_device *dummy_netdev;
+       struct hfi1_netdev_rx *netdev_rx;
        struct hfi1_affinity_node *affinity_entry;
 
        /* Keeps track of IPoIB RSM rule users */
@@ -1480,7 +1474,7 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
                         struct hfi1_ctxtdata **rcd);
 void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd);
 void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
-                        struct hfi1_devdata *dd, u8 hw_pidx, u8 port);
+                        struct hfi1_devdata *dd, u8 hw_pidx, u32 port);
 void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd);
 int hfi1_rcd_put(struct hfi1_ctxtdata *rcd);
 int hfi1_rcd_get(struct hfi1_ctxtdata *rcd);
@@ -1976,10 +1970,10 @@ static inline struct hfi1_ibdev *dev_from_rdi(struct rvt_dev_info *rdi)
        return container_of(rdi, struct hfi1_ibdev, rdi);
 }
 
-static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port)
+static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u32 port)
 {
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+       u32 pidx = port - 1; /* IB number port from 1, hdw from 0 */
 
        WARN_ON(pidx >= dd->num_pports);
        return &dd->pport[pidx].ibport_data;
@@ -2198,7 +2192,7 @@ extern const struct attribute_group ib_hfi1_attr_group;
 int hfi1_device_create(struct hfi1_devdata *dd);
 void hfi1_device_remove(struct hfi1_devdata *dd);
 
-int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+int hfi1_create_port_files(struct ib_device *ibdev, u32 port_num,
                           struct kobject *kobj);
 int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd);
 void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd);
index 786c631..e3a8a42 100644 (file)
@@ -627,7 +627,7 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
  * Common code for initializing the physical port structure.
  */
 void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
-                        struct hfi1_devdata *dd, u8 hw_pidx, u8 port)
+                        struct hfi1_devdata *dd, u8 hw_pidx, u32 port)
 {
        int i;
        uint default_pkey_idx;
@@ -1775,7 +1775,7 @@ static void remove_one(struct pci_dev *pdev)
        hfi1_unregister_ib_device(dd);
 
        /* free netdev data */
-       hfi1_netdev_free(dd);
+       hfi1_free_rx(dd);
 
        /*
         * Disable the IB link, disable interrupts on the device,
@@ -1860,7 +1860,8 @@ bail:
 }
 
 /**
- * allocate eager buffers, both kernel and user contexts.
+ * hfi1_setup_eagerbufs - llocate eager buffers, both kernel and user
+ * contexts.
  * @rcd: the context we are setting up.
  *
  * Allocate the eager TID buffers and program them into hip.
index d580aa1..cda81a7 100644 (file)
@@ -321,7 +321,7 @@ static inline void iowait_drain_wakeup(struct iowait *wait)
 /**
  * iowait_get_txhead() - get packet off of iowait list
  *
- * @wait iowait_work struture
+ * @wait: iowait_work structure
  */
 static inline struct sdma_txreq *iowait_get_txhead(struct iowait_work *wait)
 {
index f650cac..2cff38b 100644 (file)
@@ -52,8 +52,9 @@ union hfi1_ipoib_flow {
  * @producer_lock: producer sync lock
  * @consumer_lock: consumer sync lock
  */
+struct ipoib_txreq;
 struct hfi1_ipoib_circ_buf {
-       void **items;
+       struct ipoib_txreq **items;
        unsigned long head;
        unsigned long tail;
        unsigned long max_items;
@@ -125,10 +126,10 @@ hfi1_ipoib_priv(const struct net_device *dev)
        return &((struct hfi1_ipoib_rdma_netdev *)netdev_priv(dev))->dev_priv;
 }
 
-int hfi1_ipoib_send_dma(struct net_device *dev,
-                       struct sk_buff *skb,
-                       struct ib_ah *address,
-                       u32 dqpn);
+int hfi1_ipoib_send(struct net_device *dev,
+                   struct sk_buff *skb,
+                   struct ib_ah *address,
+                   u32 dqpn);
 
 int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv);
 void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv);
@@ -143,8 +144,10 @@ struct sk_buff *hfi1_ipoib_prepare_skb(struct hfi1_netdev_rxq *rxq,
                                       int size, void *data);
 
 int hfi1_ipoib_rn_get_params(struct ib_device *device,
-                            u8 port_num,
+                            u32 port_num,
                             enum rdma_netdev_t type,
                             struct rdma_netdev_alloc_params *params);
 
+void hfi1_ipoib_tx_timeout(struct net_device *dev, unsigned int q);
+
 #endif /* _IPOIB_H */
index 3242290..e594a96 100644 (file)
@@ -101,14 +101,6 @@ static const struct net_device_ops hfi1_ipoib_netdev_ops = {
        .ndo_get_stats64  = dev_get_tstats64,
 };
 
-static int hfi1_ipoib_send(struct net_device *dev,
-                          struct sk_buff *skb,
-                          struct ib_ah *address,
-                          u32 dqpn)
-{
-       return hfi1_ipoib_send_dma(dev, skb, address, dqpn);
-}
-
 static int hfi1_ipoib_mcast_attach(struct net_device *dev,
                                   struct ib_device *device,
                                   union ib_gid *mgid,
@@ -194,7 +186,7 @@ static void hfi1_ipoib_set_id(struct net_device *dev, int id)
 }
 
 static int hfi1_ipoib_setup_rn(struct ib_device *device,
-                              u8 port_num,
+                              u32 port_num,
                               struct net_device *netdev,
                               void *param)
 {
@@ -204,6 +196,7 @@ static int hfi1_ipoib_setup_rn(struct ib_device *device,
        int rc;
 
        rn->send = hfi1_ipoib_send;
+       rn->tx_timeout = hfi1_ipoib_tx_timeout;
        rn->attach_mcast = hfi1_ipoib_mcast_attach;
        rn->detach_mcast = hfi1_ipoib_mcast_detach;
        rn->set_id = hfi1_ipoib_set_id;
@@ -243,7 +236,7 @@ static int hfi1_ipoib_setup_rn(struct ib_device *device,
 }
 
 int hfi1_ipoib_rn_get_params(struct ib_device *device,
-                            u8 port_num,
+                            u32 port_num,
                             enum rdma_netdev_t type,
                             struct rdma_netdev_alloc_params *params)
 {
index edd4eea..993f983 100644 (file)
@@ -15,6 +15,7 @@
 #include "verbs.h"
 #include "trace_ibhdrs.h"
 #include "ipoib.h"
+#include "trace_tx.h"
 
 /* Add a convenience helper */
 #define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1))
@@ -63,12 +64,14 @@ static u64 hfi1_ipoib_used(struct hfi1_ipoib_txq *txq)
 
 static void hfi1_ipoib_stop_txq(struct hfi1_ipoib_txq *txq)
 {
+       trace_hfi1_txq_stop(txq);
        if (atomic_inc_return(&txq->stops) == 1)
                netif_stop_subqueue(txq->priv->netdev, txq->q_idx);
 }
 
 static void hfi1_ipoib_wake_txq(struct hfi1_ipoib_txq *txq)
 {
+       trace_hfi1_txq_wake(txq);
        if (atomic_dec_and_test(&txq->stops))
                netif_wake_subqueue(txq->priv->netdev, txq->q_idx);
 }
@@ -89,8 +92,10 @@ static void hfi1_ipoib_check_queue_depth(struct hfi1_ipoib_txq *txq)
 {
        ++txq->sent_txreqs;
        if (hfi1_ipoib_used(txq) >= hfi1_ipoib_ring_hwat(txq) &&
-           !atomic_xchg(&txq->ring_full, 1))
+           !atomic_xchg(&txq->ring_full, 1)) {
+               trace_hfi1_txq_full(txq);
                hfi1_ipoib_stop_txq(txq);
+       }
 }
 
 static void hfi1_ipoib_check_queue_stopped(struct hfi1_ipoib_txq *txq)
@@ -112,8 +117,10 @@ static void hfi1_ipoib_check_queue_stopped(struct hfi1_ipoib_txq *txq)
         * to protect against ring overflow.
         */
        if (hfi1_ipoib_used(txq) < hfi1_ipoib_ring_lwat(txq) &&
-           atomic_xchg(&txq->ring_full, 0))
+           atomic_xchg(&txq->ring_full, 0)) {
+               trace_hfi1_txq_xmit_unstopped(txq);
                hfi1_ipoib_wake_txq(txq);
+       }
 }
 
 static void hfi1_ipoib_free_tx(struct ipoib_txreq *tx, int budget)
@@ -202,7 +209,7 @@ static void hfi1_ipoib_add_tx(struct ipoib_txreq *tx)
 
                /* Finish storing txreq before incrementing head. */
                smp_store_release(&tx_ring->head, CIRC_ADD(head, 1, max_tx));
-               napi_schedule(tx->txq->napi);
+               napi_schedule_irqoff(tx->txq->napi);
        } else {
                struct hfi1_ipoib_txq *txq = tx->txq;
                struct hfi1_ipoib_dev_priv *priv = tx->priv;
@@ -405,6 +412,7 @@ static struct ipoib_txreq *hfi1_ipoib_send_dma_common(struct net_device *dev,
                                sdma_select_engine_sc(priv->dd,
                                                      txp->flow.tx_queue,
                                                      txp->flow.sc5);
+                       trace_hfi1_flow_switch(txp->txq);
                }
 
                return tx;
@@ -525,6 +533,7 @@ static int hfi1_ipoib_send_dma_list(struct net_device *dev,
        if (txq->flow.as_int != txp->flow.as_int) {
                int ret;
 
+               trace_hfi1_flow_flush(txq);
                ret = hfi1_ipoib_flush_tx_list(dev, txq);
                if (unlikely(ret)) {
                        if (ret == -EBUSY)
@@ -572,10 +581,10 @@ static u8 hfi1_ipoib_calc_entropy(struct sk_buff *skb)
        return (u8)skb_get_queue_mapping(skb);
 }
 
-int hfi1_ipoib_send_dma(struct net_device *dev,
-                       struct sk_buff *skb,
-                       struct ib_ah *address,
-                       u32 dqpn)
+int hfi1_ipoib_send(struct net_device *dev,
+                   struct sk_buff *skb,
+                   struct ib_ah *address,
+                   u32 dqpn)
 {
        struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
        struct ipoib_txparms txp;
@@ -635,8 +644,10 @@ static int hfi1_ipoib_sdma_sleep(struct sdma_engine *sde,
                        /* came from non-list submit */
                        list_add_tail(&txreq->list, &txq->tx_list);
                if (list_empty(&txq->wait.list)) {
-                       if (!atomic_xchg(&txq->no_desc, 1))
+                       if (!atomic_xchg(&txq->no_desc, 1)) {
+                               trace_hfi1_txq_queued(txq);
                                hfi1_ipoib_stop_txq(txq);
+                       }
                        iowait_queue(pkts_sent, wait->iow, &sde->dmawait);
                }
 
@@ -659,6 +670,7 @@ static void hfi1_ipoib_sdma_wakeup(struct iowait *wait, int reason)
        struct hfi1_ipoib_txq *txq =
                container_of(wait, struct hfi1_ipoib_txq, wait);
 
+       trace_hfi1_txq_wakeup(txq);
        if (likely(txq->priv->netdev->reg_state == NETREG_REGISTERED))
                iowait_schedule(wait, system_highpri_wq, WORK_CPU_UNBOUND);
 }
@@ -702,14 +714,14 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv)
 
        priv->tx_napis = kcalloc_node(dev->num_tx_queues,
                                      sizeof(struct napi_struct),
-                                     GFP_ATOMIC,
+                                     GFP_KERNEL,
                                      priv->dd->node);
        if (!priv->tx_napis)
                goto free_txreq_cache;
 
        priv->txqs = kcalloc_node(dev->num_tx_queues,
                                  sizeof(struct hfi1_ipoib_txq),
-                                 GFP_ATOMIC,
+                                 GFP_KERNEL,
                                  priv->dd->node);
        if (!priv->txqs)
                goto free_tx_napis;
@@ -741,9 +753,9 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv)
                                             priv->dd->node);
 
                txq->tx_ring.items =
-                       vzalloc_node(array_size(tx_ring_size,
-                                               sizeof(struct ipoib_txreq)),
-                                    priv->dd->node);
+                       kcalloc_node(tx_ring_size,
+                                    sizeof(struct ipoib_txreq *),
+                                    GFP_KERNEL, priv->dd->node);
                if (!txq->tx_ring.items)
                        goto free_txqs;
 
@@ -764,7 +776,7 @@ free_txqs:
                struct hfi1_ipoib_txq *txq = &priv->txqs[i];
 
                netif_napi_del(txq->napi);
-               vfree(txq->tx_ring.items);
+               kfree(txq->tx_ring.items);
        }
 
        kfree(priv->txqs);
@@ -817,7 +829,7 @@ void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv)
                hfi1_ipoib_drain_tx_list(txq);
                netif_napi_del(txq->napi);
                (void)hfi1_ipoib_drain_tx_ring(txq, txq->tx_ring.max_items);
-               vfree(txq->tx_ring.items);
+               kfree(txq->tx_ring.items);
        }
 
        kfree(priv->txqs);
@@ -854,3 +866,32 @@ void hfi1_ipoib_napi_tx_disable(struct net_device *dev)
                (void)hfi1_ipoib_drain_tx_ring(txq, txq->tx_ring.max_items);
        }
 }
+
+void hfi1_ipoib_tx_timeout(struct net_device *dev, unsigned int q)
+{
+       struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+       struct hfi1_ipoib_txq *txq = &priv->txqs[q];
+       u64 completed = atomic64_read(&txq->complete_txreqs);
+
+       dd_dev_info(priv->dd, "timeout txq %llx q %u stopped %u stops %d no_desc %d ring_full %d\n",
+                   (unsigned long long)txq, q,
+                   __netif_subqueue_stopped(dev, txq->q_idx),
+                   atomic_read(&txq->stops),
+                   atomic_read(&txq->no_desc),
+                   atomic_read(&txq->ring_full));
+       dd_dev_info(priv->dd, "sde %llx engine %u\n",
+                   (unsigned long long)txq->sde,
+                   txq->sde ? txq->sde->this_idx : 0);
+       dd_dev_info(priv->dd, "flow %x\n", txq->flow.as_int);
+       dd_dev_info(priv->dd, "sent %llu completed %llu used %llu\n",
+                   txq->sent_txreqs, completed, hfi1_ipoib_used(txq));
+       dd_dev_info(priv->dd, "tx_queue_len %u max_items %lu\n",
+                   dev->tx_queue_len, txq->tx_ring.max_items);
+       dd_dev_info(priv->dd, "head %lu tail %lu\n",
+                   txq->tx_ring.head, txq->tx_ring.tail);
+       dd_dev_info(priv->dd, "wait queued %u\n",
+                   !list_empty(&txq->wait.list));
+       dd_dev_info(priv->dd, "tx_list empty %u\n",
+                   list_empty(&txq->tx_list));
+}
+
index e2f2f78..1fe5e70 100644 (file)
@@ -108,7 +108,7 @@ static u16 hfi1_lookup_pkey_value(struct hfi1_ibport *ibp, int pkey_idx)
        return 0;
 }
 
-void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port)
+void hfi1_event_pkey_change(struct hfi1_devdata *dd, u32 port)
 {
        struct ib_event event;
 
@@ -297,7 +297,7 @@ static struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u32 dlid)
        struct rvt_qp *qp0;
        struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
        struct hfi1_devdata *dd = dd_from_ppd(ppd);
-       u8 port_num = ppd->port;
+       u32 port_num = ppd->port;
 
        memset(&attr, 0, sizeof(attr));
        attr.type = rdma_ah_find_type(&dd->verbs_dev.rdi.ibdev, port_num);
@@ -515,7 +515,7 @@ static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
 /*
  * Send a Port Capability Mask Changed trap (ch. 14.3.11).
  */
-void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
+void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u32 port_num)
 {
        struct trap_node *trap;
        struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
@@ -581,7 +581,7 @@ void hfi1_node_desc_chg(struct hfi1_ibport *ibp)
 
 static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
                                   u8 *data, struct ib_device *ibdev,
-                                  u8 port, u32 *resp_len, u32 max_len)
+                                  u32 port, u32 *resp_len, u32 max_len)
 {
        struct opa_node_description *nd;
 
@@ -601,12 +601,12 @@ static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
 }
 
 static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
+                                  struct ib_device *ibdev, u32 port,
                                   u32 *resp_len, u32 max_len)
 {
        struct opa_node_info *ni;
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+       u32 pidx = port - 1; /* IB number port from 1, hw from 0 */
 
        ni = (struct opa_node_info *)data;
 
@@ -641,11 +641,11 @@ static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
-                            u8 port)
+                            u32 port)
 {
        struct ib_node_info *nip = (struct ib_node_info *)&smp->data;
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+       u32 pidx = port - 1; /* IB number port from 1, hw from 0 */
 
        /* GUID 0 is illegal */
        if (smp->attr_mod || pidx >= dd->num_pports ||
@@ -794,7 +794,7 @@ void read_ltp_rtt(struct hfi1_devdata *dd)
 }
 
 static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
+                                  struct ib_device *ibdev, u32 port,
                                   u32 *resp_len, u32 max_len)
 {
        int i;
@@ -1009,7 +1009,7 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
  * @port: the IB port number
  * @pkeys: the pkey table is placed here
  */
-static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+static int get_pkeys(struct hfi1_devdata *dd, u32 port, u16 *pkeys)
 {
        struct hfi1_pportdata *ppd = dd->pport + port - 1;
 
@@ -1019,7 +1019,7 @@ static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
 }
 
 static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
+                                   struct ib_device *ibdev, u32 port,
                                    u32 *resp_len, u32 max_len)
 {
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
@@ -1349,7 +1349,7 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
  *
  */
 static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
+                                  struct ib_device *ibdev, u32 port,
                                   u32 *resp_len, u32 max_len, int local_mad)
 {
        struct opa_port_info *pi = (struct opa_port_info *)data;
@@ -1667,7 +1667,7 @@ get_only:
  * @port: the IB port number
  * @pkeys: the PKEY table
  */
-static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+static int set_pkeys(struct hfi1_devdata *dd, u32 port, u16 *pkeys)
 {
        struct hfi1_pportdata *ppd;
        int i;
@@ -1718,7 +1718,7 @@ static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
 }
 
 static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
+                                   struct ib_device *ibdev, u32 port,
                                    u32 *resp_len, u32 max_len)
 {
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
@@ -1732,7 +1732,7 @@ static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
        u32 size = 0;
 
        if (n_blocks_sent == 0) {
-               pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+               pr_warn("OPA Get PKey AM Invalid : P = %u; B = 0x%x; N = 0x%x\n",
                        port, start_block, n_blocks_sent);
                smp->status |= IB_SMP_INVALID_FIELD;
                return reply((struct ib_mad_hdr *)smp);
@@ -1825,7 +1825,7 @@ static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
 }
 
 static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
+                                  struct ib_device *ibdev, u32 port,
                                   u32 *resp_len, u32 max_len)
 {
        struct hfi1_ibport *ibp = to_iport(ibdev, port);
@@ -1848,7 +1848,7 @@ static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
+                                  struct ib_device *ibdev, u32 port,
                                   u32 *resp_len, u32 max_len)
 {
        struct hfi1_ibport *ibp = to_iport(ibdev, port);
@@ -1877,7 +1877,7 @@ static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
+                                  struct ib_device *ibdev, u32 port,
                                   u32 *resp_len, u32 max_len)
 {
        struct hfi1_ibport *ibp = to_iport(ibdev, port);
@@ -1900,7 +1900,7 @@ static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
+                                  struct ib_device *ibdev, u32 port,
                                   u32 *resp_len, u32 max_len)
 {
        struct hfi1_ibport *ibp = to_iport(ibdev, port);
@@ -1921,7 +1921,7 @@ static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
+                                   struct ib_device *ibdev, u32 port,
                                    u32 *resp_len, u32 max_len)
 {
        u32 n_blocks = OPA_AM_NBLK(am);
@@ -1943,7 +1943,7 @@ static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
+                                   struct ib_device *ibdev, u32 port,
                                    u32 *resp_len, u32 max_len)
 {
        u32 n_blocks = OPA_AM_NBLK(am);
@@ -1985,7 +1985,7 @@ static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
-                                    struct ib_device *ibdev, u8 port,
+                                    struct ib_device *ibdev, u32 port,
                                     u32 *resp_len, u32 max_len)
 {
        u32 n_blocks = OPA_AM_NPORT(am);
@@ -2010,7 +2010,7 @@ static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
-                                    struct ib_device *ibdev, u8 port,
+                                    struct ib_device *ibdev, u32 port,
                                     u32 *resp_len, u32 max_len)
 {
        u32 n_blocks = OPA_AM_NPORT(am);
@@ -2042,7 +2042,7 @@ static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
-                             struct ib_device *ibdev, u8 port,
+                             struct ib_device *ibdev, u32 port,
                              u32 *resp_len, u32 max_len)
 {
        u32 nports = OPA_AM_NPORT(am);
@@ -2084,7 +2084,7 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
-                             struct ib_device *ibdev, u8 port,
+                             struct ib_device *ibdev, u32 port,
                              u32 *resp_len, u32 max_len, int local_mad)
 {
        u32 nports = OPA_AM_NPORT(am);
@@ -2132,7 +2132,7 @@ static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
-                                    struct ib_device *ibdev, u8 port,
+                                    struct ib_device *ibdev, u32 port,
                                     u32 *resp_len, u32 max_len)
 {
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
@@ -2184,7 +2184,7 @@ static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
-                             struct ib_device *ibdev, u8 port, u32 *resp_len,
+                             struct ib_device *ibdev, u32 port, u32 *resp_len,
                              u32 max_len)
 {
        u32 num_ports = OPA_AM_NPORT(am);
@@ -2208,7 +2208,7 @@ static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
-                             struct ib_device *ibdev, u8 port, u32 *resp_len,
+                             struct ib_device *ibdev, u32 port, u32 *resp_len,
                              u32 max_len)
 {
        u32 num_ports = OPA_AM_NPORT(am);
@@ -2232,7 +2232,7 @@ static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
-                                struct ib_device *ibdev, u8 port,
+                                struct ib_device *ibdev, u32 port,
                                 u32 *resp_len, u32 max_len)
 {
        struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
@@ -2274,7 +2274,7 @@ static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
-                                struct ib_device *ibdev, u8 port,
+                                struct ib_device *ibdev, u32 port,
                                 u32 *resp_len, u32 max_len)
 {
        struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
@@ -2722,7 +2722,7 @@ u64 get_xmit_wait_counters(struct hfi1_pportdata *ppd,
 
 static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
                                  struct ib_device *ibdev,
-                                 u8 port, u32 *resp_len)
+                                 u32 port, u32 *resp_len)
 {
        struct opa_port_status_req *req =
                (struct opa_port_status_req *)pmp->data;
@@ -2732,7 +2732,7 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
        unsigned long vl;
        size_t response_data_size;
        u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
-       u8 port_num = req->port_num;
+       u32 port_num = req->port_num;
        u8 num_vls = hweight64(vl_select_mask);
        struct _vls_pctrs *vlinfo;
        struct hfi1_ibport *ibp = to_iport(ibdev, port);
@@ -2888,7 +2888,7 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
        return reply((struct ib_mad_hdr *)pmp);
 }
 
-static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port,
+static u64 get_error_counter_summary(struct ib_device *ibdev, u32 port,
                                     u8 res_lli, u8 res_ler)
 {
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
@@ -2973,7 +2973,7 @@ static void pma_get_opa_port_dctrs(struct ib_device *ibdev,
 
 static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
                                    struct ib_device *ibdev,
-                                   u8 port, u32 *resp_len)
+                                   u32 port, u32 *resp_len)
 {
        struct opa_port_data_counters_msg *req =
                (struct opa_port_data_counters_msg *)pmp->data;
@@ -2987,7 +2987,7 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
        u8 lq, num_vls;
        u8 res_lli, res_ler;
        u64 port_mask;
-       u8 port_num;
+       u32 port_num;
        unsigned long vl;
        unsigned long vl_select_mask;
        int vfi;
@@ -3123,7 +3123,7 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
 }
 
 static int pma_get_ib_portcounters_ext(struct ib_pma_mad *pmp,
-                                      struct ib_device *ibdev, u8 port)
+                                      struct ib_device *ibdev, u32 port)
 {
        struct ib_pma_portcounters_ext *p = (struct ib_pma_portcounters_ext *)
                                                pmp->data;
@@ -3151,7 +3151,7 @@ bail:
 }
 
 static void pma_get_opa_port_ectrs(struct ib_device *ibdev,
-                                  struct _port_ectrs *rsp, u8 port)
+                                  struct _port_ectrs *rsp, u32 port)
 {
        u64 tmp, tmp2;
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
@@ -3194,11 +3194,11 @@ static void pma_get_opa_port_ectrs(struct ib_device *ibdev,
 
 static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
                                  struct ib_device *ibdev,
-                                 u8 port, u32 *resp_len)
+                                 u32 port, u32 *resp_len)
 {
        size_t response_data_size;
        struct _port_ectrs *rsp;
-       u8 port_num;
+       u32 port_num;
        struct opa_port_error_counters64_msg *req;
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
        u32 num_ports;
@@ -3283,7 +3283,7 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
 }
 
 static int pma_get_ib_portcounters(struct ib_pma_mad *pmp,
-                                  struct ib_device *ibdev, u8 port)
+                                  struct ib_device *ibdev, u32 port)
 {
        struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
                pmp->data;
@@ -3369,7 +3369,7 @@ bail:
 
 static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp,
                                 struct ib_device *ibdev,
-                                u8 port, u32 *resp_len)
+                                u32 port, u32 *resp_len)
 {
        size_t response_data_size;
        struct _port_ei *rsp;
@@ -3377,7 +3377,7 @@ static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp,
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
        u64 port_mask;
        u32 num_ports;
-       u8 port_num;
+       u32 port_num;
        u8 num_pslm;
        u64 reg;
 
@@ -3468,7 +3468,7 @@ static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp,
 
 static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
                                  struct ib_device *ibdev,
-                                 u8 port, u32 *resp_len)
+                                 u32 port, u32 *resp_len)
 {
        struct opa_clear_port_status *req =
                (struct opa_clear_port_status *)pmp->data;
@@ -3620,14 +3620,14 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
 
 static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp,
                                 struct ib_device *ibdev,
-                                u8 port, u32 *resp_len)
+                                u32 port, u32 *resp_len)
 {
        struct _port_ei *rsp;
        struct opa_port_error_info_msg *req;
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
        u64 port_mask;
        u32 num_ports;
-       u8 port_num;
+       u32 port_num;
        u8 num_pslm;
        u32 error_info_select;
 
@@ -3702,7 +3702,7 @@ struct opa_congestion_info_attr {
 } __packed;
 
 static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
+                                   struct ib_device *ibdev, u32 port,
                                    u32 *resp_len, u32 max_len)
 {
        struct opa_congestion_info_attr *p =
@@ -3727,7 +3727,7 @@ static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
 
 static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
                                       u8 *data, struct ib_device *ibdev,
-                                      u8 port, u32 *resp_len, u32 max_len)
+                                      u32 port, u32 *resp_len, u32 max_len)
 {
        int i;
        struct opa_congestion_setting_attr *p =
@@ -3819,7 +3819,7 @@ static void apply_cc_state(struct hfi1_pportdata *ppd)
 }
 
 static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
-                                      struct ib_device *ibdev, u8 port,
+                                      struct ib_device *ibdev, u32 port,
                                       u32 *resp_len, u32 max_len)
 {
        struct opa_congestion_setting_attr *p =
@@ -3860,7 +3860,7 @@ static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
 
 static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
                                        u8 *data, struct ib_device *ibdev,
-                                       u8 port, u32 *resp_len, u32 max_len)
+                                       u32 port, u32 *resp_len, u32 max_len)
 {
        struct hfi1_ibport *ibp = to_iport(ibdev, port);
        struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
@@ -3925,7 +3925,7 @@ static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
 }
 
 static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
+                                  struct ib_device *ibdev, u32 port,
                                   u32 *resp_len, u32 max_len)
 {
        struct ib_cc_table_attr *cc_table_attr =
@@ -3977,7 +3977,7 @@ static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
+                                  struct ib_device *ibdev, u32 port,
                                   u32 *resp_len, u32 max_len)
 {
        struct ib_cc_table_attr *p = (struct ib_cc_table_attr *)data;
@@ -4036,7 +4036,7 @@ struct opa_led_info {
 #define OPA_LED_MASK   BIT(OPA_LED_SHIFT)
 
 static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
+                                  struct ib_device *ibdev, u32 port,
                                   u32 *resp_len, u32 max_len)
 {
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
@@ -4066,7 +4066,7 @@ static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
+                                  struct ib_device *ibdev, u32 port,
                                   u32 *resp_len, u32 max_len)
 {
        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
@@ -4089,7 +4089,7 @@ static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
 }
 
 static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
-                           u8 *data, struct ib_device *ibdev, u8 port,
+                           u8 *data, struct ib_device *ibdev, u32 port,
                            u32 *resp_len, u32 max_len)
 {
        int ret;
@@ -4179,7 +4179,7 @@ static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
 }
 
 static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
-                           u8 *data, struct ib_device *ibdev, u8 port,
+                           u8 *data, struct ib_device *ibdev, u32 port,
                            u32 *resp_len, u32 max_len, int local_mad)
 {
        int ret;
@@ -4254,7 +4254,7 @@ static inline void set_aggr_error(struct opa_aggregate *ag)
 }
 
 static int subn_get_opa_aggregate(struct opa_smp *smp,
-                                 struct ib_device *ibdev, u8 port,
+                                 struct ib_device *ibdev, u32 port,
                                  u32 *resp_len)
 {
        int i;
@@ -4303,7 +4303,7 @@ static int subn_get_opa_aggregate(struct opa_smp *smp,
 }
 
 static int subn_set_opa_aggregate(struct opa_smp *smp,
-                                 struct ib_device *ibdev, u8 port,
+                                 struct ib_device *ibdev, u32 port,
                                  u32 *resp_len, int local_mad)
 {
        int i;
@@ -4509,7 +4509,7 @@ static int hfi1_pkey_validation_pma(struct hfi1_ibport *ibp,
 }
 
 static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
-                           u8 port, const struct opa_mad *in_mad,
+                           u32 port, const struct opa_mad *in_mad,
                            struct opa_mad *out_mad,
                            u32 *resp_len, int local_mad)
 {
@@ -4614,7 +4614,7 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
 }
 
 static int process_subn(struct ib_device *ibdev, int mad_flags,
-                       u8 port, const struct ib_mad *in_mad,
+                       u32 port, const struct ib_mad *in_mad,
                        struct ib_mad *out_mad)
 {
        struct ib_smp *smp = (struct ib_smp *)out_mad;
@@ -4672,7 +4672,7 @@ static int process_subn(struct ib_device *ibdev, int mad_flags,
        return ret;
 }
 
-static int process_perf(struct ib_device *ibdev, u8 port,
+static int process_perf(struct ib_device *ibdev, u32 port,
                        const struct ib_mad *in_mad,
                        struct ib_mad *out_mad)
 {
@@ -4734,7 +4734,7 @@ static int process_perf(struct ib_device *ibdev, u8 port,
        return ret;
 }
 
-static int process_perf_opa(struct ib_device *ibdev, u8 port,
+static int process_perf_opa(struct ib_device *ibdev, u32 port,
                            const struct opa_mad *in_mad,
                            struct opa_mad *out_mad, u32 *resp_len)
 {
@@ -4816,7 +4816,7 @@ static int process_perf_opa(struct ib_device *ibdev, u8 port,
 }
 
 static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags,
-                               u8 port, const struct ib_wc *in_wc,
+                               u32 port, const struct ib_wc *in_wc,
                                const struct ib_grh *in_grh,
                                const struct opa_mad *in_mad,
                                struct opa_mad *out_mad, size_t *out_mad_size,
@@ -4869,7 +4869,7 @@ bail:
        return ret;
 }
 
-static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u32 port,
                               const struct ib_wc *in_wc,
                               const struct ib_grh *in_grh,
                               const struct ib_mad *in_mad,
@@ -4914,7 +4914,7 @@ static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port,
  *
  * This is called by the ib_mad module.
  */
-int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u32 port,
                     const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                     const struct ib_mad *in_mad, struct ib_mad *out_mad,
                     size_t *out_mad_size, u16 *out_mad_pkey_index)
index 889e63d..0205d30 100644 (file)
@@ -436,7 +436,7 @@ struct sc2vlnt {
                    COUNTER_MASK(1, 3) | \
                    COUNTER_MASK(1, 4))
 
-void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port);
+void hfi1_event_pkey_change(struct hfi1_devdata *dd, u32 port);
 void hfi1_handle_trap_timer(struct timer_list *t);
 u16 tx_link_width(u16 link_width);
 u64 get_xmit_wait_counters(struct hfi1_pportdata *ppd, u16 link_width,
index f3fb28e..d213f65 100644 (file)
@@ -89,7 +89,7 @@ int hfi1_mmu_rb_register(void *ops_arg,
        struct mmu_rb_handler *h;
        int ret;
 
-       h = kmalloc(sizeof(*h), GFP_KERNEL);
+       h = kzalloc(sizeof(*h), GFP_KERNEL);
        if (!h)
                return -ENOMEM;
 
index cf3040b..57a5f02 100644 (file)
@@ -206,7 +206,7 @@ int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd)
 }
 
 /**
- * msix_request_rcd_irq() - Helper function for RCVAVAIL IRQs
+ * msix_netdev_request_rcd_irq  - Helper function for RCVAVAIL IRQs
  * for netdev context
  * @rcd: valid netdev contexti
  */
@@ -221,7 +221,7 @@ int msix_netdev_request_rcd_irq(struct hfi1_ctxtdata *rcd)
 }
 
 /**
- * msix_request_smda_ira() - Helper for getting SDMA IRQ resources
+ * msix_request_sdma_irq  - Helper for getting SDMA IRQ resources
  * @sde: valid sdma engine
  *
  */
@@ -243,7 +243,7 @@ int msix_request_sdma_irq(struct sdma_engine *sde)
 }
 
 /**
- * msix_request_general_irq(void) - Helper for getting general IRQ
+ * msix_request_general_irq - Helper for getting general IRQ
  * resources
  * @dd: valid device data
  */
@@ -269,7 +269,7 @@ int msix_request_general_irq(struct hfi1_devdata *dd)
 }
 
 /**
- * enable_sdma_src() - Helper to enable SDMA IRQ srcs
+ * enable_sdma_srcs - Helper to enable SDMA IRQ srcs
  * @dd: valid devdata structure
  * @i: index of SDMA engine
  */
@@ -349,7 +349,7 @@ void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr)
 }
 
 /**
- * hfi1_clean_up_msix_interrupts() - Free all MSIx IRQ resources
+ * msix_clean_up_interrupts  - Free all MSIx IRQ resources
  * @dd: valid device data data structure
  *
  * Free the MSIx and associated PCI resources, if they have been allocated.
@@ -372,7 +372,7 @@ void msix_clean_up_interrupts(struct hfi1_devdata *dd)
 }
 
 /**
- * msix_netdev_syncrhonize_irq() - netdev IRQ synchronize
+ * msix_netdev_synchronize_irq - netdev IRQ synchronize
  * @dd: valid devdata
  */
 void msix_netdev_synchronize_irq(struct hfi1_devdata *dd)
index 947543a..8aa0746 100644 (file)
 
 /**
  * struct hfi1_netdev_rxq - Receive Queue for HFI
- * dummy netdev. Both IPoIB and VNIC netdevices will be working on
- * top of this device.
+ * Both IPoIB and VNIC netdevices will be working on the rx abstraction.
  * @napi: napi object
- * @priv: ptr to netdev_priv
+ * @rx: ptr to netdev_rx
  * @rcd:  ptr to receive context data
  */
 struct hfi1_netdev_rxq {
        struct napi_struct napi;
-       struct hfi1_netdev_priv *priv;
+       struct hfi1_netdev_rx *rx;
        struct hfi1_ctxtdata *rcd;
 };
 
@@ -36,7 +35,8 @@ struct hfi1_netdev_rxq {
 #define NUM_NETDEV_MAP_ENTRIES HFI1_MAX_NETDEV_CTXTS
 
 /**
- * struct hfi1_netdev_priv: data required to setup and run HFI netdev.
+ * struct hfi1_netdev_rx: data required to setup and run HFI netdev.
+ * @rx_napi:   the dummy netdevice to support "polling" the receive contexts
  * @dd:                hfi1_devdata
  * @rxq:       pointer to dummy netdev receive queues.
  * @num_rx_q:  number of receive queues
@@ -48,7 +48,8 @@ struct hfi1_netdev_rxq {
  * @netdevs:   atomic counter of netdevs using dummy netdev.
  *             When 0 receive queues will be freed.
  */
-struct hfi1_netdev_priv {
+struct hfi1_netdev_rx {
+       struct net_device rx_napi;
        struct hfi1_devdata *dd;
        struct hfi1_netdev_rxq *rxq;
        int num_rx_q;
@@ -60,42 +61,28 @@ struct hfi1_netdev_priv {
        atomic_t netdevs;
 };
 
-static inline
-struct hfi1_netdev_priv *hfi1_netdev_priv(struct net_device *dev)
-{
-       return (struct hfi1_netdev_priv *)&dev[1];
-}
-
 static inline
 int hfi1_netdev_ctxt_count(struct hfi1_devdata *dd)
 {
-       struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
-
-       return priv->num_rx_q;
+       return dd->netdev_rx->num_rx_q;
 }
 
 static inline
 struct hfi1_ctxtdata *hfi1_netdev_get_ctxt(struct hfi1_devdata *dd, int ctxt)
 {
-       struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
-
-       return priv->rxq[ctxt].rcd;
+       return dd->netdev_rx->rxq[ctxt].rcd;
 }
 
 static inline
 int hfi1_netdev_get_free_rmt_idx(struct hfi1_devdata *dd)
 {
-       struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
-
-       return priv->rmt_start;
+       return dd->netdev_rx->rmt_start;
 }
 
 static inline
 void hfi1_netdev_set_free_rmt_idx(struct hfi1_devdata *dd, int rmt_idx)
 {
-       struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
-
-       priv->rmt_start = rmt_idx;
+       dd->netdev_rx->rmt_start = rmt_idx;
 }
 
 u32 hfi1_num_netdev_contexts(struct hfi1_devdata *dd, u32 available_contexts,
@@ -105,8 +92,8 @@ void hfi1_netdev_enable_queues(struct hfi1_devdata *dd);
 void hfi1_netdev_disable_queues(struct hfi1_devdata *dd);
 int hfi1_netdev_rx_init(struct hfi1_devdata *dd);
 int hfi1_netdev_rx_destroy(struct hfi1_devdata *dd);
-int hfi1_netdev_alloc(struct hfi1_devdata *dd);
-void hfi1_netdev_free(struct hfi1_devdata *dd);
+int hfi1_alloc_rx(struct hfi1_devdata *dd);
+void hfi1_free_rx(struct hfi1_devdata *dd);
 int hfi1_netdev_add_data(struct hfi1_devdata *dd, int id, void *data);
 void *hfi1_netdev_remove_data(struct hfi1_devdata *dd, int id);
 void *hfi1_netdev_get_data(struct hfi1_devdata *dd, int id);
index 1bcab99..03b098a 100644 (file)
 #include <linux/etherdevice.h>
 #include <rdma/ib_verbs.h>
 
-static int hfi1_netdev_setup_ctxt(struct hfi1_netdev_priv *priv,
+static int hfi1_netdev_setup_ctxt(struct hfi1_netdev_rx *rx,
                                  struct hfi1_ctxtdata *uctxt)
 {
        unsigned int rcvctrl_ops;
-       struct hfi1_devdata *dd = priv->dd;
+       struct hfi1_devdata *dd = rx->dd;
        int ret;
 
        uctxt->rhf_rcv_function_map = netdev_rhf_rcv_functions;
@@ -118,11 +118,11 @@ static void hfi1_netdev_deallocate_ctxt(struct hfi1_devdata *dd,
        hfi1_free_ctxt(uctxt);
 }
 
-static int hfi1_netdev_allot_ctxt(struct hfi1_netdev_priv *priv,
+static int hfi1_netdev_allot_ctxt(struct hfi1_netdev_rx *rx,
                                  struct hfi1_ctxtdata **ctxt)
 {
        int rc;
-       struct hfi1_devdata *dd = priv->dd;
+       struct hfi1_devdata *dd = rx->dd;
 
        rc = hfi1_netdev_allocate_ctxt(dd, ctxt);
        if (rc) {
@@ -130,7 +130,7 @@ static int hfi1_netdev_allot_ctxt(struct hfi1_netdev_priv *priv,
                return rc;
        }
 
-       rc = hfi1_netdev_setup_ctxt(priv, *ctxt);
+       rc = hfi1_netdev_setup_ctxt(rx, *ctxt);
        if (rc) {
                dd_dev_err(dd, "netdev ctxt setup failed %d\n", rc);
                hfi1_netdev_deallocate_ctxt(dd, *ctxt);
@@ -183,31 +183,31 @@ u32 hfi1_num_netdev_contexts(struct hfi1_devdata *dd, u32 available_contexts,
                    (u32)HFI1_MAX_NETDEV_CTXTS);
 }
 
-static int hfi1_netdev_rxq_init(struct net_device *dev)
+static int hfi1_netdev_rxq_init(struct hfi1_netdev_rx *rx)
 {
        int i;
        int rc;
-       struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dev);
-       struct hfi1_devdata *dd = priv->dd;
+       struct hfi1_devdata *dd = rx->dd;
+       struct net_device *dev = &rx->rx_napi;
 
-       priv->num_rx_q = dd->num_netdev_contexts;
-       priv->rxq = kcalloc_node(priv->num_rx_q, sizeof(struct hfi1_netdev_rxq),
-                                GFP_KERNEL, dd->node);
+       rx->num_rx_q = dd->num_netdev_contexts;
+       rx->rxq = kcalloc_node(rx->num_rx_q, sizeof(*rx->rxq),
+                              GFP_KERNEL, dd->node);
 
-       if (!priv->rxq) {
+       if (!rx->rxq) {
                dd_dev_err(dd, "Unable to allocate netdev queue data\n");
                return (-ENOMEM);
        }
 
-       for (i = 0; i < priv->num_rx_q; i++) {
-               struct hfi1_netdev_rxq *rxq = &priv->rxq[i];
+       for (i = 0; i < rx->num_rx_q; i++) {
+               struct hfi1_netdev_rxq *rxq = &rx->rxq[i];
 
-               rc = hfi1_netdev_allot_ctxt(priv, &rxq->rcd);
+               rc = hfi1_netdev_allot_ctxt(rx, &rxq->rcd);
                if (rc)
                        goto bail_context_irq_failure;
 
                hfi1_rcd_get(rxq->rcd);
-               rxq->priv = priv;
+               rxq->rx = rx;
                rxq->rcd->napi = &rxq->napi;
                dd_dev_info(dd, "Setting rcv queue %d napi to context %d\n",
                            i, rxq->rcd->ctxt);
@@ -227,7 +227,7 @@ static int hfi1_netdev_rxq_init(struct net_device *dev)
 bail_context_irq_failure:
        dd_dev_err(dd, "Unable to allot receive context\n");
        for (; i >= 0; i--) {
-               struct hfi1_netdev_rxq *rxq = &priv->rxq[i];
+               struct hfi1_netdev_rxq *rxq = &rx->rxq[i];
 
                if (rxq->rcd) {
                        hfi1_netdev_deallocate_ctxt(dd, rxq->rcd);
@@ -235,20 +235,19 @@ bail_context_irq_failure:
                        rxq->rcd = NULL;
                }
        }
-       kfree(priv->rxq);
-       priv->rxq = NULL;
+       kfree(rx->rxq);
+       rx->rxq = NULL;
 
        return rc;
 }
 
-static void hfi1_netdev_rxq_deinit(struct net_device *dev)
+static void hfi1_netdev_rxq_deinit(struct hfi1_netdev_rx *rx)
 {
        int i;
-       struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dev);
-       struct hfi1_devdata *dd = priv->dd;
+       struct hfi1_devdata *dd = rx->dd;
 
-       for (i = 0; i < priv->num_rx_q; i++) {
-               struct hfi1_netdev_rxq *rxq = &priv->rxq[i];
+       for (i = 0; i < rx->num_rx_q; i++) {
+               struct hfi1_netdev_rxq *rxq = &rx->rxq[i];
 
                netif_napi_del(&rxq->napi);
                hfi1_netdev_deallocate_ctxt(dd, rxq->rcd);
@@ -256,41 +255,41 @@ static void hfi1_netdev_rxq_deinit(struct net_device *dev)
                rxq->rcd = NULL;
        }
 
-       kfree(priv->rxq);
-       priv->rxq = NULL;
-       priv->num_rx_q = 0;
+       kfree(rx->rxq);
+       rx->rxq = NULL;
+       rx->num_rx_q = 0;
 }
 
-static void enable_queues(struct hfi1_netdev_priv *priv)
+static void enable_queues(struct hfi1_netdev_rx *rx)
 {
        int i;
 
-       for (i = 0; i < priv->num_rx_q; i++) {
-               struct hfi1_netdev_rxq *rxq = &priv->rxq[i];
+       for (i = 0; i < rx->num_rx_q; i++) {
+               struct hfi1_netdev_rxq *rxq = &rx->rxq[i];
 
-               dd_dev_info(priv->dd, "enabling queue %d on context %d\n", i,
+               dd_dev_info(rx->dd, "enabling queue %d on context %d\n", i,
                            rxq->rcd->ctxt);
                napi_enable(&rxq->napi);
-               hfi1_rcvctrl(priv->dd,
+               hfi1_rcvctrl(rx->dd,
                             HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB,
                             rxq->rcd);
        }
 }
 
-static void disable_queues(struct hfi1_netdev_priv *priv)
+static void disable_queues(struct hfi1_netdev_rx *rx)
 {
        int i;
 
-       msix_netdev_synchronize_irq(priv->dd);
+       msix_netdev_synchronize_irq(rx->dd);
 
-       for (i = 0; i < priv->num_rx_q; i++) {
-               struct hfi1_netdev_rxq *rxq = &priv->rxq[i];
+       for (i = 0; i < rx->num_rx_q; i++) {
+               struct hfi1_netdev_rxq *rxq = &rx->rxq[i];
 
-               dd_dev_info(priv->dd, "disabling queue %d on context %d\n", i,
+               dd_dev_info(rx->dd, "disabling queue %d on context %d\n", i,
                            rxq->rcd->ctxt);
 
                /* wait for napi if it was scheduled */
-               hfi1_rcvctrl(priv->dd,
+               hfi1_rcvctrl(rx->dd,
                             HFI1_RCVCTRL_CTXT_DIS | HFI1_RCVCTRL_INTRAVAIL_DIS,
                             rxq->rcd);
                napi_synchronize(&rxq->napi);
@@ -307,15 +306,14 @@ static void disable_queues(struct hfi1_netdev_priv *priv)
  */
 int hfi1_netdev_rx_init(struct hfi1_devdata *dd)
 {
-       struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+       struct hfi1_netdev_rx *rx = dd->netdev_rx;
        int res;
 
-       if (atomic_fetch_inc(&priv->netdevs))
+       if (atomic_fetch_inc(&rx->netdevs))
                return 0;
 
        mutex_lock(&hfi1_mutex);
-       init_dummy_netdev(dd->dummy_netdev);
-       res = hfi1_netdev_rxq_init(dd->dummy_netdev);
+       res = hfi1_netdev_rxq_init(rx);
        mutex_unlock(&hfi1_mutex);
        return res;
 }
@@ -328,12 +326,12 @@ int hfi1_netdev_rx_init(struct hfi1_devdata *dd)
  */
 int hfi1_netdev_rx_destroy(struct hfi1_devdata *dd)
 {
-       struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+       struct hfi1_netdev_rx *rx = dd->netdev_rx;
 
        /* destroy the RX queues only if it is the last netdev going away */
-       if (atomic_fetch_add_unless(&priv->netdevs, -1, 0) == 1) {
+       if (atomic_fetch_add_unless(&rx->netdevs, -1, 0) == 1) {
                mutex_lock(&hfi1_mutex);
-               hfi1_netdev_rxq_deinit(dd->dummy_netdev);
+               hfi1_netdev_rxq_deinit(rx);
                mutex_unlock(&hfi1_mutex);
        }
 
@@ -341,39 +339,43 @@ int hfi1_netdev_rx_destroy(struct hfi1_devdata *dd)
 }
 
 /**
- * hfi1_netdev_alloc - Allocates netdev and private data. It is required
- * because RMT index and MSI-X interrupt can be set only
- * during driver initialization.
- *
+ * hfi1_alloc_rx - Allocates the rx support structure
  * @dd: hfi1 dev data
+ *
+ * Allocate the rx structure to support gathering the receive
+ * resources and the dummy netdev.
+ *
+ * Updates dd struct pointer upon success.
+ *
+ * Return: 0 (success) -error on failure
+ *
  */
-int hfi1_netdev_alloc(struct hfi1_devdata *dd)
+int hfi1_alloc_rx(struct hfi1_devdata *dd)
 {
-       struct hfi1_netdev_priv *priv;
-       const int netdev_size = sizeof(*dd->dummy_netdev) +
-               sizeof(struct hfi1_netdev_priv);
+       struct hfi1_netdev_rx *rx;
 
-       dd_dev_info(dd, "allocating netdev size %d\n", netdev_size);
-       dd->dummy_netdev = kcalloc_node(1, netdev_size, GFP_KERNEL, dd->node);
+       dd_dev_info(dd, "allocating rx size %ld\n", sizeof(*rx));
+       rx = kzalloc_node(sizeof(*rx), GFP_KERNEL, dd->node);
 
-       if (!dd->dummy_netdev)
+       if (!rx)
                return -ENOMEM;
+       rx->dd = dd;
+       init_dummy_netdev(&rx->rx_napi);
 
-       priv = hfi1_netdev_priv(dd->dummy_netdev);
-       priv->dd = dd;
-       xa_init(&priv->dev_tbl);
-       atomic_set(&priv->enabled, 0);
-       atomic_set(&priv->netdevs, 0);
+       xa_init(&rx->dev_tbl);
+       atomic_set(&rx->enabled, 0);
+       atomic_set(&rx->netdevs, 0);
+       dd->netdev_rx = rx;
 
        return 0;
 }
 
-void hfi1_netdev_free(struct hfi1_devdata *dd)
+void hfi1_free_rx(struct hfi1_devdata *dd)
 {
-       if (dd->dummy_netdev) {
-               dd_dev_info(dd, "hfi1 netdev freed\n");
-               kfree(dd->dummy_netdev);
-               dd->dummy_netdev = NULL;
+       if (dd->netdev_rx) {
+               dd_dev_info(dd, "hfi1 rx freed\n");
+               kfree(dd->netdev_rx);
+               dd->netdev_rx = NULL;
        }
 }
 
@@ -388,33 +390,33 @@ void hfi1_netdev_free(struct hfi1_devdata *dd)
  */
 void hfi1_netdev_enable_queues(struct hfi1_devdata *dd)
 {
-       struct hfi1_netdev_priv *priv;
+       struct hfi1_netdev_rx *rx;
 
-       if (!dd->dummy_netdev)
+       if (!dd->netdev_rx)
                return;
 
-       priv = hfi1_netdev_priv(dd->dummy_netdev);
-       if (atomic_fetch_inc(&priv->enabled))
+       rx = dd->netdev_rx;
+       if (atomic_fetch_inc(&rx->enabled))
                return;
 
        mutex_lock(&hfi1_mutex);
-       enable_queues(priv);
+       enable_queues(rx);
        mutex_unlock(&hfi1_mutex);
 }
 
 void hfi1_netdev_disable_queues(struct hfi1_devdata *dd)
 {
-       struct hfi1_netdev_priv *priv;
+       struct hfi1_netdev_rx *rx;
 
-       if (!dd->dummy_netdev)
+       if (!dd->netdev_rx)
                return;
 
-       priv = hfi1_netdev_priv(dd->dummy_netdev);
-       if (atomic_dec_if_positive(&priv->enabled))
+       rx = dd->netdev_rx;
+       if (atomic_dec_if_positive(&rx->enabled))
                return;
 
        mutex_lock(&hfi1_mutex);
-       disable_queues(priv);
+       disable_queues(rx);
        mutex_unlock(&hfi1_mutex);
 }
 
@@ -430,9 +432,9 @@ void hfi1_netdev_disable_queues(struct hfi1_devdata *dd)
  */
 int hfi1_netdev_add_data(struct hfi1_devdata *dd, int id, void *data)
 {
-       struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+       struct hfi1_netdev_rx *rx = dd->netdev_rx;
 
-       return xa_insert(&priv->dev_tbl, id, data, GFP_NOWAIT);
+       return xa_insert(&rx->dev_tbl, id, data, GFP_NOWAIT);
 }
 
 /**
@@ -444,9 +446,9 @@ int hfi1_netdev_add_data(struct hfi1_devdata *dd, int id, void *data)
  */
 void *hfi1_netdev_remove_data(struct hfi1_devdata *dd, int id)
 {
-       struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+       struct hfi1_netdev_rx *rx = dd->netdev_rx;
 
-       return xa_erase(&priv->dev_tbl, id);
+       return xa_erase(&rx->dev_tbl, id);
 }
 
 /**
@@ -457,24 +459,24 @@ void *hfi1_netdev_remove_data(struct hfi1_devdata *dd, int id)
  */
 void *hfi1_netdev_get_data(struct hfi1_devdata *dd, int id)
 {
-       struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+       struct hfi1_netdev_rx *rx = dd->netdev_rx;
 
-       return xa_load(&priv->dev_tbl, id);
+       return xa_load(&rx->dev_tbl, id);
 }
 
 /**
- * hfi1_netdev_get_first_dat - Gets first entry with greater or equal id.
+ * hfi1_netdev_get_first_data - Gets first entry with greater or equal id.
  *
  * @dd: hfi1 dev data
  * @start_id: requested integer id up to INT_MAX
  */
 void *hfi1_netdev_get_first_data(struct hfi1_devdata *dd, int *start_id)
 {
-       struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+       struct hfi1_netdev_rx *rx = dd->netdev_rx;
        unsigned long index = *start_id;
        void *ret;
 
-       ret = xa_find(&priv->dev_tbl, &index, UINT_MAX, XA_PRESENT);
+       ret = xa_find(&rx->dev_tbl, &index, UINT_MAX, XA_PRESENT);
        *start_id = (int)index;
        return ret;
 }
index 46b5290..1fcc6e9 100644 (file)
@@ -1285,7 +1285,7 @@ bail:
 }
 
 /**
- * sdma_clean()  Clean up allocated memory
+ * sdma_clean - Clean up allocated memory
  * @dd:          struct hfi1_devdata
  * @num_engines: num sdma engines
  *
index 7a85119..f57d552 100644 (file)
@@ -907,24 +907,6 @@ static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq,
        return 0;
 }
 
-/**
- * sdma_iowait_schedule() - initialize wait structure
- * @sde: sdma_engine to schedule
- * @wait: wait struct to schedule
- *
- * This function initializes the iowait
- * structure embedded in the QP or PQ.
- *
- */
-static inline void sdma_iowait_schedule(
-       struct sdma_engine *sde,
-       struct iowait *wait)
-{
-       struct hfi1_pportdata *ppd = sde->dd->pport;
-
-       iowait_schedule(wait, ppd->hfi1_wq, sde->cpu);
-}
-
 /* for use by interrupt handling */
 void sdma_engine_error(struct sdma_engine *sde, u64 status);
 void sdma_engine_interrupt(struct sdma_engine *sde, u64 status);
index 5650130..eaf441e 100644 (file)
@@ -649,7 +649,7 @@ const struct attribute_group ib_hfi1_attr_group = {
        .attrs = hfi1_attributes,
 };
 
-int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+int hfi1_create_port_files(struct ib_device *ibdev, u32 port_num,
                           struct kobject *kobj)
 {
        struct hfi1_pportdata *ppd;
index 769e5e4..d44fc54 100644 (file)
@@ -53,6 +53,8 @@
 #include "hfi.h"
 #include "mad.h"
 #include "sdma.h"
+#include "ipoib.h"
+#include "user_sdma.h"
 
 const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1);
 
@@ -653,6 +655,80 @@ TRACE_EVENT(hfi1_sdma_user_completion,
                      __entry->code)
 );
 
+TRACE_EVENT(hfi1_usdma_defer,
+           TP_PROTO(struct hfi1_user_sdma_pkt_q *pq,
+                    struct sdma_engine *sde,
+                    struct iowait *wait),
+           TP_ARGS(pq, sde, wait),
+           TP_STRUCT__entry(DD_DEV_ENTRY(pq->dd)
+                            __field(struct hfi1_user_sdma_pkt_q *, pq)
+                            __field(struct sdma_engine *, sde)
+                            __field(struct iowait *, wait)
+                            __field(int, engine)
+                            __field(int, empty)
+                            ),
+            TP_fast_assign(DD_DEV_ASSIGN(pq->dd);
+                           __entry->pq = pq;
+                           __entry->sde = sde;
+                           __entry->wait = wait;
+                           __entry->engine = sde->this_idx;
+                           __entry->empty = list_empty(&__entry->wait->list);
+                           ),
+            TP_printk("[%s] pq %llx sde %llx wait %llx engine %d empty %d",
+                      __get_str(dev),
+                      (unsigned long long)__entry->pq,
+                      (unsigned long long)__entry->sde,
+                      (unsigned long long)__entry->wait,
+                      __entry->engine,
+                      __entry->empty
+               )
+);
+
+TRACE_EVENT(hfi1_usdma_activate,
+           TP_PROTO(struct hfi1_user_sdma_pkt_q *pq,
+                    struct iowait *wait,
+                    int reason),
+           TP_ARGS(pq, wait, reason),
+           TP_STRUCT__entry(DD_DEV_ENTRY(pq->dd)
+                            __field(struct hfi1_user_sdma_pkt_q *, pq)
+                            __field(struct iowait *, wait)
+                            __field(int, reason)
+                            ),
+            TP_fast_assign(DD_DEV_ASSIGN(pq->dd);
+                           __entry->pq = pq;
+                           __entry->wait = wait;
+                           __entry->reason = reason;
+                           ),
+            TP_printk("[%s] pq %llx wait %llx reason %d",
+                      __get_str(dev),
+                      (unsigned long long)__entry->pq,
+                      (unsigned long long)__entry->wait,
+                      __entry->reason
+               )
+);
+
+TRACE_EVENT(hfi1_usdma_we,
+           TP_PROTO(struct hfi1_user_sdma_pkt_q *pq,
+                    int we_ret),
+           TP_ARGS(pq, we_ret),
+           TP_STRUCT__entry(DD_DEV_ENTRY(pq->dd)
+                            __field(struct hfi1_user_sdma_pkt_q *, pq)
+                            __field(int, state)
+                            __field(int, we_ret)
+                            ),
+            TP_fast_assign(DD_DEV_ASSIGN(pq->dd);
+                           __entry->pq = pq;
+                           __entry->state = pq->state;
+                           __entry->we_ret = we_ret;
+                           ),
+            TP_printk("[%s] pq %llx state %d we_ret %d",
+                      __get_str(dev),
+                      (unsigned long long)__entry->pq,
+                      __entry->state,
+                      __entry->we_ret
+               )
+);
+
 const char *print_u32_array(struct trace_seq *, u32 *, int);
 #define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
 
@@ -858,6 +934,109 @@ DEFINE_EVENT(
        TP_ARGS(qp, flag)
 );
 
+DECLARE_EVENT_CLASS(/* AIP  */
+       hfi1_ipoib_txq_template,
+       TP_PROTO(struct hfi1_ipoib_txq *txq),
+       TP_ARGS(txq),
+       TP_STRUCT__entry(/* entry */
+               DD_DEV_ENTRY(txq->priv->dd)
+               __field(struct hfi1_ipoib_txq *, txq)
+               __field(struct sdma_engine *, sde)
+               __field(ulong, head)
+               __field(ulong, tail)
+               __field(uint, used)
+               __field(uint, flow)
+               __field(int, stops)
+               __field(int, no_desc)
+               __field(u8, idx)
+               __field(u8, stopped)
+       ),
+       TP_fast_assign(/* assign */
+               DD_DEV_ASSIGN(txq->priv->dd)
+               __entry->txq = txq;
+               __entry->sde = txq->sde;
+               __entry->head = txq->tx_ring.head;
+               __entry->tail = txq->tx_ring.tail;
+               __entry->idx = txq->q_idx;
+               __entry->used =
+                       txq->sent_txreqs -
+                       atomic64_read(&txq->complete_txreqs);
+               __entry->flow = txq->flow.as_int;
+               __entry->stops = atomic_read(&txq->stops);
+               __entry->no_desc = atomic_read(&txq->no_desc);
+               __entry->stopped =
+                __netif_subqueue_stopped(txq->priv->netdev, txq->q_idx);
+       ),
+       TP_printk(/* print  */
+               "[%s] txq %llx idx %u sde %llx head %lx tail %lx flow %x used %u stops %d no_desc %d stopped %u",
+               __get_str(dev),
+               (unsigned long long)__entry->txq,
+               __entry->idx,
+               (unsigned long long)__entry->sde,
+               __entry->head,
+               __entry->tail,
+               __entry->flow,
+               __entry->used,
+               __entry->stops,
+               __entry->no_desc,
+               __entry->stopped
+       )
+);
+
+DEFINE_EVENT(/* queue stop */
+       hfi1_ipoib_txq_template, hfi1_txq_stop,
+       TP_PROTO(struct hfi1_ipoib_txq *txq),
+       TP_ARGS(txq)
+);
+
+DEFINE_EVENT(/* queue wake */
+       hfi1_ipoib_txq_template, hfi1_txq_wake,
+       TP_PROTO(struct hfi1_ipoib_txq *txq),
+       TP_ARGS(txq)
+);
+
+DEFINE_EVENT(/* flow flush */
+       hfi1_ipoib_txq_template, hfi1_flow_flush,
+       TP_PROTO(struct hfi1_ipoib_txq *txq),
+       TP_ARGS(txq)
+);
+
+DEFINE_EVENT(/* flow switch */
+       hfi1_ipoib_txq_template, hfi1_flow_switch,
+       TP_PROTO(struct hfi1_ipoib_txq *txq),
+       TP_ARGS(txq)
+);
+
+DEFINE_EVENT(/* wakeup */
+       hfi1_ipoib_txq_template, hfi1_txq_wakeup,
+       TP_PROTO(struct hfi1_ipoib_txq *txq),
+       TP_ARGS(txq)
+);
+
+DEFINE_EVENT(/* full */
+       hfi1_ipoib_txq_template, hfi1_txq_full,
+       TP_PROTO(struct hfi1_ipoib_txq *txq),
+       TP_ARGS(txq)
+);
+
+DEFINE_EVENT(/* queued */
+       hfi1_ipoib_txq_template, hfi1_txq_queued,
+       TP_PROTO(struct hfi1_ipoib_txq *txq),
+       TP_ARGS(txq)
+);
+
+DEFINE_EVENT(/* xmit_stopped */
+       hfi1_ipoib_txq_template, hfi1_txq_xmit_stopped,
+       TP_PROTO(struct hfi1_ipoib_txq *txq),
+       TP_ARGS(txq)
+);
+
+DEFINE_EVENT(/* xmit_unstopped */
+       hfi1_ipoib_txq_template, hfi1_txq_xmit_unstopped,
+       TP_PROTO(struct hfi1_ipoib_txq *txq),
+       TP_ARGS(txq)
+);
+
 #endif /* __HFI1_TRACE_TX_H */
 
 #undef TRACE_INCLUDE_PATH
index 4a4956f..da5b2e3 100644 (file)
@@ -133,6 +133,7 @@ static int defer_packet_queue(
                container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
 
        write_seqlock(&sde->waitlock);
+       trace_hfi1_usdma_defer(pq, sde, &pq->busy);
        if (sdma_progress(sde, seq, txreq))
                goto eagain;
        /*
@@ -157,7 +158,8 @@ static void activate_packet_queue(struct iowait *wait, int reason)
 {
        struct hfi1_user_sdma_pkt_q *pq =
                container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
-       pq->busy.lock = NULL;
+
+       trace_hfi1_usdma_activate(pq, wait, reason);
        xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
        wake_up(&wait->wait_dma);
 };
@@ -599,13 +601,17 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
        while (req->seqsubmitted != req->info.npkts) {
                ret = user_sdma_send_pkts(req, pcount);
                if (ret < 0) {
+                       int we_ret;
+
                        if (ret != -EBUSY)
                                goto free_req;
-                       if (wait_event_interruptible_timeout(
+                       we_ret = wait_event_interruptible_timeout(
                                pq->busy.wait_dma,
                                pq->state == SDMA_PKT_Q_ACTIVE,
                                msecs_to_jiffies(
-                                       SDMA_IOWAIT_TIMEOUT)) <= 0)
+                                       SDMA_IOWAIT_TIMEOUT));
+                       trace_hfi1_usdma_we(pq, we_ret);
+                       if (we_ret <= 0)
                                flush_pq_iowait(pq);
                }
        }
index 1e8c02f..fabe581 100644 (file)
@@ -53,6 +53,7 @@
 #include "common.h"
 #include "iowait.h"
 #include "user_exp_rcv.h"
+#include "mmu_rb.h"
 
 /* The maximum number of Data io vectors per message/request */
 #define MAX_VECTORS_PER_REQ 8
index 0dd4bb0..5542943 100644 (file)
@@ -1407,7 +1407,7 @@ static inline u16 opa_width_to_ib(u16 in)
        }
 }
 
-static int query_port(struct rvt_dev_info *rdi, u8 port_num,
+static int query_port(struct rvt_dev_info *rdi, u32 port_num,
                      struct ib_port_attr *props)
 {
        struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
@@ -1485,7 +1485,7 @@ bail:
        return ret;
 }
 
-static int shut_down_port(struct rvt_dev_info *rdi, u8 port_num)
+static int shut_down_port(struct rvt_dev_info *rdi, u32 port_num)
 {
        struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
        struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
@@ -1694,7 +1694,7 @@ static int init_cntr_names(const char *names_in,
 }
 
 static struct rdma_hw_stats *alloc_hw_stats(struct ib_device *ibdev,
-                                           u8 port_num)
+                                           u32 port_num)
 {
        int i, err;
 
@@ -1758,7 +1758,7 @@ static u64 hfi1_sps_ints(void)
 }
 
 static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
-                       u8 port, int index)
+                       u32 port, int index)
 {
        u64 *values;
        int count;
index d36e3e1..420df17 100644 (file)
@@ -325,10 +325,10 @@ static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait)
  */
 void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl,
                   u32 qp1, u32 qp2, u32 lid1, u32 lid2);
-void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num);
+void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u32 port_num);
 void hfi1_sys_guid_chg(struct hfi1_ibport *ibp);
 void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
-int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u32 port,
                     const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                     const struct ib_mad *in_mad, struct ib_mad *out_mad,
                     size_t *out_mad_size, u16 *out_mad_pkey_index);
index d2d526c..4bdfc79 100644 (file)
@@ -99,11 +99,6 @@ static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
        return tx;
 }
 
-static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx)
-{
-       return &tx->txreq;
-}
-
 static inline struct verbs_txreq *get_waiting_verbs_txreq(struct iowait_work *w)
 {
        struct sdma_txreq *stx;
index 66150a1..a7a450e 100644 (file)
@@ -156,7 +156,7 @@ bool hfi1_vnic_sdma_write_avail(struct hfi1_vnic_vport_info *vinfo,
 
 /* vnic rdma netdev operations */
 struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device,
-                                     u8 port_num,
+                                     u32 port_num,
                                      enum rdma_netdev_t type,
                                      const char *name,
                                      unsigned char name_assign_type,
index a90824d..7e79c05 100644 (file)
@@ -593,7 +593,7 @@ static void hfi1_vnic_free_rn(struct net_device *netdev)
 }
 
 struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device,
-                                     u8 port_num,
+                                     u32 port_num,
                                      enum rdma_netdev_t type,
                                      const char *name,
                                      unsigned char name_assign_type,
index 4bcaaa0..5d389ed 100644 (file)
@@ -304,6 +304,9 @@ done:
 
 void hns_roce_cleanup_bitmap(struct hns_roce_dev *hr_dev)
 {
+       if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_XRC)
+               hns_roce_cleanup_xrcd_table(hr_dev);
+
        if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ)
                hns_roce_cleanup_srq_table(hr_dev);
        hns_roce_cleanup_qp_table(hr_dev);
index 339e3fd..8f68cc3 100644 (file)
 
 #define CMD_POLL_TOKEN 0xffff
 #define CMD_MAX_NUM 32
-#define CMD_TOKEN_MASK 0x1f
 
 static int hns_roce_cmd_mbox_post_hw(struct hns_roce_dev *hr_dev, u64 in_param,
                                     u64 out_param, u32 in_modifier,
                                     u8 op_modifier, u16 op, u16 token,
                                     int event)
 {
-       struct hns_roce_cmdq *cmd = &hr_dev->cmd;
-       int ret;
-
-       mutex_lock(&cmd->hcr_mutex);
-       ret = hr_dev->hw->post_mbox(hr_dev, in_param, out_param, in_modifier,
-                                   op_modifier, op, token, event);
-       mutex_unlock(&cmd->hcr_mutex);
-
-       return ret;
+       return hr_dev->hw->post_mbox(hr_dev, in_param, out_param, in_modifier,
+                                    op_modifier, op, token, event);
 }
 
 /* this should be called with "poll_sem" */
@@ -62,18 +54,19 @@ static int __hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param,
                                    u8 op_modifier, u16 op,
                                    unsigned int timeout)
 {
-       struct device *dev = hr_dev->dev;
        int ret;
 
        ret = hns_roce_cmd_mbox_post_hw(hr_dev, in_param, out_param,
                                        in_modifier, op_modifier, op,
                                        CMD_POLL_TOKEN, 0);
        if (ret) {
-               dev_err(dev, "[cmd_poll]hns_roce_cmd_mbox_post_hw failed\n");
+               dev_err_ratelimited(hr_dev->dev,
+                                   "failed to post mailbox %x in poll mode, ret = %d.\n",
+                                   op, ret);
                return ret;
        }
 
-       return hr_dev->hw->chk_mbox(hr_dev, timeout);
+       return hr_dev->hw->poll_mbox_done(hr_dev, timeout);
 }
 
 static int hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param,
@@ -96,15 +89,18 @@ void hns_roce_cmd_event(struct hns_roce_dev *hr_dev, u16 token, u8 status,
        struct hns_roce_cmd_context *context =
                &hr_dev->cmd.context[token % hr_dev->cmd.max_cmds];
 
-       if (token != context->token)
+       if (unlikely(token != context->token)) {
+               dev_err_ratelimited(hr_dev->dev,
+                                   "[cmd] invalid ae token %x,context token is %x!\n",
+                                   token, context->token);
                return;
+       }
 
        context->result = (status == HNS_ROCE_CMD_SUCCESS) ? 0 : (-EIO);
        context->out_param = out_param;
        complete(&context->done);
 }
 
-/* this should be called with "use_events" */
 static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param,
                                    u64 out_param, unsigned long in_modifier,
                                    u8 op_modifier, u16 op,
@@ -116,44 +112,44 @@ static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param,
        int ret;
 
        spin_lock(&cmd->context_lock);
-       WARN_ON(cmd->free_head < 0);
-       context = &cmd->context[cmd->free_head];
-       context->token += cmd->token_mask + 1;
-       cmd->free_head = context->next;
+
+       do {
+               context = &cmd->context[cmd->free_head];
+               cmd->free_head = context->next;
+       } while (context->busy);
+
+       context->busy = 1;
+       context->token += cmd->max_cmds;
+
        spin_unlock(&cmd->context_lock);
 
-       init_completion(&context->done);
+       reinit_completion(&context->done);
 
        ret = hns_roce_cmd_mbox_post_hw(hr_dev, in_param, out_param,
                                        in_modifier, op_modifier, op,
                                        context->token, 1);
-       if (ret)
+       if (ret) {
+               dev_err_ratelimited(dev,
+                                   "failed to post mailbox %x in event mode, ret = %d.\n",
+                                   op, ret);
                goto out;
+       }
 
-       /*
-        * It is timeout when wait_for_completion_timeout return 0
-        * The return value is the time limit set in advance
-        * how many seconds showing
-        */
        if (!wait_for_completion_timeout(&context->done,
                                         msecs_to_jiffies(timeout))) {
-               dev_err(dev, "[cmd]wait_for_completion_timeout timeout\n");
+               dev_err_ratelimited(dev, "[cmd] token %x mailbox %x timeout.\n",
+                                   context->token, op);
                ret = -EBUSY;
                goto out;
        }
 
        ret = context->result;
-       if (ret) {
-               dev_err(dev, "[cmd]event mod cmd process error!err=%d\n", ret);
-               goto out;
-       }
+       if (ret)
+               dev_err_ratelimited(dev, "[cmd] token %x mailbox %x error %d\n",
+                                   context->token, op, ret);
 
 out:
-       spin_lock(&cmd->context_lock);
-       context->next = cmd->free_head;
-       cmd->free_head = context - cmd->context;
-       spin_unlock(&cmd->context_lock);
-
+       context->busy = 0;
        return ret;
 }
 
@@ -175,44 +171,28 @@ int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param,
                      unsigned long in_modifier, u8 op_modifier, u16 op,
                      unsigned int timeout)
 {
-       int ret;
+       bool is_busy;
 
-       if (hr_dev->hw->rst_prc_mbox) {
-               ret = hr_dev->hw->rst_prc_mbox(hr_dev);
-               if (ret == CMD_RST_PRC_SUCCESS)
-                       return 0;
-               else if (ret == CMD_RST_PRC_EBUSY)
-                       return -EBUSY;
-       }
+       if (hr_dev->hw->chk_mbox_avail)
+               if (!hr_dev->hw->chk_mbox_avail(hr_dev, &is_busy))
+                       return is_busy ? -EBUSY : 0;
 
        if (hr_dev->cmd.use_events)
-               ret = hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
-                                            in_modifier, op_modifier, op,
-                                            timeout);
+               return hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
+                                             in_modifier, op_modifier, op,
+                                             timeout);
        else
-               ret = hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param,
-                                            in_modifier, op_modifier, op,
-                                            timeout);
-
-       if (ret == CMD_RST_PRC_EBUSY)
-               return -EBUSY;
-
-       if (ret && (hr_dev->hw->rst_prc_mbox &&
-                   hr_dev->hw->rst_prc_mbox(hr_dev) == CMD_RST_PRC_SUCCESS))
-               return 0;
-
-       return ret;
+               return hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param,
+                                             in_modifier, op_modifier, op,
+                                             timeout);
 }
 
 int hns_roce_cmd_init(struct hns_roce_dev *hr_dev)
 {
-       struct device *dev = hr_dev->dev;
-
-       mutex_init(&hr_dev->cmd.hcr_mutex);
        sema_init(&hr_dev->cmd.poll_sem, 1);
        hr_dev->cmd.use_events = 0;
        hr_dev->cmd.max_cmds = CMD_MAX_NUM;
-       hr_dev->cmd.pool = dma_pool_create("hns_roce_cmd", dev,
+       hr_dev->cmd.pool = dma_pool_create("hns_roce_cmd", hr_dev->dev,
                                           HNS_ROCE_MAILBOX_SIZE,
                                           HNS_ROCE_MAILBOX_SIZE, 0);
        if (!hr_dev->cmd.pool)
@@ -239,16 +219,16 @@ int hns_roce_cmd_use_events(struct hns_roce_dev *hr_dev)
        for (i = 0; i < hr_cmd->max_cmds; ++i) {
                hr_cmd->context[i].token = i;
                hr_cmd->context[i].next = i + 1;
+               init_completion(&hr_cmd->context[i].done);
        }
-
-       hr_cmd->context[hr_cmd->max_cmds - 1].next = -1;
+       hr_cmd->context[hr_cmd->max_cmds - 1].next = 0;
        hr_cmd->free_head = 0;
 
        sema_init(&hr_cmd->event_sem, hr_cmd->max_cmds);
        spin_lock_init(&hr_cmd->context_lock);
 
-       hr_cmd->token_mask = CMD_TOKEN_MASK;
        hr_cmd->use_events = 1;
+       down(&hr_cmd->poll_sem);
 
        return 0;
 }
@@ -259,6 +239,8 @@ void hns_roce_cmd_use_polling(struct hns_roce_dev *hr_dev)
 
        kfree(hr_cmd->context);
        hr_cmd->use_events = 0;
+
+       up(&hr_cmd->poll_sem);
 }
 
 struct hns_roce_cmd_mailbox *
index 23c438c..d5fe56c 100644 (file)
@@ -48,7 +48,8 @@
 #define roce_set_field(origin, mask, shift, val)                               \
        do {                                                                   \
                (origin) &= ~cpu_to_le32(mask);                                \
-               (origin) |= cpu_to_le32(((u32)(val) << (u32)(shift)) & (mask));     \
+               (origin) |=                                                    \
+                       cpu_to_le32(((u32)(val) << (u32)(shift)) & (mask));    \
        } while (0)
 
 #define roce_set_bit(origin, shift, val)                                       \
@@ -59,9 +60,9 @@
 #define _hr_reg_enable(ptr, field_type, field_h, field_l)                      \
        ({                                                                     \
                const field_type *_ptr = ptr;                                  \
-               *((__le32 *)_ptr + (field_h) / 32) |=                          \
-                       cpu_to_le32(BIT((field_l) % 32)) +                     \
-                       BUILD_BUG_ON_ZERO((field_h) != (field_l))            \
+               *((__le32 *)_ptr + (field_h) / 32) |= cpu_to_le32(             \
+                       BIT((field_l) % 32) +                                  \
+                       BUILD_BUG_ON_ZERO((field_h) != (field_l)));            \
        })
 
 #define hr_reg_enable(ptr, field) _hr_reg_enable(ptr, field)
 #define _hr_reg_clear(ptr, field_type, field_h, field_l)                       \
        ({                                                                     \
                const field_type *_ptr = ptr;                                  \
+               BUILD_BUG_ON(((field_h) / 32) != ((field_l) / 32));            \
                *((__le32 *)_ptr + (field_h) / 32) &=                          \
-                       cpu_to_le32(                                           \
-                               ~GENMASK((field_h) % 32, (field_l) % 32)) +    \
-                       BUILD_BUG_ON_ZERO(((field_h) / 32) !=                  \
-                                         ((field_l) / 32));                   \
+                       ~cpu_to_le32(GENMASK((field_h) % 32, (field_l) % 32)); \
        })
 
 #define hr_reg_clear(ptr, field) _hr_reg_clear(ptr, field)
 
 #define hr_reg_write(ptr, field, val) _hr_reg_write(ptr, field, val)
 
+#define _hr_reg_read(ptr, field_type, field_h, field_l)                        \
+       ({                                                                     \
+               const field_type *_ptr = ptr;                                  \
+               BUILD_BUG_ON(((field_h) / 32) != ((field_l) / 32));            \
+               FIELD_GET(GENMASK((field_h) % 32, (field_l) % 32),             \
+                         le32_to_cpu(*((__le32 *)_ptr + (field_h) / 32)));    \
+       })
+
+#define hr_reg_read(ptr, field) _hr_reg_read(ptr, field)
+
 #define ROCEE_GLB_CFG_ROCEE_DB_SQ_MODE_S 3
 #define ROCEE_GLB_CFG_ROCEE_DB_OTH_MODE_S 4
 
index 74fc494..800884b 100644 (file)
@@ -225,7 +225,7 @@ static int alloc_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
                       struct ib_udata *udata, unsigned long addr,
                       struct hns_roce_ib_create_cq_resp *resp)
 {
-       bool has_db = hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB;
+       bool has_db = hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB;
        struct hns_roce_ucontext *uctx;
        int err;
 
@@ -250,8 +250,8 @@ static int alloc_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
                        *hr_cq->set_ci_db = 0;
                        hr_cq->flags |= HNS_ROCE_CQ_FLAG_RECORD_DB;
                }
-               hr_cq->cq_db_l = hr_dev->reg_base + hr_dev->odb_offset +
-                                DB_REG_OFFSET * hr_dev->priv_uar.index;
+               hr_cq->db_reg = hr_dev->reg_base + hr_dev->odb_offset +
+                               DB_REG_OFFSET * hr_dev->priv_uar.index;
        }
 
        return 0;
@@ -276,6 +276,57 @@ static void free_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
        }
 }
 
+static int verify_cq_create_attr(struct hns_roce_dev *hr_dev,
+                                const struct ib_cq_init_attr *attr)
+{
+       struct ib_device *ibdev = &hr_dev->ib_dev;
+
+       if (!attr->cqe || attr->cqe > hr_dev->caps.max_cqes) {
+               ibdev_err(ibdev, "failed to check CQ count %u, max = %u.\n",
+                         attr->cqe, hr_dev->caps.max_cqes);
+               return -EINVAL;
+       }
+
+       if (attr->comp_vector >= hr_dev->caps.num_comp_vectors) {
+               ibdev_err(ibdev, "failed to check CQ vector = %u, max = %d.\n",
+                         attr->comp_vector, hr_dev->caps.num_comp_vectors);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int get_cq_ucmd(struct hns_roce_cq *hr_cq, struct ib_udata *udata,
+                      struct hns_roce_ib_create_cq *ucmd)
+{
+       struct ib_device *ibdev = hr_cq->ib_cq.device;
+       int ret;
+
+       ret = ib_copy_from_udata(ucmd, udata, min(udata->inlen, sizeof(*ucmd)));
+       if (ret) {
+               ibdev_err(ibdev, "failed to copy CQ udata, ret = %d.\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void set_cq_param(struct hns_roce_cq *hr_cq, u32 cq_entries, int vector,
+                        struct hns_roce_ib_create_cq *ucmd)
+{
+       struct hns_roce_dev *hr_dev = to_hr_dev(hr_cq->ib_cq.device);
+
+       cq_entries = max(cq_entries, hr_dev->caps.min_cqes);
+       cq_entries = roundup_pow_of_two(cq_entries);
+       hr_cq->ib_cq.cqe = cq_entries - 1; /* used as cqe index */
+       hr_cq->cq_depth = cq_entries;
+       hr_cq->vector = vector;
+
+       spin_lock_init(&hr_cq->lock);
+       INIT_LIST_HEAD(&hr_cq->sq_list);
+       INIT_LIST_HEAD(&hr_cq->rq_list);
+}
+
 static void set_cqe_size(struct hns_roce_cq *hr_cq, struct ib_udata *udata,
                         struct hns_roce_ib_create_cq *ucmd)
 {
@@ -299,44 +350,23 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
        struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
        struct ib_device *ibdev = &hr_dev->ib_dev;
        struct hns_roce_ib_create_cq ucmd = {};
-       int vector = attr->comp_vector;
-       u32 cq_entries = attr->cqe;
        int ret;
 
        if (attr->flags)
                return -EOPNOTSUPP;
 
-       if (cq_entries < 1 || cq_entries > hr_dev->caps.max_cqes) {
-               ibdev_err(ibdev, "failed to check CQ count %u, max = %u.\n",
-                         cq_entries, hr_dev->caps.max_cqes);
-               return -EINVAL;
-       }
-
-       if (vector >= hr_dev->caps.num_comp_vectors) {
-               ibdev_err(ibdev, "failed to check CQ vector = %d, max = %d.\n",
-                         vector, hr_dev->caps.num_comp_vectors);
-               return -EINVAL;
-       }
-
-       cq_entries = max(cq_entries, hr_dev->caps.min_cqes);
-       cq_entries = roundup_pow_of_two(cq_entries);
-       hr_cq->ib_cq.cqe = cq_entries - 1; /* used as cqe index */
-       hr_cq->cq_depth = cq_entries;
-       hr_cq->vector = vector;
-       spin_lock_init(&hr_cq->lock);
-       INIT_LIST_HEAD(&hr_cq->sq_list);
-       INIT_LIST_HEAD(&hr_cq->rq_list);
+       ret = verify_cq_create_attr(hr_dev, attr);
+       if (ret)
+               return ret;
 
        if (udata) {
-               ret = ib_copy_from_udata(&ucmd, udata,
-                                        min(udata->inlen, sizeof(ucmd)));
-               if (ret) {
-                       ibdev_err(ibdev, "failed to copy CQ udata, ret = %d.\n",
-                                 ret);
+               ret = get_cq_ucmd(hr_cq, udata, &ucmd);
+               if (ret)
                        return ret;
-               }
        }
 
+       set_cq_param(hr_cq, attr->cqe, attr->comp_vector, &ucmd);
+
        set_cqe_size(hr_cq, udata, &ucmd);
 
        ret = alloc_cq_buf(hr_dev, hr_cq, udata, ucmd.buf_addr);
index 3d6b7a2..97800d2 100644 (file)
@@ -137,6 +137,7 @@ enum {
        SERV_TYPE_UC,
        SERV_TYPE_RD,
        SERV_TYPE_UD,
+       SERV_TYPE_XRC = 5,
 };
 
 enum hns_roce_qp_state {
@@ -168,6 +169,8 @@ enum hns_roce_event {
        HNS_ROCE_EVENT_TYPE_DB_OVERFLOW               = 0x12,
        HNS_ROCE_EVENT_TYPE_MB                        = 0x13,
        HNS_ROCE_EVENT_TYPE_FLR                       = 0x15,
+       HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION            = 0x16,
+       HNS_ROCE_EVENT_TYPE_INVALID_XRCETH            = 0x17,
 };
 
 #define HNS_ROCE_CAP_FLAGS_EX_SHIFT 12
@@ -176,9 +179,10 @@ enum {
        HNS_ROCE_CAP_FLAG_REREG_MR              = BIT(0),
        HNS_ROCE_CAP_FLAG_ROCE_V1_V2            = BIT(1),
        HNS_ROCE_CAP_FLAG_RQ_INLINE             = BIT(2),
-       HNS_ROCE_CAP_FLAG_RECORD_DB             = BIT(3),
-       HNS_ROCE_CAP_FLAG_SQ_RECORD_DB          = BIT(4),
+       HNS_ROCE_CAP_FLAG_CQ_RECORD_DB          = BIT(3),
+       HNS_ROCE_CAP_FLAG_QP_RECORD_DB          = BIT(4),
        HNS_ROCE_CAP_FLAG_SRQ                   = BIT(5),
+       HNS_ROCE_CAP_FLAG_XRC                   = BIT(6),
        HNS_ROCE_CAP_FLAG_MW                    = BIT(7),
        HNS_ROCE_CAP_FLAG_FRMR                  = BIT(8),
        HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL          = BIT(9),
@@ -214,12 +218,6 @@ enum {
        HNS_ROCE_RST_DIRECT_RETURN              = 0,
 };
 
-enum {
-       CMD_RST_PRC_OTHERS,
-       CMD_RST_PRC_SUCCESS,
-       CMD_RST_PRC_EBUSY,
-};
-
 #define HNS_ROCE_CMD_SUCCESS                   1
 
 /* The minimum page size is 4K for hardware */
@@ -244,6 +242,11 @@ struct hns_roce_pd {
        unsigned long           pdn;
 };
 
+struct hns_roce_xrcd {
+       struct ib_xrcd ibxrcd;
+       u32 xrcdn;
+};
+
 struct hns_roce_bitmap {
        /* Bitmap Traversal last a bit which is 1 */
        unsigned long           last;
@@ -363,7 +366,7 @@ struct hns_roce_wq {
        int             wqe_shift;      /* WQE size */
        u32             head;
        u32             tail;
-       void __iomem    *db_reg_l;
+       void __iomem    *db_reg;
 };
 
 struct hns_roce_sge {
@@ -437,7 +440,7 @@ struct hns_roce_cq {
        u32                             cq_depth;
        u32                             cons_index;
        u32                             *set_ci_db;
-       void __iomem                    *cq_db_l;
+       void __iomem                    *db_reg;
        u16                             *tptr_addr;
        int                             arm_sn;
        int                             cqe_size;
@@ -467,7 +470,8 @@ struct hns_roce_srq {
        u32                     rsv_sge;
        int                     wqe_shift;
        u32                     cqn;
-       void __iomem            *db_reg_l;
+       u32                     xrcdn;
+       void __iomem            *db_reg;
 
        atomic_t                refcount;
        struct completion       free;
@@ -546,6 +550,7 @@ struct hns_roce_cmd_context {
        int                     next;
        u64                     out_param;
        u16                     token;
+       u16                     busy;
 };
 
 struct hns_roce_cmdq {
@@ -561,11 +566,6 @@ struct hns_roce_cmdq {
        spinlock_t              context_lock;
        int                     free_head;
        struct hns_roce_cmd_context *context;
-       /*
-        * Result of get integer part
-        * which max_comds compute according a power of 2
-        */
-       u16                     token_mask;
        /*
         * Process whether use event mode, init default non-zero
         * After the event queue of cmd event ready,
@@ -640,6 +640,8 @@ struct hns_roce_qp {
                                         enum hns_roce_event event_type);
        unsigned long           qpn;
 
+       u32                     xrcdn;
+
        atomic_t                refcount;
        struct completion       free;
 
@@ -695,7 +697,7 @@ struct hns_roce_aeqe {
 
 struct hns_roce_eq {
        struct hns_roce_dev             *hr_dev;
-       void __iomem                    *doorbell;
+       void __iomem                    *db_reg;
 
        int                             type_flag; /* Aeq:1 ceq:0 */
        int                             eqn;
@@ -723,6 +725,13 @@ struct hns_roce_eq_table {
        void __iomem            **eqc_base; /* only for hw v1 */
 };
 
+enum cong_type {
+       CONG_TYPE_DCQCN,
+       CONG_TYPE_LDCP,
+       CONG_TYPE_HC3,
+       CONG_TYPE_DIP,
+};
+
 struct hns_roce_caps {
        u64             fw_ver;
        u8              num_ports;
@@ -759,13 +768,14 @@ struct hns_roce_caps {
        int             num_other_vectors;
        u32             num_mtpts;
        u32             num_mtt_segs;
-       u32             num_cqe_segs;
        u32             num_srqwqe_segs;
        u32             num_idx_segs;
        int             reserved_mrws;
        int             reserved_uars;
        int             num_pds;
        int             reserved_pds;
+       u32             num_xrcds;
+       u32             reserved_xrcds;
        u32             mtt_entry_sz;
        u32             cqe_sz;
        u32             page_size_cap;
@@ -794,6 +804,9 @@ struct hns_roce_caps {
        u32             cqc_bt_num;
        u32             cqc_timer_bt_num;
        u32             mpt_bt_num;
+       u32             eqc_bt_num;
+       u32             smac_bt_num;
+       u32             sgid_bt_num;
        u32             sccc_bt_num;
        u32             gmv_bt_num;
        u32             qpc_ba_pg_sz;
@@ -851,6 +864,7 @@ struct hns_roce_caps {
        u16             default_aeq_period;
        u16             default_aeq_arm_st;
        u16             default_ceq_arm_st;
+       enum cong_type  cong_type;
 };
 
 struct hns_roce_dfx_hw {
@@ -874,9 +888,10 @@ struct hns_roce_hw {
        int (*post_mbox)(struct hns_roce_dev *hr_dev, u64 in_param,
                         u64 out_param, u32 in_modifier, u8 op_modifier, u16 op,
                         u16 token, int event);
-       int (*chk_mbox)(struct hns_roce_dev *hr_dev, unsigned int timeout);
-       int (*rst_prc_mbox)(struct hns_roce_dev *hr_dev);
-       int (*set_gid)(struct hns_roce_dev *hr_dev, u8 port, int gid_index,
+       int (*poll_mbox_done)(struct hns_roce_dev *hr_dev,
+                             unsigned int timeout);
+       bool (*chk_mbox_avail)(struct hns_roce_dev *hr_dev, bool *is_busy);
+       int (*set_gid)(struct hns_roce_dev *hr_dev, u32 port, int gid_index,
                       const union ib_gid *gid, const struct ib_gid_attr *attr);
        int (*set_mac)(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr);
        void (*set_mtu)(struct hns_roce_dev *hr_dev, u8 phy_port,
@@ -897,33 +912,17 @@ struct hns_roce_hw {
        int (*clear_hem)(struct hns_roce_dev *hr_dev,
                         struct hns_roce_hem_table *table, int obj,
                         int step_idx);
-       int (*query_qp)(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
-                       int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
        int (*modify_qp)(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
                         int attr_mask, enum ib_qp_state cur_state,
                         enum ib_qp_state new_state);
-       int (*destroy_qp)(struct ib_qp *ibqp, struct ib_udata *udata);
        int (*qp_flow_control_init)(struct hns_roce_dev *hr_dev,
                         struct hns_roce_qp *hr_qp);
-       int (*post_send)(struct ib_qp *ibqp, const struct ib_send_wr *wr,
-                        const struct ib_send_wr **bad_wr);
-       int (*post_recv)(struct ib_qp *qp, const struct ib_recv_wr *recv_wr,
-                        const struct ib_recv_wr **bad_recv_wr);
-       int (*req_notify_cq)(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
-       int (*poll_cq)(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
        int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr,
                        struct ib_udata *udata);
        int (*destroy_cq)(struct ib_cq *ibcq, struct ib_udata *udata);
-       int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
        int (*init_eq)(struct hns_roce_dev *hr_dev);
        void (*cleanup_eq)(struct hns_roce_dev *hr_dev);
        int (*write_srqc)(struct hns_roce_srq *srq, void *mb_buf);
-       int (*modify_srq)(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr,
-                      enum ib_srq_attr_mask srq_attr_mask,
-                      struct ib_udata *udata);
-       int (*query_srq)(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
-       int (*post_srq_recv)(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
-                            const struct ib_recv_wr **bad_wr);
        const struct ib_device_ops *hns_roce_dev_ops;
        const struct ib_device_ops *hns_roce_dev_srq_ops;
 };
@@ -945,6 +944,8 @@ struct hns_roce_dev {
        enum hns_roce_device_state state;
        struct list_head        qp_list; /* list of all qps on this dev */
        spinlock_t              qp_list_lock; /* protect qp_list */
+       struct list_head        dip_list; /* list of all dest ips on this dev */
+       spinlock_t              dip_list_lock; /* protect dip_list */
 
        struct list_head        pgdir_list;
        struct mutex            pgdir_mutex;
@@ -963,6 +964,7 @@ struct hns_roce_dev {
 
        struct hns_roce_cmdq    cmd;
        struct hns_roce_bitmap    pd_bitmap;
+       struct hns_roce_bitmap xrcd_bitmap;
        struct hns_roce_uar_table uar_table;
        struct hns_roce_mr_table  mr_table;
        struct hns_roce_cq_table  cq_table;
@@ -986,6 +988,9 @@ struct hns_roce_dev {
        void                    *priv;
        struct workqueue_struct *irq_workq;
        const struct hns_roce_dfx_hw *dfx;
+       u32 func_num;
+       u32 is_vf;
+       u32 cong_algo_tmpl_id;
 };
 
 static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev)
@@ -1004,6 +1009,11 @@ static inline struct hns_roce_pd *to_hr_pd(struct ib_pd *ibpd)
        return container_of(ibpd, struct hns_roce_pd, ibpd);
 }
 
+static inline struct hns_roce_xrcd *to_hr_xrcd(struct ib_xrcd *ibxrcd)
+{
+       return container_of(ibxrcd, struct hns_roce_xrcd, ibxrcd);
+}
+
 static inline struct hns_roce_ah *to_hr_ah(struct ib_ah *ibah)
 {
        return container_of(ibah, struct hns_roce_ah, ibah);
@@ -1136,6 +1146,7 @@ int hns_roce_init_mr_table(struct hns_roce_dev *hr_dev);
 void hns_roce_init_cq_table(struct hns_roce_dev *hr_dev);
 int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev);
 int hns_roce_init_srq_table(struct hns_roce_dev *hr_dev);
+int hns_roce_init_xrcd_table(struct hns_roce_dev *hr_dev);
 
 void hns_roce_cleanup_pd_table(struct hns_roce_dev *hr_dev);
 void hns_roce_cleanup_mr_table(struct hns_roce_dev *hr_dev);
@@ -1143,6 +1154,7 @@ void hns_roce_cleanup_eq_table(struct hns_roce_dev *hr_dev);
 void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev);
 void hns_roce_cleanup_qp_table(struct hns_roce_dev *hr_dev);
 void hns_roce_cleanup_srq_table(struct hns_roce_dev *hr_dev);
+void hns_roce_cleanup_xrcd_table(struct hns_roce_dev *hr_dev);
 
 int hns_roce_bitmap_alloc(struct hns_roce_bitmap *bitmap, unsigned long *obj);
 void hns_roce_bitmap_free(struct hns_roce_bitmap *bitmap, unsigned long obj,
@@ -1207,6 +1219,9 @@ int hns_roce_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr,
                        struct ib_udata *udata);
 int hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
 
+int hns_roce_alloc_xrcd(struct ib_xrcd *ib_xrcd, struct ib_udata *udata);
+int hns_roce_dealloc_xrcd(struct ib_xrcd *ib_xrcd, struct ib_udata *udata);
+
 struct ib_qp *hns_roce_create_qp(struct ib_pd *ib_pd,
                                 struct ib_qp_init_attr *init_attr,
                                 struct ib_udata *udata);
@@ -1246,7 +1261,7 @@ void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn);
 void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type);
 void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type);
 void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type);
-u8 hns_get_gid_index(struct hns_roce_dev *hr_dev, u8 port, int gid_index);
+u8 hns_get_gid_index(struct hns_roce_dev *hr_dev, u32 port, int gid_index);
 void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev);
 int hns_roce_init(struct hns_roce_dev *hr_dev);
 void hns_roce_exit(struct hns_roce_dev *hr_dev);
index 5346fdc..620acf6 100644 (file)
@@ -54,7 +54,7 @@
  *             GID[0][0], GID[1][0],.....GID[N - 1][0],
  *             And so on
  */
-u8 hns_get_gid_index(struct hns_roce_dev *hr_dev, u8 port, int gid_index)
+u8 hns_get_gid_index(struct hns_roce_dev *hr_dev, u32 port, int gid_index)
 {
        return gid_index * hr_dev->caps.num_ports + port;
 }
@@ -345,7 +345,7 @@ out:
                doorbell[0] = sq_db.u32_4;
                doorbell[1] = sq_db.u32_8;
 
-               hns_roce_write64_k(doorbell, qp->sq.db_reg_l);
+               hns_roce_write64_k(doorbell, qp->sq.db_reg);
        }
 
        spin_unlock_irqrestore(&qp->sq.lock, flags);
@@ -440,7 +440,7 @@ out:
                        doorbell[0] = rq_db.u32_4;
                        doorbell[1] = rq_db.u32_8;
 
-                       hns_roce_write64_k(doorbell, hr_qp->rq.db_reg_l);
+                       hns_roce_write64_k(doorbell, hr_qp->rq.db_reg);
                }
        }
        spin_unlock_irqrestore(&hr_qp->rq.lock, flags);
@@ -538,7 +538,7 @@ static void hns_roce_set_sdb_ext(struct hns_roce_dev *hr_dev, u32 ext_sdb_alept,
        /*
         * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of
         * using 4K page, and shift more 32 because of
-        * caculating the high 32 bit value evaluated to hardware.
+        * calculating the high 32 bit value evaluated to hardware.
         */
        roce_set_field(tmp, ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_BA_H_M,
                       ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_BA_H_S, sdb_dma_addr >> 44);
@@ -711,7 +711,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
        int i, j;
        u8 queue_en[HNS_ROCE_V1_RESV_QP] = { 0 };
        u8 phy_port;
-       u8 port = 0;
+       u32 port = 0;
        u8 sl;
 
        /* Reserved cq for loop qp */
@@ -1189,7 +1189,7 @@ static int hns_roce_raq_init(struct hns_roce_dev *hr_dev)
        /*
         * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of
         * using 4K page, and shift more 32 because of
-        * caculating the high 32 bit value evaluated to hardware.
+        * calculating the high 32 bit value evaluated to hardware.
         */
        roce_set_field(tmp, ROCEE_EXT_RAQ_H_EXT_RAQ_BA_H_M,
                       ROCEE_EXT_RAQ_H_EXT_RAQ_BA_H_S,
@@ -1382,7 +1382,6 @@ static int hns_roce_free_mr_init(struct hns_roce_dev *hr_dev)
        ret = hns_roce_v1_rsv_lp_qp(hr_dev);
        if (ret) {
                dev_err(dev, "Reserved loop qp failed(%d)!\n", ret);
-               flush_workqueue(free_mr->free_mr_wq);
                destroy_workqueue(free_mr->free_mr_wq);
        }
 
@@ -1394,7 +1393,6 @@ static void hns_roce_free_mr_free(struct hns_roce_dev *hr_dev)
        struct hns_roce_v1_priv *priv = hr_dev->priv;
        struct hns_roce_free_mr *free_mr = &priv->free_mr;
 
-       flush_workqueue(free_mr->free_mr_wq);
        destroy_workqueue(free_mr->free_mr_wq);
 
        hns_roce_v1_release_lp_qp(hr_dev);
@@ -1676,7 +1674,7 @@ static int hns_roce_v1_chk_mbox(struct hns_roce_dev *hr_dev,
        return 0;
 }
 
-static int hns_roce_v1_set_gid(struct hns_roce_dev *hr_dev, u8 port,
+static int hns_roce_v1_set_gid(struct hns_roce_dev *hr_dev, u32 port,
                               int gid_index, const union ib_gid *gid,
                               const struct ib_gid_attr *attr)
 {
@@ -1939,7 +1937,7 @@ static void hns_roce_v1_cq_set_ci(struct hns_roce_cq *hr_cq, u32 cons_index)
        roce_set_field(doorbell[1], ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_INP_H_M,
                       ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_INP_H_S, hr_cq->cqn);
 
-       hns_roce_write64_k(doorbell, hr_cq->cq_db_l);
+       hns_roce_write64_k(doorbell, hr_cq->db_reg);
 }
 
 static void __hns_roce_v1_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn,
@@ -2041,7 +2039,7 @@ static void hns_roce_v1_write_cqc(struct hns_roce_dev *hr_dev,
        /**
         * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of
         * using 4K page, and shift more 32 because of
-        * caculating the high 32 bit value evaluated to hardware.
+        * calculating the high 32 bit value evaluated to hardware.
         */
        roce_set_field(cq_context->cqc_byte_20,
                       CQ_CONTEXT_CQC_BYTE_20_CQE_TPTR_ADDR_H_M,
@@ -2092,7 +2090,7 @@ static int hns_roce_v1_req_notify_cq(struct ib_cq *ibcq,
                       ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_INP_H_S,
                       hr_cq->cqn | notification_flag);
 
-       hns_roce_write64_k(doorbell, hr_cq->cq_db_l);
+       hns_roce_write64_k(doorbell, hr_cq->db_reg);
 
        return 0;
 }
@@ -2673,8 +2671,8 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
        int ret = -EINVAL;
        u64 sq_ba = 0;
        u64 rq_ba = 0;
-       int port;
-       u8 port_num;
+       u32 port;
+       u32 port_num;
        u8 *dmac;
        u8 *smac;
 
@@ -3217,12 +3215,12 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
                roce_set_bit(doorbell[1], RQ_DOORBELL_U32_8_HW_SYNC_S, 1);
 
                if (ibqp->uobject) {
-                       hr_qp->rq.db_reg_l = hr_dev->reg_base +
+                       hr_qp->rq.db_reg = hr_dev->reg_base +
                                     hr_dev->odb_offset +
                                     DB_REG_OFFSET * hr_dev->priv_uar.index;
                }
 
-               hns_roce_write64_k(doorbell, hr_qp->rq.db_reg_l);
+               hns_roce_write64_k(doorbell, hr_qp->rq.db_reg);
        }
 
        hr_qp->state = new_state;
@@ -3449,8 +3447,7 @@ static int hns_roce_v1_q_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
                                   ((roce_get_bit(context->qpc_bytes_4,
                        QP_CONTEXT_QPC_BYTE_4_ATOMIC_OPERATION_ENABLE_S)) << 3);
 
-       if (hr_qp->ibqp.qp_type == IB_QPT_RC ||
-           hr_qp->ibqp.qp_type == IB_QPT_UC) {
+       if (hr_qp->ibqp.qp_type == IB_QPT_RC) {
                struct ib_global_route *grh =
                        rdma_ah_retrieve_grh(&qp_attr->ah_attr);
 
@@ -3604,7 +3601,7 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 static void set_eq_cons_index_v1(struct hns_roce_eq *eq, u32 req_not)
 {
        roce_raw_write((eq->cons_index & HNS_ROCE_V1_CONS_IDX_M) |
-                      (req_not << eq->log_entries), eq->doorbell);
+                      (req_not << eq->log_entries), eq->db_reg);
 }
 
 static void hns_roce_v1_wq_catas_err_handle(struct hns_roce_dev *hr_dev,
@@ -4170,7 +4167,7 @@ static int hns_roce_v1_create_eq(struct hns_roce_dev *hr_dev,
         * Configure eq extended address 45~49 bit.
         * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of
         * using 4K page, and shift more 32 because of
-        * caculating the high 32 bit value evaluated to hardware.
+        * calculating the high 32 bit value evaluated to hardware.
         */
        roce_set_field(tmp1, ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_M,
                       ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_S,
@@ -4234,9 +4231,9 @@ static int hns_roce_v1_init_eq_table(struct hns_roce_dev *hr_dev)
                                                ROCEE_CAEP_CEQC_SHIFT_0_REG +
                                                CEQ_REG_OFFSET * i;
                        eq->type_flag = HNS_ROCE_CEQ;
-                       eq->doorbell = hr_dev->reg_base +
-                                      ROCEE_CAEP_CEQC_CONS_IDX_0_REG +
-                                      CEQ_REG_OFFSET * i;
+                       eq->db_reg = hr_dev->reg_base +
+                                    ROCEE_CAEP_CEQC_CONS_IDX_0_REG +
+                                    CEQ_REG_OFFSET * i;
                        eq->entries = hr_dev->caps.ceqe_depth;
                        eq->log_entries = ilog2(eq->entries);
                        eq->eqe_size = HNS_ROCE_CEQE_SIZE;
@@ -4245,8 +4242,8 @@ static int hns_roce_v1_init_eq_table(struct hns_roce_dev *hr_dev)
                        eq_table->eqc_base[i] = hr_dev->reg_base +
                                                ROCEE_CAEP_AEQC_AEQE_SHIFT_REG;
                        eq->type_flag = HNS_ROCE_AEQ;
-                       eq->doorbell = hr_dev->reg_base +
-                                      ROCEE_CAEP_AEQE_CONS_IDX_REG;
+                       eq->db_reg = hr_dev->reg_base +
+                                    ROCEE_CAEP_AEQE_CONS_IDX_REG;
                        eq->entries = hr_dev->caps.aeqe_depth;
                        eq->log_entries = ilog2(eq->entries);
                        eq->eqe_size = HNS_ROCE_AEQE_SIZE;
@@ -4349,7 +4346,7 @@ static const struct hns_roce_hw hns_roce_hw_v1 = {
        .hw_init = hns_roce_v1_init,
        .hw_exit = hns_roce_v1_exit,
        .post_mbox = hns_roce_v1_post_mbox,
-       .chk_mbox = hns_roce_v1_chk_mbox,
+       .poll_mbox_done = hns_roce_v1_chk_mbox,
        .set_gid = hns_roce_v1_set_gid,
        .set_mac = hns_roce_v1_set_mac,
        .set_mtu = hns_roce_v1_set_mtu,
@@ -4357,12 +4354,6 @@ static const struct hns_roce_hw hns_roce_hw_v1 = {
        .write_cqc = hns_roce_v1_write_cqc,
        .clear_hem = hns_roce_v1_clear_hem,
        .modify_qp = hns_roce_v1_modify_qp,
-       .query_qp = hns_roce_v1_query_qp,
-       .destroy_qp = hns_roce_v1_destroy_qp,
-       .post_send = hns_roce_v1_post_send,
-       .post_recv = hns_roce_v1_post_recv,
-       .req_notify_cq = hns_roce_v1_req_notify_cq,
-       .poll_cq = hns_roce_v1_poll_cq,
        .dereg_mr = hns_roce_v1_dereg_mr,
        .destroy_cq = hns_roce_v1_destroy_cq,
        .init_eq = hns_roce_v1_init_eq_table,
index ce26f97..7652daf 100644 (file)
 #include "hns_roce_hem.h"
 #include "hns_roce_hw_v2.h"
 
+enum {
+       CMD_RST_PRC_OTHERS,
+       CMD_RST_PRC_SUCCESS,
+       CMD_RST_PRC_EBUSY,
+};
+
 static inline void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg,
                                   struct ib_sge *sg)
 {
@@ -632,24 +638,60 @@ static inline void update_sq_db(struct hns_roce_dev *hr_dev,
         * around the mailbox calls. Hence, use the deferred flush for
         * now.
         */
-       if (qp->state == IB_QPS_ERR) {
+       if (unlikely(qp->state == IB_QPS_ERR)) {
                if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG, &qp->flush_flag))
                        init_flush_work(hr_dev, qp);
        } else {
                struct hns_roce_v2_db sq_db = {};
 
-               roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_TAG_M,
-                              V2_DB_BYTE_4_TAG_S, qp->doorbell_qpn);
-               roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_CMD_M,
-                              V2_DB_BYTE_4_CMD_S, HNS_ROCE_V2_SQ_DB);
+               roce_set_field(sq_db.byte_4, V2_DB_TAG_M, V2_DB_TAG_S,
+                              qp->doorbell_qpn);
+               roce_set_field(sq_db.byte_4, V2_DB_CMD_M, V2_DB_CMD_S,
+                              HNS_ROCE_V2_SQ_DB);
+
                /* indicates data on new BAR, 0 : SQ doorbell, 1 : DWQE */
                roce_set_bit(sq_db.byte_4, V2_DB_FLAG_S, 0);
-               roce_set_field(sq_db.parameter, V2_DB_PARAMETER_IDX_M,
-                              V2_DB_PARAMETER_IDX_S, qp->sq.head);
-               roce_set_field(sq_db.parameter, V2_DB_PARAMETER_SL_M,
-                              V2_DB_PARAMETER_SL_S, qp->sl);
+               roce_set_field(sq_db.parameter, V2_DB_PRODUCER_IDX_M,
+                              V2_DB_PRODUCER_IDX_S, qp->sq.head);
+               roce_set_field(sq_db.parameter, V2_DB_SL_M, V2_DB_SL_S,
+                              qp->sl);
+
+               hns_roce_write64(hr_dev, (__le32 *)&sq_db, qp->sq.db_reg);
+       }
+}
+
+static inline void update_rq_db(struct hns_roce_dev *hr_dev,
+                               struct hns_roce_qp *qp)
+{
+       /*
+        * Hip08 hardware cannot flush the WQEs in RQ if the QP state
+        * gets into errored mode. Hence, as a workaround to this
+        * hardware limitation, driver needs to assist in flushing. But
+        * the flushing operation uses mailbox to convey the QP state to
+        * the hardware and which can sleep due to the mutex protection
+        * around the mailbox calls. Hence, use the deferred flush for
+        * now.
+        */
+       if (unlikely(qp->state == IB_QPS_ERR)) {
+               if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG, &qp->flush_flag))
+                       init_flush_work(hr_dev, qp);
+       } else {
+               if (likely(qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB)) {
+                       *qp->rdb.db_record =
+                                       qp->rq.head & V2_DB_PRODUCER_IDX_M;
+               } else {
+                       struct hns_roce_v2_db rq_db = {};
+
+                       roce_set_field(rq_db.byte_4, V2_DB_TAG_M, V2_DB_TAG_S,
+                                      qp->qpn);
+                       roce_set_field(rq_db.byte_4, V2_DB_CMD_M, V2_DB_CMD_S,
+                                      HNS_ROCE_V2_RQ_DB);
+                       roce_set_field(rq_db.parameter, V2_DB_PRODUCER_IDX_M,
+                                      V2_DB_PRODUCER_IDX_S, qp->rq.head);
 
-               hns_roce_write64(hr_dev, (__le32 *)&sq_db, qp->sq.db_reg_l);
+                       hns_roce_write64(hr_dev, (__le32 *)&rq_db,
+                                        qp->rq.db_reg);
+               }
        }
 }
 
@@ -681,8 +723,7 @@ static void write_dwqe(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp,
        roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_WQE_INDEX_M,
                       V2_RC_SEND_WQE_BYTE_4_WQE_INDEX_S, qp->sq.head);
 
-       hns_roce_write512(hr_dev, wqe, hr_dev->mem_base +
-                         HNS_ROCE_DWQE_SIZE * qp->ibqp.qp_num);
+       hns_roce_write512(hr_dev, wqe, qp->sq.db_reg);
 }
 
 static int hns_roce_v2_post_send(struct ib_qp *ibqp,
@@ -879,22 +920,7 @@ out:
        if (likely(nreq)) {
                hr_qp->rq.head += nreq;
 
-               /*
-                * Hip08 hardware cannot flush the WQEs in RQ if the QP state
-                * gets into errored mode. Hence, as a workaround to this
-                * hardware limitation, driver needs to assist in flushing. But
-                * the flushing operation uses mailbox to convey the QP state to
-                * the hardware and which can sleep due to the mutex protection
-                * around the mailbox calls. Hence, use the deferred flush for
-                * now.
-                */
-               if (hr_qp->state == IB_QPS_ERR) {
-                       if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG,
-                                             &hr_qp->flush_flag))
-                               init_flush_work(hr_dev, hr_qp);
-               } else {
-                       *hr_qp->rdb.db_record = hr_qp->rq.head & 0xffff;
-               }
+               update_rq_db(hr_dev, hr_qp);
        }
        spin_unlock_irqrestore(&hr_qp->rq.lock, flags);
 
@@ -1016,13 +1042,14 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
        }
 
        if (likely(nreq)) {
-               srq_db.byte_4 =
-                       cpu_to_le32(HNS_ROCE_V2_SRQ_DB << V2_DB_BYTE_4_CMD_S |
-                                   (srq->srqn & V2_DB_BYTE_4_TAG_M));
-               srq_db.parameter =
-                       cpu_to_le32(srq->idx_que.head & V2_DB_PARAMETER_IDX_M);
+               roce_set_field(srq_db.byte_4, V2_DB_TAG_M, V2_DB_TAG_S,
+                              srq->srqn);
+               roce_set_field(srq_db.byte_4, V2_DB_CMD_M, V2_DB_CMD_S,
+                              HNS_ROCE_V2_SRQ_DB);
+               roce_set_field(srq_db.parameter, V2_DB_PRODUCER_IDX_M,
+                              V2_DB_PRODUCER_IDX_S, srq->idx_que.head);
 
-               hns_roce_write64(hr_dev, (__le32 *)&srq_db, srq->db_reg_l);
+               hns_roce_write64(hr_dev, (__le32 *)&srq_db, srq->db_reg);
        }
 
        spin_unlock_irqrestore(&srq->lock, flags);
@@ -1030,7 +1057,7 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
        return ret;
 }
 
-static int hns_roce_v2_cmd_hw_reseted(struct hns_roce_dev *hr_dev,
+static u32 hns_roce_v2_cmd_hw_reseted(struct hns_roce_dev *hr_dev,
                                      unsigned long instance_stage,
                                      unsigned long reset_stage)
 {
@@ -1053,7 +1080,7 @@ static int hns_roce_v2_cmd_hw_reseted(struct hns_roce_dev *hr_dev,
        return CMD_RST_PRC_SUCCESS;
 }
 
-static int hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev,
+static u32 hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev,
                                        unsigned long instance_stage,
                                        unsigned long reset_stage)
 {
@@ -1081,7 +1108,7 @@ static int hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev,
        return CMD_RST_PRC_SUCCESS;
 }
 
-static int hns_roce_v2_cmd_sw_resetting(struct hns_roce_dev *hr_dev)
+static u32 hns_roce_v2_cmd_sw_resetting(struct hns_roce_dev *hr_dev)
 {
        struct hns_roce_v2_priv *priv = hr_dev->priv;
        struct hnae3_handle *handle = priv->handle;
@@ -1098,10 +1125,9 @@ static int hns_roce_v2_cmd_sw_resetting(struct hns_roce_dev *hr_dev)
        return CMD_RST_PRC_EBUSY;
 }
 
-static int hns_roce_v2_rst_process_cmd(struct hns_roce_dev *hr_dev)
+static u32 check_aedev_reset_status(struct hns_roce_dev *hr_dev,
+                                   struct hnae3_handle *handle)
 {
-       struct hns_roce_v2_priv *priv = hr_dev->priv;
-       struct hnae3_handle *handle = priv->handle;
        const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
        unsigned long instance_stage; /* the current instance stage */
        unsigned long reset_stage; /* the current reset stage */
@@ -1109,9 +1135,6 @@ static int hns_roce_v2_rst_process_cmd(struct hns_roce_dev *hr_dev)
        bool sw_resetting;
        bool hw_resetting;
 
-       if (hr_dev->is_reset)
-               return CMD_RST_PRC_SUCCESS;
-
        /* Get information about reset from NIC driver or RoCE driver itself,
         * the meaning of the following variables from NIC driver are described
         * as below:
@@ -1122,19 +1145,53 @@ static int hns_roce_v2_rst_process_cmd(struct hns_roce_dev *hr_dev)
        instance_stage = handle->rinfo.instance_state;
        reset_stage = handle->rinfo.reset_state;
        reset_cnt = ops->ae_dev_reset_cnt(handle);
-       hw_resetting = ops->get_cmdq_stat(handle);
-       sw_resetting = ops->ae_dev_resetting(handle);
-
        if (reset_cnt != hr_dev->reset_cnt)
                return hns_roce_v2_cmd_hw_reseted(hr_dev, instance_stage,
                                                  reset_stage);
-       else if (hw_resetting)
+
+       hw_resetting = ops->get_cmdq_stat(handle);
+       if (hw_resetting)
                return hns_roce_v2_cmd_hw_resetting(hr_dev, instance_stage,
                                                    reset_stage);
-       else if (sw_resetting && instance_stage == HNS_ROCE_STATE_INIT)
+
+       sw_resetting = ops->ae_dev_resetting(handle);
+       if (sw_resetting && instance_stage == HNS_ROCE_STATE_INIT)
                return hns_roce_v2_cmd_sw_resetting(hr_dev);
 
-       return 0;
+       return CMD_RST_PRC_OTHERS;
+}
+
+static bool check_device_is_in_reset(struct hns_roce_dev *hr_dev)
+{
+       struct hns_roce_v2_priv *priv = hr_dev->priv;
+       struct hnae3_handle *handle = priv->handle;
+       const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+
+       if (hr_dev->reset_cnt != ops->ae_dev_reset_cnt(handle))
+               return true;
+
+       if (ops->get_hw_reset_stat(handle))
+               return true;
+
+       if (ops->ae_dev_resetting(handle))
+               return true;
+
+       return false;
+}
+
+static bool v2_chk_mbox_is_avail(struct hns_roce_dev *hr_dev, bool *busy)
+{
+       struct hns_roce_v2_priv *priv = hr_dev->priv;
+       u32 status;
+
+       if (hr_dev->is_reset)
+               status = CMD_RST_PRC_SUCCESS;
+       else
+               status = check_aedev_reset_status(hr_dev, priv->handle);
+
+       *busy = (status == CMD_RST_PRC_EBUSY);
+
+       return status == CMD_RST_PRC_OTHERS;
 }
 
 static int hns_roce_alloc_cmq_desc(struct hns_roce_dev *hr_dev,
@@ -1152,6 +1209,9 @@ static int hns_roce_alloc_cmq_desc(struct hns_roce_dev *hr_dev,
                ring->desc_dma_addr = 0;
                kfree(ring->desc);
                ring->desc = NULL;
+
+               dev_err_ratelimited(hr_dev->dev,
+                                   "failed to map cmq desc addr.\n");
                return -ENOMEM;
        }
 
@@ -1228,14 +1288,16 @@ static int hns_roce_v2_cmq_init(struct hns_roce_dev *hr_dev)
        /* Init CSQ */
        ret = hns_roce_init_cmq_ring(hr_dev, TYPE_CSQ);
        if (ret) {
-               dev_err(hr_dev->dev, "Init CSQ error, ret = %d.\n", ret);
+               dev_err_ratelimited(hr_dev->dev,
+                                   "failed to init CSQ, ret = %d.\n", ret);
                return ret;
        }
 
        /* Init CRQ */
        ret = hns_roce_init_cmq_ring(hr_dev, TYPE_CRQ);
        if (ret) {
-               dev_err(hr_dev->dev, "Init CRQ error, ret = %d.\n", ret);
+               dev_err_ratelimited(hr_dev->dev,
+                                   "failed to init CRQ, ret = %d.\n", ret);
                goto err_crq;
        }
 
@@ -1352,27 +1414,36 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
 static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
                             struct hns_roce_cmq_desc *desc, int num)
 {
-       int retval;
+       bool busy;
        int ret;
 
-       ret = hns_roce_v2_rst_process_cmd(hr_dev);
-       if (ret == CMD_RST_PRC_SUCCESS)
-               return 0;
-       if (ret == CMD_RST_PRC_EBUSY)
-               return -EBUSY;
+       if (!v2_chk_mbox_is_avail(hr_dev, &busy))
+               return busy ? -EBUSY : 0;
 
        ret = __hns_roce_cmq_send(hr_dev, desc, num);
        if (ret) {
-               retval = hns_roce_v2_rst_process_cmd(hr_dev);
-               if (retval == CMD_RST_PRC_SUCCESS)
-                       return 0;
-               else if (retval == CMD_RST_PRC_EBUSY)
-                       return -EBUSY;
+               if (!v2_chk_mbox_is_avail(hr_dev, &busy))
+                       return busy ? -EBUSY : 0;
        }
 
        return ret;
 }
 
+static int config_hem_ba_to_hw(struct hns_roce_dev *hr_dev, unsigned long obj,
+                              dma_addr_t base_addr, u16 op)
+{
+       struct hns_roce_cmd_mailbox *mbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+       int ret;
+
+       if (IS_ERR(mbox))
+               return PTR_ERR(mbox);
+
+       ret = hns_roce_cmd_mbox(hr_dev, base_addr, mbox->dma, obj, 0, op,
+                               HNS_ROCE_CMD_TIMEOUT_MSECS);
+       hns_roce_free_cmd_mailbox(hr_dev, mbox);
+       return ret;
+}
+
 static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev)
 {
        struct hns_roce_query_version *resp;
@@ -1391,92 +1462,90 @@ static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev)
        return 0;
 }
 
-static bool hns_roce_func_clr_chk_rst(struct hns_roce_dev *hr_dev)
+static void func_clr_hw_resetting_state(struct hns_roce_dev *hr_dev,
+                                       struct hnae3_handle *handle)
 {
-       struct hns_roce_v2_priv *priv = hr_dev->priv;
-       struct hnae3_handle *handle = priv->handle;
        const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
-       unsigned long reset_cnt;
-       bool sw_resetting;
-       bool hw_resetting;
+       unsigned long end;
 
-       reset_cnt = ops->ae_dev_reset_cnt(handle);
-       hw_resetting = ops->get_hw_reset_stat(handle);
-       sw_resetting = ops->ae_dev_resetting(handle);
+       hr_dev->dis_db = true;
 
-       if (reset_cnt != hr_dev->reset_cnt || hw_resetting || sw_resetting)
-               return true;
+       dev_warn(hr_dev->dev,
+                "Func clear is pending, device in resetting state.\n");
+       end = HNS_ROCE_V2_HW_RST_TIMEOUT;
+       while (end) {
+               if (!ops->get_hw_reset_stat(handle)) {
+                       hr_dev->is_reset = true;
+                       dev_info(hr_dev->dev,
+                                "Func clear success after reset.\n");
+                       return;
+               }
+               msleep(HNS_ROCE_V2_HW_RST_COMPLETION_WAIT);
+               end -= HNS_ROCE_V2_HW_RST_COMPLETION_WAIT;
+       }
 
-       return false;
+       dev_warn(hr_dev->dev, "Func clear failed.\n");
 }
 
-static void hns_roce_func_clr_rst_prc(struct hns_roce_dev *hr_dev, int retval,
-                                     int flag)
+static void func_clr_sw_resetting_state(struct hns_roce_dev *hr_dev,
+                                       struct hnae3_handle *handle)
 {
-       struct hns_roce_v2_priv *priv = hr_dev->priv;
-       struct hnae3_handle *handle = priv->handle;
        const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
-       unsigned long instance_stage;
-       unsigned long reset_cnt;
        unsigned long end;
-       bool sw_resetting;
-       bool hw_resetting;
 
-       instance_stage = handle->rinfo.instance_state;
-       reset_cnt = ops->ae_dev_reset_cnt(handle);
-       hw_resetting = ops->get_hw_reset_stat(handle);
-       sw_resetting = ops->ae_dev_resetting(handle);
+       hr_dev->dis_db = true;
+
+       dev_warn(hr_dev->dev,
+                "Func clear is pending, device in resetting state.\n");
+       end = HNS_ROCE_V2_HW_RST_TIMEOUT;
+       while (end) {
+               if (ops->ae_dev_reset_cnt(handle) !=
+                   hr_dev->reset_cnt) {
+                       hr_dev->is_reset = true;
+                       dev_info(hr_dev->dev,
+                                "Func clear success after sw reset\n");
+                       return;
+               }
+               msleep(HNS_ROCE_V2_HW_RST_COMPLETION_WAIT);
+               end -= HNS_ROCE_V2_HW_RST_COMPLETION_WAIT;
+       }
+
+       dev_warn(hr_dev->dev, "Func clear failed because of unfinished sw reset\n");
+}
 
-       if (reset_cnt != hr_dev->reset_cnt) {
+static void hns_roce_func_clr_rst_proc(struct hns_roce_dev *hr_dev, int retval,
+                                      int flag)
+{
+       struct hns_roce_v2_priv *priv = hr_dev->priv;
+       struct hnae3_handle *handle = priv->handle;
+       const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+
+       if (ops->ae_dev_reset_cnt(handle) != hr_dev->reset_cnt) {
                hr_dev->dis_db = true;
                hr_dev->is_reset = true;
                dev_info(hr_dev->dev, "Func clear success after reset.\n");
-       } else if (hw_resetting) {
-               hr_dev->dis_db = true;
+               return;
+       }
 
-               dev_warn(hr_dev->dev,
-                        "Func clear is pending, device in resetting state.\n");
-               end = HNS_ROCE_V2_HW_RST_TIMEOUT;
-               while (end) {
-                       if (!ops->get_hw_reset_stat(handle)) {
-                               hr_dev->is_reset = true;
-                               dev_info(hr_dev->dev,
-                                        "Func clear success after reset.\n");
-                               return;
-                       }
-                       msleep(HNS_ROCE_V2_HW_RST_COMPLETION_WAIT);
-                       end -= HNS_ROCE_V2_HW_RST_COMPLETION_WAIT;
-               }
+       if (ops->get_hw_reset_stat(handle)) {
+               func_clr_hw_resetting_state(hr_dev, handle);
+               return;
+       }
 
-               dev_warn(hr_dev->dev, "Func clear failed.\n");
-       } else if (sw_resetting && instance_stage == HNS_ROCE_STATE_INIT) {
-               hr_dev->dis_db = true;
+       if (ops->ae_dev_resetting(handle) &&
+           handle->rinfo.instance_state == HNS_ROCE_STATE_INIT) {
+               func_clr_sw_resetting_state(hr_dev, handle);
+               return;
+       }
 
+       if (retval && !flag)
                dev_warn(hr_dev->dev,
-                        "Func clear is pending, device in resetting state.\n");
-               end = HNS_ROCE_V2_HW_RST_TIMEOUT;
-               while (end) {
-                       if (ops->ae_dev_reset_cnt(handle) !=
-                           hr_dev->reset_cnt) {
-                               hr_dev->is_reset = true;
-                               dev_info(hr_dev->dev,
-                                        "Func clear success after sw reset\n");
-                               return;
-                       }
-                       msleep(HNS_ROCE_V2_HW_RST_COMPLETION_WAIT);
-                       end -= HNS_ROCE_V2_HW_RST_COMPLETION_WAIT;
-               }
-
-               dev_warn(hr_dev->dev, "Func clear failed because of unfinished sw reset\n");
-       } else {
-               if (retval && !flag)
-                       dev_warn(hr_dev->dev,
-                                "Func clear read failed, ret = %d.\n", retval);
+                        "Func clear read failed, ret = %d.\n", retval);
 
-               dev_warn(hr_dev->dev, "Func clear failed.\n");
-       }
+       dev_warn(hr_dev->dev, "Func clear failed.\n");
 }
-static void hns_roce_function_clear(struct hns_roce_dev *hr_dev)
+
+static void __hns_roce_function_clear(struct hns_roce_dev *hr_dev, int vf_id)
 {
        bool fclr_write_fail_flag = false;
        struct hns_roce_func_clear *resp;
@@ -1484,11 +1553,12 @@ static void hns_roce_function_clear(struct hns_roce_dev *hr_dev)
        unsigned long end;
        int ret = 0;
 
-       if (hns_roce_func_clr_chk_rst(hr_dev))
+       if (check_device_is_in_reset(hr_dev))
                goto out;
 
        hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR, false);
        resp = (struct hns_roce_func_clear *)desc.data;
+       resp->rst_funcid_en = cpu_to_le32(vf_id);
 
        ret = hns_roce_cmq_send(hr_dev, &desc, 1);
        if (ret) {
@@ -1501,7 +1571,7 @@ static void hns_roce_function_clear(struct hns_roce_dev *hr_dev)
        msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL);
        end = HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS;
        while (end) {
-               if (hns_roce_func_clr_chk_rst(hr_dev))
+               if (check_device_is_in_reset(hr_dev))
                        goto out;
                msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT);
                end -= HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT;
@@ -1509,18 +1579,45 @@ static void hns_roce_function_clear(struct hns_roce_dev *hr_dev)
                hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR,
                                              true);
 
+               resp->rst_funcid_en = cpu_to_le32(vf_id);
                ret = hns_roce_cmq_send(hr_dev, &desc, 1);
                if (ret)
                        continue;
 
                if (roce_get_bit(resp->func_done, FUNC_CLEAR_RST_FUN_DONE_S)) {
-                       hr_dev->is_reset = true;
+                       if (vf_id == 0)
+                               hr_dev->is_reset = true;
                        return;
                }
        }
 
 out:
-       hns_roce_func_clr_rst_prc(hr_dev, ret, fclr_write_fail_flag);
+       hns_roce_func_clr_rst_proc(hr_dev, ret, fclr_write_fail_flag);
+}
+
+static void hns_roce_free_vf_resource(struct hns_roce_dev *hr_dev, int vf_id)
+{
+       enum hns_roce_opcode_type opcode = HNS_ROCE_OPC_ALLOC_VF_RES;
+       struct hns_roce_cmq_desc desc[2];
+       struct hns_roce_cmq_req *req_a;
+
+       req_a = (struct hns_roce_cmq_req *)desc[0].data;
+       hns_roce_cmq_setup_basic_desc(&desc[0], opcode, false);
+       desc[0].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
+       hns_roce_cmq_setup_basic_desc(&desc[1], opcode, false);
+       hr_reg_write(req_a, FUNC_RES_A_VF_ID, vf_id);
+       hns_roce_cmq_send(hr_dev, desc, 2);
+}
+
+static void hns_roce_function_clear(struct hns_roce_dev *hr_dev)
+{
+       int i;
+
+       for (i = hr_dev->func_num - 1; i >= 0; i--) {
+               __hns_roce_function_clear(hr_dev, i);
+               if (i != 0)
+                       hns_roce_free_vf_resource(hr_dev, i);
+       }
 }
 
 static int hns_roce_query_fw_ver(struct hns_roce_dev *hr_dev)
@@ -1540,79 +1637,107 @@ static int hns_roce_query_fw_ver(struct hns_roce_dev *hr_dev)
        return 0;
 }
 
+static int hns_roce_query_func_info(struct hns_roce_dev *hr_dev)
+{
+       struct hns_roce_cmq_desc desc;
+       int ret;
+
+       if (hr_dev->pci_dev->revision < PCI_REVISION_ID_HIP09) {
+               hr_dev->func_num = 1;
+               return 0;
+       }
+
+       hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_QUERY_FUNC_INFO,
+                                     true);
+       ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+       if (ret) {
+               hr_dev->func_num = 1;
+               return ret;
+       }
+
+       hr_dev->func_num = le32_to_cpu(desc.func_info.own_func_num);
+       hr_dev->cong_algo_tmpl_id = le32_to_cpu(desc.func_info.own_mac_id);
+
+       return 0;
+}
+
 static int hns_roce_config_global_param(struct hns_roce_dev *hr_dev)
 {
-       struct hns_roce_cfg_global_param *req;
        struct hns_roce_cmq_desc desc;
+       struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
 
        hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GLOBAL_PARAM,
                                      false);
 
-       req = (struct hns_roce_cfg_global_param *)desc.data;
-       memset(req, 0, sizeof(*req));
-       roce_set_field(req->time_cfg_udp_port,
-                      CFG_GLOBAL_PARAM_DATA_0_ROCEE_TIME_1US_CFG_M,
-                      CFG_GLOBAL_PARAM_DATA_0_ROCEE_TIME_1US_CFG_S, 0x3e8);
-       roce_set_field(req->time_cfg_udp_port,
-                      CFG_GLOBAL_PARAM_DATA_0_ROCEE_UDP_PORT_M,
-                      CFG_GLOBAL_PARAM_DATA_0_ROCEE_UDP_PORT_S,
-                      ROCE_V2_UDP_DPORT);
+       hr_reg_write(req, CFG_GLOBAL_PARAM_1US_CYCLES, 0x3e8);
+       hr_reg_write(req, CFG_GLOBAL_PARAM_UDP_PORT, ROCE_V2_UDP_DPORT);
 
        return hns_roce_cmq_send(hr_dev, &desc, 1);
 }
 
-static int hns_roce_query_pf_resource(struct hns_roce_dev *hr_dev)
+static int load_func_res_caps(struct hns_roce_dev *hr_dev, bool is_vf)
 {
        struct hns_roce_cmq_desc desc[2];
-       struct hns_roce_pf_res_a *req_a;
-       struct hns_roce_pf_res_b *req_b;
+       struct hns_roce_cmq_req *r_a = (struct hns_roce_cmq_req *)desc[0].data;
+       struct hns_roce_cmq_req *r_b = (struct hns_roce_cmq_req *)desc[1].data;
+       struct hns_roce_caps *caps = &hr_dev->caps;
+       enum hns_roce_opcode_type opcode;
+       u32 func_num;
        int ret;
 
-       hns_roce_cmq_setup_basic_desc(&desc[0], HNS_ROCE_OPC_QUERY_PF_RES,
-                                     true);
-       desc[0].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
+       if (is_vf) {
+               opcode = HNS_ROCE_OPC_QUERY_VF_RES;
+               func_num = 1;
+       } else {
+               opcode = HNS_ROCE_OPC_QUERY_PF_RES;
+               func_num = hr_dev->func_num;
+       }
 
-       hns_roce_cmq_setup_basic_desc(&desc[1], HNS_ROCE_OPC_QUERY_PF_RES,
-                                     true);
+       hns_roce_cmq_setup_basic_desc(&desc[0], opcode, true);
+       desc[0].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
+       hns_roce_cmq_setup_basic_desc(&desc[1], opcode, true);
 
        ret = hns_roce_cmq_send(hr_dev, desc, 2);
        if (ret)
                return ret;
 
-       req_a = (struct hns_roce_pf_res_a *)desc[0].data;
-       req_b = (struct hns_roce_pf_res_b *)desc[1].data;
-
-       hr_dev->caps.qpc_bt_num = roce_get_field(req_a->qpc_bt_idx_num,
-                                                PF_RES_DATA_1_PF_QPC_BT_NUM_M,
-                                                PF_RES_DATA_1_PF_QPC_BT_NUM_S);
-       hr_dev->caps.srqc_bt_num = roce_get_field(req_a->srqc_bt_idx_num,
-                                               PF_RES_DATA_2_PF_SRQC_BT_NUM_M,
-                                               PF_RES_DATA_2_PF_SRQC_BT_NUM_S);
-       hr_dev->caps.cqc_bt_num = roce_get_field(req_a->cqc_bt_idx_num,
-                                                PF_RES_DATA_3_PF_CQC_BT_NUM_M,
-                                                PF_RES_DATA_3_PF_CQC_BT_NUM_S);
-       hr_dev->caps.mpt_bt_num = roce_get_field(req_a->mpt_bt_idx_num,
-                                                PF_RES_DATA_4_PF_MPT_BT_NUM_M,
-                                                PF_RES_DATA_4_PF_MPT_BT_NUM_S);
-
-       hr_dev->caps.sl_num = roce_get_field(req_b->qid_idx_sl_num,
-                                            PF_RES_DATA_3_PF_SL_NUM_M,
-                                            PF_RES_DATA_3_PF_SL_NUM_S);
-       hr_dev->caps.sccc_bt_num = roce_get_field(req_b->sccc_bt_idx_num,
-                                            PF_RES_DATA_4_PF_SCCC_BT_NUM_M,
-                                            PF_RES_DATA_4_PF_SCCC_BT_NUM_S);
-
-       hr_dev->caps.gmv_bt_num = roce_get_field(req_b->gmv_idx_num,
-                                                PF_RES_DATA_5_PF_GMV_BT_NUM_M,
-                                                PF_RES_DATA_5_PF_GMV_BT_NUM_S);
+       caps->qpc_bt_num = hr_reg_read(r_a, FUNC_RES_A_QPC_BT_NUM) / func_num;
+       caps->srqc_bt_num = hr_reg_read(r_a, FUNC_RES_A_SRQC_BT_NUM) / func_num;
+       caps->cqc_bt_num = hr_reg_read(r_a, FUNC_RES_A_CQC_BT_NUM) / func_num;
+       caps->mpt_bt_num = hr_reg_read(r_a, FUNC_RES_A_MPT_BT_NUM) / func_num;
+       caps->eqc_bt_num = hr_reg_read(r_a, FUNC_RES_A_EQC_BT_NUM) / func_num;
+       caps->smac_bt_num = hr_reg_read(r_b, FUNC_RES_B_SMAC_NUM) / func_num;
+       caps->sgid_bt_num = hr_reg_read(r_b, FUNC_RES_B_SGID_NUM) / func_num;
+       caps->sccc_bt_num = hr_reg_read(r_b, FUNC_RES_B_SCCC_BT_NUM) / func_num;
+
+       if (is_vf) {
+               caps->sl_num = hr_reg_read(r_b, FUNC_RES_V_QID_NUM) / func_num;
+               caps->gmv_bt_num = hr_reg_read(r_b, FUNC_RES_V_GMV_BT_NUM) /
+                                              func_num;
+       } else {
+               caps->sl_num = hr_reg_read(r_b, FUNC_RES_B_QID_NUM) / func_num;
+               caps->gmv_bt_num = hr_reg_read(r_b, FUNC_RES_B_GMV_BT_NUM) /
+                                              func_num;
+       }
 
        return 0;
 }
 
+static int hns_roce_query_pf_resource(struct hns_roce_dev *hr_dev)
+{
+       return load_func_res_caps(hr_dev, false);
+}
+
+static int hns_roce_query_vf_resource(struct hns_roce_dev *hr_dev)
+{
+       return load_func_res_caps(hr_dev, true);
+}
+
 static int hns_roce_query_pf_timer_resource(struct hns_roce_dev *hr_dev)
 {
-       struct hns_roce_pf_timer_res_a *req_a;
        struct hns_roce_cmq_desc desc;
+       struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
+       struct hns_roce_caps *caps = &hr_dev->caps;
        int ret;
 
        hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_QUERY_PF_TIMER_RES,
@@ -1622,24 +1747,17 @@ static int hns_roce_query_pf_timer_resource(struct hns_roce_dev *hr_dev)
        if (ret)
                return ret;
 
-       req_a = (struct hns_roce_pf_timer_res_a *)desc.data;
-
-       hr_dev->caps.qpc_timer_bt_num =
-               roce_get_field(req_a->qpc_timer_bt_idx_num,
-                              PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_M,
-                              PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_S);
-       hr_dev->caps.cqc_timer_bt_num =
-               roce_get_field(req_a->cqc_timer_bt_idx_num,
-                              PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_M,
-                              PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_S);
+       caps->qpc_timer_bt_num = hr_reg_read(req, PF_TIMER_RES_QPC_ITEM_NUM);
+       caps->cqc_timer_bt_num = hr_reg_read(req, PF_TIMER_RES_CQC_ITEM_NUM);
 
        return 0;
 }
 
-static int hns_roce_set_vf_switch_param(struct hns_roce_dev *hr_dev, int vf_id)
+static int __hns_roce_set_vf_switch_param(struct hns_roce_dev *hr_dev,
+                                         u32 vf_id)
 {
-       struct hns_roce_cmq_desc desc;
        struct hns_roce_vf_switch *swt;
+       struct hns_roce_cmq_desc desc;
        int ret;
 
        swt = (struct hns_roce_vf_switch *)desc.data;
@@ -1661,153 +1779,127 @@ static int hns_roce_set_vf_switch_param(struct hns_roce_dev *hr_dev, int vf_id)
        return hns_roce_cmq_send(hr_dev, &desc, 1);
 }
 
-static int hns_roce_alloc_vf_resource(struct hns_roce_dev *hr_dev)
+static int hns_roce_set_vf_switch_param(struct hns_roce_dev *hr_dev)
 {
-       struct hns_roce_cmq_desc desc[2];
-       struct hns_roce_vf_res_a *req_a;
-       struct hns_roce_vf_res_b *req_b;
+       u32 vf_id;
+       int ret;
+
+       for (vf_id = 0; vf_id < hr_dev->func_num; vf_id++) {
+               ret = __hns_roce_set_vf_switch_param(hr_dev, vf_id);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
 
-       req_a = (struct hns_roce_vf_res_a *)desc[0].data;
-       req_b = (struct hns_roce_vf_res_b *)desc[1].data;
+static int __hns_roce_alloc_vf_resource(struct hns_roce_dev *hr_dev, int vf_id)
+{
+       struct hns_roce_cmq_desc desc[2];
+       struct hns_roce_cmq_req *r_a = (struct hns_roce_cmq_req *)desc[0].data;
+       struct hns_roce_cmq_req *r_b = (struct hns_roce_cmq_req *)desc[1].data;
+       enum hns_roce_opcode_type opcode = HNS_ROCE_OPC_ALLOC_VF_RES;
+       struct hns_roce_caps *caps = &hr_dev->caps;
 
-       hns_roce_cmq_setup_basic_desc(&desc[0], HNS_ROCE_OPC_ALLOC_VF_RES,
-                                     false);
+       hns_roce_cmq_setup_basic_desc(&desc[0], opcode, false);
        desc[0].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
+       hns_roce_cmq_setup_basic_desc(&desc[1], opcode, false);
 
-       hns_roce_cmq_setup_basic_desc(&desc[1], HNS_ROCE_OPC_ALLOC_VF_RES,
-                                     false);
+       hr_reg_write(r_a, FUNC_RES_A_VF_ID, vf_id);
+
+       hr_reg_write(r_a, FUNC_RES_A_QPC_BT_NUM, caps->qpc_bt_num);
+       hr_reg_write(r_a, FUNC_RES_A_QPC_BT_IDX, vf_id * caps->qpc_bt_num);
+       hr_reg_write(r_a, FUNC_RES_A_SRQC_BT_NUM, caps->srqc_bt_num);
+       hr_reg_write(r_a, FUNC_RES_A_SRQC_BT_IDX, vf_id * caps->srqc_bt_num);
+       hr_reg_write(r_a, FUNC_RES_A_CQC_BT_NUM, caps->cqc_bt_num);
+       hr_reg_write(r_a, FUNC_RES_A_CQC_BT_IDX, vf_id * caps->cqc_bt_num);
+       hr_reg_write(r_a, FUNC_RES_A_MPT_BT_NUM, caps->mpt_bt_num);
+       hr_reg_write(r_a, FUNC_RES_A_MPT_BT_IDX, vf_id * caps->mpt_bt_num);
+       hr_reg_write(r_a, FUNC_RES_A_EQC_BT_NUM, caps->eqc_bt_num);
+       hr_reg_write(r_a, FUNC_RES_A_EQC_BT_IDX, vf_id * caps->eqc_bt_num);
+       hr_reg_write(r_b, FUNC_RES_V_QID_NUM, caps->sl_num);
+       hr_reg_write(r_b, FUNC_RES_B_QID_IDX, vf_id * caps->sl_num);
+       hr_reg_write(r_b, FUNC_RES_B_SCCC_BT_NUM, caps->sccc_bt_num);
+       hr_reg_write(r_b, FUNC_RES_B_SCCC_BT_IDX, vf_id * caps->sccc_bt_num);
 
-       roce_set_field(req_a->vf_qpc_bt_idx_num,
-                      VF_RES_A_DATA_1_VF_QPC_BT_IDX_M,
-                      VF_RES_A_DATA_1_VF_QPC_BT_IDX_S, 0);
-       roce_set_field(req_a->vf_qpc_bt_idx_num,
-                      VF_RES_A_DATA_1_VF_QPC_BT_NUM_M,
-                      VF_RES_A_DATA_1_VF_QPC_BT_NUM_S, HNS_ROCE_VF_QPC_BT_NUM);
-
-       roce_set_field(req_a->vf_srqc_bt_idx_num,
-                      VF_RES_A_DATA_2_VF_SRQC_BT_IDX_M,
-                      VF_RES_A_DATA_2_VF_SRQC_BT_IDX_S, 0);
-       roce_set_field(req_a->vf_srqc_bt_idx_num,
-                      VF_RES_A_DATA_2_VF_SRQC_BT_NUM_M,
-                      VF_RES_A_DATA_2_VF_SRQC_BT_NUM_S,
-                      HNS_ROCE_VF_SRQC_BT_NUM);
-
-       roce_set_field(req_a->vf_cqc_bt_idx_num,
-                      VF_RES_A_DATA_3_VF_CQC_BT_IDX_M,
-                      VF_RES_A_DATA_3_VF_CQC_BT_IDX_S, 0);
-       roce_set_field(req_a->vf_cqc_bt_idx_num,
-                      VF_RES_A_DATA_3_VF_CQC_BT_NUM_M,
-                      VF_RES_A_DATA_3_VF_CQC_BT_NUM_S, HNS_ROCE_VF_CQC_BT_NUM);
-
-       roce_set_field(req_a->vf_mpt_bt_idx_num,
-                      VF_RES_A_DATA_4_VF_MPT_BT_IDX_M,
-                      VF_RES_A_DATA_4_VF_MPT_BT_IDX_S, 0);
-       roce_set_field(req_a->vf_mpt_bt_idx_num,
-                      VF_RES_A_DATA_4_VF_MPT_BT_NUM_M,
-                      VF_RES_A_DATA_4_VF_MPT_BT_NUM_S, HNS_ROCE_VF_MPT_BT_NUM);
-
-       roce_set_field(req_a->vf_eqc_bt_idx_num, VF_RES_A_DATA_5_VF_EQC_IDX_M,
-                      VF_RES_A_DATA_5_VF_EQC_IDX_S, 0);
-       roce_set_field(req_a->vf_eqc_bt_idx_num, VF_RES_A_DATA_5_VF_EQC_NUM_M,
-                      VF_RES_A_DATA_5_VF_EQC_NUM_S, HNS_ROCE_VF_EQC_NUM);
-
-       roce_set_field(req_b->vf_smac_idx_num, VF_RES_B_DATA_1_VF_SMAC_IDX_M,
-                      VF_RES_B_DATA_1_VF_SMAC_IDX_S, 0);
-       roce_set_field(req_b->vf_smac_idx_num, VF_RES_B_DATA_1_VF_SMAC_NUM_M,
-                      VF_RES_B_DATA_1_VF_SMAC_NUM_S, HNS_ROCE_VF_SMAC_NUM);
-
-       roce_set_field(req_b->vf_sgid_idx_num, VF_RES_B_DATA_2_VF_SGID_IDX_M,
-                      VF_RES_B_DATA_2_VF_SGID_IDX_S, 0);
-       roce_set_field(req_b->vf_sgid_idx_num, VF_RES_B_DATA_2_VF_SGID_NUM_M,
-                      VF_RES_B_DATA_2_VF_SGID_NUM_S, HNS_ROCE_VF_SGID_NUM);
-
-       roce_set_field(req_b->vf_qid_idx_sl_num, VF_RES_B_DATA_3_VF_QID_IDX_M,
-                      VF_RES_B_DATA_3_VF_QID_IDX_S, 0);
-       roce_set_field(req_b->vf_qid_idx_sl_num, VF_RES_B_DATA_3_VF_SL_NUM_M,
-                      VF_RES_B_DATA_3_VF_SL_NUM_S, HNS_ROCE_VF_SL_NUM);
-
-       roce_set_field(req_b->vf_sccc_idx_num, VF_RES_B_DATA_4_VF_SCCC_BT_IDX_M,
-                      VF_RES_B_DATA_4_VF_SCCC_BT_IDX_S, 0);
-       roce_set_field(req_b->vf_sccc_idx_num, VF_RES_B_DATA_4_VF_SCCC_BT_NUM_M,
-                      VF_RES_B_DATA_4_VF_SCCC_BT_NUM_S,
-                      HNS_ROCE_VF_SCCC_BT_NUM);
+       if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) {
+               hr_reg_write(r_b, FUNC_RES_V_GMV_BT_NUM, caps->gmv_bt_num);
+               hr_reg_write(r_b, FUNC_RES_B_GMV_BT_IDX,
+                            vf_id * caps->gmv_bt_num);
+       } else {
+               hr_reg_write(r_b, FUNC_RES_B_SGID_NUM, caps->sgid_bt_num);
+               hr_reg_write(r_b, FUNC_RES_B_SGID_IDX,
+                            vf_id * caps->sgid_bt_num);
+               hr_reg_write(r_b, FUNC_RES_B_SMAC_NUM, caps->smac_bt_num);
+               hr_reg_write(r_b, FUNC_RES_B_SMAC_IDX,
+                            vf_id * caps->smac_bt_num);
+       }
 
        return hns_roce_cmq_send(hr_dev, desc, 2);
 }
 
+static int hns_roce_alloc_vf_resource(struct hns_roce_dev *hr_dev)
+{
+       int vf_id;
+       int ret;
+
+       for (vf_id = 0; vf_id < hr_dev->func_num; vf_id++) {
+               ret = __hns_roce_alloc_vf_resource(hr_dev, vf_id);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
 static int hns_roce_v2_set_bt(struct hns_roce_dev *hr_dev)
 {
-       u8 srqc_hop_num = hr_dev->caps.srqc_hop_num;
-       u8 qpc_hop_num = hr_dev->caps.qpc_hop_num;
-       u8 cqc_hop_num = hr_dev->caps.cqc_hop_num;
-       u8 mpt_hop_num = hr_dev->caps.mpt_hop_num;
-       u8 sccc_hop_num = hr_dev->caps.sccc_hop_num;
-       struct hns_roce_cfg_bt_attr *req;
        struct hns_roce_cmq_desc desc;
+       struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
+       struct hns_roce_caps *caps = &hr_dev->caps;
 
        hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_BT_ATTR, false);
-       req = (struct hns_roce_cfg_bt_attr *)desc.data;
-       memset(req, 0, sizeof(*req));
-
-       roce_set_field(req->vf_qpc_cfg, CFG_BT_ATTR_DATA_0_VF_QPC_BA_PGSZ_M,
-                      CFG_BT_ATTR_DATA_0_VF_QPC_BA_PGSZ_S,
-                      hr_dev->caps.qpc_ba_pg_sz + PG_SHIFT_OFFSET);
-       roce_set_field(req->vf_qpc_cfg, CFG_BT_ATTR_DATA_0_VF_QPC_BUF_PGSZ_M,
-                      CFG_BT_ATTR_DATA_0_VF_QPC_BUF_PGSZ_S,
-                      hr_dev->caps.qpc_buf_pg_sz + PG_SHIFT_OFFSET);
-       roce_set_field(req->vf_qpc_cfg, CFG_BT_ATTR_DATA_0_VF_QPC_HOPNUM_M,
-                      CFG_BT_ATTR_DATA_0_VF_QPC_HOPNUM_S,
-                      qpc_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : qpc_hop_num);
-
-       roce_set_field(req->vf_srqc_cfg, CFG_BT_ATTR_DATA_1_VF_SRQC_BA_PGSZ_M,
-                      CFG_BT_ATTR_DATA_1_VF_SRQC_BA_PGSZ_S,
-                      hr_dev->caps.srqc_ba_pg_sz + PG_SHIFT_OFFSET);
-       roce_set_field(req->vf_srqc_cfg, CFG_BT_ATTR_DATA_1_VF_SRQC_BUF_PGSZ_M,
-                      CFG_BT_ATTR_DATA_1_VF_SRQC_BUF_PGSZ_S,
-                      hr_dev->caps.srqc_buf_pg_sz + PG_SHIFT_OFFSET);
-       roce_set_field(req->vf_srqc_cfg, CFG_BT_ATTR_DATA_1_VF_SRQC_HOPNUM_M,
-                      CFG_BT_ATTR_DATA_1_VF_SRQC_HOPNUM_S,
-                      srqc_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : srqc_hop_num);
-
-       roce_set_field(req->vf_cqc_cfg, CFG_BT_ATTR_DATA_2_VF_CQC_BA_PGSZ_M,
-                      CFG_BT_ATTR_DATA_2_VF_CQC_BA_PGSZ_S,
-                      hr_dev->caps.cqc_ba_pg_sz + PG_SHIFT_OFFSET);
-       roce_set_field(req->vf_cqc_cfg, CFG_BT_ATTR_DATA_2_VF_CQC_BUF_PGSZ_M,
-                      CFG_BT_ATTR_DATA_2_VF_CQC_BUF_PGSZ_S,
-                      hr_dev->caps.cqc_buf_pg_sz + PG_SHIFT_OFFSET);
-       roce_set_field(req->vf_cqc_cfg, CFG_BT_ATTR_DATA_2_VF_CQC_HOPNUM_M,
-                      CFG_BT_ATTR_DATA_2_VF_CQC_HOPNUM_S,
-                      cqc_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : cqc_hop_num);
-
-       roce_set_field(req->vf_mpt_cfg, CFG_BT_ATTR_DATA_3_VF_MPT_BA_PGSZ_M,
-                      CFG_BT_ATTR_DATA_3_VF_MPT_BA_PGSZ_S,
-                      hr_dev->caps.mpt_ba_pg_sz + PG_SHIFT_OFFSET);
-       roce_set_field(req->vf_mpt_cfg, CFG_BT_ATTR_DATA_3_VF_MPT_BUF_PGSZ_M,
-                      CFG_BT_ATTR_DATA_3_VF_MPT_BUF_PGSZ_S,
-                      hr_dev->caps.mpt_buf_pg_sz + PG_SHIFT_OFFSET);
-       roce_set_field(req->vf_mpt_cfg, CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_M,
-                      CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_S,
-                      mpt_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : mpt_hop_num);
-
-       roce_set_field(req->vf_sccc_cfg,
-                      CFG_BT_ATTR_DATA_4_VF_SCCC_BA_PGSZ_M,
-                      CFG_BT_ATTR_DATA_4_VF_SCCC_BA_PGSZ_S,
-                      hr_dev->caps.sccc_ba_pg_sz + PG_SHIFT_OFFSET);
-       roce_set_field(req->vf_sccc_cfg,
-                      CFG_BT_ATTR_DATA_4_VF_SCCC_BUF_PGSZ_M,
-                      CFG_BT_ATTR_DATA_4_VF_SCCC_BUF_PGSZ_S,
-                      hr_dev->caps.sccc_buf_pg_sz + PG_SHIFT_OFFSET);
-       roce_set_field(req->vf_sccc_cfg,
-                      CFG_BT_ATTR_DATA_4_VF_SCCC_HOPNUM_M,
-                      CFG_BT_ATTR_DATA_4_VF_SCCC_HOPNUM_S,
-                      sccc_hop_num ==
-                             HNS_ROCE_HOP_NUM_0 ? 0 : sccc_hop_num);
+
+       hr_reg_write(req, CFG_BT_ATTR_QPC_BA_PGSZ,
+                    caps->qpc_ba_pg_sz + PG_SHIFT_OFFSET);
+       hr_reg_write(req, CFG_BT_ATTR_QPC_BUF_PGSZ,
+                    caps->qpc_buf_pg_sz + PG_SHIFT_OFFSET);
+       hr_reg_write(req, CFG_BT_ATTR_QPC_HOPNUM,
+                    to_hr_hem_hopnum(caps->qpc_hop_num, caps->num_qps));
+
+       hr_reg_write(req, CFG_BT_ATTR_SRQC_BA_PGSZ,
+                    caps->srqc_ba_pg_sz + PG_SHIFT_OFFSET);
+       hr_reg_write(req, CFG_BT_ATTR_SRQC_BUF_PGSZ,
+                    caps->srqc_buf_pg_sz + PG_SHIFT_OFFSET);
+       hr_reg_write(req, CFG_BT_ATTR_SRQC_HOPNUM,
+                    to_hr_hem_hopnum(caps->srqc_hop_num, caps->num_srqs));
+
+       hr_reg_write(req, CFG_BT_ATTR_CQC_BA_PGSZ,
+                    caps->cqc_ba_pg_sz + PG_SHIFT_OFFSET);
+       hr_reg_write(req, CFG_BT_ATTR_CQC_BUF_PGSZ,
+                    caps->cqc_buf_pg_sz + PG_SHIFT_OFFSET);
+       hr_reg_write(req, CFG_BT_ATTR_CQC_HOPNUM,
+                    to_hr_hem_hopnum(caps->cqc_hop_num, caps->num_cqs));
+
+       hr_reg_write(req, CFG_BT_ATTR_MPT_BA_PGSZ,
+                    caps->mpt_ba_pg_sz + PG_SHIFT_OFFSET);
+       hr_reg_write(req, CFG_BT_ATTR_MPT_BUF_PGSZ,
+                    caps->mpt_buf_pg_sz + PG_SHIFT_OFFSET);
+       hr_reg_write(req, CFG_BT_ATTR_MPT_HOPNUM,
+                    to_hr_hem_hopnum(caps->mpt_hop_num, caps->num_mtpts));
+
+       hr_reg_write(req, CFG_BT_ATTR_SCCC_BA_PGSZ,
+                    caps->sccc_ba_pg_sz + PG_SHIFT_OFFSET);
+       hr_reg_write(req, CFG_BT_ATTR_SCCC_BUF_PGSZ,
+                    caps->sccc_buf_pg_sz + PG_SHIFT_OFFSET);
+       hr_reg_write(req, CFG_BT_ATTR_SCCC_HOPNUM,
+                    to_hr_hem_hopnum(caps->sccc_hop_num, caps->num_qps));
 
        return hns_roce_cmq_send(hr_dev, &desc, 1);
 }
 
 static void set_default_caps(struct hns_roce_dev *hr_dev)
 {
+       struct hns_roce_v2_priv *priv = hr_dev->priv;
        struct hns_roce_caps *caps = &hr_dev->caps;
 
        caps->num_qps           = HNS_ROCE_V2_MAX_QP_NUM;
@@ -1819,24 +1911,24 @@ static void set_default_caps(struct hns_roce_dev *hr_dev)
        caps->max_sq_sg         = HNS_ROCE_V2_MAX_SQ_SGE_NUM;
        caps->max_extend_sg     = HNS_ROCE_V2_MAX_EXTEND_SGE_NUM;
        caps->max_rq_sg         = HNS_ROCE_V2_MAX_RQ_SGE_NUM;
-       caps->max_sq_inline     = HNS_ROCE_V2_MAX_SQ_INLINE;
        caps->num_uars          = HNS_ROCE_V2_UAR_NUM;
        caps->phy_num_uars      = HNS_ROCE_V2_PHY_UAR_NUM;
        caps->num_aeq_vectors   = HNS_ROCE_V2_AEQE_VEC_NUM;
-       caps->num_comp_vectors  = HNS_ROCE_V2_COMP_VEC_NUM;
+       caps->num_comp_vectors  =
+                       min_t(u32, caps->eqc_bt_num - 1,
+                             (u32)priv->handle->rinfo.num_vectors - 2);
        caps->num_other_vectors = HNS_ROCE_V2_ABNORMAL_VEC_NUM;
        caps->num_mtpts         = HNS_ROCE_V2_MAX_MTPT_NUM;
        caps->num_mtt_segs      = HNS_ROCE_V2_MAX_MTT_SEGS;
-       caps->num_cqe_segs      = HNS_ROCE_V2_MAX_CQE_SEGS;
        caps->num_srqwqe_segs   = HNS_ROCE_V2_MAX_SRQWQE_SEGS;
        caps->num_idx_segs      = HNS_ROCE_V2_MAX_IDX_SEGS;
        caps->num_pds           = HNS_ROCE_V2_MAX_PD_NUM;
+       caps->num_xrcds         = HNS_ROCE_V2_MAX_XRCD_NUM;
        caps->max_qp_init_rdma  = HNS_ROCE_V2_MAX_QP_INIT_RDMA;
        caps->max_qp_dest_rdma  = HNS_ROCE_V2_MAX_QP_DEST_RDMA;
        caps->max_sq_desc_sz    = HNS_ROCE_V2_MAX_SQ_DESC_SZ;
        caps->max_rq_desc_sz    = HNS_ROCE_V2_MAX_RQ_DESC_SZ;
        caps->max_srq_desc_sz   = HNS_ROCE_V2_MAX_SRQ_DESC_SZ;
-       caps->qpc_sz            = HNS_ROCE_V2_QPC_SZ;
        caps->irrl_entry_sz     = HNS_ROCE_V2_IRRL_ENTRY_SZ;
        caps->trrl_entry_sz     = HNS_ROCE_V2_EXT_ATOMIC_TRRL_ENTRY_SZ;
        caps->cqc_entry_sz      = HNS_ROCE_V2_CQC_ENTRY_SZ;
@@ -1844,56 +1936,39 @@ static void set_default_caps(struct hns_roce_dev *hr_dev)
        caps->mtpt_entry_sz     = HNS_ROCE_V2_MTPT_ENTRY_SZ;
        caps->mtt_entry_sz      = HNS_ROCE_V2_MTT_ENTRY_SZ;
        caps->idx_entry_sz      = HNS_ROCE_V2_IDX_ENTRY_SZ;
-       caps->cqe_sz            = HNS_ROCE_V2_CQE_SIZE;
        caps->page_size_cap     = HNS_ROCE_V2_PAGE_SIZE_SUPPORTED;
        caps->reserved_lkey     = 0;
        caps->reserved_pds      = 0;
+       caps->reserved_xrcds    = HNS_ROCE_V2_RSV_XRCD_NUM;
        caps->reserved_mrws     = 1;
        caps->reserved_uars     = 0;
        caps->reserved_cqs      = 0;
        caps->reserved_srqs     = 0;
        caps->reserved_qps      = HNS_ROCE_V2_RSV_QPS;
 
-       caps->qpc_ba_pg_sz      = 0;
-       caps->qpc_buf_pg_sz     = 0;
        caps->qpc_hop_num       = HNS_ROCE_CONTEXT_HOP_NUM;
-       caps->srqc_ba_pg_sz     = 0;
-       caps->srqc_buf_pg_sz    = 0;
        caps->srqc_hop_num      = HNS_ROCE_CONTEXT_HOP_NUM;
-       caps->cqc_ba_pg_sz      = 0;
-       caps->cqc_buf_pg_sz     = 0;
        caps->cqc_hop_num       = HNS_ROCE_CONTEXT_HOP_NUM;
-       caps->mpt_ba_pg_sz      = 0;
-       caps->mpt_buf_pg_sz     = 0;
        caps->mpt_hop_num       = HNS_ROCE_CONTEXT_HOP_NUM;
-       caps->mtt_ba_pg_sz      = 0;
-       caps->mtt_buf_pg_sz     = 0;
        caps->mtt_hop_num       = HNS_ROCE_MTT_HOP_NUM;
+       caps->pbl_hop_num       = HNS_ROCE_PBL_HOP_NUM;
        caps->wqe_sq_hop_num    = HNS_ROCE_SQWQE_HOP_NUM;
        caps->wqe_sge_hop_num   = HNS_ROCE_EXT_SGE_HOP_NUM;
        caps->wqe_rq_hop_num    = HNS_ROCE_RQWQE_HOP_NUM;
-       caps->cqe_ba_pg_sz      = HNS_ROCE_BA_PG_SZ_SUPPORTED_256K;
-       caps->cqe_buf_pg_sz     = 0;
        caps->cqe_hop_num       = HNS_ROCE_CQE_HOP_NUM;
-       caps->srqwqe_ba_pg_sz   = 0;
-       caps->srqwqe_buf_pg_sz  = 0;
        caps->srqwqe_hop_num    = HNS_ROCE_SRQWQE_HOP_NUM;
-       caps->idx_ba_pg_sz      = 0;
-       caps->idx_buf_pg_sz     = 0;
        caps->idx_hop_num       = HNS_ROCE_IDX_HOP_NUM;
-       caps->chunk_sz          = HNS_ROCE_V2_TABLE_CHUNK_SIZE;
+       caps->eqe_hop_num       = HNS_ROCE_EQE_HOP_NUM;
+       caps->chunk_sz          = HNS_ROCE_V2_TABLE_CHUNK_SIZE;
 
        caps->flags             = HNS_ROCE_CAP_FLAG_REREG_MR |
                                  HNS_ROCE_CAP_FLAG_ROCE_V1_V2 |
-                                 HNS_ROCE_CAP_FLAG_RECORD_DB |
-                                 HNS_ROCE_CAP_FLAG_SQ_RECORD_DB;
+                                 HNS_ROCE_CAP_FLAG_CQ_RECORD_DB |
+                                 HNS_ROCE_CAP_FLAG_QP_RECORD_DB;
 
        caps->pkey_table_len[0] = 1;
-       caps->gid_table_len[0]  = HNS_ROCE_V2_GID_INDEX_NUM;
        caps->ceqe_depth        = HNS_ROCE_V2_COMP_EQE_NUM;
        caps->aeqe_depth        = HNS_ROCE_V2_ASYNC_EQE_NUM;
-       caps->aeqe_size         = HNS_ROCE_AEQE_SIZE;
-       caps->ceqe_size         = HNS_ROCE_CEQE_SIZE;
        caps->local_ca_ack_delay = 0;
        caps->max_mtu = IB_MTU_4096;
 
@@ -1902,22 +1977,15 @@ static void set_default_caps(struct hns_roce_dev *hr_dev)
 
        caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC | HNS_ROCE_CAP_FLAG_MW |
                       HNS_ROCE_CAP_FLAG_SRQ | HNS_ROCE_CAP_FLAG_FRMR |
-                      HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL;
+                      HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL | HNS_ROCE_CAP_FLAG_XRC;
 
        caps->num_qpc_timer       = HNS_ROCE_V2_MAX_QPC_TIMER_NUM;
        caps->qpc_timer_entry_sz  = HNS_ROCE_V2_QPC_TIMER_ENTRY_SZ;
-       caps->qpc_timer_ba_pg_sz  = 0;
-       caps->qpc_timer_buf_pg_sz = 0;
        caps->qpc_timer_hop_num   = HNS_ROCE_HOP_NUM_0;
        caps->num_cqc_timer       = HNS_ROCE_V2_MAX_CQC_TIMER_NUM;
        caps->cqc_timer_entry_sz  = HNS_ROCE_V2_CQC_TIMER_ENTRY_SZ;
-       caps->cqc_timer_ba_pg_sz  = 0;
-       caps->cqc_timer_buf_pg_sz = 0;
        caps->cqc_timer_hop_num   = HNS_ROCE_HOP_NUM_0;
 
-       caps->sccc_sz = HNS_ROCE_V2_SCCC_SZ;
-       caps->sccc_ba_pg_sz       = 0;
-       caps->sccc_buf_pg_sz      = 0;
        caps->sccc_hop_num        = HNS_ROCE_SCCC_HOP_NUM;
 
        if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) {
@@ -1930,10 +1998,17 @@ static void set_default_caps(struct hns_roce_dev *hr_dev)
                caps->gmv_entry_num = caps->gmv_bt_num * (PAGE_SIZE /
                                                          caps->gmv_entry_sz);
                caps->gmv_hop_num = HNS_ROCE_HOP_NUM_0;
-               caps->gmv_ba_pg_sz = 0;
-               caps->gmv_buf_pg_sz = 0;
                caps->gid_table_len[0] = caps->gmv_bt_num * (HNS_HW_PAGE_SIZE /
                                         caps->gmv_entry_sz);
+               caps->max_sq_inline = HNS_ROCE_V2_MAX_SQ_INL_EXT;
+       } else {
+               caps->aeqe_size = HNS_ROCE_AEQE_SIZE;
+               caps->ceqe_size = HNS_ROCE_CEQE_SIZE;
+               caps->cqe_sz = HNS_ROCE_V2_CQE_SIZE;
+               caps->qpc_sz = HNS_ROCE_V2_QPC_SZ;
+               caps->sccc_sz = HNS_ROCE_V2_SCCC_SZ;
+               caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM;
+               caps->max_sq_inline = HNS_ROCE_V2_MAX_SQ_INLINE;
        }
 }
 
@@ -1979,6 +2054,70 @@ static void calc_pg_sz(u32 obj_num, u32 obj_size, u32 hop_num, u32 ctx_bt_num,
                *buf_page_size = ilog2(DIV_ROUND_UP(obj_num, obj_per_chunk));
 }
 
+static void set_hem_page_size(struct hns_roce_dev *hr_dev)
+{
+       struct hns_roce_caps *caps = &hr_dev->caps;
+
+       /* EQ */
+       caps->eqe_ba_pg_sz = 0;
+       caps->eqe_buf_pg_sz = 0;
+
+       /* Link Table */
+       caps->tsq_buf_pg_sz = 0;
+
+       /* MR */
+       caps->pbl_ba_pg_sz = HNS_ROCE_BA_PG_SZ_SUPPORTED_16K;
+       caps->pbl_buf_pg_sz = 0;
+       calc_pg_sz(caps->num_mtpts, caps->mtpt_entry_sz, caps->mpt_hop_num,
+                  caps->mpt_bt_num, &caps->mpt_buf_pg_sz, &caps->mpt_ba_pg_sz,
+                  HEM_TYPE_MTPT);
+
+       /* QP */
+       caps->qpc_timer_ba_pg_sz  = 0;
+       caps->qpc_timer_buf_pg_sz = 0;
+       caps->mtt_ba_pg_sz = 0;
+       caps->mtt_buf_pg_sz = 0;
+       calc_pg_sz(caps->num_qps, caps->qpc_sz, caps->qpc_hop_num,
+                  caps->qpc_bt_num, &caps->qpc_buf_pg_sz, &caps->qpc_ba_pg_sz,
+                  HEM_TYPE_QPC);
+
+       if (caps->flags & HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL)
+               calc_pg_sz(caps->num_qps, caps->sccc_sz, caps->sccc_hop_num,
+                          caps->sccc_bt_num, &caps->sccc_buf_pg_sz,
+                          &caps->sccc_ba_pg_sz, HEM_TYPE_SCCC);
+
+       /* CQ */
+       calc_pg_sz(caps->num_cqs, caps->cqc_entry_sz, caps->cqc_hop_num,
+                  caps->cqc_bt_num, &caps->cqc_buf_pg_sz, &caps->cqc_ba_pg_sz,
+                  HEM_TYPE_CQC);
+       calc_pg_sz(caps->max_cqes, caps->cqe_sz, caps->cqe_hop_num,
+                  1, &caps->cqe_buf_pg_sz, &caps->cqe_ba_pg_sz, HEM_TYPE_CQE);
+
+       if (caps->cqc_timer_entry_sz)
+               calc_pg_sz(caps->num_cqc_timer, caps->cqc_timer_entry_sz,
+                          caps->cqc_timer_hop_num, caps->cqc_timer_bt_num,
+                          &caps->cqc_timer_buf_pg_sz,
+                          &caps->cqc_timer_ba_pg_sz, HEM_TYPE_CQC_TIMER);
+
+       /* SRQ */
+       if (caps->flags & HNS_ROCE_CAP_FLAG_SRQ) {
+               calc_pg_sz(caps->num_srqs, caps->srqc_entry_sz,
+                          caps->srqc_hop_num, caps->srqc_bt_num,
+                          &caps->srqc_buf_pg_sz, &caps->srqc_ba_pg_sz,
+                          HEM_TYPE_SRQC);
+               calc_pg_sz(caps->num_srqwqe_segs, caps->mtt_entry_sz,
+                          caps->srqwqe_hop_num, 1, &caps->srqwqe_buf_pg_sz,
+                          &caps->srqwqe_ba_pg_sz, HEM_TYPE_SRQWQE);
+               calc_pg_sz(caps->num_idx_segs, caps->idx_entry_sz,
+                          caps->idx_hop_num, 1, &caps->idx_buf_pg_sz,
+                          &caps->idx_ba_pg_sz, HEM_TYPE_IDX);
+       }
+
+       /* GMV */
+       caps->gmv_ba_pg_sz = 0;
+       caps->gmv_buf_pg_sz = 0;
+}
+
 static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev)
 {
        struct hns_roce_cmq_desc desc[HNS_ROCE_QUERY_PF_CAPS_CMD_NUM];
@@ -2062,6 +2201,9 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev)
        caps->gid_table_len[0] = roce_get_field(resp_c->max_gid_num_cqs,
                                                V2_QUERY_PF_CAPS_C_MAX_GID_M,
                                                V2_QUERY_PF_CAPS_C_MAX_GID_S);
+
+       caps->gid_table_len[0] /= hr_dev->func_num;
+
        caps->max_cqes = 1 << roce_get_field(resp_c->cq_depth,
                                             V2_QUERY_PF_CAPS_C_CQ_DEPTH_M,
                                             V2_QUERY_PF_CAPS_C_CQ_DEPTH_S);
@@ -2079,13 +2221,18 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev)
        caps->num_srqs = 1 << roce_get_field(resp_d->wq_hop_num_max_srqs,
                                             V2_QUERY_PF_CAPS_D_NUM_SRQS_M,
                                             V2_QUERY_PF_CAPS_D_NUM_SRQS_S);
+       caps->cong_type = roce_get_field(resp_d->wq_hop_num_max_srqs,
+                                        V2_QUERY_PF_CAPS_D_CONG_TYPE_M,
+                                        V2_QUERY_PF_CAPS_D_CONG_TYPE_S);
        caps->max_srq_wrs = 1 << le16_to_cpu(resp_d->srq_depth);
+
        caps->ceqe_depth = 1 << roce_get_field(resp_d->num_ceqs_ceq_depth,
                                               V2_QUERY_PF_CAPS_D_CEQ_DEPTH_M,
                                               V2_QUERY_PF_CAPS_D_CEQ_DEPTH_S);
        caps->num_comp_vectors = roce_get_field(resp_d->num_ceqs_ceq_depth,
                                                V2_QUERY_PF_CAPS_D_NUM_CEQS_M,
                                                V2_QUERY_PF_CAPS_D_NUM_CEQS_S);
+
        caps->aeqe_depth = 1 << roce_get_field(resp_d->arm_st_aeq_depth,
                                               V2_QUERY_PF_CAPS_D_AEQ_DEPTH_M,
                                               V2_QUERY_PF_CAPS_D_AEQ_DEPTH_S);
@@ -2133,8 +2280,8 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev)
        caps->num_mtt_segs = HNS_ROCE_V2_MAX_MTT_SEGS;
        caps->ceqe_size = HNS_ROCE_CEQE_SIZE;
        caps->aeqe_size = HNS_ROCE_AEQE_SIZE;
-       caps->mtt_ba_pg_sz = 0;
-       caps->num_cqe_segs = HNS_ROCE_V2_MAX_CQE_SEGS;
+       caps->num_xrcds = HNS_ROCE_V2_MAX_XRCD_NUM;
+       caps->reserved_xrcds = HNS_ROCE_V2_RSV_XRCD_NUM;
        caps->num_srqwqe_segs = HNS_ROCE_V2_MAX_SRQWQE_SEGS;
        caps->num_idx_segs = HNS_ROCE_V2_MAX_IDX_SEGS;
 
@@ -2166,99 +2313,82 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev)
                caps->gmv_entry_num = caps->gmv_bt_num * (PAGE_SIZE /
                                                    caps->gmv_entry_sz);
                caps->gmv_hop_num = HNS_ROCE_HOP_NUM_0;
-               caps->gmv_ba_pg_sz = 0;
-               caps->gmv_buf_pg_sz = 0;
                caps->gid_table_len[0] = caps->gmv_bt_num *
                                (HNS_HW_PAGE_SIZE / caps->gmv_entry_sz);
        }
 
-       calc_pg_sz(caps->num_qps, caps->qpc_sz, caps->qpc_hop_num,
-                  caps->qpc_bt_num, &caps->qpc_buf_pg_sz, &caps->qpc_ba_pg_sz,
-                  HEM_TYPE_QPC);
-       calc_pg_sz(caps->num_mtpts, caps->mtpt_entry_sz, caps->mpt_hop_num,
-                  caps->mpt_bt_num, &caps->mpt_buf_pg_sz, &caps->mpt_ba_pg_sz,
-                  HEM_TYPE_MTPT);
-       calc_pg_sz(caps->num_cqs, caps->cqc_entry_sz, caps->cqc_hop_num,
-                  caps->cqc_bt_num, &caps->cqc_buf_pg_sz, &caps->cqc_ba_pg_sz,
-                  HEM_TYPE_CQC);
-       calc_pg_sz(caps->num_srqs, caps->srqc_entry_sz, caps->srqc_hop_num,
-                  caps->srqc_bt_num, &caps->srqc_buf_pg_sz,
-                  &caps->srqc_ba_pg_sz, HEM_TYPE_SRQC);
-
-       caps->sccc_hop_num = ctx_hop_num;
        caps->qpc_timer_hop_num = HNS_ROCE_HOP_NUM_0;
        caps->cqc_timer_hop_num = HNS_ROCE_HOP_NUM_0;
 
-       calc_pg_sz(caps->num_qps, caps->sccc_sz,
-                  caps->sccc_hop_num, caps->sccc_bt_num,
-                  &caps->sccc_buf_pg_sz, &caps->sccc_ba_pg_sz,
-                  HEM_TYPE_SCCC);
-       calc_pg_sz(caps->num_cqc_timer, caps->cqc_timer_entry_sz,
-                  caps->cqc_timer_hop_num, caps->cqc_timer_bt_num,
-                  &caps->cqc_timer_buf_pg_sz,
-                  &caps->cqc_timer_ba_pg_sz, HEM_TYPE_CQC_TIMER);
-
-       calc_pg_sz(caps->num_cqe_segs, caps->mtt_entry_sz, caps->cqe_hop_num,
-                  1, &caps->cqe_buf_pg_sz, &caps->cqe_ba_pg_sz, HEM_TYPE_CQE);
-       calc_pg_sz(caps->num_srqwqe_segs, caps->mtt_entry_sz,
-                  caps->srqwqe_hop_num, 1, &caps->srqwqe_buf_pg_sz,
-                  &caps->srqwqe_ba_pg_sz, HEM_TYPE_SRQWQE);
-       calc_pg_sz(caps->num_idx_segs, caps->idx_entry_sz, caps->idx_hop_num,
-                  1, &caps->idx_buf_pg_sz, &caps->idx_ba_pg_sz, HEM_TYPE_IDX);
-
        return 0;
 }
 
-static int hns_roce_config_qpc_size(struct hns_roce_dev *hr_dev)
+static int config_hem_entry_size(struct hns_roce_dev *hr_dev, u32 type, u32 val)
 {
        struct hns_roce_cmq_desc desc;
-       struct hns_roce_cfg_entry_size *cfg_size =
-                                 (struct hns_roce_cfg_entry_size *)desc.data;
+       struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
 
        hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_ENTRY_SIZE,
                                      false);
 
-       cfg_size->type = cpu_to_le32(HNS_ROCE_CFG_QPC_SIZE);
-       cfg_size->size = cpu_to_le32(hr_dev->caps.qpc_sz);
-
-       return hns_roce_cmq_send(hr_dev, &desc, 1);
-}
-
-static int hns_roce_config_sccc_size(struct hns_roce_dev *hr_dev)
-{
-       struct hns_roce_cmq_desc desc;
-       struct hns_roce_cfg_entry_size *cfg_size =
-                                 (struct hns_roce_cfg_entry_size *)desc.data;
-
-       hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_ENTRY_SIZE,
-                                     false);
-
-       cfg_size->type = cpu_to_le32(HNS_ROCE_CFG_SCCC_SIZE);
-       cfg_size->size = cpu_to_le32(hr_dev->caps.sccc_sz);
+       hr_reg_write(req, CFG_HEM_ENTRY_SIZE_TYPE, type);
+       hr_reg_write(req, CFG_HEM_ENTRY_SIZE_VALUE, val);
 
        return hns_roce_cmq_send(hr_dev, &desc, 1);
 }
 
 static int hns_roce_config_entry_size(struct hns_roce_dev *hr_dev)
 {
+       struct hns_roce_caps *caps = &hr_dev->caps;
        int ret;
 
        if (hr_dev->pci_dev->revision < PCI_REVISION_ID_HIP09)
                return 0;
 
-       ret = hns_roce_config_qpc_size(hr_dev);
+       ret = config_hem_entry_size(hr_dev, HNS_ROCE_CFG_QPC_SIZE,
+                                   caps->qpc_sz);
        if (ret) {
                dev_err(hr_dev->dev, "failed to cfg qpc sz, ret = %d.\n", ret);
                return ret;
        }
 
-       ret = hns_roce_config_sccc_size(hr_dev);
+       ret = config_hem_entry_size(hr_dev, HNS_ROCE_CFG_SCCC_SIZE,
+                                   caps->sccc_sz);
        if (ret)
                dev_err(hr_dev->dev, "failed to cfg sccc sz, ret = %d.\n", ret);
 
        return ret;
 }
 
+static int hns_roce_v2_vf_profile(struct hns_roce_dev *hr_dev)
+{
+       int ret;
+
+       hr_dev->vendor_part_id = hr_dev->pci_dev->device;
+       hr_dev->sys_image_guid = be64_to_cpu(hr_dev->ib_dev.node_guid);
+       hr_dev->func_num = 1;
+
+       ret = hns_roce_query_vf_resource(hr_dev);
+       if (ret) {
+               dev_err(hr_dev->dev,
+                       "Query the VF resource fail, ret = %d.\n", ret);
+               return ret;
+       }
+
+       set_default_caps(hr_dev);
+       set_hem_page_size(hr_dev);
+
+       ret = hns_roce_v2_set_bt(hr_dev);
+       if (ret) {
+               dev_err(hr_dev->dev,
+                       "Configure the VF bt attribute fail, ret = %d.\n",
+                       ret);
+               return ret;
+       }
+
+       return 0;
+}
+
 static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 {
        struct hns_roce_caps *caps = &hr_dev->caps;
@@ -2278,6 +2408,16 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
                return ret;
        }
 
+       if (hr_dev->is_vf)
+               return hns_roce_v2_vf_profile(hr_dev);
+
+       ret = hns_roce_query_func_info(hr_dev);
+       if (ret) {
+               dev_err(hr_dev->dev, "Query function info fail, ret = %d.\n",
+                       ret);
+               return ret;
+       }
+
        ret = hns_roce_config_global_param(hr_dev);
        if (ret) {
                dev_err(hr_dev->dev, "Configure global param fail, ret = %d.\n",
@@ -2300,7 +2440,7 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
                return ret;
        }
 
-       ret = hns_roce_set_vf_switch_param(hr_dev, 0);
+       ret = hns_roce_set_vf_switch_param(hr_dev);
        if (ret) {
                dev_err(hr_dev->dev,
                        "failed to set function switch param, ret = %d.\n",
@@ -2311,13 +2451,8 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
        hr_dev->vendor_part_id = hr_dev->pci_dev->device;
        hr_dev->sys_image_guid = be64_to_cpu(hr_dev->ib_dev.node_guid);
 
-       caps->pbl_ba_pg_sz      = HNS_ROCE_BA_PG_SZ_SUPPORTED_16K;
-       caps->pbl_buf_pg_sz     = 0;
        caps->pbl_hop_num       = HNS_ROCE_PBL_HOP_NUM;
-       caps->eqe_ba_pg_sz      = 0;
-       caps->eqe_buf_pg_sz     = 0;
        caps->eqe_hop_num       = HNS_ROCE_EQE_HOP_NUM;
-       caps->tsq_buf_pg_sz     = 0;
 
        ret = hns_roce_query_pf_caps(hr_dev);
        if (ret)
@@ -2330,6 +2465,7 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
                return ret;
        }
 
+       set_hem_page_size(hr_dev);
        ret = hns_roce_v2_set_bt(hr_dev);
        if (ret) {
                dev_err(hr_dev->dev,
@@ -2507,6 +2643,22 @@ static void hns_roce_free_link_table(struct hns_roce_dev *hr_dev,
                          link_tbl->table.map);
 }
 
+static void free_dip_list(struct hns_roce_dev *hr_dev)
+{
+       struct hns_roce_dip *hr_dip;
+       struct hns_roce_dip *tmp;
+       unsigned long flags;
+
+       spin_lock_irqsave(&hr_dev->dip_list_lock, flags);
+
+       list_for_each_entry_safe(hr_dip, tmp, &hr_dev->dip_list, node) {
+               list_del(&hr_dip->node);
+               kfree(hr_dip);
+       }
+
+       spin_unlock_irqrestore(&hr_dev->dip_list_lock, flags);
+}
+
 static int get_hem_table(struct hns_roce_dev *hr_dev)
 {
        unsigned int qpc_count;
@@ -2515,6 +2667,17 @@ static int get_hem_table(struct hns_roce_dev *hr_dev)
        int ret;
        int i;
 
+       /* Alloc memory for source address table buffer space chunk */
+       for (gmv_count = 0; gmv_count < hr_dev->caps.gmv_entry_num;
+            gmv_count++) {
+               ret = hns_roce_table_get(hr_dev, &hr_dev->gmv_table, gmv_count);
+               if (ret)
+                       goto err_gmv_failed;
+       }
+
+       if (hr_dev->is_vf)
+               return 0;
+
        /* Alloc memory for QPC Timer buffer space chunk */
        for (qpc_count = 0; qpc_count < hr_dev->caps.qpc_timer_bt_num;
             qpc_count++) {
@@ -2537,23 +2700,8 @@ static int get_hem_table(struct hns_roce_dev *hr_dev)
                }
        }
 
-       /* Alloc memory for GMV(GID/MAC/VLAN) table buffer space chunk */
-       for (gmv_count = 0; gmv_count < hr_dev->caps.gmv_entry_num;
-            gmv_count++) {
-               ret = hns_roce_table_get(hr_dev, &hr_dev->gmv_table, gmv_count);
-               if (ret) {
-                       dev_err(hr_dev->dev,
-                               "failed to get gmv table, ret = %d.\n", ret);
-                       goto err_gmv_failed;
-               }
-       }
-
        return 0;
 
-err_gmv_failed:
-       for (i = 0; i < gmv_count; i++)
-               hns_roce_table_put(hr_dev, &hr_dev->gmv_table, i);
-
 err_cqc_timer_failed:
        for (i = 0; i < cqc_count; i++)
                hns_roce_table_put(hr_dev, &hr_dev->cqc_timer_table, i);
@@ -2562,19 +2710,47 @@ err_qpc_timer_failed:
        for (i = 0; i < qpc_count; i++)
                hns_roce_table_put(hr_dev, &hr_dev->qpc_timer_table, i);
 
+err_gmv_failed:
+       for (i = 0; i < gmv_count; i++)
+               hns_roce_table_put(hr_dev, &hr_dev->gmv_table, i);
+
        return ret;
 }
 
+static void put_hem_table(struct hns_roce_dev *hr_dev)
+{
+       int i;
+
+       for (i = 0; i < hr_dev->caps.gmv_entry_num; i++)
+               hns_roce_table_put(hr_dev, &hr_dev->gmv_table, i);
+
+       if (hr_dev->is_vf)
+               return;
+
+       for (i = 0; i < hr_dev->caps.qpc_timer_bt_num; i++)
+               hns_roce_table_put(hr_dev, &hr_dev->qpc_timer_table, i);
+
+       for (i = 0; i < hr_dev->caps.cqc_timer_bt_num; i++)
+               hns_roce_table_put(hr_dev, &hr_dev->cqc_timer_table, i);
+}
+
 static int hns_roce_v2_init(struct hns_roce_dev *hr_dev)
 {
        struct hns_roce_v2_priv *priv = hr_dev->priv;
        int ret;
 
+       ret = get_hem_table(hr_dev);
+       if (ret)
+               return ret;
+
+       if (hr_dev->is_vf)
+               return 0;
+
        /* TSQ includes SQ doorbell and ack doorbell */
        ret = hns_roce_init_link_table(hr_dev, TSQ_LINK_TABLE);
        if (ret) {
                dev_err(hr_dev->dev, "failed to init TSQ, ret = %d.\n", ret);
-               return ret;
+               goto err_tsq_init_failed;
        }
 
        ret = hns_roce_init_link_table(hr_dev, TPQ_LINK_TABLE);
@@ -2583,17 +2759,13 @@ static int hns_roce_v2_init(struct hns_roce_dev *hr_dev)
                goto err_tpq_init_failed;
        }
 
-       ret = get_hem_table(hr_dev);
-       if (ret)
-               goto err_get_hem_table_failed;
-
        return 0;
 
-err_get_hem_table_failed:
-       hns_roce_free_link_table(hr_dev, &priv->tpq);
+err_tsq_init_failed:
+       put_hem_table(hr_dev);
 
 err_tpq_init_failed:
-       hns_roce_free_link_table(hr_dev, &priv->tsq);
+       hns_roce_free_link_table(hr_dev, &priv->tpq);
 
        return ret;
 }
@@ -2604,38 +2776,13 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev)
 
        hns_roce_function_clear(hr_dev);
 
-       hns_roce_free_link_table(hr_dev, &priv->tpq);
-       hns_roce_free_link_table(hr_dev, &priv->tsq);
-}
-
-static int hns_roce_query_mbox_status(struct hns_roce_dev *hr_dev)
-{
-       struct hns_roce_cmq_desc desc;
-       struct hns_roce_mbox_status *mb_st =
-                                      (struct hns_roce_mbox_status *)desc.data;
-       int status;
-
-       hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_QUERY_MB_ST, true);
-
-       status = hns_roce_cmq_send(hr_dev, &desc, 1);
-       if (status)
-               return status;
-
-       return le32_to_cpu(mb_st->mb_status_hw_run);
-}
-
-static int hns_roce_v2_cmd_pending(struct hns_roce_dev *hr_dev)
-{
-       u32 status = hns_roce_query_mbox_status(hr_dev);
-
-       return status >> HNS_ROCE_HW_RUN_BIT_SHIFT;
-}
-
-static int hns_roce_v2_cmd_complete(struct hns_roce_dev *hr_dev)
-{
-       u32 status = hns_roce_query_mbox_status(hr_dev);
+       if (!hr_dev->is_vf) {
+               hns_roce_free_link_table(hr_dev, &priv->tpq);
+               hns_roce_free_link_table(hr_dev, &priv->tsq);
+       }
 
-       return status & HNS_ROCE_HW_MB_STATUS_MASK;
+       if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP09)
+               free_dip_list(hr_dev);
 }
 
 static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev, u64 in_param,
@@ -2657,58 +2804,97 @@ static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev, u64 in_param,
        return hns_roce_cmq_send(hr_dev, &desc, 1);
 }
 
-static int hns_roce_v2_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param,
-                                u64 out_param, u32 in_modifier, u8 op_modifier,
-                                u16 op, u16 token, int event)
+static int v2_wait_mbox_complete(struct hns_roce_dev *hr_dev, u32 timeout,
+                                u8 *complete_status)
 {
-       struct device *dev = hr_dev->dev;
+       struct hns_roce_mbox_status *mb_st;
+       struct hns_roce_cmq_desc desc;
        unsigned long end;
-       int ret;
+       int ret = -EBUSY;
+       u32 status;
+       bool busy;
+
+       mb_st = (struct hns_roce_mbox_status *)desc.data;
+       end = msecs_to_jiffies(timeout) + jiffies;
+       while (v2_chk_mbox_is_avail(hr_dev, &busy)) {
+               status = 0;
+               hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_QUERY_MB_ST,
+                                             true);
+               ret = __hns_roce_cmq_send(hr_dev, &desc, 1);
+               if (!ret) {
+                       status = le32_to_cpu(mb_st->mb_status_hw_run);
+                       /* No pending message exists in ROCEE mbox. */
+                       if (!(status & MB_ST_HW_RUN_M))
+                               break;
+               } else if (!v2_chk_mbox_is_avail(hr_dev, &busy)) {
+                       break;
+               }
 
-       end = msecs_to_jiffies(HNS_ROCE_V2_GO_BIT_TIMEOUT_MSECS) + jiffies;
-       while (hns_roce_v2_cmd_pending(hr_dev)) {
                if (time_after(jiffies, end)) {
-                       dev_dbg(dev, "jiffies=%d end=%d\n", (int)jiffies,
-                               (int)end);
-                       return -EAGAIN;
+                       dev_err_ratelimited(hr_dev->dev,
+                                           "failed to wait mbox status 0x%x\n",
+                                           status);
+                       return -ETIMEDOUT;
                }
+
                cond_resched();
+               ret = -EBUSY;
        }
 
-       ret = hns_roce_mbox_post(hr_dev, in_param, out_param, in_modifier,
-                                op_modifier, op, token, event);
-       if (ret)
-               dev_err(dev, "Post mailbox fail(%d)\n", ret);
+       if (!ret) {
+               *complete_status = (u8)(status & MB_ST_COMPLETE_M);
+       } else if (!v2_chk_mbox_is_avail(hr_dev, &busy)) {
+               /* Ignore all errors if the mbox is unavailable. */
+               ret = 0;
+               *complete_status = MB_ST_COMPLETE_M;
+       }
 
        return ret;
 }
 
-static int hns_roce_v2_chk_mbox(struct hns_roce_dev *hr_dev,
-                               unsigned int timeout)
+static int v2_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param,
+                       u64 out_param, u32 in_modifier, u8 op_modifier,
+                       u16 op, u16 token, int event)
 {
-       struct device *dev = hr_dev->dev;
-       unsigned long end;
-       u32 status;
-
-       end = msecs_to_jiffies(timeout) + jiffies;
-       while (hns_roce_v2_cmd_pending(hr_dev) && time_before(jiffies, end))
-               cond_resched();
+       u8 status = 0;
+       int ret;
 
-       if (hns_roce_v2_cmd_pending(hr_dev)) {
-               dev_err(dev, "[cmd_poll]hw run cmd TIMEDOUT!\n");
-               return -ETIMEDOUT;
+       /* Waiting for the mbox to be idle */
+       ret = v2_wait_mbox_complete(hr_dev, HNS_ROCE_V2_GO_BIT_TIMEOUT_MSECS,
+                                   &status);
+       if (unlikely(ret)) {
+               dev_err_ratelimited(hr_dev->dev,
+                                   "failed to check post mbox status = 0x%x, ret = %d.\n",
+                                   status, ret);
+               return ret;
        }
 
-       status = hns_roce_v2_cmd_complete(hr_dev);
-       if (status != 0x1) {
-               if (status == CMD_RST_PRC_EBUSY)
-                       return status;
+       /* Post new message to mbox */
+       ret = hns_roce_mbox_post(hr_dev, in_param, out_param, in_modifier,
+                                op_modifier, op, token, event);
+       if (ret)
+               dev_err_ratelimited(hr_dev->dev,
+                                   "failed to post mailbox, ret = %d.\n", ret);
+
+       return ret;
+}
+
+static int v2_poll_mbox_done(struct hns_roce_dev *hr_dev, unsigned int timeout)
+{
+       u8 status = 0;
+       int ret;
 
-               dev_err(dev, "mailbox status 0x%x!\n", status);
-               return -EBUSY;
+       ret = v2_wait_mbox_complete(hr_dev, timeout, &status);
+       if (!ret) {
+               if (status != MB_ST_COMPLETE_SUCC)
+                       return -EBUSY;
+       } else {
+               dev_err_ratelimited(hr_dev->dev,
+                                   "failed to check mbox status = 0x%x, ret = %d.\n",
+                                   status, ret);
        }
 
-       return 0;
+       return ret;
 }
 
 static void copy_gid(void *dest, const union ib_gid *gid)
@@ -2790,7 +2976,7 @@ static int config_gmv_table(struct hns_roce_dev *hr_dev,
        return hns_roce_cmq_send(hr_dev, desc, 2);
 }
 
-static int hns_roce_v2_set_gid(struct hns_roce_dev *hr_dev, u8 port,
+static int hns_roce_v2_set_gid(struct hns_roce_dev *hr_dev, u32 port,
                               int gid_index, const union ib_gid *gid,
                               const struct ib_gid_attr *attr)
 {
@@ -3079,14 +3265,31 @@ static void *get_sw_cqe_v2(struct hns_roce_cq *hr_cq, unsigned int n)
                !!(n & hr_cq->cq_depth)) ? cqe : NULL;
 }
 
-static inline void hns_roce_v2_cq_set_ci(struct hns_roce_cq *hr_cq, u32 ci)
+static inline void update_cq_db(struct hns_roce_dev *hr_dev,
+                               struct hns_roce_cq *hr_cq)
 {
-       *hr_cq->set_ci_db = ci & V2_CQ_DB_PARAMETER_CONS_IDX_M;
+       if (likely(hr_cq->flags & HNS_ROCE_CQ_FLAG_RECORD_DB)) {
+               *hr_cq->set_ci_db = hr_cq->cons_index & V2_CQ_DB_CONS_IDX_M;
+       } else {
+               struct hns_roce_v2_db cq_db = {};
+
+               roce_set_field(cq_db.byte_4, V2_DB_TAG_M, V2_DB_TAG_S,
+                              hr_cq->cqn);
+               roce_set_field(cq_db.byte_4, V2_DB_CMD_M, V2_DB_CMD_S,
+                              HNS_ROCE_V2_CQ_DB);
+               roce_set_field(cq_db.parameter, V2_CQ_DB_CONS_IDX_M,
+                              V2_CQ_DB_CONS_IDX_S, hr_cq->cons_index);
+               roce_set_field(cq_db.parameter, V2_CQ_DB_CMD_SN_M,
+                              V2_CQ_DB_CMD_SN_S, 1);
+
+               hns_roce_write64(hr_dev, (__le32 *)&cq_db, hr_cq->db_reg);
+       }
 }
 
 static void __hns_roce_v2_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn,
                                   struct hns_roce_srq *srq)
 {
+       struct hns_roce_dev *hr_dev = to_hr_dev(hr_cq->ib_cq.device);
        struct hns_roce_v2_cqe *cqe, *dest;
        u32 prod_index;
        int nfreed = 0;
@@ -3129,7 +3332,7 @@ static void __hns_roce_v2_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn,
 
        if (nfreed) {
                hr_cq->cons_index += nfreed;
-               hns_roce_v2_cq_set_ci(hr_cq, hr_cq->cons_index);
+               update_cq_db(hr_dev, hr_cq);
        }
 }
 
@@ -3224,37 +3427,33 @@ static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq,
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(ibcq->device);
        struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
-       u32 notification_flag;
-       __le32 doorbell[2];
+       struct hns_roce_v2_db cq_db = {};
+       u32 notify_flag;
 
-       doorbell[0] = 0;
-       doorbell[1] = 0;
-
-       notification_flag = (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
-                            V2_CQ_DB_REQ_NOT : V2_CQ_DB_REQ_NOT_SOL;
        /*
-        * flags = 0; Notification Flag = 1, next
-        * flags = 1; Notification Flag = 0, solocited
+        * flags = 0, then notify_flag : next
+        * flags = 1, then notify flag : solocited
         */
-       roce_set_field(doorbell[0], V2_CQ_DB_BYTE_4_TAG_M, V2_DB_BYTE_4_TAG_S,
-                      hr_cq->cqn);
-       roce_set_field(doorbell[0], V2_CQ_DB_BYTE_4_CMD_M, V2_DB_BYTE_4_CMD_S,
-                      HNS_ROCE_V2_CQ_DB_NTR);
-       roce_set_field(doorbell[1], V2_CQ_DB_PARAMETER_CONS_IDX_M,
-                      V2_CQ_DB_PARAMETER_CONS_IDX_S, hr_cq->cons_index);
-       roce_set_field(doorbell[1], V2_CQ_DB_PARAMETER_CMD_SN_M,
-                      V2_CQ_DB_PARAMETER_CMD_SN_S, hr_cq->arm_sn & 0x3);
-       roce_set_bit(doorbell[1], V2_CQ_DB_PARAMETER_NOTIFY_S,
-                    notification_flag);
-
-       hns_roce_write64(hr_dev, doorbell, hr_cq->cq_db_l);
+       notify_flag = (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+                     V2_CQ_DB_REQ_NOT : V2_CQ_DB_REQ_NOT_SOL;
+
+       roce_set_field(cq_db.byte_4, V2_DB_TAG_M, V2_DB_TAG_S, hr_cq->cqn);
+       roce_set_field(cq_db.byte_4, V2_DB_CMD_M, V2_DB_CMD_S,
+                      HNS_ROCE_V2_CQ_DB_NOTIFY);
+       roce_set_field(cq_db.parameter, V2_CQ_DB_CONS_IDX_M,
+                      V2_CQ_DB_CONS_IDX_S, hr_cq->cons_index);
+       roce_set_field(cq_db.parameter, V2_CQ_DB_CMD_SN_M,
+                      V2_CQ_DB_CMD_SN_S, hr_cq->arm_sn);
+       roce_set_bit(cq_db.parameter, V2_CQ_DB_NOTIFY_TYPE_S, notify_flag);
+
+       hns_roce_write64(hr_dev, (__le32 *)&cq_db, hr_cq->db_reg);
 
        return 0;
 }
 
 static int hns_roce_handle_recv_inl_wqe(struct hns_roce_v2_cqe *cqe,
-                                                   struct hns_roce_qp **cur_qp,
-                                                   struct ib_wc *wc)
+                                       struct hns_roce_qp *qp,
+                                       struct ib_wc *wc)
 {
        struct hns_roce_rinl_sge *sge_list;
        u32 wr_num, wr_cnt, sge_num;
@@ -3263,11 +3462,11 @@ static int hns_roce_handle_recv_inl_wqe(struct hns_roce_v2_cqe *cqe,
 
        wr_num = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_WQE_INDX_M,
                                V2_CQE_BYTE_4_WQE_INDX_S) & 0xffff;
-       wr_cnt = wr_num & ((*cur_qp)->rq.wqe_cnt - 1);
+       wr_cnt = wr_num & (qp->rq.wqe_cnt - 1);
 
-       sge_list = (*cur_qp)->rq_inl_buf.wqe_list[wr_cnt].sg_list;
-       sge_num = (*cur_qp)->rq_inl_buf.wqe_list[wr_cnt].sge_cnt;
-       wqe_buf = hns_roce_get_recv_wqe(*cur_qp, wr_cnt);
+       sge_list = qp->rq_inl_buf.wqe_list[wr_cnt].sg_list;
+       sge_num = qp->rq_inl_buf.wqe_list[wr_cnt].sge_cnt;
+       wqe_buf = hns_roce_get_recv_wqe(qp, wr_cnt);
        data_len = wc->byte_len;
 
        for (sge_cnt = 0; (sge_cnt < sge_num) && (data_len); sge_cnt++) {
@@ -3401,21 +3600,205 @@ static void get_cqe_status(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp,
                init_flush_work(hr_dev, qp);
 }
 
+static int get_cur_qp(struct hns_roce_cq *hr_cq, struct hns_roce_v2_cqe *cqe,
+                     struct hns_roce_qp **cur_qp)
+{
+       struct hns_roce_dev *hr_dev = to_hr_dev(hr_cq->ib_cq.device);
+       struct hns_roce_qp *hr_qp = *cur_qp;
+       u32 qpn;
+
+       qpn = roce_get_field(cqe->byte_16, V2_CQE_BYTE_16_LCL_QPN_M,
+                            V2_CQE_BYTE_16_LCL_QPN_S) &
+             HNS_ROCE_V2_CQE_QPN_MASK;
+
+       if (!hr_qp || qpn != hr_qp->qpn) {
+               hr_qp = __hns_roce_qp_lookup(hr_dev, qpn);
+               if (unlikely(!hr_qp)) {
+                       ibdev_err(&hr_dev->ib_dev,
+                                 "CQ %06lx with entry for unknown QPN %06x\n",
+                                 hr_cq->cqn, qpn);
+                       return -EINVAL;
+               }
+               *cur_qp = hr_qp;
+       }
+
+       return 0;
+}
+
+/*
+ * mapped-value = 1 + real-value
+ * The ib wc opcode's real value is start from 0, In order to distinguish
+ * between initialized and uninitialized map values, we plus 1 to the actual
+ * value when defining the mapping, so that the validity can be identified by
+ * checking whether the mapped value is greater than 0.
+ */
+#define HR_WC_OP_MAP(hr_key, ib_key) \
+               [HNS_ROCE_V2_WQE_OP_ ## hr_key] = 1 + IB_WC_ ## ib_key
+
+static const u32 wc_send_op_map[] = {
+       HR_WC_OP_MAP(SEND,                      SEND),
+       HR_WC_OP_MAP(SEND_WITH_INV,             SEND),
+       HR_WC_OP_MAP(SEND_WITH_IMM,             SEND),
+       HR_WC_OP_MAP(RDMA_READ,                 RDMA_READ),
+       HR_WC_OP_MAP(RDMA_WRITE,                RDMA_WRITE),
+       HR_WC_OP_MAP(RDMA_WRITE_WITH_IMM,       RDMA_WRITE),
+       HR_WC_OP_MAP(LOCAL_INV,                 LOCAL_INV),
+       HR_WC_OP_MAP(ATOM_CMP_AND_SWAP,         COMP_SWAP),
+       HR_WC_OP_MAP(ATOM_FETCH_AND_ADD,        FETCH_ADD),
+       HR_WC_OP_MAP(ATOM_MSK_CMP_AND_SWAP,     MASKED_COMP_SWAP),
+       HR_WC_OP_MAP(ATOM_MSK_FETCH_AND_ADD,    MASKED_FETCH_ADD),
+       HR_WC_OP_MAP(FAST_REG_PMR,              REG_MR),
+       HR_WC_OP_MAP(BIND_MW,                   REG_MR),
+};
+
+static int to_ib_wc_send_op(u32 hr_opcode)
+{
+       if (hr_opcode >= ARRAY_SIZE(wc_send_op_map))
+               return -EINVAL;
+
+       return wc_send_op_map[hr_opcode] ? wc_send_op_map[hr_opcode] - 1 :
+                                          -EINVAL;
+}
+
+static const u32 wc_recv_op_map[] = {
+       HR_WC_OP_MAP(RDMA_WRITE_WITH_IMM,               WITH_IMM),
+       HR_WC_OP_MAP(SEND,                              RECV),
+       HR_WC_OP_MAP(SEND_WITH_IMM,                     WITH_IMM),
+       HR_WC_OP_MAP(SEND_WITH_INV,                     RECV),
+};
+
+static int to_ib_wc_recv_op(u32 hr_opcode)
+{
+       if (hr_opcode >= ARRAY_SIZE(wc_recv_op_map))
+               return -EINVAL;
+
+       return wc_recv_op_map[hr_opcode] ? wc_recv_op_map[hr_opcode] - 1 :
+                                          -EINVAL;
+}
+
+static void fill_send_wc(struct ib_wc *wc, struct hns_roce_v2_cqe *cqe)
+{
+       u32 hr_opcode;
+       int ib_opcode;
+
+       wc->wc_flags = 0;
+
+       hr_opcode = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_OPCODE_M,
+                                  V2_CQE_BYTE_4_OPCODE_S) & 0x1f;
+       switch (hr_opcode) {
+       case HNS_ROCE_V2_WQE_OP_RDMA_READ:
+               wc->byte_len = le32_to_cpu(cqe->byte_cnt);
+               break;
+       case HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM:
+       case HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM:
+               wc->wc_flags |= IB_WC_WITH_IMM;
+               break;
+       case HNS_ROCE_V2_WQE_OP_LOCAL_INV:
+               wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+               break;
+       case HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP:
+       case HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD:
+       case HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP:
+       case HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD:
+               wc->byte_len  = 8;
+               break;
+       default:
+               break;
+       }
+
+       ib_opcode = to_ib_wc_send_op(hr_opcode);
+       if (ib_opcode < 0)
+               wc->status = IB_WC_GENERAL_ERR;
+       else
+               wc->opcode = ib_opcode;
+}
+
+static inline bool is_rq_inl_enabled(struct ib_wc *wc, u32 hr_opcode,
+                                    struct hns_roce_v2_cqe *cqe)
+{
+       return wc->qp->qp_type != IB_QPT_UD &&
+              wc->qp->qp_type != IB_QPT_GSI &&
+              (hr_opcode == HNS_ROCE_V2_OPCODE_SEND ||
+               hr_opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_IMM ||
+               hr_opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_INV) &&
+              roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_RQ_INLINE_S);
+}
+
+static int fill_recv_wc(struct ib_wc *wc, struct hns_roce_v2_cqe *cqe)
+{
+       struct hns_roce_qp *qp = to_hr_qp(wc->qp);
+       u32 hr_opcode;
+       int ib_opcode;
+       int ret;
+
+       wc->byte_len = le32_to_cpu(cqe->byte_cnt);
+
+       hr_opcode = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_OPCODE_M,
+                                  V2_CQE_BYTE_4_OPCODE_S) & 0x1f;
+       switch (hr_opcode) {
+       case HNS_ROCE_V2_OPCODE_RDMA_WRITE_IMM:
+       case HNS_ROCE_V2_OPCODE_SEND_WITH_IMM:
+               wc->wc_flags = IB_WC_WITH_IMM;
+               wc->ex.imm_data = cpu_to_be32(le32_to_cpu(cqe->immtdata));
+               break;
+       case HNS_ROCE_V2_OPCODE_SEND_WITH_INV:
+               wc->wc_flags = IB_WC_WITH_INVALIDATE;
+               wc->ex.invalidate_rkey = le32_to_cpu(cqe->rkey);
+               break;
+       default:
+               wc->wc_flags = 0;
+       }
+
+       ib_opcode = to_ib_wc_recv_op(hr_opcode);
+       if (ib_opcode < 0)
+               wc->status = IB_WC_GENERAL_ERR;
+       else
+               wc->opcode = ib_opcode;
+
+       if (is_rq_inl_enabled(wc, hr_opcode, cqe)) {
+               ret = hns_roce_handle_recv_inl_wqe(cqe, qp, wc);
+               if (unlikely(ret))
+                       return ret;
+       }
+
+       wc->sl = roce_get_field(cqe->byte_32, V2_CQE_BYTE_32_SL_M,
+                               V2_CQE_BYTE_32_SL_S);
+       wc->src_qp = roce_get_field(cqe->byte_32, V2_CQE_BYTE_32_RMT_QPN_M,
+                                   V2_CQE_BYTE_32_RMT_QPN_S);
+       wc->slid = 0;
+       wc->wc_flags |= roce_get_bit(cqe->byte_32, V2_CQE_BYTE_32_GRH_S) ?
+                                    IB_WC_GRH : 0;
+       wc->port_num = roce_get_field(cqe->byte_32, V2_CQE_BYTE_32_PORTN_M,
+                                     V2_CQE_BYTE_32_PORTN_S);
+       wc->pkey_index = 0;
+
+       if (roce_get_bit(cqe->byte_28, V2_CQE_BYTE_28_VID_VLD_S)) {
+               wc->vlan_id = roce_get_field(cqe->byte_28, V2_CQE_BYTE_28_VID_M,
+                                            V2_CQE_BYTE_28_VID_S);
+               wc->wc_flags |= IB_WC_WITH_VLAN;
+       } else {
+               wc->vlan_id = 0xffff;
+       }
+
+       wc->network_hdr_type = roce_get_field(cqe->byte_28,
+                                             V2_CQE_BYTE_28_PORT_TYPE_M,
+                                             V2_CQE_BYTE_28_PORT_TYPE_S);
+
+       return 0;
+}
+
 static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
                                struct hns_roce_qp **cur_qp, struct ib_wc *wc)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(hr_cq->ib_cq.device);
+       struct hns_roce_qp *qp = *cur_qp;
        struct hns_roce_srq *srq = NULL;
        struct hns_roce_v2_cqe *cqe;
-       struct hns_roce_qp *hr_qp;
        struct hns_roce_wq *wq;
        int is_send;
-       u16 wqe_ctr;
-       u32 opcode;
-       u32 qpn;
+       u16 wqe_idx;
        int ret;
 
-       /* Find cqe according to consumer index */
        cqe = get_sw_cqe_v2(hr_cq, hr_cq->cons_index);
        if (!cqe)
                return -EAGAIN;
@@ -3424,189 +3807,50 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
        /* Memory barrier */
        rmb();
 
-       /* 0->SQ, 1->RQ */
-       is_send = !roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_S_R_S);
-
-       qpn = roce_get_field(cqe->byte_16, V2_CQE_BYTE_16_LCL_QPN_M,
-                               V2_CQE_BYTE_16_LCL_QPN_S);
-
-       if (!*cur_qp || (qpn & HNS_ROCE_V2_CQE_QPN_MASK) != (*cur_qp)->qpn) {
-               hr_qp = __hns_roce_qp_lookup(hr_dev, qpn);
-               if (unlikely(!hr_qp)) {
-                       ibdev_err(&hr_dev->ib_dev,
-                                 "CQ %06lx with entry for unknown QPN %06x\n",
-                                 hr_cq->cqn, qpn & HNS_ROCE_V2_CQE_QPN_MASK);
-                       return -EINVAL;
-               }
-               *cur_qp = hr_qp;
-       }
+       ret = get_cur_qp(hr_cq, cqe, &qp);
+       if (ret)
+               return ret;
 
-       wc->qp = &(*cur_qp)->ibqp;
+       wc->qp = &qp->ibqp;
        wc->vendor_err = 0;
 
+       wqe_idx = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_WQE_INDX_M,
+                                V2_CQE_BYTE_4_WQE_INDX_S);
+
+       is_send = !roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_S_R_S);
        if (is_send) {
-               wq = &(*cur_qp)->sq;
-               if ((*cur_qp)->sq_signal_bits) {
-                       /*
-                        * If sg_signal_bit is 1,
-                        * firstly tail pointer updated to wqe
-                        * which current cqe correspond to
-                        */
-                       wqe_ctr = (u16)roce_get_field(cqe->byte_4,
-                                                     V2_CQE_BYTE_4_WQE_INDX_M,
-                                                     V2_CQE_BYTE_4_WQE_INDX_S);
-                       wq->tail += (wqe_ctr - (u16)wq->tail) &
+               wq = &qp->sq;
+
+               /* If sg_signal_bit is set, tail pointer will be updated to
+                * the WQE corresponding to the current CQE.
+                */
+               if (qp->sq_signal_bits)
+                       wq->tail += (wqe_idx - (u16)wq->tail) &
                                    (wq->wqe_cnt - 1);
-               }
 
                wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
                ++wq->tail;
-       } else if ((*cur_qp)->ibqp.srq) {
-               srq = to_hr_srq((*cur_qp)->ibqp.srq);
-               wqe_ctr = (u16)roce_get_field(cqe->byte_4,
-                                             V2_CQE_BYTE_4_WQE_INDX_M,
-                                             V2_CQE_BYTE_4_WQE_INDX_S);
-               wc->wr_id = srq->wrid[wqe_ctr];
-               hns_roce_free_srq_wqe(srq, wqe_ctr);
-       } else {
-               /* Update tail pointer, record wr_id */
-               wq = &(*cur_qp)->rq;
-               wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
-               ++wq->tail;
-       }
-
-       get_cqe_status(hr_dev, *cur_qp, hr_cq, cqe, wc);
-       if (unlikely(wc->status != IB_WC_SUCCESS))
-               return 0;
 
-       if (is_send) {
-               wc->wc_flags = 0;
-               /* SQ corresponding to CQE */
-               switch (roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_OPCODE_M,
-                                      V2_CQE_BYTE_4_OPCODE_S) & 0x1f) {
-               case HNS_ROCE_V2_WQE_OP_SEND:
-                       wc->opcode = IB_WC_SEND;
-                       break;
-               case HNS_ROCE_V2_WQE_OP_SEND_WITH_INV:
-                       wc->opcode = IB_WC_SEND;
-                       break;
-               case HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM:
-                       wc->opcode = IB_WC_SEND;
-                       wc->wc_flags |= IB_WC_WITH_IMM;
-                       break;
-               case HNS_ROCE_V2_WQE_OP_RDMA_READ:
-                       wc->opcode = IB_WC_RDMA_READ;
-                       wc->byte_len = le32_to_cpu(cqe->byte_cnt);
-                       break;
-               case HNS_ROCE_V2_WQE_OP_RDMA_WRITE:
-                       wc->opcode = IB_WC_RDMA_WRITE;
-                       break;
-               case HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM:
-                       wc->opcode = IB_WC_RDMA_WRITE;
-                       wc->wc_flags |= IB_WC_WITH_IMM;
-                       break;
-               case HNS_ROCE_V2_WQE_OP_LOCAL_INV:
-                       wc->opcode = IB_WC_LOCAL_INV;
-                       wc->wc_flags |= IB_WC_WITH_INVALIDATE;
-                       break;
-               case HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP:
-                       wc->opcode = IB_WC_COMP_SWAP;
-                       wc->byte_len  = 8;
-                       break;
-               case HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD:
-                       wc->opcode = IB_WC_FETCH_ADD;
-                       wc->byte_len  = 8;
-                       break;
-               case HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP:
-                       wc->opcode = IB_WC_MASKED_COMP_SWAP;
-                       wc->byte_len  = 8;
-                       break;
-               case HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD:
-                       wc->opcode = IB_WC_MASKED_FETCH_ADD;
-                       wc->byte_len  = 8;
-                       break;
-               case HNS_ROCE_V2_WQE_OP_FAST_REG_PMR:
-                       wc->opcode = IB_WC_REG_MR;
-                       break;
-               case HNS_ROCE_V2_WQE_OP_BIND_MW:
-                       wc->opcode = IB_WC_REG_MR;
-                       break;
-               default:
-                       wc->status = IB_WC_GENERAL_ERR;
-                       break;
-               }
+               fill_send_wc(wc, cqe);
        } else {
-               /* RQ correspond to CQE */
-               wc->byte_len = le32_to_cpu(cqe->byte_cnt);
-
-               opcode = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_OPCODE_M,
-                                       V2_CQE_BYTE_4_OPCODE_S);
-               switch (opcode & 0x1f) {
-               case HNS_ROCE_V2_OPCODE_RDMA_WRITE_IMM:
-                       wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
-                       wc->wc_flags = IB_WC_WITH_IMM;
-                       wc->ex.imm_data =
-                               cpu_to_be32(le32_to_cpu(cqe->immtdata));
-                       break;
-               case HNS_ROCE_V2_OPCODE_SEND:
-                       wc->opcode = IB_WC_RECV;
-                       wc->wc_flags = 0;
-                       break;
-               case HNS_ROCE_V2_OPCODE_SEND_WITH_IMM:
-                       wc->opcode = IB_WC_RECV;
-                       wc->wc_flags = IB_WC_WITH_IMM;
-                       wc->ex.imm_data =
-                               cpu_to_be32(le32_to_cpu(cqe->immtdata));
-                       break;
-               case HNS_ROCE_V2_OPCODE_SEND_WITH_INV:
-                       wc->opcode = IB_WC_RECV;
-                       wc->wc_flags = IB_WC_WITH_INVALIDATE;
-                       wc->ex.invalidate_rkey = le32_to_cpu(cqe->rkey);
-                       break;
-               default:
-                       wc->status = IB_WC_GENERAL_ERR;
-                       break;
-               }
-
-               if ((wc->qp->qp_type == IB_QPT_RC ||
-                    wc->qp->qp_type == IB_QPT_UC) &&
-                   (opcode == HNS_ROCE_V2_OPCODE_SEND ||
-                   opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_IMM ||
-                   opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_INV) &&
-                   (roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_RQ_INLINE_S))) {
-                       ret = hns_roce_handle_recv_inl_wqe(cqe, cur_qp, wc);
-                       if (unlikely(ret))
-                               return -EAGAIN;
-               }
-
-               wc->sl = (u8)roce_get_field(cqe->byte_32, V2_CQE_BYTE_32_SL_M,
-                                           V2_CQE_BYTE_32_SL_S);
-               wc->src_qp = (u8)roce_get_field(cqe->byte_32,
-                                               V2_CQE_BYTE_32_RMT_QPN_M,
-                                               V2_CQE_BYTE_32_RMT_QPN_S);
-               wc->slid = 0;
-               wc->wc_flags |= (roce_get_bit(cqe->byte_32,
-                                             V2_CQE_BYTE_32_GRH_S) ?
-                                             IB_WC_GRH : 0);
-               wc->port_num = roce_get_field(cqe->byte_32,
-                               V2_CQE_BYTE_32_PORTN_M, V2_CQE_BYTE_32_PORTN_S);
-               wc->pkey_index = 0;
-
-               if (roce_get_bit(cqe->byte_28, V2_CQE_BYTE_28_VID_VLD_S)) {
-                       wc->vlan_id = (u16)roce_get_field(cqe->byte_28,
-                                                         V2_CQE_BYTE_28_VID_M,
-                                                         V2_CQE_BYTE_28_VID_S);
-                       wc->wc_flags |= IB_WC_WITH_VLAN;
+               if (qp->ibqp.srq) {
+                       srq = to_hr_srq(qp->ibqp.srq);
+                       wc->wr_id = srq->wrid[wqe_idx];
+                       hns_roce_free_srq_wqe(srq, wqe_idx);
                } else {
-                       wc->vlan_id = 0xffff;
+                       wq = &qp->rq;
+                       wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+                       ++wq->tail;
                }
 
-               wc->network_hdr_type = roce_get_field(cqe->byte_28,
-                                                   V2_CQE_BYTE_28_PORT_TYPE_M,
-                                                   V2_CQE_BYTE_28_PORT_TYPE_S);
+               ret = fill_recv_wc(wc, cqe);
        }
 
-       return 0;
+       get_cqe_status(hr_dev, qp, hr_cq, cqe, wc);
+       if (unlikely(wc->status != IB_WC_SUCCESS))
+               return 0;
+
+       return ret;
 }
 
 static int hns_roce_v2_poll_cq(struct ib_cq *ibcq, int num_entries,
@@ -3638,7 +3882,7 @@ static int hns_roce_v2_poll_cq(struct ib_cq *ibcq, int num_entries,
        }
 
        if (npolled)
-               hns_roce_v2_cq_set_ci(hr_cq, hr_cq->cons_index);
+               update_cq_db(hr_dev, hr_cq);
 
 out:
        spin_unlock_irqrestore(&hr_cq->lock, flags);
@@ -3647,12 +3891,9 @@ out:
 }
 
 static int get_op_for_set_hem(struct hns_roce_dev *hr_dev, u32 type,
-                             int step_idx)
+                             int step_idx, u16 *mbox_op)
 {
-       int op;
-
-       if (type == HEM_TYPE_SCCC && step_idx)
-               return -EINVAL;
+       u16 op;
 
        switch (type) {
        case HEM_TYPE_QPC:
@@ -3677,51 +3918,49 @@ static int get_op_for_set_hem(struct hns_roce_dev *hr_dev, u32 type,
                op = HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0;
                break;
        default:
-               dev_warn(hr_dev->dev,
-                        "table %u not to be written by mailbox!\n", type);
+               dev_warn(hr_dev->dev, "failed to check hem type %u.\n", type);
                return -EINVAL;
        }
 
-       return op + step_idx;
+       *mbox_op = op + step_idx;
+
+       return 0;
 }
 
-static int set_hem_to_hw(struct hns_roce_dev *hr_dev, int obj, u64 bt_ba,
-                        u32 hem_type, int step_idx)
+static int config_gmv_ba_to_hw(struct hns_roce_dev *hr_dev, unsigned long obj,
+                              dma_addr_t base_addr)
 {
-       struct hns_roce_cmd_mailbox *mailbox;
        struct hns_roce_cmq_desc desc;
-       struct hns_roce_cfg_gmv_bt *gmv_bt =
-                               (struct hns_roce_cfg_gmv_bt *)desc.data;
-       int ret;
-       int op;
+       struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
+       u32 idx = obj / (HNS_HW_PAGE_SIZE / hr_dev->caps.gmv_entry_sz);
+       u64 addr = to_hr_hw_page_addr(base_addr);
 
-       if (hem_type == HEM_TYPE_GMV) {
-               hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT,
-                                             false);
+       hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, false);
 
-               gmv_bt->gmv_ba_l = cpu_to_le32(bt_ba >> HNS_HW_PAGE_SHIFT);
-               gmv_bt->gmv_ba_h = cpu_to_le32(bt_ba >> (HNS_HW_PAGE_SHIFT +
-                                                        32));
-               gmv_bt->gmv_bt_idx = cpu_to_le32(obj /
-                       (HNS_HW_PAGE_SIZE / hr_dev->caps.gmv_entry_sz));
+       hr_reg_write(req, CFG_GMV_BT_BA_L, lower_32_bits(addr));
+       hr_reg_write(req, CFG_GMV_BT_BA_H, upper_32_bits(addr));
+       hr_reg_write(req, CFG_GMV_BT_IDX, idx);
 
-               return hns_roce_cmq_send(hr_dev, &desc, 1);
-       }
+       return hns_roce_cmq_send(hr_dev, &desc, 1);
+}
 
-       op = get_op_for_set_hem(hr_dev, hem_type, step_idx);
-       if (op < 0)
-               return 0;
+static int set_hem_to_hw(struct hns_roce_dev *hr_dev, int obj,
+                        dma_addr_t base_addr, u32 hem_type, int step_idx)
+{
+       int ret;
+       u16 op;
 
-       mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
-       if (IS_ERR(mailbox))
-               return PTR_ERR(mailbox);
+       if (unlikely(hem_type == HEM_TYPE_GMV))
+               return config_gmv_ba_to_hw(hr_dev, obj, base_addr);
 
-       ret = hns_roce_cmd_mbox(hr_dev, bt_ba, mailbox->dma, obj,
-                               0, op, HNS_ROCE_CMD_TIMEOUT_MSECS);
+       if (unlikely(hem_type == HEM_TYPE_SCCC && step_idx))
+               return 0;
 
-       hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+       ret = get_op_for_set_hem(hr_dev, hem_type, step_idx, &op);
+       if (ret < 0)
+               return ret;
 
-       return ret;
+       return config_hem_ba_to_hw(hr_dev, obj, base_addr, op);
 }
 
 static int hns_roce_v2_set_hem(struct hns_roce_dev *hr_dev,
@@ -3911,6 +4150,16 @@ static void set_qpc_wqe_cnt(struct hns_roce_qp *hr_qp,
                       ilog2(hr_qp->rq.wqe_cnt));
 }
 
+static inline int get_cqn(struct ib_cq *ib_cq)
+{
+       return ib_cq ? to_hr_cq(ib_cq)->cqn : 0;
+}
+
+static inline int get_pdn(struct ib_pd *ib_pd)
+{
+       return ib_pd ? to_hr_pd(ib_pd)->pdn : 0;
+}
+
 static void modify_qp_reset_to_init(struct ib_qp *ibqp,
                                    const struct ib_qp_attr *attr,
                                    int attr_mask,
@@ -3927,13 +4176,13 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
         * 0 at the same time, else set them to 0x1.
         */
        roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
-                      V2_QPC_BYTE_4_TST_S, to_hr_qp_type(hr_qp->ibqp.qp_type));
+                      V2_QPC_BYTE_4_TST_S, to_hr_qp_type(ibqp->qp_type));
 
        roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
                       V2_QPC_BYTE_4_SQPN_S, hr_qp->qpn);
 
        roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
-                      V2_QPC_BYTE_16_PD_S, to_hr_pd(ibqp->pd)->pdn);
+                      V2_QPC_BYTE_16_PD_S, get_pdn(ibqp->pd));
 
        roce_set_field(context->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_RQWS_M,
                       V2_QPC_BYTE_20_RQWS_S, ilog2(hr_qp->rq.max_gs));
@@ -3944,6 +4193,13 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
        roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M,
                       V2_QPC_BYTE_24_VLAN_ID_S, 0xfff);
 
+       if (ibqp->qp_type == IB_QPT_XRC_TGT) {
+               context->qkey_xrcd = cpu_to_le32(hr_qp->xrcdn);
+
+               roce_set_bit(context->byte_80_rnr_rx_cqn,
+                            V2_QPC_BYTE_80_XRC_QP_TYPE_S, 1);
+       }
+
        if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB)
                roce_set_bit(context->byte_68_rq_db,
                             V2_QPC_BYTE_68_RQ_RECORD_EN_S, 1);
@@ -3954,23 +4210,27 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
                       ((u32)hr_qp->rdb.dma) >> 1);
        context->rq_db_record_addr = cpu_to_le32(hr_qp->rdb.dma >> 32);
 
-       roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S,
-                   (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) ? 1 : 0);
+       if (ibqp->qp_type != IB_QPT_UD && ibqp->qp_type != IB_QPT_GSI)
+               roce_set_bit(context->byte_76_srqn_op_en,
+                            V2_QPC_BYTE_76_RQIE_S,
+                            !!(hr_dev->caps.flags &
+                               HNS_ROCE_CAP_FLAG_RQ_INLINE));
 
        roce_set_field(context->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M,
-                      V2_QPC_BYTE_80_RX_CQN_S, to_hr_cq(ibqp->recv_cq)->cqn);
+                      V2_QPC_BYTE_80_RX_CQN_S, get_cqn(ibqp->recv_cq));
+
        if (ibqp->srq) {
+               roce_set_bit(context->byte_76_srqn_op_en,
+                            V2_QPC_BYTE_76_SRQ_EN_S, 1);
                roce_set_field(context->byte_76_srqn_op_en,
                               V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S,
                               to_hr_srq(ibqp->srq)->srqn);
-               roce_set_bit(context->byte_76_srqn_op_en,
-                            V2_QPC_BYTE_76_SRQ_EN_S, 1);
        }
 
        roce_set_bit(context->byte_172_sq_psn, V2_QPC_BYTE_172_FRE_S, 1);
 
        roce_set_field(context->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
-                      V2_QPC_BYTE_252_TX_CQN_S, to_hr_cq(ibqp->send_cq)->cqn);
+                      V2_QPC_BYTE_252_TX_CQN_S, get_cqn(ibqp->send_cq));
 
        if (hr_dev->caps.qpc_sz < HNS_ROCE_V3_QPC_SZ)
                return;
@@ -3993,22 +4253,23 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp,
         * 0 at the same time, else set them to 0x1.
         */
        roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
-                      V2_QPC_BYTE_4_TST_S, to_hr_qp_type(hr_qp->ibqp.qp_type));
+                      V2_QPC_BYTE_4_TST_S, to_hr_qp_type(ibqp->qp_type));
        roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
                       V2_QPC_BYTE_4_TST_S, 0);
 
        roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
-                      V2_QPC_BYTE_16_PD_S, to_hr_pd(ibqp->pd)->pdn);
+                      V2_QPC_BYTE_16_PD_S, get_pdn(ibqp->pd));
+
        roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
                       V2_QPC_BYTE_16_PD_S, 0);
 
        roce_set_field(context->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M,
-                      V2_QPC_BYTE_80_RX_CQN_S, to_hr_cq(ibqp->recv_cq)->cqn);
+                      V2_QPC_BYTE_80_RX_CQN_S, get_cqn(ibqp->recv_cq));
        roce_set_field(qpc_mask->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M,
                       V2_QPC_BYTE_80_RX_CQN_S, 0);
 
        roce_set_field(context->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
-                      V2_QPC_BYTE_252_TX_CQN_S, to_hr_cq(ibqp->send_cq)->cqn);
+                      V2_QPC_BYTE_252_TX_CQN_S, get_cqn(ibqp->send_cq));
        roce_set_field(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
                       V2_QPC_BYTE_252_TX_CQN_S, 0);
 
@@ -4133,17 +4394,6 @@ static int config_qp_rq_buf(struct hns_roce_dev *hr_dev,
                       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M,
                       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S, 0);
 
-       roce_set_field(context->byte_84_rq_ci_pi,
-                      V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
-                      V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, hr_qp->rq.head);
-       roce_set_field(qpc_mask->byte_84_rq_ci_pi,
-                      V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
-                      V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
-
-       roce_set_field(qpc_mask->byte_84_rq_ci_pi,
-                      V2_QPC_BYTE_84_RQ_CONSUMER_IDX_M,
-                      V2_QPC_BYTE_84_RQ_CONSUMER_IDX_S, 0);
-
        return 0;
 }
 
@@ -4240,7 +4490,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
        u64 *mtts;
        u8 *dmac;
        u8 *smac;
-       int port;
+       u32 port;
        int ret;
 
        ret = config_qp_rq_buf(hr_dev, hr_qp, context, qpc_mask);
@@ -4454,6 +4704,143 @@ static inline u16 get_udp_sport(u32 fl, u32 lqpn, u32 rqpn)
        return rdma_flow_label_to_udp_sport(fl);
 }
 
+static int get_dip_ctx_idx(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
+                          u32 *dip_idx)
+{
+       const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
+       struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+       struct hns_roce_dip *hr_dip;
+       unsigned long flags;
+       int ret = 0;
+
+       spin_lock_irqsave(&hr_dev->dip_list_lock, flags);
+
+       list_for_each_entry(hr_dip, &hr_dev->dip_list, node) {
+               if (!memcmp(grh->dgid.raw, hr_dip->dgid, 16))
+                       goto out;
+       }
+
+       /* If no dgid is found, a new dip and a mapping between dgid and
+        * dip_idx will be created.
+        */
+       hr_dip = kzalloc(sizeof(*hr_dip), GFP_ATOMIC);
+       if (!hr_dip) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       memcpy(hr_dip->dgid, grh->dgid.raw, sizeof(grh->dgid.raw));
+       hr_dip->dip_idx = *dip_idx = ibqp->qp_num;
+       list_add_tail(&hr_dip->node, &hr_dev->dip_list);
+
+out:
+       spin_unlock_irqrestore(&hr_dev->dip_list_lock, flags);
+       return ret;
+}
+
+enum {
+       CONG_DCQCN,
+       CONG_WINDOW,
+};
+
+enum {
+       UNSUPPORT_CONG_LEVEL,
+       SUPPORT_CONG_LEVEL,
+};
+
+enum {
+       CONG_LDCP,
+       CONG_HC3,
+};
+
+enum {
+       DIP_INVALID,
+       DIP_VALID,
+};
+
+static int check_cong_type(struct ib_qp *ibqp,
+                          struct hns_roce_congestion_algorithm *cong_alg)
+{
+       struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+
+       /* different congestion types match different configurations */
+       switch (hr_dev->caps.cong_type) {
+       case CONG_TYPE_DCQCN:
+               cong_alg->alg_sel = CONG_DCQCN;
+               cong_alg->alg_sub_sel = UNSUPPORT_CONG_LEVEL;
+               cong_alg->dip_vld = DIP_INVALID;
+               break;
+       case CONG_TYPE_LDCP:
+               cong_alg->alg_sel = CONG_WINDOW;
+               cong_alg->alg_sub_sel = CONG_LDCP;
+               cong_alg->dip_vld = DIP_INVALID;
+               break;
+       case CONG_TYPE_HC3:
+               cong_alg->alg_sel = CONG_WINDOW;
+               cong_alg->alg_sub_sel = CONG_HC3;
+               cong_alg->dip_vld = DIP_INVALID;
+               break;
+       case CONG_TYPE_DIP:
+               cong_alg->alg_sel = CONG_DCQCN;
+               cong_alg->alg_sub_sel = UNSUPPORT_CONG_LEVEL;
+               cong_alg->dip_vld = DIP_VALID;
+               break;
+       default:
+               ibdev_err(&hr_dev->ib_dev,
+                         "error type(%u) for congestion selection.\n",
+                         hr_dev->caps.cong_type);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int fill_cong_field(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
+                          struct hns_roce_v2_qp_context *context,
+                          struct hns_roce_v2_qp_context *qpc_mask)
+{
+       const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
+       struct hns_roce_congestion_algorithm cong_field;
+       struct ib_device *ibdev = ibqp->device;
+       struct hns_roce_dev *hr_dev = to_hr_dev(ibdev);
+       u32 dip_idx = 0;
+       int ret;
+
+       if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 ||
+           grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE)
+               return 0;
+
+       ret = check_cong_type(ibqp, &cong_field);
+       if (ret)
+               return ret;
+
+       hr_reg_write(context, QPC_CONG_ALGO_TMPL_ID, hr_dev->cong_algo_tmpl_id +
+                    hr_dev->caps.cong_type * HNS_ROCE_CONG_SIZE);
+       hr_reg_write(qpc_mask, QPC_CONG_ALGO_TMPL_ID, 0);
+       hr_reg_write(&context->ext, QPCEX_CONG_ALG_SEL, cong_field.alg_sel);
+       hr_reg_write(&qpc_mask->ext, QPCEX_CONG_ALG_SEL, 0);
+       hr_reg_write(&context->ext, QPCEX_CONG_ALG_SUB_SEL,
+                    cong_field.alg_sub_sel);
+       hr_reg_write(&qpc_mask->ext, QPCEX_CONG_ALG_SUB_SEL, 0);
+       hr_reg_write(&context->ext, QPCEX_DIP_CTX_IDX_VLD, cong_field.dip_vld);
+       hr_reg_write(&qpc_mask->ext, QPCEX_DIP_CTX_IDX_VLD, 0);
+
+       /* if dip is disabled, there is no need to set dip idx */
+       if (cong_field.dip_vld == 0)
+               return 0;
+
+       ret = get_dip_ctx_idx(ibqp, attr, &dip_idx);
+       if (ret) {
+               ibdev_err(ibdev, "failed to fill cong field, ret = %d.\n", ret);
+               return ret;
+       }
+
+       hr_reg_write(&context->ext, QPCEX_DIP_CTX_IDX, dip_idx);
+       hr_reg_write(&qpc_mask->ext, QPCEX_DIP_CTX_IDX, 0);
+
+       return 0;
+}
+
 static int hns_roce_v2_set_path(struct ib_qp *ibqp,
                                const struct ib_qp_attr *attr,
                                int attr_mask,
@@ -4537,6 +4924,10 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp,
        roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_HOP_LIMIT_M,
                       V2_QPC_BYTE_24_HOP_LIMIT_S, 0);
 
+       ret = fill_cong_field(ibqp, attr, context, qpc_mask);
+       if (ret)
+               return ret;
+
        roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
                       V2_QPC_BYTE_24_TC_S, get_tclass(&attr->ah_attr.grh));
        roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
@@ -4687,7 +5078,6 @@ static int hns_roce_v2_set_opt_fields(struct ib_qp *ibqp,
                               V2_QPC_BYTE_244_RNR_CNT_S, 0);
        }
 
-       /* RC&UC&UD required attr */
        if (attr_mask & IB_QP_SQ_PSN) {
                roce_set_field(context->byte_172_sq_psn,
                               V2_QPC_BYTE_172_SQ_CUR_PSN_M,
@@ -4765,7 +5155,6 @@ static int hns_roce_v2_set_opt_fields(struct ib_qp *ibqp,
                               V2_QPC_BYTE_80_MIN_RNR_TIME_S, 0);
        }
 
-       /* RC&UC required attr */
        if (attr_mask & IB_QP_RQ_PSN) {
                roce_set_field(context->byte_108_rx_reqepsn,
                               V2_QPC_BYTE_108_RX_REQ_EPSN_M,
@@ -4808,6 +5197,29 @@ static void hns_roce_v2_record_opt_fields(struct ib_qp *ibqp,
        }
 }
 
+static void clear_qp(struct hns_roce_qp *hr_qp)
+{
+       struct ib_qp *ibqp = &hr_qp->ibqp;
+
+       if (ibqp->send_cq)
+               hns_roce_v2_cq_clean(to_hr_cq(ibqp->send_cq),
+                                    hr_qp->qpn, NULL);
+
+       if (ibqp->recv_cq  && ibqp->recv_cq != ibqp->send_cq)
+               hns_roce_v2_cq_clean(to_hr_cq(ibqp->recv_cq),
+                                    hr_qp->qpn, ibqp->srq ?
+                                    to_hr_srq(ibqp->srq) : NULL);
+
+       if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB)
+               *hr_qp->rdb.db_record = 0;
+
+       hr_qp->rq.head = 0;
+       hr_qp->rq.tail = 0;
+       hr_qp->sq.head = 0;
+       hr_qp->sq.tail = 0;
+       hr_qp->next_sge = 0;
+}
+
 static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
                                 const struct ib_qp_attr *attr,
                                 int attr_mask, enum ib_qp_state cur_state,
@@ -4842,19 +5254,23 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 
        /* When QP state is err, SQ and RQ WQE should be flushed */
        if (new_state == IB_QPS_ERR) {
-               spin_lock_irqsave(&hr_qp->sq.lock, sq_flag);
-               hr_qp->state = IB_QPS_ERR;
-               roce_set_field(context->byte_160_sq_ci_pi,
-                              V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
-                              V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S,
-                              hr_qp->sq.head);
-               roce_set_field(qpc_mask->byte_160_sq_ci_pi,
-                              V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
-                              V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S, 0);
-               spin_unlock_irqrestore(&hr_qp->sq.lock, sq_flag);
-
-               if (!ibqp->srq) {
+               if (ibqp->qp_type != IB_QPT_XRC_TGT) {
+                       spin_lock_irqsave(&hr_qp->sq.lock, sq_flag);
+                       hr_qp->state = IB_QPS_ERR;
+                       roce_set_field(context->byte_160_sq_ci_pi,
+                                      V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
+                                      V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S,
+                                      hr_qp->sq.head);
+                       roce_set_field(qpc_mask->byte_160_sq_ci_pi,
+                                      V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
+                                      V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S, 0);
+                       spin_unlock_irqrestore(&hr_qp->sq.lock, sq_flag);
+               }
+
+               if (!ibqp->srq && ibqp->qp_type != IB_QPT_XRC_INI &&
+                   ibqp->qp_type != IB_QPT_XRC_TGT) {
                        spin_lock_irqsave(&hr_qp->rq.lock, rq_flag);
+                       hr_qp->state = IB_QPS_ERR;
                        roce_set_field(context->byte_84_rq_ci_pi,
                               V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
                               V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S,
@@ -4873,7 +5289,8 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
                goto out;
 
        roce_set_bit(context->byte_108_rx_reqepsn, V2_QPC_BYTE_108_INV_CREDIT_S,
-                    ibqp->srq ? 1 : 0);
+                    ((to_hr_qp_type(hr_qp->ibqp.qp_type) == SERV_TYPE_XRC) ||
+                    ibqp->srq) ? 1 : 0);
        roce_set_bit(qpc_mask->byte_108_rx_reqepsn,
                     V2_QPC_BYTE_108_INV_CREDIT_S, 0);
 
@@ -4894,21 +5311,8 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 
        hns_roce_v2_record_opt_fields(ibqp, attr, attr_mask);
 
-       if (new_state == IB_QPS_RESET && !ibqp->uobject) {
-               hns_roce_v2_cq_clean(to_hr_cq(ibqp->recv_cq), hr_qp->qpn,
-                                    ibqp->srq ? to_hr_srq(ibqp->srq) : NULL);
-               if (ibqp->send_cq != ibqp->recv_cq)
-                       hns_roce_v2_cq_clean(to_hr_cq(ibqp->send_cq),
-                                            hr_qp->qpn, NULL);
-
-               hr_qp->rq.head = 0;
-               hr_qp->rq.tail = 0;
-               hr_qp->sq.head = 0;
-               hr_qp->sq.tail = 0;
-               hr_qp->next_sge = 0;
-               if (hr_qp->rq.wqe_cnt)
-                       *hr_qp->rdb.db_record = 0;
-       }
+       if (new_state == IB_QPS_RESET && !ibqp->uobject)
+               clear_qp(hr_qp);
 
 out:
        return ret;
@@ -5019,7 +5423,8 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
                                    V2_QPC_BYTE_76_ATE_S)) << V2_QP_ATE_S);
 
        if (hr_qp->ibqp.qp_type == IB_QPT_RC ||
-           hr_qp->ibqp.qp_type == IB_QPT_UC) {
+           hr_qp->ibqp.qp_type == IB_QPT_XRC_INI ||
+           hr_qp->ibqp.qp_type == IB_QPT_XRC_TGT) {
                struct ib_global_route *grh =
                                rdma_ah_retrieve_grh(&qp_attr->ah_attr);
 
@@ -5051,6 +5456,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
        qp_attr->max_dest_rd_atomic = 1 << roce_get_field(context.byte_140_raq,
                                                     V2_QPC_BYTE_140_RR_MAX_M,
                                                     V2_QPC_BYTE_140_RR_MAX_S);
+
        qp_attr->min_rnr_timer = (u8)roce_get_field(context.byte_80_rnr_rx_cqn,
                                                 V2_QPC_BYTE_80_MIN_RNR_TIME_M,
                                                 V2_QPC_BYTE_80_MIN_RNR_TIME_S);
@@ -5068,6 +5474,7 @@ done:
        qp_attr->cur_qp_state = qp_attr->qp_state;
        qp_attr->cap.max_recv_wr = hr_qp->rq.wqe_cnt;
        qp_attr->cap.max_recv_sge = hr_qp->rq.max_gs - hr_qp->rq.rsv_sge;
+       qp_attr->cap.max_inline_data = hr_qp->max_inline_data;
 
        if (!ibqp->uobject) {
                qp_attr->cap.max_send_wr = hr_qp->sq.wqe_cnt;
@@ -5085,6 +5492,15 @@ out:
        return ret;
 }
 
+static inline int modify_qp_is_ok(struct hns_roce_qp *hr_qp)
+{
+       return ((hr_qp->ibqp.qp_type == IB_QPT_RC ||
+                hr_qp->ibqp.qp_type == IB_QPT_UD ||
+                hr_qp->ibqp.qp_type == IB_QPT_XRC_INI ||
+                hr_qp->ibqp.qp_type == IB_QPT_XRC_TGT) &&
+               hr_qp->state != IB_QPS_RESET);
+}
+
 static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
                                         struct hns_roce_qp *hr_qp,
                                         struct ib_udata *udata)
@@ -5094,9 +5510,7 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
        unsigned long flags;
        int ret = 0;
 
-       if ((hr_qp->ibqp.qp_type == IB_QPT_RC ||
-            hr_qp->ibqp.qp_type == IB_QPT_UD) &&
-          hr_qp->state != IB_QPS_RESET) {
+       if (modify_qp_is_ok(hr_qp)) {
                /* Modify qp to reset before destroying qp */
                ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0,
                                            hr_qp->state, IB_QPS_RESET);
@@ -5275,9 +5689,11 @@ static int hns_roce_v2_write_srqc(struct hns_roce_srq *srq, void *mb_buf)
        }
 
        hr_reg_write(ctx, SRQC_SRQ_ST, 1);
+       hr_reg_write(ctx, SRQC_SRQ_TYPE,
+                    !!(srq->ibsrq.srq_type == IB_SRQT_XRC));
        hr_reg_write(ctx, SRQC_PD, to_hr_pd(srq->ibsrq.pd)->pdn);
        hr_reg_write(ctx, SRQC_SRQN, srq->srqn);
-       hr_reg_write(ctx, SRQC_XRCD, 0);
+       hr_reg_write(ctx, SRQC_XRCD, srq->xrcdn);
        hr_reg_write(ctx, SRQC_XRC_CQN, srq->cqn);
        hr_reg_write(ctx, SRQC_SHIFT, ilog2(srq->wqe_cnt));
        hr_reg_write(ctx, SRQC_RQWS,
@@ -5481,6 +5897,12 @@ static void hns_roce_irq_work_handle(struct work_struct *work)
        case HNS_ROCE_EVENT_TYPE_FLR:
                ibdev_warn(ibdev, "Function level reset.\n");
                break;
+       case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION:
+               ibdev_err(ibdev, "xrc domain violation error.\n");
+               break;
+       case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH:
+               ibdev_err(ibdev, "invalid xrceth error.\n");
+               break;
        default:
                break;
        }
@@ -5505,33 +5927,30 @@ static void hns_roce_v2_init_irq_work(struct hns_roce_dev *hr_dev,
        queue_work(hr_dev->irq_workq, &(irq_work->work));
 }
 
-static void set_eq_cons_index_v2(struct hns_roce_eq *eq)
+static void update_eq_db(struct hns_roce_eq *eq)
 {
        struct hns_roce_dev *hr_dev = eq->hr_dev;
-       __le32 doorbell[2] = {};
+       struct hns_roce_v2_db eq_db = {};
 
        if (eq->type_flag == HNS_ROCE_AEQ) {
-               roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_CMD_M,
-                              HNS_ROCE_V2_EQ_DB_CMD_S,
+               roce_set_field(eq_db.byte_4, V2_EQ_DB_CMD_M, V2_EQ_DB_CMD_S,
                               eq->arm_st == HNS_ROCE_V2_EQ_ALWAYS_ARMED ?
                               HNS_ROCE_EQ_DB_CMD_AEQ :
                               HNS_ROCE_EQ_DB_CMD_AEQ_ARMED);
        } else {
-               roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_TAG_M,
-                              HNS_ROCE_V2_EQ_DB_TAG_S, eq->eqn);
+               roce_set_field(eq_db.byte_4, V2_EQ_DB_TAG_M, V2_EQ_DB_TAG_S,
+                              eq->eqn);
 
-               roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_CMD_M,
-                              HNS_ROCE_V2_EQ_DB_CMD_S,
+               roce_set_field(eq_db.byte_4, V2_EQ_DB_CMD_M, V2_EQ_DB_CMD_S,
                               eq->arm_st == HNS_ROCE_V2_EQ_ALWAYS_ARMED ?
                               HNS_ROCE_EQ_DB_CMD_CEQ :
                               HNS_ROCE_EQ_DB_CMD_CEQ_ARMED);
        }
 
-       roce_set_field(doorbell[1], HNS_ROCE_V2_EQ_DB_PARA_M,
-                      HNS_ROCE_V2_EQ_DB_PARA_S,
-                      (eq->cons_index & HNS_ROCE_V2_CONS_IDX_M));
+       roce_set_field(eq_db.parameter, V2_EQ_DB_CONS_IDX_M,
+                      V2_EQ_DB_CONS_IDX_S, eq->cons_index);
 
-       hns_roce_write64(hr_dev, doorbell, eq->doorbell);
+       hns_roce_write64(hr_dev, (__le32 *)&eq_db, eq->db_reg);
 }
 
 static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq)
@@ -5581,6 +6000,8 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
                case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
                case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
                case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+               case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION:
+               case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH:
                        hns_roce_qp_event(hr_dev, queue_num, event_type);
                        break;
                case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
@@ -5616,7 +6037,7 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
                aeqe = next_aeqe_sw_v2(eq);
        }
 
-       set_eq_cons_index_v2(eq);
+       update_eq_db(eq);
        return aeqe_found;
 }
 
@@ -5656,7 +6077,7 @@ static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev,
                ceqe = next_ceqe_sw_v2(eq);
        }
 
-       set_eq_cons_index_v2(eq);
+       update_eq_db(eq);
 
        return ceqe_found;
 }
@@ -5710,58 +6131,34 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
                roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
 
                int_work = 1;
-       } else if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S)) {
-               dev_err(dev, "BUS ERR!\n");
-
-               int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S;
-               roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
-
-               int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S;
-               roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
-
-               int_work = 1;
-       } else if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S)) {
-               dev_err(dev, "OTHER ERR!\n");
+       } else if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_RAS_INT_S)) {
+               dev_err(dev, "RAS interrupt!\n");
 
-               int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S;
+               int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_RAS_INT_S;
                roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
 
                int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S;
                roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
 
                int_work = 1;
-       } else
+       } else {
                dev_err(dev, "There is no abnormal irq found!\n");
+       }
 
        return IRQ_RETVAL(int_work);
 }
 
 static void hns_roce_v2_int_mask_enable(struct hns_roce_dev *hr_dev,
-                                       int eq_num, int enable_flag)
+                                       int eq_num, u32 enable_flag)
 {
        int i;
 
-       if (enable_flag == EQ_ENABLE) {
-               for (i = 0; i < eq_num; i++)
-                       roce_write(hr_dev, ROCEE_VF_EVENT_INT_EN_REG +
-                                  i * EQ_REG_OFFSET,
-                                  HNS_ROCE_V2_VF_EVENT_INT_EN_M);
-
-               roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG,
-                          HNS_ROCE_V2_VF_ABN_INT_EN_M);
-               roce_write(hr_dev, ROCEE_VF_ABN_INT_CFG_REG,
-                          HNS_ROCE_V2_VF_ABN_INT_CFG_M);
-       } else {
-               for (i = 0; i < eq_num; i++)
-                       roce_write(hr_dev, ROCEE_VF_EVENT_INT_EN_REG +
-                                  i * EQ_REG_OFFSET,
-                                  HNS_ROCE_V2_VF_EVENT_INT_EN_M & 0x0);
+       for (i = 0; i < eq_num; i++)
+               roce_write(hr_dev, ROCEE_VF_EVENT_INT_EN_REG +
+                          i * EQ_REG_OFFSET, enable_flag);
 
-               roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG,
-                          HNS_ROCE_V2_VF_ABN_INT_EN_M & 0x0);
-               roce_write(hr_dev, ROCEE_VF_ABN_INT_CFG_REG,
-                          HNS_ROCE_V2_VF_ABN_INT_CFG_M & 0x0);
-       }
+       roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, enable_flag);
+       roce_write(hr_dev, ROCEE_VF_ABN_INT_CFG_REG, enable_flag);
 }
 
 static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, int eqn)
@@ -5786,6 +6183,16 @@ static void free_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq)
        hns_roce_mtr_destroy(hr_dev, &eq->mtr);
 }
 
+static void init_eq_config(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq)
+{
+       eq->db_reg = hr_dev->reg_base + ROCEE_VF_EQ_DB_CFG0_REG;
+       eq->cons_index = 0;
+       eq->over_ignore = HNS_ROCE_V2_EQ_OVER_IGNORE_0;
+       eq->coalesce = HNS_ROCE_V2_EQ_COALESCE_0;
+       eq->arm_st = HNS_ROCE_V2_EQ_ALWAYS_ARMED;
+       eq->shift = ilog2((unsigned int)eq->entries);
+}
+
 static int config_eqc(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq,
                      void *mb_buf)
 {
@@ -5797,13 +6204,7 @@ static int config_eqc(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq,
        eqc = mb_buf;
        memset(eqc, 0, sizeof(struct hns_roce_eq_context));
 
-       /* init eqc */
-       eq->doorbell = hr_dev->reg_base + ROCEE_VF_EQ_DB_CFG0_REG;
-       eq->cons_index = 0;
-       eq->over_ignore = HNS_ROCE_V2_EQ_OVER_IGNORE_0;
-       eq->coalesce = HNS_ROCE_V2_EQ_COALESCE_0;
-       eq->arm_st = HNS_ROCE_V2_EQ_ALWAYS_ARMED;
-       eq->shift = ilog2((unsigned int)eq->entries);
+       init_eq_config(hr_dev, eq);
 
        /* if not multi-hop, eqe buffer only use one trunk */
        count = hns_roce_mtr_find(hr_dev, &eq->mtr, 0, eqe_ba, MTT_MIN_COUNT,
@@ -5813,102 +6214,34 @@ static int config_eqc(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq,
                return -ENOBUFS;
        }
 
-       /* set eqc state */
-       roce_set_field(eqc->byte_4, HNS_ROCE_EQC_EQ_ST_M, HNS_ROCE_EQC_EQ_ST_S,
-                      HNS_ROCE_V2_EQ_STATE_VALID);
-
-       /* set eqe hop num */
-       roce_set_field(eqc->byte_4, HNS_ROCE_EQC_HOP_NUM_M,
-                      HNS_ROCE_EQC_HOP_NUM_S, eq->hop_num);
-
-       /* set eqc over_ignore */
-       roce_set_field(eqc->byte_4, HNS_ROCE_EQC_OVER_IGNORE_M,
-                      HNS_ROCE_EQC_OVER_IGNORE_S, eq->over_ignore);
-
-       /* set eqc coalesce */
-       roce_set_field(eqc->byte_4, HNS_ROCE_EQC_COALESCE_M,
-                      HNS_ROCE_EQC_COALESCE_S, eq->coalesce);
-
-       /* set eqc arm_state */
-       roce_set_field(eqc->byte_4, HNS_ROCE_EQC_ARM_ST_M,
-                      HNS_ROCE_EQC_ARM_ST_S, eq->arm_st);
-
-       /* set eqn */
-       roce_set_field(eqc->byte_4, HNS_ROCE_EQC_EQN_M, HNS_ROCE_EQC_EQN_S,
-                      eq->eqn);
-
-       /* set eqe_cnt */
-       roce_set_field(eqc->byte_4, HNS_ROCE_EQC_EQE_CNT_M,
-                      HNS_ROCE_EQC_EQE_CNT_S, HNS_ROCE_EQ_INIT_EQE_CNT);
-
-       /* set eqe_ba_pg_sz */
-       roce_set_field(eqc->byte_8, HNS_ROCE_EQC_BA_PG_SZ_M,
-                      HNS_ROCE_EQC_BA_PG_SZ_S,
-                      to_hr_hw_page_shift(eq->mtr.hem_cfg.ba_pg_shift));
-
-       /* set eqe_buf_pg_sz */
-       roce_set_field(eqc->byte_8, HNS_ROCE_EQC_BUF_PG_SZ_M,
-                      HNS_ROCE_EQC_BUF_PG_SZ_S,
-                      to_hr_hw_page_shift(eq->mtr.hem_cfg.buf_pg_shift));
-
-       /* set eq_producer_idx */
-       roce_set_field(eqc->byte_8, HNS_ROCE_EQC_PROD_INDX_M,
-                      HNS_ROCE_EQC_PROD_INDX_S, HNS_ROCE_EQ_INIT_PROD_IDX);
-
-       /* set eq_max_cnt */
-       roce_set_field(eqc->byte_12, HNS_ROCE_EQC_MAX_CNT_M,
-                      HNS_ROCE_EQC_MAX_CNT_S, eq->eq_max_cnt);
-
-       /* set eq_period */
-       roce_set_field(eqc->byte_12, HNS_ROCE_EQC_PERIOD_M,
-                      HNS_ROCE_EQC_PERIOD_S, eq->eq_period);
-
-       /* set eqe_report_timer */
-       roce_set_field(eqc->eqe_report_timer, HNS_ROCE_EQC_REPORT_TIMER_M,
-                      HNS_ROCE_EQC_REPORT_TIMER_S,
-                      HNS_ROCE_EQ_INIT_REPORT_TIMER);
-
-       /* set bt_ba [34:3] */
-       roce_set_field(eqc->eqe_ba0, HNS_ROCE_EQC_EQE_BA_L_M,
-                      HNS_ROCE_EQC_EQE_BA_L_S, bt_ba >> 3);
-
-       /* set bt_ba [64:35] */
-       roce_set_field(eqc->eqe_ba1, HNS_ROCE_EQC_EQE_BA_H_M,
-                      HNS_ROCE_EQC_EQE_BA_H_S, bt_ba >> 35);
-
-       /* set eq shift */
-       roce_set_field(eqc->byte_28, HNS_ROCE_EQC_SHIFT_M, HNS_ROCE_EQC_SHIFT_S,
-                      eq->shift);
-
-       /* set eq MSI_IDX */
-       roce_set_field(eqc->byte_28, HNS_ROCE_EQC_MSI_INDX_M,
-                      HNS_ROCE_EQC_MSI_INDX_S, HNS_ROCE_EQ_INIT_MSI_IDX);
-
-       /* set cur_eqe_ba [27:12] */
-       roce_set_field(eqc->byte_28, HNS_ROCE_EQC_CUR_EQE_BA_L_M,
-                      HNS_ROCE_EQC_CUR_EQE_BA_L_S, eqe_ba[0] >> 12);
-
-       /* set cur_eqe_ba [59:28] */
-       roce_set_field(eqc->byte_32, HNS_ROCE_EQC_CUR_EQE_BA_M_M,
-                      HNS_ROCE_EQC_CUR_EQE_BA_M_S, eqe_ba[0] >> 28);
-
-       /* set cur_eqe_ba [63:60] */
-       roce_set_field(eqc->byte_36, HNS_ROCE_EQC_CUR_EQE_BA_H_M,
-                      HNS_ROCE_EQC_CUR_EQE_BA_H_S, eqe_ba[0] >> 60);
-
-       /* set eq consumer idx */
-       roce_set_field(eqc->byte_36, HNS_ROCE_EQC_CONS_INDX_M,
-                      HNS_ROCE_EQC_CONS_INDX_S, HNS_ROCE_EQ_INIT_CONS_IDX);
-
-       roce_set_field(eqc->byte_40, HNS_ROCE_EQC_NXT_EQE_BA_L_M,
-                      HNS_ROCE_EQC_NXT_EQE_BA_L_S, eqe_ba[1] >> 12);
-
-       roce_set_field(eqc->byte_44, HNS_ROCE_EQC_NXT_EQE_BA_H_M,
-                      HNS_ROCE_EQC_NXT_EQE_BA_H_S, eqe_ba[1] >> 44);
-
-       roce_set_field(eqc->byte_44, HNS_ROCE_EQC_EQE_SIZE_M,
-                      HNS_ROCE_EQC_EQE_SIZE_S,
-                      eq->eqe_size == HNS_ROCE_V3_EQE_SIZE ? 1 : 0);
+       hr_reg_write(eqc, EQC_EQ_ST, HNS_ROCE_V2_EQ_STATE_VALID);
+       hr_reg_write(eqc, EQC_EQE_HOP_NUM, eq->hop_num);
+       hr_reg_write(eqc, EQC_OVER_IGNORE, eq->over_ignore);
+       hr_reg_write(eqc, EQC_COALESCE, eq->coalesce);
+       hr_reg_write(eqc, EQC_ARM_ST, eq->arm_st);
+       hr_reg_write(eqc, EQC_EQN, eq->eqn);
+       hr_reg_write(eqc, EQC_EQE_CNT, HNS_ROCE_EQ_INIT_EQE_CNT);
+       hr_reg_write(eqc, EQC_EQE_BA_PG_SZ,
+                    to_hr_hw_page_shift(eq->mtr.hem_cfg.ba_pg_shift));
+       hr_reg_write(eqc, EQC_EQE_BUF_PG_SZ,
+                    to_hr_hw_page_shift(eq->mtr.hem_cfg.buf_pg_shift));
+       hr_reg_write(eqc, EQC_EQ_PROD_INDX, HNS_ROCE_EQ_INIT_PROD_IDX);
+       hr_reg_write(eqc, EQC_EQ_MAX_CNT, eq->eq_max_cnt);
+
+       hr_reg_write(eqc, EQC_EQ_PERIOD, eq->eq_period);
+       hr_reg_write(eqc, EQC_EQE_REPORT_TIMER, HNS_ROCE_EQ_INIT_REPORT_TIMER);
+       hr_reg_write(eqc, EQC_EQE_BA_L, bt_ba >> 3);
+       hr_reg_write(eqc, EQC_EQE_BA_H, bt_ba >> 35);
+       hr_reg_write(eqc, EQC_SHIFT, eq->shift);
+       hr_reg_write(eqc, EQC_MSI_INDX, HNS_ROCE_EQ_INIT_MSI_IDX);
+       hr_reg_write(eqc, EQC_CUR_EQE_BA_L, eqe_ba[0] >> 12);
+       hr_reg_write(eqc, EQC_CUR_EQE_BA_M, eqe_ba[0] >> 28);
+       hr_reg_write(eqc, EQC_CUR_EQE_BA_H, eqe_ba[0] >> 60);
+       hr_reg_write(eqc, EQC_EQ_CONS_INDX, HNS_ROCE_EQ_INIT_CONS_IDX);
+       hr_reg_write(eqc, EQC_NEX_EQE_BA_L, eqe_ba[1] >> 12);
+       hr_reg_write(eqc, EQC_NEX_EQE_BA_H, eqe_ba[1] >> 44);
+       hr_reg_write(eqc, EQC_EQE_SIZE,
+                    !!(eq->eqe_size == HNS_ROCE_V3_EQE_SIZE));
 
        return 0;
 }
@@ -6166,6 +6499,7 @@ static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev)
        hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_DISABLE);
 
        __hns_roce_free_irq(hr_dev);
+       destroy_workqueue(hr_dev->irq_workq);
 
        for (i = 0; i < eq_num; i++) {
                hns_roce_v2_destroy_eqc(hr_dev, i);
@@ -6174,9 +6508,6 @@ static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev)
        }
 
        kfree(eq_table->eq);
-
-       flush_workqueue(hr_dev->irq_workq);
-       destroy_workqueue(hr_dev->irq_workq);
 }
 
 static const struct hns_roce_dfx_hw hns_roce_dfx_hw_v2 = {
@@ -6205,9 +6536,9 @@ static const struct hns_roce_hw hns_roce_hw_v2 = {
        .hw_profile = hns_roce_v2_profile,
        .hw_init = hns_roce_v2_init,
        .hw_exit = hns_roce_v2_exit,
-       .post_mbox = hns_roce_v2_post_mbox,
-       .chk_mbox = hns_roce_v2_chk_mbox,
-       .rst_prc_mbox = hns_roce_v2_rst_process_cmd,
+       .post_mbox = v2_post_mbox,
+       .poll_mbox_done = v2_poll_mbox_done,
+       .chk_mbox_avail = v2_chk_mbox_is_avail,
        .set_gid = hns_roce_v2_set_gid,
        .set_mac = hns_roce_v2_set_mac,
        .write_mtpt = hns_roce_v2_write_mtpt,
@@ -6218,20 +6549,10 @@ static const struct hns_roce_hw hns_roce_hw_v2 = {
        .set_hem = hns_roce_v2_set_hem,
        .clear_hem = hns_roce_v2_clear_hem,
        .modify_qp = hns_roce_v2_modify_qp,
-       .query_qp = hns_roce_v2_query_qp,
-       .destroy_qp = hns_roce_v2_destroy_qp,
        .qp_flow_control_init = hns_roce_v2_qp_flow_control_init,
-       .modify_cq = hns_roce_v2_modify_cq,
-       .post_send = hns_roce_v2_post_send,
-       .post_recv = hns_roce_v2_post_recv,
-       .req_notify_cq = hns_roce_v2_req_notify_cq,
-       .poll_cq = hns_roce_v2_poll_cq,
        .init_eq = hns_roce_v2_init_eq_table,
        .cleanup_eq = hns_roce_v2_cleanup_eq_table,
        .write_srqc = hns_roce_v2_write_srqc,
-       .modify_srq = hns_roce_v2_modify_srq,
-       .query_srq = hns_roce_v2_query_srq,
-       .post_srq_recv = hns_roce_v2_post_srq_recv,
        .hns_roce_dev_ops = &hns_roce_v2_dev_ops,
        .hns_roce_dev_srq_ops = &hns_roce_v2_dev_srq_ops,
 };
@@ -6243,6 +6564,8 @@ static const struct pci_device_id hns_roce_hw_v2_pci_tbl[] = {
        {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), 0},
        {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0},
        {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_200G_RDMA), 0},
+       {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_RDMA_DCB_PFC_VF),
+        HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
        /* required last entry */
        {0, }
 };
@@ -6253,9 +6576,12 @@ static void hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
                                  struct hnae3_handle *handle)
 {
        struct hns_roce_v2_priv *priv = hr_dev->priv;
+       const struct pci_device_id *id;
        int i;
 
        hr_dev->pci_dev = handle->pdev;
+       id = pci_match_id(hns_roce_hw_v2_pci_tbl, hr_dev->pci_dev);
+       hr_dev->is_vf = id->driver_data;
        hr_dev->dev = &handle->pdev->dev;
        hr_dev->hw = &hns_roce_hw_v2;
        hr_dev->dfx = &hns_roce_dfx_hw_v2;
@@ -6272,7 +6598,7 @@ static void hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
        addrconf_addr_eui48((u8 *)&hr_dev->ib_dev.node_guid,
                            hr_dev->iboe.netdevs[0]->dev_addr);
 
-       for (i = 0; i < HNS_ROCE_V2_MAX_IRQ_NUM; i++)
+       for (i = 0; i < handle->rinfo.num_vectors; i++)
                hr_dev->irq[i] = pci_irq_vector(handle->pdev,
                                                i + handle->rinfo.base_vector);
 
@@ -6356,6 +6682,9 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
        if (!id)
                return 0;
 
+       if (id->driver_data && handle->pdev->revision < PCI_REVISION_ID_HIP09)
+               return 0;
+
        ret = __hns_roce_hw_v2_init_instance(handle);
        if (ret) {
                handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
index 39621fb..a2100a6 100644 (file)
 #define HNS_ROCE_VF_SRQC_BT_NUM                        64
 #define HNS_ROCE_VF_CQC_BT_NUM                 64
 #define HNS_ROCE_VF_MPT_BT_NUM                 64
-#define HNS_ROCE_VF_EQC_NUM                    64
 #define HNS_ROCE_VF_SMAC_NUM                   32
-#define HNS_ROCE_VF_SGID_NUM                   32
 #define HNS_ROCE_VF_SL_NUM                     8
 #define HNS_ROCE_VF_GMV_BT_NUM                 256
 
-#define HNS_ROCE_V2_MAX_QP_NUM                 0x100000
+#define HNS_ROCE_V2_MAX_QP_NUM                 0x1000
 #define HNS_ROCE_V2_MAX_QPC_TIMER_NUM          0x200
 #define HNS_ROCE_V2_MAX_WQE_NUM                        0x8000
 #define        HNS_ROCE_V2_MAX_SRQ                     0x100000
@@ -61,6 +59,7 @@
 #define HNS_ROCE_V2_MAX_SQ_SGE_NUM             64
 #define HNS_ROCE_V2_MAX_EXTEND_SGE_NUM         0x200000
 #define HNS_ROCE_V2_MAX_SQ_INLINE              0x20
+#define HNS_ROCE_V2_MAX_SQ_INL_EXT             0x400
 #define HNS_ROCE_V2_MAX_RC_INL_INN_SZ          32
 #define HNS_ROCE_V2_UAR_NUM                    256
 #define HNS_ROCE_V2_PHY_UAR_NUM                        1
@@ -74,6 +73,8 @@
 #define HNS_ROCE_V2_MAX_SRQWQE_SEGS            0x1000000
 #define HNS_ROCE_V2_MAX_IDX_SEGS               0x1000000
 #define HNS_ROCE_V2_MAX_PD_NUM                 0x1000000
+#define HNS_ROCE_V2_MAX_XRCD_NUM               0x1000000
+#define HNS_ROCE_V2_RSV_XRCD_NUM               0
 #define HNS_ROCE_V2_MAX_QP_INIT_RDMA           128
 #define HNS_ROCE_V2_MAX_QP_DEST_RDMA           128
 #define HNS_ROCE_V2_MAX_SQ_DESC_SZ             64
 
 #define HNS_ROCE_BA_PG_SZ_SUPPORTED_256K       6
 #define HNS_ROCE_BA_PG_SZ_SUPPORTED_16K                2
-#define HNS_ROCE_V2_GID_INDEX_NUM              256
+#define HNS_ROCE_V2_GID_INDEX_NUM              16
 
 #define HNS_ROCE_V2_TABLE_CHUNK_SIZE           (1 << 18)
 
 
 #define HNS_ROCE_CMQ_SCC_CLR_DONE_CNT          5
 
+#define HNS_ROCE_CONG_SIZE 64
+
 #define check_whether_last_step(hop_num, step_idx) \
        ((step_idx == 0 && hop_num == HNS_ROCE_HOP_NUM_0) || \
        (step_idx == 1 && hop_num == 1) || \
@@ -195,11 +198,11 @@ enum {
 };
 
 enum {
-       HNS_ROCE_V2_SQ_DB       = 0x0,
-       HNS_ROCE_V2_RQ_DB       = 0x1,
-       HNS_ROCE_V2_SRQ_DB      = 0x2,
-       HNS_ROCE_V2_CQ_DB_PTR   = 0x3,
-       HNS_ROCE_V2_CQ_DB_NTR   = 0x4,
+       HNS_ROCE_V2_SQ_DB,
+       HNS_ROCE_V2_RQ_DB,
+       HNS_ROCE_V2_SRQ_DB,
+       HNS_ROCE_V2_CQ_DB,
+       HNS_ROCE_V2_CQ_DB_NOTIFY
 };
 
 enum {
@@ -233,6 +236,7 @@ enum hns_roce_opcode_type {
        HNS_ROCE_OPC_CFG_EXT_LLM                        = 0x8403,
        HNS_ROCE_OPC_CFG_TMOUT_LLM                      = 0x8404,
        HNS_ROCE_OPC_QUERY_PF_TIMER_RES                 = 0x8406,
+       HNS_ROCE_OPC_QUERY_FUNC_INFO                    = 0x8407,
        HNS_ROCE_OPC_QUERY_PF_CAPS_NUM                  = 0x8408,
        HNS_ROCE_OPC_CFG_ENTRY_SIZE                     = 0x8409,
        HNS_ROCE_OPC_CFG_SGID_TB                        = 0x8500,
@@ -244,6 +248,7 @@ enum hns_roce_opcode_type {
        HNS_ROCE_OPC_CLR_SCCC                           = 0x8509,
        HNS_ROCE_OPC_QUERY_SCCC                         = 0x850a,
        HNS_ROCE_OPC_RESET_SCCC                         = 0x850b,
+       HNS_ROCE_OPC_QUERY_VF_RES                       = 0x850e,
        HNS_ROCE_OPC_CFG_GMV_TBL                        = 0x850f,
        HNS_ROCE_OPC_CFG_GMV_BT                         = 0x8510,
        HNS_SWITCH_PARAMETER_CFG                        = 0x1033,
@@ -255,10 +260,20 @@ enum {
 };
 
 enum hns_roce_cmd_return_status {
-       CMD_EXEC_SUCCESS        = 0,
-       CMD_NO_AUTH             = 1,
-       CMD_NOT_EXEC            = 2,
-       CMD_QUEUE_FULL          = 3,
+       CMD_EXEC_SUCCESS,
+       CMD_NO_AUTH,
+       CMD_NOT_EXIST,
+       CMD_CRQ_FULL,
+       CMD_NEXT_ERR,
+       CMD_NOT_EXEC,
+       CMD_PARA_ERR,
+       CMD_RESULT_ERR,
+       CMD_TIMEOUT,
+       CMD_HILINK_ERR,
+       CMD_INFO_ILLEGAL,
+       CMD_INVALID,
+       CMD_ROH_CHECK_FAIL,
+       CMD_OTHER_ERR = 0xff
 };
 
 enum hns_roce_sgid_type {
@@ -399,7 +414,8 @@ struct hns_roce_srq_context {
 #define SRQC_CONSUMER_IDX SRQC_FIELD_LOC(127, 112)
 #define SRQC_WQE_BT_BA_L SRQC_FIELD_LOC(159, 128)
 #define SRQC_WQE_BT_BA_H SRQC_FIELD_LOC(188, 160)
-#define SRQC_RSV2 SRQC_FIELD_LOC(191, 189)
+#define SRQC_RSV2 SRQC_FIELD_LOC(190, 189)
+#define SRQC_SRQ_TYPE SRQC_FIELD_LOC(191, 191)
 #define SRQC_PD SRQC_FIELD_LOC(215, 192)
 #define SRQC_RQWS SRQC_FIELD_LOC(219, 216)
 #define SRQC_RSV3 SRQC_FIELD_LOC(223, 220)
@@ -572,6 +588,10 @@ struct hns_roce_v2_qp_context {
        struct hns_roce_v2_qp_context_ex ext;
 };
 
+#define QPC_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_v2_qp_context, h, l)
+
+#define QPC_CONG_ALGO_TMPL_ID QPC_FIELD_LOC(455, 448)
+
 #define        V2_QPC_BYTE_4_TST_S 0
 #define V2_QPC_BYTE_4_TST_M GENMASK(2, 0)
 
@@ -663,9 +683,6 @@ struct hns_roce_v2_qp_context {
 #define        V2_QPC_BYTE_56_LP_PKTN_INI_S 28
 #define V2_QPC_BYTE_56_LP_PKTN_INI_M GENMASK(31, 28)
 
-#define        V2_QPC_BYTE_60_TEMPID_S 0
-#define V2_QPC_BYTE_60_TEMPID_M GENMASK(7, 0)
-
 #define V2_QPC_BYTE_60_SCC_TOKEN_S 8
 #define V2_QPC_BYTE_60_SCC_TOKEN_M GENMASK(26, 8)
 
@@ -698,6 +715,8 @@ struct hns_roce_v2_qp_context {
 #define        V2_QPC_BYTE_80_RX_CQN_S 0
 #define V2_QPC_BYTE_80_RX_CQN_M GENMASK(23, 0)
 
+#define V2_QPC_BYTE_80_XRC_QP_TYPE_S 24
+
 #define        V2_QPC_BYTE_80_MIN_RNR_TIME_S 27
 #define V2_QPC_BYTE_80_MIN_RNR_TIME_M GENMASK(31, 27)
 
@@ -940,6 +959,10 @@ struct hns_roce_v2_qp_context {
 
 #define QPCEX_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_v2_qp_context_ex, h, l)
 
+#define QPCEX_CONG_ALG_SEL QPCEX_FIELD_LOC(0, 0)
+#define QPCEX_CONG_ALG_SUB_SEL QPCEX_FIELD_LOC(1, 1)
+#define QPCEX_DIP_CTX_IDX_VLD QPCEX_FIELD_LOC(2, 2)
+#define QPCEX_DIP_CTX_IDX QPCEX_FIELD_LOC(22, 3)
 #define QPCEX_STASH QPCEX_FIELD_LOC(82, 82)
 
 #define        V2_QP_RWE_S 1 /* rdma write enable */
@@ -1130,33 +1153,27 @@ struct hns_roce_v2_mpt_entry {
 #define V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S 28
 #define V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M GENMASK(31, 28)
 
-#define        V2_DB_BYTE_4_TAG_S 0
-#define V2_DB_BYTE_4_TAG_M GENMASK(23, 0)
+#define V2_DB_TAG_S 0
+#define V2_DB_TAG_M GENMASK(23, 0)
 
-#define        V2_DB_BYTE_4_CMD_S 24
-#define V2_DB_BYTE_4_CMD_M GENMASK(27, 24)
+#define V2_DB_CMD_S 24
+#define V2_DB_CMD_M GENMASK(27, 24)
 
 #define V2_DB_FLAG_S 31
 
-#define V2_DB_PARAMETER_IDX_S 0
-#define V2_DB_PARAMETER_IDX_M GENMASK(15, 0)
+#define V2_DB_PRODUCER_IDX_S 0
+#define V2_DB_PRODUCER_IDX_M GENMASK(15, 0)
 
-#define V2_DB_PARAMETER_SL_S 16
-#define V2_DB_PARAMETER_SL_M GENMASK(18, 16)
+#define V2_DB_SL_S 16
+#define V2_DB_SL_M GENMASK(18, 16)
 
-#define        V2_CQ_DB_BYTE_4_TAG_S 0
-#define V2_CQ_DB_BYTE_4_TAG_M GENMASK(23, 0)
+#define V2_CQ_DB_CONS_IDX_S 0
+#define V2_CQ_DB_CONS_IDX_M GENMASK(23, 0)
 
-#define        V2_CQ_DB_BYTE_4_CMD_S 24
-#define V2_CQ_DB_BYTE_4_CMD_M GENMASK(27, 24)
+#define V2_CQ_DB_NOTIFY_TYPE_S 24
 
-#define V2_CQ_DB_PARAMETER_CONS_IDX_S 0
-#define V2_CQ_DB_PARAMETER_CONS_IDX_M GENMASK(23, 0)
-
-#define V2_CQ_DB_PARAMETER_CMD_SN_S 25
-#define V2_CQ_DB_PARAMETER_CMD_SN_M GENMASK(26, 25)
-
-#define V2_CQ_DB_PARAMETER_NOTIFY_S 24
+#define V2_CQ_DB_CMD_SN_S 25
+#define V2_CQ_DB_CMD_SN_M GENMASK(26, 25)
 
 struct hns_roce_v2_ud_send_wqe {
        __le32  byte_4;
@@ -1359,194 +1376,44 @@ struct hns_roce_cfg_llm_b {
 #define CFG_LLM_TAIL_PTR_S 0
 #define CFG_LLM_TAIL_PTR_M GENMASK(11, 0)
 
-struct hns_roce_cfg_global_param {
-       __le32 time_cfg_udp_port;
-       __le32 rsv[5];
-};
-
-#define CFG_GLOBAL_PARAM_DATA_0_ROCEE_TIME_1US_CFG_S 0
-#define CFG_GLOBAL_PARAM_DATA_0_ROCEE_TIME_1US_CFG_M GENMASK(9, 0)
-
-#define CFG_GLOBAL_PARAM_DATA_0_ROCEE_UDP_PORT_S 16
-#define CFG_GLOBAL_PARAM_DATA_0_ROCEE_UDP_PORT_M GENMASK(31, 16)
-
-struct hns_roce_pf_res_a {
-       __le32  rsv;
-       __le32  qpc_bt_idx_num;
-       __le32  srqc_bt_idx_num;
-       __le32  cqc_bt_idx_num;
-       __le32  mpt_bt_idx_num;
-       __le32  eqc_bt_idx_num;
-};
-
-#define PF_RES_DATA_1_PF_QPC_BT_IDX_S 0
-#define PF_RES_DATA_1_PF_QPC_BT_IDX_M GENMASK(10, 0)
-
-#define PF_RES_DATA_1_PF_QPC_BT_NUM_S 16
-#define PF_RES_DATA_1_PF_QPC_BT_NUM_M GENMASK(27, 16)
-
-#define PF_RES_DATA_2_PF_SRQC_BT_IDX_S 0
-#define PF_RES_DATA_2_PF_SRQC_BT_IDX_M GENMASK(8, 0)
-
-#define PF_RES_DATA_2_PF_SRQC_BT_NUM_S 16
-#define PF_RES_DATA_2_PF_SRQC_BT_NUM_M GENMASK(25, 16)
-
-#define PF_RES_DATA_3_PF_CQC_BT_IDX_S 0
-#define PF_RES_DATA_3_PF_CQC_BT_IDX_M GENMASK(8, 0)
-
-#define PF_RES_DATA_3_PF_CQC_BT_NUM_S 16
-#define PF_RES_DATA_3_PF_CQC_BT_NUM_M GENMASK(25, 16)
-
-#define PF_RES_DATA_4_PF_MPT_BT_IDX_S 0
-#define PF_RES_DATA_4_PF_MPT_BT_IDX_M GENMASK(8, 0)
-
-#define PF_RES_DATA_4_PF_MPT_BT_NUM_S 16
-#define PF_RES_DATA_4_PF_MPT_BT_NUM_M GENMASK(25, 16)
-
-#define PF_RES_DATA_5_PF_EQC_BT_IDX_S 0
-#define PF_RES_DATA_5_PF_EQC_BT_IDX_M GENMASK(8, 0)
-
-#define PF_RES_DATA_5_PF_EQC_BT_NUM_S 16
-#define PF_RES_DATA_5_PF_EQC_BT_NUM_M GENMASK(25, 16)
-
-struct hns_roce_pf_res_b {
-       __le32  rsv0;
-       __le32  smac_idx_num;
-       __le32  sgid_idx_num;
-       __le32  qid_idx_sl_num;
-       __le32  sccc_bt_idx_num;
-       __le32  gmv_idx_num;
-};
-
-#define PF_RES_DATA_1_PF_SMAC_IDX_S 0
-#define PF_RES_DATA_1_PF_SMAC_IDX_M GENMASK(7, 0)
-
-#define PF_RES_DATA_1_PF_SMAC_NUM_S 8
-#define PF_RES_DATA_1_PF_SMAC_NUM_M GENMASK(16, 8)
-
-#define PF_RES_DATA_2_PF_SGID_IDX_S 0
-#define PF_RES_DATA_2_PF_SGID_IDX_M GENMASK(7, 0)
-
-#define PF_RES_DATA_2_PF_SGID_NUM_S 8
-#define PF_RES_DATA_2_PF_SGID_NUM_M GENMASK(16, 8)
-
-#define PF_RES_DATA_3_PF_QID_IDX_S 0
-#define PF_RES_DATA_3_PF_QID_IDX_M GENMASK(9, 0)
-
-#define PF_RES_DATA_3_PF_SL_NUM_S 16
-#define PF_RES_DATA_3_PF_SL_NUM_M GENMASK(26, 16)
-
-#define PF_RES_DATA_4_PF_SCCC_BT_IDX_S 0
-#define PF_RES_DATA_4_PF_SCCC_BT_IDX_M GENMASK(8, 0)
-
-#define PF_RES_DATA_4_PF_SCCC_BT_NUM_S 9
-#define PF_RES_DATA_4_PF_SCCC_BT_NUM_M GENMASK(17, 9)
-
-#define PF_RES_DATA_5_PF_GMV_BT_IDX_S 0
-#define PF_RES_DATA_5_PF_GMV_BT_IDX_M GENMASK(7, 0)
+/* Fields of HNS_ROCE_OPC_CFG_GLOBAL_PARAM */
+#define CFG_GLOBAL_PARAM_1US_CYCLES CMQ_REQ_FIELD_LOC(9, 0)
+#define CFG_GLOBAL_PARAM_UDP_PORT CMQ_REQ_FIELD_LOC(31, 16)
 
-#define PF_RES_DATA_5_PF_GMV_BT_NUM_S 8
-#define PF_RES_DATA_5_PF_GMV_BT_NUM_M GENMASK(16, 8)
-
-struct hns_roce_pf_timer_res_a {
-       __le32  rsv0;
-       __le32  qpc_timer_bt_idx_num;
-       __le32  cqc_timer_bt_idx_num;
-       __le32  rsv[3];
-};
-
-#define PF_RES_DATA_1_PF_QPC_TIMER_BT_IDX_S 0
-#define PF_RES_DATA_1_PF_QPC_TIMER_BT_IDX_M GENMASK(11, 0)
-
-#define PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_S 16
-#define PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_M GENMASK(28, 16)
-
-#define PF_RES_DATA_2_PF_CQC_TIMER_BT_IDX_S 0
-#define PF_RES_DATA_2_PF_CQC_TIMER_BT_IDX_M GENMASK(10, 0)
-
-#define PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_S 16
-#define PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_M GENMASK(27, 16)
-
-struct hns_roce_vf_res_a {
-       __le32 vf_id;
-       __le32 vf_qpc_bt_idx_num;
-       __le32 vf_srqc_bt_idx_num;
-       __le32 vf_cqc_bt_idx_num;
-       __le32 vf_mpt_bt_idx_num;
-       __le32 vf_eqc_bt_idx_num;
-};
-
-#define VF_RES_A_DATA_1_VF_QPC_BT_IDX_S 0
-#define VF_RES_A_DATA_1_VF_QPC_BT_IDX_M GENMASK(10, 0)
-
-#define VF_RES_A_DATA_1_VF_QPC_BT_NUM_S 16
-#define VF_RES_A_DATA_1_VF_QPC_BT_NUM_M GENMASK(27, 16)
-
-#define VF_RES_A_DATA_2_VF_SRQC_BT_IDX_S 0
-#define VF_RES_A_DATA_2_VF_SRQC_BT_IDX_M GENMASK(8, 0)
-
-#define VF_RES_A_DATA_2_VF_SRQC_BT_NUM_S 16
-#define VF_RES_A_DATA_2_VF_SRQC_BT_NUM_M GENMASK(25, 16)
-
-#define VF_RES_A_DATA_3_VF_CQC_BT_IDX_S 0
-#define VF_RES_A_DATA_3_VF_CQC_BT_IDX_M GENMASK(8, 0)
-
-#define VF_RES_A_DATA_3_VF_CQC_BT_NUM_S 16
-#define VF_RES_A_DATA_3_VF_CQC_BT_NUM_M GENMASK(25, 16)
-
-#define VF_RES_A_DATA_4_VF_MPT_BT_IDX_S 0
-#define VF_RES_A_DATA_4_VF_MPT_BT_IDX_M GENMASK(8, 0)
-
-#define VF_RES_A_DATA_4_VF_MPT_BT_NUM_S 16
-#define VF_RES_A_DATA_4_VF_MPT_BT_NUM_M GENMASK(25, 16)
-
-#define VF_RES_A_DATA_5_VF_EQC_IDX_S 0
-#define VF_RES_A_DATA_5_VF_EQC_IDX_M GENMASK(8, 0)
-
-#define VF_RES_A_DATA_5_VF_EQC_NUM_S 16
-#define VF_RES_A_DATA_5_VF_EQC_NUM_M GENMASK(25, 16)
-
-struct hns_roce_vf_res_b {
-       __le32 rsv0;
-       __le32 vf_smac_idx_num;
-       __le32 vf_sgid_idx_num;
-       __le32 vf_qid_idx_sl_num;
-       __le32 vf_sccc_idx_num;
-       __le32 vf_gmv_idx_num;
-};
-
-#define VF_RES_B_DATA_0_VF_ID_S 0
-#define VF_RES_B_DATA_0_VF_ID_M GENMASK(7, 0)
-
-#define VF_RES_B_DATA_1_VF_SMAC_IDX_S 0
-#define VF_RES_B_DATA_1_VF_SMAC_IDX_M GENMASK(7, 0)
-
-#define VF_RES_B_DATA_1_VF_SMAC_NUM_S 8
-#define VF_RES_B_DATA_1_VF_SMAC_NUM_M GENMASK(16, 8)
-
-#define VF_RES_B_DATA_2_VF_SGID_IDX_S 0
-#define VF_RES_B_DATA_2_VF_SGID_IDX_M GENMASK(7, 0)
-
-#define VF_RES_B_DATA_2_VF_SGID_NUM_S 8
-#define VF_RES_B_DATA_2_VF_SGID_NUM_M GENMASK(16, 8)
-
-#define VF_RES_B_DATA_3_VF_QID_IDX_S 0
-#define VF_RES_B_DATA_3_VF_QID_IDX_M GENMASK(9, 0)
-
-#define VF_RES_B_DATA_3_VF_SL_NUM_S 16
-#define VF_RES_B_DATA_3_VF_SL_NUM_M GENMASK(19, 16)
-
-#define VF_RES_B_DATA_4_VF_SCCC_BT_IDX_S 0
-#define VF_RES_B_DATA_4_VF_SCCC_BT_IDX_M GENMASK(8, 0)
-
-#define VF_RES_B_DATA_4_VF_SCCC_BT_NUM_S 9
-#define VF_RES_B_DATA_4_VF_SCCC_BT_NUM_M GENMASK(17, 9)
-
-#define VF_RES_B_DATA_5_VF_GMV_BT_IDX_S 0
-#define VF_RES_B_DATA_5_VF_GMV_BT_IDX_M GENMASK(7, 0)
-
-#define VF_RES_B_DATA_5_VF_GMV_BT_NUM_S 16
-#define VF_RES_B_DATA_5_VF_GMV_BT_NUM_M GENMASK(24, 16)
+/*
+ * Fields of HNS_ROCE_OPC_QUERY_PF_RES, HNS_ROCE_OPC_QUERY_VF_RES
+ * and HNS_ROCE_OPC_ALLOC_VF_RES
+ */
+#define FUNC_RES_A_VF_ID CMQ_REQ_FIELD_LOC(7, 0)
+#define FUNC_RES_A_QPC_BT_IDX CMQ_REQ_FIELD_LOC(42, 32)
+#define FUNC_RES_A_QPC_BT_NUM CMQ_REQ_FIELD_LOC(59, 48)
+#define FUNC_RES_A_SRQC_BT_IDX CMQ_REQ_FIELD_LOC(72, 64)
+#define FUNC_RES_A_SRQC_BT_NUM CMQ_REQ_FIELD_LOC(89, 80)
+#define FUNC_RES_A_CQC_BT_IDX CMQ_REQ_FIELD_LOC(104, 96)
+#define FUNC_RES_A_CQC_BT_NUM CMQ_REQ_FIELD_LOC(121, 112)
+#define FUNC_RES_A_MPT_BT_IDX CMQ_REQ_FIELD_LOC(136, 128)
+#define FUNC_RES_A_MPT_BT_NUM CMQ_REQ_FIELD_LOC(153, 144)
+#define FUNC_RES_A_EQC_BT_IDX CMQ_REQ_FIELD_LOC(168, 160)
+#define FUNC_RES_A_EQC_BT_NUM CMQ_REQ_FIELD_LOC(185, 176)
+#define FUNC_RES_B_SMAC_IDX CMQ_REQ_FIELD_LOC(39, 32)
+#define FUNC_RES_B_SMAC_NUM CMQ_REQ_FIELD_LOC(48, 40)
+#define FUNC_RES_B_SGID_IDX CMQ_REQ_FIELD_LOC(71, 64)
+#define FUNC_RES_B_SGID_NUM CMQ_REQ_FIELD_LOC(80, 72)
+#define FUNC_RES_B_QID_IDX CMQ_REQ_FIELD_LOC(105, 96)
+#define FUNC_RES_B_QID_NUM CMQ_REQ_FIELD_LOC(122, 112)
+#define FUNC_RES_V_QID_NUM CMQ_REQ_FIELD_LOC(115, 112)
+
+#define FUNC_RES_B_SCCC_BT_IDX CMQ_REQ_FIELD_LOC(136, 128)
+#define FUNC_RES_B_SCCC_BT_NUM CMQ_REQ_FIELD_LOC(145, 137)
+#define FUNC_RES_B_GMV_BT_IDX CMQ_REQ_FIELD_LOC(167, 160)
+#define FUNC_RES_B_GMV_BT_NUM CMQ_REQ_FIELD_LOC(176, 168)
+#define FUNC_RES_V_GMV_BT_NUM CMQ_REQ_FIELD_LOC(184, 176)
+
+/* Fields of HNS_ROCE_OPC_QUERY_PF_TIMER_RES */
+#define PF_TIMER_RES_QPC_ITEM_IDX CMQ_REQ_FIELD_LOC(43, 32)
+#define PF_TIMER_RES_QPC_ITEM_NUM CMQ_REQ_FIELD_LOC(60, 48)
+#define PF_TIMER_RES_CQC_ITEM_IDX CMQ_REQ_FIELD_LOC(74, 64)
+#define PF_TIMER_RES_CQC_ITEM_NUM CMQ_REQ_FIELD_LOC(91, 80)
 
 struct hns_roce_vf_switch {
        __le32 rocee_sel;
@@ -1578,59 +1445,43 @@ struct hns_roce_mbox_status {
        __le32  rsv[5];
 };
 
-struct hns_roce_cfg_bt_attr {
-       __le32 vf_qpc_cfg;
-       __le32 vf_srqc_cfg;
-       __le32 vf_cqc_cfg;
-       __le32 vf_mpt_cfg;
-       __le32 vf_sccc_cfg;
-       __le32 rsv;
+#define HNS_ROCE_V2_GO_BIT_TIMEOUT_MSECS 10000
+
+#define MB_ST_HW_RUN_M BIT(31)
+#define MB_ST_COMPLETE_M GENMASK(7, 0)
+
+#define MB_ST_COMPLETE_SUCC 1
+
+/* Fields of HNS_ROCE_OPC_CFG_BT_ATTR */
+#define CFG_BT_ATTR_QPC_BA_PGSZ CMQ_REQ_FIELD_LOC(3, 0)
+#define CFG_BT_ATTR_QPC_BUF_PGSZ CMQ_REQ_FIELD_LOC(7, 4)
+#define CFG_BT_ATTR_QPC_HOPNUM CMQ_REQ_FIELD_LOC(9, 8)
+#define CFG_BT_ATTR_SRQC_BA_PGSZ CMQ_REQ_FIELD_LOC(35, 32)
+#define CFG_BT_ATTR_SRQC_BUF_PGSZ CMQ_REQ_FIELD_LOC(39, 36)
+#define CFG_BT_ATTR_SRQC_HOPNUM CMQ_REQ_FIELD_LOC(41, 40)
+#define CFG_BT_ATTR_CQC_BA_PGSZ CMQ_REQ_FIELD_LOC(67, 64)
+#define CFG_BT_ATTR_CQC_BUF_PGSZ CMQ_REQ_FIELD_LOC(71, 68)
+#define CFG_BT_ATTR_CQC_HOPNUM CMQ_REQ_FIELD_LOC(73, 72)
+#define CFG_BT_ATTR_MPT_BA_PGSZ CMQ_REQ_FIELD_LOC(99, 96)
+#define CFG_BT_ATTR_MPT_BUF_PGSZ CMQ_REQ_FIELD_LOC(103, 100)
+#define CFG_BT_ATTR_MPT_HOPNUM CMQ_REQ_FIELD_LOC(105, 104)
+#define CFG_BT_ATTR_SCCC_BA_PGSZ CMQ_REQ_FIELD_LOC(131, 128)
+#define CFG_BT_ATTR_SCCC_BUF_PGSZ CMQ_REQ_FIELD_LOC(135, 132)
+#define CFG_BT_ATTR_SCCC_HOPNUM CMQ_REQ_FIELD_LOC(137, 136)
+
+/* Fields of HNS_ROCE_OPC_CFG_ENTRY_SIZE */
+#define CFG_HEM_ENTRY_SIZE_TYPE CMQ_REQ_FIELD_LOC(31, 0)
+enum {
+       HNS_ROCE_CFG_QPC_SIZE = BIT(0),
+       HNS_ROCE_CFG_SCCC_SIZE = BIT(1),
 };
 
-#define CFG_BT_ATTR_DATA_0_VF_QPC_BA_PGSZ_S 0
-#define CFG_BT_ATTR_DATA_0_VF_QPC_BA_PGSZ_M GENMASK(3, 0)
-
-#define CFG_BT_ATTR_DATA_0_VF_QPC_BUF_PGSZ_S 4
-#define CFG_BT_ATTR_DATA_0_VF_QPC_BUF_PGSZ_M GENMASK(7, 4)
-
-#define CFG_BT_ATTR_DATA_0_VF_QPC_HOPNUM_S 8
-#define CFG_BT_ATTR_DATA_0_VF_QPC_HOPNUM_M GENMASK(9, 8)
-
-#define CFG_BT_ATTR_DATA_1_VF_SRQC_BA_PGSZ_S 0
-#define CFG_BT_ATTR_DATA_1_VF_SRQC_BA_PGSZ_M GENMASK(3, 0)
-
-#define CFG_BT_ATTR_DATA_1_VF_SRQC_BUF_PGSZ_S 4
-#define CFG_BT_ATTR_DATA_1_VF_SRQC_BUF_PGSZ_M GENMASK(7, 4)
-
-#define CFG_BT_ATTR_DATA_1_VF_SRQC_HOPNUM_S 8
-#define CFG_BT_ATTR_DATA_1_VF_SRQC_HOPNUM_M GENMASK(9, 8)
-
-#define CFG_BT_ATTR_DATA_2_VF_CQC_BA_PGSZ_S 0
-#define CFG_BT_ATTR_DATA_2_VF_CQC_BA_PGSZ_M GENMASK(3, 0)
-
-#define CFG_BT_ATTR_DATA_2_VF_CQC_BUF_PGSZ_S 4
-#define CFG_BT_ATTR_DATA_2_VF_CQC_BUF_PGSZ_M GENMASK(7, 4)
-
-#define CFG_BT_ATTR_DATA_2_VF_CQC_HOPNUM_S 8
-#define CFG_BT_ATTR_DATA_2_VF_CQC_HOPNUM_M GENMASK(9, 8)
-
-#define CFG_BT_ATTR_DATA_3_VF_MPT_BA_PGSZ_S 0
-#define CFG_BT_ATTR_DATA_3_VF_MPT_BA_PGSZ_M GENMASK(3, 0)
-
-#define CFG_BT_ATTR_DATA_3_VF_MPT_BUF_PGSZ_S 4
-#define CFG_BT_ATTR_DATA_3_VF_MPT_BUF_PGSZ_M GENMASK(7, 4)
-
-#define CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_S 8
-#define CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_M GENMASK(9, 8)
-
-#define CFG_BT_ATTR_DATA_4_VF_SCCC_BA_PGSZ_S 0
-#define CFG_BT_ATTR_DATA_4_VF_SCCC_BA_PGSZ_M GENMASK(3, 0)
-
-#define CFG_BT_ATTR_DATA_4_VF_SCCC_BUF_PGSZ_S 4
-#define CFG_BT_ATTR_DATA_4_VF_SCCC_BUF_PGSZ_M GENMASK(7, 4)
+#define CFG_HEM_ENTRY_SIZE_VALUE CMQ_REQ_FIELD_LOC(191, 160)
 
-#define CFG_BT_ATTR_DATA_4_VF_SCCC_HOPNUM_S 8
-#define CFG_BT_ATTR_DATA_4_VF_SCCC_HOPNUM_M GENMASK(9, 8)
+/* Fields of HNS_ROCE_OPC_CFG_GMV_BT */
+#define CFG_GMV_BT_BA_L CMQ_REQ_FIELD_LOC(31, 0)
+#define CFG_GMV_BT_BA_H CMQ_REQ_FIELD_LOC(51, 32)
+#define CFG_GMV_BT_IDX CMQ_REQ_FIELD_LOC(95, 64)
 
 struct hns_roce_cfg_sgid_tb {
        __le32  table_idx_rsv;
@@ -1641,17 +1492,6 @@ struct hns_roce_cfg_sgid_tb {
        __le32  vf_sgid_type_rsv;
 };
 
-enum {
-       HNS_ROCE_CFG_QPC_SIZE = BIT(0),
-       HNS_ROCE_CFG_SCCC_SIZE = BIT(1),
-};
-
-struct hns_roce_cfg_entry_size {
-       __le32  type;
-       __le32  rsv[4];
-       __le32  size;
-};
-
 #define CFG_SGID_TB_TABLE_IDX_S 0
 #define CFG_SGID_TB_TABLE_IDX_M GENMASK(7, 0)
 
@@ -1670,16 +1510,6 @@ struct hns_roce_cfg_smac_tb {
 #define CFG_SMAC_TB_VF_SMAC_H_S 0
 #define CFG_SMAC_TB_VF_SMAC_H_M GENMASK(15, 0)
 
-struct hns_roce_cfg_gmv_bt {
-       __le32 gmv_ba_l;
-       __le32 gmv_ba_h;
-       __le32 gmv_bt_idx;
-       __le32 rsv[3];
-};
-
-#define CFG_GMV_BA_H_S 0
-#define CFG_GMV_BA_H_M GENMASK(19, 0)
-
 struct hns_roce_cfg_gmv_tb_a {
        __le32 vf_sgid_l;
        __le32 vf_sgid_ml;
@@ -1805,6 +1635,14 @@ struct hns_roce_query_pf_caps_d {
 #define V2_QUERY_PF_CAPS_D_SQWQE_HOP_NUM_S 24
 #define V2_QUERY_PF_CAPS_D_SQWQE_HOP_NUM_M GENMASK(25, 24)
 
+#define V2_QUERY_PF_CAPS_D_CONG_TYPE_S 26
+#define V2_QUERY_PF_CAPS_D_CONG_TYPE_M GENMASK(29, 26)
+
+struct hns_roce_congestion_algorithm {
+       u8 alg_sel;
+       u8 alg_sub_sel;
+       u8 dip_vld;
+};
 
 #define V2_QUERY_PF_CAPS_D_CEQ_DEPTH_S 0
 #define V2_QUERY_PF_CAPS_D_CEQ_DEPTH_M GENMASK(21, 0)
@@ -1859,18 +1697,27 @@ struct hns_roce_query_pf_caps_e {
 #define V2_QUERY_PF_CAPS_E_RSV_LKEYS_S 0
 #define V2_QUERY_PF_CAPS_E_RSV_LKEYS_M GENMASK(19, 0)
 
+struct hns_roce_cmq_req {
+       __le32 data[6];
+};
+
+#define CMQ_REQ_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_cmq_req, h, l)
+
 struct hns_roce_cmq_desc {
        __le16 opcode;
        __le16 flag;
        __le16 retval;
        __le16 rsv;
-       __le32 data[6];
-};
-
-#define HNS_ROCE_V2_GO_BIT_TIMEOUT_MSECS       10000
+       union {
+               __le32 data[6];
+               struct {
+                       __le32 own_func_num;
+                       __le32 own_mac_id;
+                       __le32 rsv[4];
+               } func_info;
+       };
 
-#define HNS_ROCE_HW_RUN_BIT_SHIFT      31
-#define HNS_ROCE_HW_MB_STATUS_MASK     0xFF
+};
 
 struct hns_roce_v2_cmq_ring {
        dma_addr_t desc_dma_addr;
@@ -1932,6 +1779,12 @@ struct hns_roce_eq_context {
        __le32  rsv[5];
 };
 
+struct hns_roce_dip {
+       u8 dgid[GID_LEN_V2];
+       u8 dip_idx;
+       struct list_head node;  /* all dips are on a list */
+};
+
 #define HNS_ROCE_AEQ_DEFAULT_BURST_NUM 0x0
 #define HNS_ROCE_AEQ_DEFAULT_INTERVAL  0x0
 #define HNS_ROCE_CEQ_DEFAULT_BURST_NUM 0x0
@@ -1966,8 +1819,7 @@ struct hns_roce_eq_context {
 #define HNS_ROCE_V2_ASYNC_EQE_NUM              0x1000
 
 #define HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S   0
-#define HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S                1
-#define HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S      2
+#define HNS_ROCE_V2_VF_INT_ST_RAS_INT_S                1
 
 #define HNS_ROCE_EQ_DB_CMD_AEQ                 0x0
 #define HNS_ROCE_EQ_DB_CMD_AEQ_ARMED           0x1
@@ -1982,96 +1834,38 @@ struct hns_roce_eq_context {
 #define HNS_ROCE_INT_NAME_LEN                  32
 #define HNS_ROCE_V2_EQN_M GENMASK(23, 0)
 
-#define HNS_ROCE_V2_CONS_IDX_M GENMASK(23, 0)
-
 #define HNS_ROCE_V2_VF_ABN_INT_EN_S 0
 #define HNS_ROCE_V2_VF_ABN_INT_EN_M GENMASK(0, 0)
 #define HNS_ROCE_V2_VF_ABN_INT_ST_M GENMASK(2, 0)
 #define HNS_ROCE_V2_VF_ABN_INT_CFG_M GENMASK(2, 0)
 #define HNS_ROCE_V2_VF_EVENT_INT_EN_M GENMASK(0, 0)
 
-/* WORD0 */
-#define HNS_ROCE_EQC_EQ_ST_S 0
-#define HNS_ROCE_EQC_EQ_ST_M GENMASK(1, 0)
-
-#define HNS_ROCE_EQC_HOP_NUM_S 2
-#define HNS_ROCE_EQC_HOP_NUM_M GENMASK(3, 2)
-
-#define HNS_ROCE_EQC_OVER_IGNORE_S 4
-#define HNS_ROCE_EQC_OVER_IGNORE_M GENMASK(4, 4)
-
-#define HNS_ROCE_EQC_COALESCE_S 5
-#define HNS_ROCE_EQC_COALESCE_M GENMASK(5, 5)
-
-#define HNS_ROCE_EQC_ARM_ST_S 6
-#define HNS_ROCE_EQC_ARM_ST_M GENMASK(7, 6)
-
-#define HNS_ROCE_EQC_EQN_S 8
-#define HNS_ROCE_EQC_EQN_M GENMASK(15, 8)
-
-#define HNS_ROCE_EQC_EQE_CNT_S 16
-#define HNS_ROCE_EQC_EQE_CNT_M GENMASK(31, 16)
-
-/* WORD1 */
-#define HNS_ROCE_EQC_BA_PG_SZ_S 0
-#define HNS_ROCE_EQC_BA_PG_SZ_M GENMASK(3, 0)
-
-#define HNS_ROCE_EQC_BUF_PG_SZ_S 4
-#define HNS_ROCE_EQC_BUF_PG_SZ_M GENMASK(7, 4)
-
-#define HNS_ROCE_EQC_PROD_INDX_S 8
-#define HNS_ROCE_EQC_PROD_INDX_M GENMASK(31, 8)
-
-/* WORD2 */
-#define HNS_ROCE_EQC_MAX_CNT_S 0
-#define HNS_ROCE_EQC_MAX_CNT_M GENMASK(15, 0)
-
-#define HNS_ROCE_EQC_PERIOD_S 16
-#define HNS_ROCE_EQC_PERIOD_M GENMASK(31, 16)
-
-/* WORD3 */
-#define HNS_ROCE_EQC_REPORT_TIMER_S 0
-#define HNS_ROCE_EQC_REPORT_TIMER_M GENMASK(31, 0)
-
-/* WORD4 */
-#define HNS_ROCE_EQC_EQE_BA_L_S 0
-#define HNS_ROCE_EQC_EQE_BA_L_M GENMASK(31, 0)
-
-/* WORD5 */
-#define HNS_ROCE_EQC_EQE_BA_H_S 0
-#define HNS_ROCE_EQC_EQE_BA_H_M GENMASK(28, 0)
-
-/* WORD6 */
-#define HNS_ROCE_EQC_SHIFT_S 0
-#define HNS_ROCE_EQC_SHIFT_M GENMASK(7, 0)
-
-#define HNS_ROCE_EQC_MSI_INDX_S 8
-#define HNS_ROCE_EQC_MSI_INDX_M GENMASK(15, 8)
-
-#define HNS_ROCE_EQC_CUR_EQE_BA_L_S 16
-#define HNS_ROCE_EQC_CUR_EQE_BA_L_M GENMASK(31, 16)
-
-/* WORD7 */
-#define HNS_ROCE_EQC_CUR_EQE_BA_M_S 0
-#define HNS_ROCE_EQC_CUR_EQE_BA_M_M GENMASK(31, 0)
-
-/* WORD8 */
-#define HNS_ROCE_EQC_CUR_EQE_BA_H_S 0
-#define HNS_ROCE_EQC_CUR_EQE_BA_H_M GENMASK(3, 0)
-
-#define HNS_ROCE_EQC_CONS_INDX_S 8
-#define HNS_ROCE_EQC_CONS_INDX_M GENMASK(31, 8)
-
-/* WORD9 */
-#define HNS_ROCE_EQC_NXT_EQE_BA_L_S 0
-#define HNS_ROCE_EQC_NXT_EQE_BA_L_M GENMASK(31, 0)
-
-/* WORD10 */
-#define HNS_ROCE_EQC_NXT_EQE_BA_H_S 0
-#define HNS_ROCE_EQC_NXT_EQE_BA_H_M GENMASK(19, 0)
-
-#define HNS_ROCE_EQC_EQE_SIZE_S 20
-#define HNS_ROCE_EQC_EQE_SIZE_M GENMASK(21, 20)
+#define EQC_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_eq_context, h, l)
+
+#define EQC_EQ_ST EQC_FIELD_LOC(1, 0)
+#define EQC_EQE_HOP_NUM EQC_FIELD_LOC(3, 2)
+#define EQC_OVER_IGNORE EQC_FIELD_LOC(4, 4)
+#define EQC_COALESCE EQC_FIELD_LOC(5, 5)
+#define EQC_ARM_ST EQC_FIELD_LOC(7, 6)
+#define EQC_EQN EQC_FIELD_LOC(15, 8)
+#define EQC_EQE_CNT EQC_FIELD_LOC(31, 16)
+#define EQC_EQE_BA_PG_SZ EQC_FIELD_LOC(35, 32)
+#define EQC_EQE_BUF_PG_SZ EQC_FIELD_LOC(39, 36)
+#define EQC_EQ_PROD_INDX EQC_FIELD_LOC(63, 40)
+#define EQC_EQ_MAX_CNT EQC_FIELD_LOC(79, 64)
+#define EQC_EQ_PERIOD EQC_FIELD_LOC(95, 80)
+#define EQC_EQE_REPORT_TIMER EQC_FIELD_LOC(127, 96)
+#define EQC_EQE_BA_L EQC_FIELD_LOC(159, 128)
+#define EQC_EQE_BA_H EQC_FIELD_LOC(188, 160)
+#define EQC_SHIFT EQC_FIELD_LOC(199, 192)
+#define EQC_MSI_INDX EQC_FIELD_LOC(207, 200)
+#define EQC_CUR_EQE_BA_L EQC_FIELD_LOC(223, 208)
+#define EQC_CUR_EQE_BA_M EQC_FIELD_LOC(255, 224)
+#define EQC_CUR_EQE_BA_H EQC_FIELD_LOC(259, 256)
+#define EQC_EQ_CONS_INDX EQC_FIELD_LOC(287, 264)
+#define EQC_NEX_EQE_BA_L EQC_FIELD_LOC(319, 288)
+#define EQC_NEX_EQE_BA_H EQC_FIELD_LOC(339, 320)
+#define EQC_EQE_SIZE EQC_FIELD_LOC(341, 340)
 
 #define HNS_ROCE_V2_CEQE_COMP_CQN_S 0
 #define HNS_ROCE_V2_CEQE_COMP_CQN_M GENMASK(23, 0)
@@ -2082,14 +1876,14 @@ struct hns_roce_eq_context {
 #define HNS_ROCE_V2_AEQE_SUB_TYPE_S 8
 #define HNS_ROCE_V2_AEQE_SUB_TYPE_M GENMASK(15, 8)
 
-#define HNS_ROCE_V2_EQ_DB_CMD_S        16
-#define HNS_ROCE_V2_EQ_DB_CMD_M        GENMASK(17, 16)
+#define V2_EQ_DB_TAG_S 0
+#define V2_EQ_DB_TAG_M GENMASK(7, 0)
 
-#define HNS_ROCE_V2_EQ_DB_TAG_S        0
-#define HNS_ROCE_V2_EQ_DB_TAG_M        GENMASK(7, 0)
+#define V2_EQ_DB_CMD_S 16
+#define V2_EQ_DB_CMD_M GENMASK(17, 16)
 
-#define HNS_ROCE_V2_EQ_DB_PARA_S 0
-#define HNS_ROCE_V2_EQ_DB_PARA_M GENMASK(23, 0)
+#define V2_EQ_DB_CONS_IDX_S 0
+#define V2_EQ_DB_CONS_IDX_M GENMASK(23, 0)
 
 #define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S 0
 #define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M GENMASK(23, 0)
index c9c0836..6c6e82b 100644 (file)
@@ -42,7 +42,7 @@
 #include "hns_roce_device.h"
 #include "hns_roce_hem.h"
 
-static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u8 port, u8 *addr)
+static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u32 port, u8 *addr)
 {
        u8 phy_port;
        u32 i;
@@ -63,7 +63,7 @@ static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u8 port, u8 *addr)
 static int hns_roce_add_gid(const struct ib_gid_attr *attr, void **context)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(attr->device);
-       u8 port = attr->port_num - 1;
+       u32 port = attr->port_num - 1;
        int ret;
 
        if (port >= hr_dev->caps.num_ports)
@@ -77,7 +77,7 @@ static int hns_roce_add_gid(const struct ib_gid_attr *attr, void **context)
 static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(attr->device);
-       u8 port = attr->port_num - 1;
+       u32 port = attr->port_num - 1;
        int ret;
 
        if (port >= hr_dev->caps.num_ports)
@@ -88,7 +88,7 @@ static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context)
        return ret;
 }
 
-static int handle_en_event(struct hns_roce_dev *hr_dev, u8 port,
+static int handle_en_event(struct hns_roce_dev *hr_dev, u32 port,
                           unsigned long event)
 {
        struct device *dev = hr_dev->dev;
@@ -128,7 +128,7 @@ static int hns_roce_netdev_event(struct notifier_block *self,
        struct hns_roce_ib_iboe *iboe = NULL;
        struct hns_roce_dev *hr_dev = NULL;
        int ret;
-       u8 port;
+       u32 port;
 
        hr_dev = container_of(self, struct hns_roce_dev, iboe.nb);
        iboe = &hr_dev->iboe;
@@ -207,10 +207,13 @@ static int hns_roce_query_device(struct ib_device *ib_dev,
                props->max_fast_reg_page_list_len = HNS_ROCE_FRMR_MAX_PA;
        }
 
+       if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_XRC)
+               props->device_cap_flags |= IB_DEVICE_XRC;
+
        return 0;
 }
 
-static int hns_roce_query_port(struct ib_device *ib_dev, u8 port_num,
+static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num,
                               struct ib_port_attr *props)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
@@ -218,7 +221,7 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u8 port_num,
        struct net_device *net_dev;
        unsigned long flags;
        enum ib_mtu mtu;
-       u8 port;
+       u32 port;
 
        port = port_num - 1;
 
@@ -258,12 +261,12 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u8 port_num,
 }
 
 static enum rdma_link_layer hns_roce_get_link_layer(struct ib_device *device,
-                                                   u8 port_num)
+                                                   u32 port_num)
 {
        return IB_LINK_LAYER_ETHERNET;
 }
 
-static int hns_roce_query_pkey(struct ib_device *ib_dev, u8 port, u16 index,
+static int hns_roce_query_pkey(struct ib_device *ib_dev, u32 port, u16 index,
                               u16 *pkey)
 {
        *pkey = PKEY_ID;
@@ -300,12 +303,14 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
                return -EAGAIN;
 
        resp.qp_tab_size = hr_dev->caps.num_qps;
+       resp.srq_tab_size = hr_dev->caps.num_srqs;
 
        ret = hns_roce_uar_alloc(hr_dev, &context->uar);
        if (ret)
                goto error_fail_uar_alloc;
 
-       if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) {
+       if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB ||
+           hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) {
                INIT_LIST_HEAD(&context->page_list);
                mutex_init(&context->page_mutex);
        }
@@ -365,7 +370,7 @@ static int hns_roce_mmap(struct ib_ucontext *context,
        }
 }
 
-static int hns_roce_port_immutable(struct ib_device *ib_dev, u8 port_num,
+static int hns_roce_port_immutable(struct ib_device *ib_dev, u32 port_num,
                                   struct ib_port_immutable *immutable)
 {
        struct ib_port_attr attr;
@@ -390,6 +395,19 @@ static void hns_roce_disassociate_ucontext(struct ib_ucontext *ibcontext)
 {
 }
 
+static void hns_roce_get_fw_ver(struct ib_device *device, char *str)
+{
+       u64 fw_ver = to_hr_dev(device)->caps.fw_ver;
+       unsigned int major, minor, sub_minor;
+
+       major = upper_32_bits(fw_ver);
+       minor = high_16_bits(lower_32_bits(fw_ver));
+       sub_minor = low_16_bits(fw_ver);
+
+       snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%04u", major, minor,
+                sub_minor);
+}
+
 static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev)
 {
        struct hns_roce_ib_iboe *iboe = &hr_dev->iboe;
@@ -405,6 +423,7 @@ static const struct ib_device_ops hns_roce_dev_ops = {
        .uverbs_abi_ver = 1,
        .uverbs_no_driver_id_binding = 1,
 
+       .get_dev_fw_str = hns_roce_get_fw_ver,
        .add_gid = hns_roce_add_gid,
        .alloc_pd = hns_roce_alloc_pd,
        .alloc_ucontext = hns_roce_alloc_ucontext,
@@ -461,6 +480,13 @@ static const struct ib_device_ops hns_roce_dev_srq_ops = {
        INIT_RDMA_OBJ_SIZE(ib_srq, hns_roce_srq, ibsrq),
 };
 
+static const struct ib_device_ops hns_roce_dev_xrcd_ops = {
+       .alloc_xrcd = hns_roce_alloc_xrcd,
+       .dealloc_xrcd = hns_roce_dealloc_xrcd,
+
+       INIT_RDMA_OBJ_SIZE(ib_xrcd, hns_roce_xrcd, ibxrcd),
+};
+
 static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
 {
        int ret;
@@ -484,20 +510,20 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
        if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_REREG_MR)
                ib_set_device_ops(ib_dev, &hns_roce_dev_mr_ops);
 
-       /* MW */
        if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_MW)
                ib_set_device_ops(ib_dev, &hns_roce_dev_mw_ops);
 
-       /* FRMR */
        if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_FRMR)
                ib_set_device_ops(ib_dev, &hns_roce_dev_frmr_ops);
 
-       /* SRQ */
        if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) {
                ib_set_device_ops(ib_dev, &hns_roce_dev_srq_ops);
                ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_srq_ops);
        }
 
+       if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_XRC)
+               ib_set_device_ops(ib_dev, &hns_roce_dev_xrcd_ops);
+
        ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops);
        ib_set_device_ops(ib_dev, &hns_roce_dev_ops);
        for (i = 0; i < hr_dev->caps.num_ports; i++) {
@@ -704,7 +730,8 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev)
        spin_lock_init(&hr_dev->sm_lock);
        spin_lock_init(&hr_dev->bt_cmd_lock);
 
-       if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) {
+       if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB ||
+           hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) {
                INIT_LIST_HEAD(&hr_dev->pgdir_list);
                mutex_init(&hr_dev->pgdir_mutex);
        }
@@ -727,10 +754,19 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev)
                goto err_uar_alloc_free;
        }
 
+       if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_XRC) {
+               ret = hns_roce_init_xrcd_table(hr_dev);
+               if (ret) {
+                       dev_err(dev, "failed to init xrcd table, ret = %d.\n",
+                               ret);
+                       goto err_pd_table_free;
+               }
+       }
+
        ret = hns_roce_init_mr_table(hr_dev);
        if (ret) {
                dev_err(dev, "Failed to init memory region table.\n");
-               goto err_pd_table_free;
+               goto err_xrcd_table_free;
        }
 
        hns_roce_init_cq_table(hr_dev);
@@ -759,6 +795,10 @@ err_cq_table_free:
        hns_roce_cleanup_cq_table(hr_dev);
        hns_roce_cleanup_mr_table(hr_dev);
 
+err_xrcd_table_free:
+       if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_XRC)
+               hns_roce_cleanup_xrcd_table(hr_dev);
+
 err_pd_table_free:
        hns_roce_cleanup_pd_table(hr_dev);
 
@@ -886,6 +926,8 @@ int hns_roce_init(struct hns_roce_dev *hr_dev)
 
        INIT_LIST_HEAD(&hr_dev->qp_list);
        spin_lock_init(&hr_dev->qp_list_lock);
+       INIT_LIST_HEAD(&hr_dev->dip_list);
+       spin_lock_init(&hr_dev->dip_list_lock);
 
        ret = hns_roce_register_device(hr_dev);
        if (ret)
index cca818d..a5813bf 100644 (file)
@@ -137,3 +137,62 @@ void hns_roce_cleanup_uar_table(struct hns_roce_dev *hr_dev)
 {
        hns_roce_bitmap_cleanup(&hr_dev->uar_table.bitmap);
 }
+
+static int hns_roce_xrcd_alloc(struct hns_roce_dev *hr_dev, u32 *xrcdn)
+{
+       unsigned long obj;
+       int ret;
+
+       ret = hns_roce_bitmap_alloc(&hr_dev->xrcd_bitmap, &obj);
+       if (ret)
+               return ret;
+
+       *xrcdn = obj;
+
+       return 0;
+}
+
+static void hns_roce_xrcd_free(struct hns_roce_dev *hr_dev,
+                              u32 xrcdn)
+{
+       hns_roce_bitmap_free(&hr_dev->xrcd_bitmap, xrcdn, BITMAP_NO_RR);
+}
+
+int hns_roce_init_xrcd_table(struct hns_roce_dev *hr_dev)
+{
+       return hns_roce_bitmap_init(&hr_dev->xrcd_bitmap,
+                                   hr_dev->caps.num_xrcds,
+                                   hr_dev->caps.num_xrcds - 1,
+                                   hr_dev->caps.reserved_xrcds, 0);
+}
+
+void hns_roce_cleanup_xrcd_table(struct hns_roce_dev *hr_dev)
+{
+       hns_roce_bitmap_cleanup(&hr_dev->xrcd_bitmap);
+}
+
+int hns_roce_alloc_xrcd(struct ib_xrcd *ib_xrcd, struct ib_udata *udata)
+{
+       struct hns_roce_dev *hr_dev = to_hr_dev(ib_xrcd->device);
+       struct hns_roce_xrcd *xrcd = to_hr_xrcd(ib_xrcd);
+       int ret;
+
+       if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_XRC))
+               return -EINVAL;
+
+       ret = hns_roce_xrcd_alloc(hr_dev, &xrcd->xrcdn);
+       if (ret) {
+               dev_err(hr_dev->dev, "failed to alloc xrcdn, ret = %d.\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+int hns_roce_dealloc_xrcd(struct ib_xrcd *ib_xrcd, struct ib_udata *udata)
+{
+       hns_roce_xrcd_free(to_hr_dev(ib_xrcd->device),
+                          to_hr_xrcd(ib_xrcd)->xrcdn);
+
+       return 0;
+}
index 004aca9..230a909 100644 (file)
@@ -98,7 +98,9 @@ void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type)
        if (hr_dev->hw_rev != HNS_ROCE_HW_VER1 &&
            (event_type == HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR ||
             event_type == HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR ||
-            event_type == HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR)) {
+            event_type == HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR ||
+            event_type == HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION ||
+            event_type == HNS_ROCE_EVENT_TYPE_INVALID_XRCETH)) {
                qp->state = IB_QPS_ERR;
                if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG, &qp->flush_flag))
                        init_flush_work(hr_dev, qp);
@@ -142,6 +144,8 @@ static void hns_roce_ib_qp_event(struct hns_roce_qp *hr_qp,
                        event.event = IB_EVENT_QP_REQ_ERR;
                        break;
                case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+               case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION:
+               case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH:
                        event.event = IB_EVENT_QP_ACCESS_ERR;
                        break;
                default:
@@ -366,8 +370,13 @@ void hns_roce_qp_remove(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
        unsigned long flags;
 
        list_del(&hr_qp->node);
-       list_del(&hr_qp->sq_node);
-       list_del(&hr_qp->rq_node);
+
+       if (hr_qp->ibqp.qp_type != IB_QPT_XRC_TGT)
+               list_del(&hr_qp->sq_node);
+
+       if (hr_qp->ibqp.qp_type != IB_QPT_XRC_INI &&
+           hr_qp->ibqp.qp_type != IB_QPT_XRC_TGT)
+               list_del(&hr_qp->rq_node);
 
        xa_lock_irqsave(xa, flags);
        __xa_erase(xa, hr_qp->qpn & (hr_dev->caps.num_qps - 1));
@@ -478,7 +487,9 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap,
                                            hr_qp->rq.max_gs);
 
        hr_qp->rq.wqe_cnt = cnt;
-       if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE)
+       if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE &&
+           hr_qp->ibqp.qp_type != IB_QPT_UD &&
+           hr_qp->ibqp.qp_type != IB_QPT_GSI)
                hr_qp->rq_inl_buf.wqe_cnt = cnt;
        else
                hr_qp->rq_inl_buf.wqe_cnt = 0;
@@ -776,7 +787,7 @@ static inline bool user_qp_has_sdb(struct hns_roce_dev *hr_dev,
                                   struct hns_roce_ib_create_qp_resp *resp,
                                   struct hns_roce_ib_create_qp *ucmd)
 {
-       return ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) &&
+       return ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) &&
                udata->outlen >= offsetofend(typeof(*resp), cap_flags) &&
                hns_roce_qp_has_sq(init_attr) &&
                udata->inlen >= offsetofend(typeof(*ucmd), sdb_addr));
@@ -787,7 +798,7 @@ static inline bool user_qp_has_rdb(struct hns_roce_dev *hr_dev,
                                   struct ib_udata *udata,
                                   struct hns_roce_ib_create_qp_resp *resp)
 {
-       return ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+       return ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) &&
                udata->outlen >= offsetofend(typeof(*resp), cap_flags) &&
                hns_roce_qp_has_rq(init_attr));
 }
@@ -795,7 +806,7 @@ static inline bool user_qp_has_rdb(struct hns_roce_dev *hr_dev,
 static inline bool kernel_qp_has_rdb(struct hns_roce_dev *hr_dev,
                                     struct ib_qp_init_attr *init_attr)
 {
-       return ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+       return ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) &&
                hns_roce_qp_has_rq(init_attr));
 }
 
@@ -840,11 +851,16 @@ static int alloc_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
                        resp->cap_flags |= HNS_ROCE_QP_CAP_RQ_RECORD_DB;
                }
        } else {
-               /* QP doorbell register address */
-               hr_qp->sq.db_reg_l = hr_dev->reg_base + hr_dev->sdb_offset +
-                                    DB_REG_OFFSET * hr_dev->priv_uar.index;
-               hr_qp->rq.db_reg_l = hr_dev->reg_base + hr_dev->odb_offset +
-                                    DB_REG_OFFSET * hr_dev->priv_uar.index;
+               if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
+                       hr_qp->sq.db_reg = hr_dev->mem_base +
+                                          HNS_ROCE_DWQE_SIZE * hr_qp->qpn;
+               else
+                       hr_qp->sq.db_reg =
+                               hr_dev->reg_base + hr_dev->sdb_offset +
+                               DB_REG_OFFSET * hr_dev->priv_uar.index;
+
+               hr_qp->rq.db_reg = hr_dev->reg_base + hr_dev->odb_offset +
+                                  DB_REG_OFFSET * hr_dev->priv_uar.index;
 
                if (kernel_qp_has_rdb(hr_dev, init_attr)) {
                        ret = hns_roce_alloc_db(hr_dev, &hr_qp->rdb, 0);
@@ -1011,36 +1027,36 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
                }
        }
 
-       ret = alloc_qp_db(hr_dev, hr_qp, init_attr, udata, &ucmd, &resp);
-       if (ret) {
-               ibdev_err(ibdev, "failed to alloc QP doorbell, ret = %d.\n",
-                         ret);
-               goto err_wrid;
-       }
-
        ret = alloc_qp_buf(hr_dev, hr_qp, init_attr, udata, ucmd.buf_addr);
        if (ret) {
                ibdev_err(ibdev, "failed to alloc QP buffer, ret = %d.\n", ret);
-               goto err_db;
+               goto err_buf;
        }
 
        ret = alloc_qpn(hr_dev, hr_qp);
        if (ret) {
                ibdev_err(ibdev, "failed to alloc QPN, ret = %d.\n", ret);
-               goto err_buf;
+               goto err_qpn;
+       }
+
+       ret = alloc_qp_db(hr_dev, hr_qp, init_attr, udata, &ucmd, &resp);
+       if (ret) {
+               ibdev_err(ibdev, "failed to alloc QP doorbell, ret = %d.\n",
+                         ret);
+               goto err_db;
        }
 
        ret = alloc_qpc(hr_dev, hr_qp);
        if (ret) {
                ibdev_err(ibdev, "failed to alloc QP context, ret = %d.\n",
                          ret);
-               goto err_qpn;
+               goto err_qpc;
        }
 
        ret = hns_roce_qp_store(hr_dev, hr_qp, init_attr);
        if (ret) {
                ibdev_err(ibdev, "failed to store QP, ret = %d.\n", ret);
-               goto err_qpc;
+               goto err_store;
        }
 
        if (udata) {
@@ -1055,7 +1071,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
        if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL) {
                ret = hr_dev->hw->qp_flow_control_init(hr_dev, hr_qp);
                if (ret)
-                       goto err_store;
+                       goto err_flow_ctrl;
        }
 
        hr_qp->ibqp.qp_num = hr_qp->qpn;
@@ -1065,17 +1081,17 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 
        return 0;
 
-err_store:
+err_flow_ctrl:
        hns_roce_qp_remove(hr_dev, hr_qp);
-err_qpc:
+err_store:
        free_qpc(hr_dev, hr_qp);
-err_qpn:
+err_qpc:
+       free_qp_db(hr_dev, hr_qp, udata);
+err_db:
        free_qpn(hr_dev, hr_qp);
-err_buf:
+err_qpn:
        free_qp_buf(hr_dev, hr_qp);
-err_db:
-       free_qp_db(hr_dev, hr_qp, udata);
-err_wrid:
+err_buf:
        free_kernel_wrid(hr_qp);
        return ret;
 }
@@ -1100,11 +1116,16 @@ static int check_qp_type(struct hns_roce_dev *hr_dev, enum ib_qp_type type,
                         bool is_user)
 {
        switch (type) {
+       case IB_QPT_XRC_INI:
+       case IB_QPT_XRC_TGT:
+               if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_XRC))
+                       goto out;
+               break;
        case IB_QPT_UD:
                if (hr_dev->pci_dev->revision <= PCI_REVISION_ID_HIP08 &&
                    is_user)
                        goto out;
-               fallthrough;
+               break;
        case IB_QPT_RC:
        case IB_QPT_GSI:
                break;
@@ -1124,8 +1145,8 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd,
                                 struct ib_qp_init_attr *init_attr,
                                 struct ib_udata *udata)
 {
-       struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
-       struct ib_device *ibdev = &hr_dev->ib_dev;
+       struct ib_device *ibdev = pd ? pd->device : init_attr->xrcd->device;
+       struct hns_roce_dev *hr_dev = to_hr_dev(ibdev);
        struct hns_roce_qp *hr_qp;
        int ret;
 
@@ -1137,6 +1158,15 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd,
        if (!hr_qp)
                return ERR_PTR(-ENOMEM);
 
+       if (init_attr->qp_type == IB_QPT_XRC_INI)
+               init_attr->recv_cq = NULL;
+
+       if (init_attr->qp_type == IB_QPT_XRC_TGT) {
+               hr_qp->xrcdn = to_hr_xrcd(init_attr->xrcd)->xrcdn;
+               init_attr->recv_cq = NULL;
+               init_attr->send_cq = NULL;
+       }
+
        if (init_attr->qp_type == IB_QPT_GSI) {
                hr_qp->port = init_attr->port_num - 1;
                hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port];
@@ -1156,20 +1186,18 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd,
 
 int to_hr_qp_type(int qp_type)
 {
-       int transport_type;
-
-       if (qp_type == IB_QPT_RC)
-               transport_type = SERV_TYPE_RC;
-       else if (qp_type == IB_QPT_UC)
-               transport_type = SERV_TYPE_UC;
-       else if (qp_type == IB_QPT_UD)
-               transport_type = SERV_TYPE_UD;
-       else if (qp_type == IB_QPT_GSI)
-               transport_type = SERV_TYPE_UD;
-       else
-               transport_type = -1;
-
-       return transport_type;
+       switch (qp_type) {
+       case IB_QPT_RC:
+               return SERV_TYPE_RC;
+       case IB_QPT_UD:
+       case IB_QPT_GSI:
+               return SERV_TYPE_UD;
+       case IB_QPT_XRC_INI:
+       case IB_QPT_XRC_TGT:
+               return SERV_TYPE_XRC;
+       default:
+               return -1;
+       }
 }
 
 static int check_mtu_validate(struct hns_roce_dev *hr_dev,
index d5a6de0..546d182 100644 (file)
@@ -314,6 +314,9 @@ static void set_srq_ext_param(struct hns_roce_srq *srq,
 {
        srq->cqn = ib_srq_has_cq(init_attr->srq_type) ?
                   to_hr_cq(init_attr->ext.cq)->cqn : 0;
+
+       srq->xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ?
+                    to_hr_xrcd(init_attr->ext.xrc.xrcd)->xrcdn : 0;
 }
 
 static int set_srq_param(struct hns_roce_srq *srq,
@@ -412,7 +415,7 @@ int hns_roce_create_srq(struct ib_srq *ib_srq,
                }
        }
 
-       srq->db_reg_l = hr_dev->reg_base + SRQ_DB_REG;
+       srq->db_reg = hr_dev->reg_base + SRQ_DB_REG;
        srq->event = hns_roce_ib_srq_event;
        atomic_set(&srq->refcount, 1);
        init_completion(&srq->free);
index 6a79502..be4094a 100644 (file)
@@ -504,15 +504,6 @@ static inline void i40iw_free_resource(struct i40iw_device *iwdev,
        spin_unlock_irqrestore(&iwdev->resource_lock, flags);
 }
 
-/**
- * to_iwhdl - Get the handler from the device pointer
- * @iwdev: device pointer
- **/
-static inline struct i40iw_handler *to_iwhdl(struct i40iw_device *iw_dev)
-{
-       return container_of(iw_dev, struct i40iw_handler, device);
-}
-
 struct i40iw_handler *i40iw_find_netdev(struct net_device *netdev);
 
 /**
index ac65c82..2450b7d 100644 (file)
@@ -905,7 +905,7 @@ static int i40iw_send_mpa_reject(struct i40iw_cm_node *cm_node,
 }
 
 /**
- * recv_mpa - process an IETF MPA frame
+ * i40iw_parse_mpa - process an IETF MPA frame
  * @cm_node: connection's node
  * @buffer: Data pointer
  * @type: to return accept or reject
@@ -4360,7 +4360,7 @@ void i40iw_cm_teardown_connections(struct i40iw_device *iwdev, u32 *ipaddr,
 }
 
 /**
- * i40iw_ifdown_notify - process an ifdown on an interface
+ * i40iw_if_notify - process an ifdown on an interface
  * @iwdev: device pointer
  * @netdev: network interface device structure
  * @ipaddr: Pointer to IPv4 or IPv6 address
index 8bd72af..b44bfc1 100644 (file)
@@ -285,7 +285,7 @@ static enum i40iw_status_code i40iw_hmc_finish_add_sd_reg(struct i40iw_sc_dev *d
 }
 
 /**
- * i40iw_create_iw_hmc_obj - allocate backing store for hmc objects
+ * i40iw_sc_create_hmc_obj - allocate backing store for hmc objects
  * @dev: pointer to the device structure
  * @info: pointer to i40iw_hmc_iw_create_obj_info struct
  *
@@ -434,7 +434,7 @@ static enum i40iw_status_code i40iw_finish_del_sd_reg(struct i40iw_sc_dev *dev,
 }
 
 /**
- * i40iw_del_iw_hmc_obj - remove pe hmc objects
+ * i40iw_sc_del_hmc_obj - remove pe hmc objects
  * @dev: pointer to the device structure
  * @info: pointer to i40iw_hmc_del_obj_info struct
  * @reset: true if called before reset
index ab4cb11..b496f30 100644 (file)
@@ -78,7 +78,7 @@ static struct i40e_client i40iw_client;
 static char i40iw_client_name[I40E_CLIENT_STR_LENGTH] = "i40iw";
 
 static LIST_HEAD(i40iw_handlers);
-static spinlock_t i40iw_handler_lock;
+static DEFINE_SPINLOCK(i40iw_handler_lock);
 
 static enum i40iw_status_code i40iw_virtchnl_send(struct i40iw_sc_dev *dev,
                                                  u32 vf_id, u8 *msg, u16 len);
@@ -251,7 +251,7 @@ static void i40iw_destroy_cqp(struct i40iw_device *iwdev, bool free_hwcqp)
 }
 
 /**
- * i40iw_disable_irqs - disable device interrupts
+ * i40iw_disable_irq - disable device interrupts
  * @dev: hardware control device structure
  * @msix_vec: msix vector to disable irq
  * @dev_id: parameter to pass to free_irq (used during irq setup)
@@ -2043,7 +2043,6 @@ static int __init i40iw_init_module(void)
        i40iw_client.ops = &i40e_ops;
        memcpy(i40iw_client.name, i40iw_client_name, I40E_CLIENT_STR_LENGTH);
        i40iw_client.type = I40E_CLIENT_IWARP;
-       spin_lock_init(&i40iw_handler_lock);
        ret = i40e_register_client(&i40iw_client);
        i40iw_register_notifiers();
 
index d474aad..d938ccb 100644 (file)
@@ -50,17 +50,6 @@ static inline void set_64bit_val(u64 *wqe_words, u32 byte_index, u64 value)
        wqe_words[byte_index >> 3] = value;
 }
 
-/**
- * set_32bit_val - set 32 value to hw wqe
- * @wqe_words: wqe addr to write
- * @byte_index: index in wqe
- * @value: value to write
- **/
-static inline void set_32bit_val(u32 *wqe_words, u32 byte_index, u32 value)
-{
-       wqe_words[byte_index >> 2] = value;
-}
-
 /**
  * get_64bit_val - read 64 bit value from wqe
  * @wqe_words: wqe addr
@@ -72,17 +61,6 @@ static inline void get_64bit_val(u64 *wqe_words, u32 byte_index, u64 *value)
        *value = wqe_words[byte_index >> 3];
 }
 
-/**
- * get_32bit_val - read 32 bit value from wqe
- * @wqe_words: wqe addr
- * @byte_index: index to reaad from
- * @value: return 32 bit value
- **/
-static inline void get_32bit_val(u32 *wqe_words, u32 byte_index, u32 *value)
-{
-       *value = wqe_words[byte_index >> 2];
-}
-
 struct i40iw_dma_mem {
        void *va;
        dma_addr_t pa;
index 53e5cd1..146a414 100644 (file)
@@ -393,12 +393,9 @@ static enum i40iw_status_code add_pble_pool(struct i40iw_sc_dev *dev,
        i40iw_debug(dev, I40IW_DEBUG_PBLE, "next_fpm_addr = %llx chunk_size[%u] = 0x%x\n",
                    pble_rsrc->next_fpm_addr, chunk->size, chunk->size);
        pble_rsrc->unallocated_pble -= (chunk->size >> 3);
-       list_add(&chunk->list, &pble_rsrc->pinfo.clist);
        sd_reg_val = (sd_entry_type == I40IW_SD_TYPE_PAGED) ?
                        sd_entry->u.pd_table.pd_page_addr.pa : sd_entry->u.bp.addr.pa;
-       if (sd_entry->valid)
-               return 0;
-       if (dev->is_pf) {
+       if (dev->is_pf && !sd_entry->valid) {
                ret_code = i40iw_hmc_sd_one(dev, hmc_info->hmc_fn_id,
                                            sd_reg_val, idx->sd_idx,
                                            sd_entry->entry_type, true);
@@ -409,6 +406,7 @@ static enum i40iw_status_code add_pble_pool(struct i40iw_sc_dev *dev,
        }
 
        sd_entry->valid = true;
+       list_add(&chunk->list, &pble_rsrc->pinfo.clist);
        return 0;
  error:
        kfree(chunk);
index d1c8cc0..88fb68e 100644 (file)
@@ -1000,7 +1000,7 @@ static void i40iw_ilq_putback_rcvbuf(struct i40iw_sc_qp *qp, u32 wqe_idx)
 }
 
 /**
- * i40iw_ieq_get_fpdu - given length return fpdu length
+ * i40iw_ieq_get_fpdu_length - given length return fpdu length
  * @length: length if fpdu
  */
 static u16 i40iw_ieq_get_fpdu_length(u16 length)
index 76f052b..9ff825f 100644 (file)
@@ -890,7 +890,7 @@ void i40iw_terminate_done(struct i40iw_sc_qp *qp, int timeout_occurred)
 }
 
 /**
- * i40iw_terminate_imeout - timeout happened
+ * i40iw_terminate_timeout - timeout happened
  * @t: points to iwarp qp
  */
 static void i40iw_terminate_timeout(struct timer_list *t)
index f18d146..b876d72 100644 (file)
@@ -94,7 +94,7 @@ static int i40iw_query_device(struct ib_device *ibdev,
  * @props: returning device attributes
  */
 static int i40iw_query_port(struct ib_device *ibdev,
-                           u8 port,
+                           u32 port,
                            struct ib_port_attr *props)
 {
        props->lid = 1;
@@ -647,7 +647,7 @@ error:
 }
 
 /**
- * i40iw_query - query qp attributes
+ * i40iw_query_qp - query qp attributes
  * @ibqp: qp pointer
  * @attr: attributes pointer
  * @attr_mask: Not used
@@ -1846,7 +1846,7 @@ static struct ib_mr *i40iw_get_dma_mr(struct ib_pd *pd, int acc)
 }
 
 /**
- * i40iw_del_mem_list - Deleting pbl list entries for CQ/QP
+ * i40iw_del_memlist - Deleting pbl list entries for CQ/QP
  * @iwmr: iwmr for IB's user page addresses
  * @ucontext: ptr to user context
  */
@@ -2347,7 +2347,7 @@ static int i40iw_req_notify_cq(struct ib_cq *ibcq,
  * @port_num: port number
  * @immutable: immutable data for the port return
  */
-static int i40iw_port_immutable(struct ib_device *ibdev, u8 port_num,
+static int i40iw_port_immutable(struct ib_device *ibdev, u32 port_num,
                                struct ib_port_immutable *immutable)
 {
        struct ib_port_attr attr;
@@ -2446,7 +2446,7 @@ static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str)
  * @port_num: port number
  */
 static struct rdma_hw_stats *i40iw_alloc_hw_stats(struct ib_device *ibdev,
-                                                 u8 port_num)
+                                                 u32 port_num)
 {
        struct i40iw_device *iwdev = to_iwdev(ibdev);
        struct i40iw_sc_dev *dev = &iwdev->sc_dev;
@@ -2477,7 +2477,7 @@ static struct rdma_hw_stats *i40iw_alloc_hw_stats(struct ib_device *ibdev,
  */
 static int i40iw_get_hw_stats(struct ib_device *ibdev,
                              struct rdma_hw_stats *stats,
-                             u8 port_num, int index)
+                             u32 port_num, int index)
 {
        struct i40iw_device *iwdev = to_iwdev(ibdev);
        struct i40iw_sc_dev *dev = &iwdev->sc_dev;
@@ -2504,7 +2504,7 @@ static int i40iw_get_hw_stats(struct ib_device *ibdev,
  * @gid: Global ID
  */
 static int i40iw_query_gid(struct ib_device *ibdev,
-                          u8 port,
+                          u32 port,
                           int index,
                           union ib_gid *gid)
 {
index aca9061..e34a152 100644 (file)
@@ -333,7 +333,7 @@ static void pf_cqp_get_hmc_fcn_callback(struct i40iw_sc_dev *dev, void *callback
 }
 
 /**
- * pf_add_hmc_obj - Callback for Add HMC Object
+ * pf_add_hmc_obj_callback - Callback for Add HMC Object
  * @work_vf_dev: pointer to the VF Device
  */
 static void pf_add_hmc_obj_callback(void *work_vf_dev)
index cca414e..571d9c5 100644 (file)
@@ -73,12 +73,12 @@ static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port,
                                     int *resched_delay_sec);
 
 void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num,
-                                        u8 port_num, u8 *p_data)
+                                        u32 port_num, u8 *p_data)
 {
        int i;
        u64 guid_indexes;
        int slave_id;
-       int port_index = port_num - 1;
+       u32 port_index = port_num - 1;
 
        if (!mlx4_is_master(dev->dev))
                return;
@@ -86,7 +86,7 @@ void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num,
        guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
                                   ports_guid[port_num - 1].
                                   all_rec_per_port[block_num].guid_indexes);
-       pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes);
+       pr_debug("port: %u, guid_indexes: 0x%llx\n", port_num, guid_indexes);
 
        for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
                /* The location of the specific index starts from bit number 4
@@ -184,7 +184,7 @@ unlock:
  * port_number - 1 or 2
  */
 void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
-                                         int block_num, u8 port_num,
+                                         int block_num, u32 port_num,
                                          u8 *p_data)
 {
        int i;
@@ -206,7 +206,7 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
        guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
                                   ports_guid[port_num - 1].
                                   all_rec_per_port[block_num].guid_indexes);
-       pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes);
+       pr_debug("port: %u, guid_indexes: 0x%llx\n", port_num, guid_indexes);
 
        /*calculate the slaves and notify them*/
        for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
@@ -260,11 +260,11 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
                        new_state = set_and_calc_slave_port_state(dev->dev, slave_id, port_num,
                                                                  MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID,
                                                                  &gen_event);
-                       pr_debug("slave: %d, port: %d prev_port_state: %d,"
+                       pr_debug("slave: %d, port: %u prev_port_state: %d,"
                                 " new_port_state: %d, gen_event: %d\n",
                                 slave_id, port_num, prev_state, new_state, gen_event);
                        if (gen_event == SLAVE_PORT_GEN_EVENT_UP) {
-                               pr_debug("sending PORT_UP event to slave: %d, port: %d\n",
+                               pr_debug("sending PORT_UP event to slave: %d, port: %u\n",
                                         slave_id, port_num);
                                mlx4_gen_port_state_change_eqe(dev->dev, slave_id,
                                                               port_num, MLX4_PORT_CHANGE_SUBTYPE_ACTIVE);
@@ -274,7 +274,7 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
                                                      MLX4_PORT_STATE_IB_EVENT_GID_INVALID,
                                                      &gen_event);
                        if (gen_event == SLAVE_PORT_GEN_EVENT_DOWN) {
-                               pr_debug("sending PORT DOWN event to slave: %d, port: %d\n",
+                               pr_debug("sending PORT DOWN event to slave: %d, port: %u\n",
                                         slave_id, port_num);
                                mlx4_gen_port_state_change_eqe(dev->dev,
                                                               slave_id,
index f3ace85..d13ecbd 100644 (file)
@@ -88,8 +88,8 @@ struct mlx4_rcv_tunnel_mad {
        struct ib_mad mad;
 } __packed;
 
-static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num);
-static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u32 port_num);
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u32 port_num);
 static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
                                int block, u32 change_bitmap);
 
@@ -186,7 +186,7 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
        return err;
 }
 
-static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl)
+static void update_sm_ah(struct mlx4_ib_dev *dev, u32 port_num, u16 lid, u8 sl)
 {
        struct ib_ah *new_ah;
        struct rdma_ah_attr ah_attr;
@@ -217,8 +217,8 @@ static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl)
  * Snoop SM MADs for port info, GUID info, and  P_Key table sets, so we can
  * synthesize LID change, Client-Rereg, GID change, and P_Key change events.
  */
-static void smp_snoop(struct ib_device *ibdev, u8 port_num, const struct ib_mad *mad,
-                     u16 prev_lid)
+static void smp_snoop(struct ib_device *ibdev, u32 port_num,
+                     const struct ib_mad *mad, u16 prev_lid)
 {
        struct ib_port_info *pinfo;
        u16 lid;
@@ -274,7 +274,7 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, const struct ib_mad
                                                be16_to_cpu(base[i]);
                                }
                        }
-                       pr_debug("PKEY Change event: port=%d, "
+                       pr_debug("PKEY Change event: port=%u, "
                                 "block=0x%x, change_bitmap=0x%x\n",
                                 port_num, bn, pkey_change_bitmap);
 
@@ -380,7 +380,8 @@ static void node_desc_override(struct ib_device *dev,
        }
 }
 
-static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, const struct ib_mad *mad)
+static void forward_trap(struct mlx4_ib_dev *dev, u32 port_num,
+                        const struct ib_mad *mad)
 {
        int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
        struct ib_mad_send_buf *send_buf;
@@ -429,7 +430,7 @@ static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave
        return ret;
 }
 
-int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid)
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u32 port, __be64 guid)
 {
        struct mlx4_ib_dev *dev = to_mdev(ibdev);
        int i;
@@ -443,7 +444,7 @@ int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid)
 
 
 static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave,
-                                  u8 port, u16 pkey, u16 *ix)
+                                  u32 port, u16 pkey, u16 *ix)
 {
        int i, ret;
        u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF;
@@ -507,7 +508,7 @@ static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave)
        return (qpn >= proxy_start && qpn <= proxy_start + 1);
 }
 
-int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u32 port,
                          enum ib_qp_type dest_qpt, struct ib_wc *wc,
                          struct ib_grh *grh, struct ib_mad *mad)
 {
@@ -678,7 +679,7 @@ end:
        return ret;
 }
 
-static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port,
+static int mlx4_ib_demux_mad(struct ib_device *ibdev, u32 port,
                        struct ib_wc *wc, struct ib_grh *grh,
                        struct ib_mad *mad)
 {
@@ -818,7 +819,7 @@ static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port,
        return 0;
 }
 
-static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
                        const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                        const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
@@ -932,9 +933,10 @@ static int iboe_process_mad_port_info(void *out_mad)
        return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
 }
 
-static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-                       const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                       const struct ib_mad *in_mad, struct ib_mad *out_mad)
+static int iboe_process_mad(struct ib_device *ibdev, int mad_flags,
+                           u32 port_num, const struct ib_wc *in_wc,
+                           const struct ib_grh *in_grh,
+                           const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
        struct mlx4_counter counter_stats;
        struct mlx4_ib_dev *dev = to_mdev(ibdev);
@@ -979,7 +981,7 @@ static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
        return err;
 }
 
-int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
                        const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                        const struct ib_mad *in, struct ib_mad *out,
                        size_t *out_mad_size, u16 *out_mad_pkey_index)
@@ -1073,7 +1075,7 @@ void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
        }
 }
 
-static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num)
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u32 port_num)
 {
        mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_LID_CHANGE);
 
@@ -1082,7 +1084,7 @@ static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num)
                                            MLX4_EQ_PORT_INFO_LID_CHANGE_MASK);
 }
 
-static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num)
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u32 port_num)
 {
        /* re-configure the alias-guid and mcg's */
        if (mlx4_is_master(dev->dev)) {
@@ -1121,7 +1123,7 @@ static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
                            GET_MASK_FROM_EQE(eqe));
 }
 
-static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u8 port_num,
+static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u32 port_num,
                                      u32 guid_tbl_blk_num, u32 change_bitmap)
 {
        struct ib_smp *in_mad  = NULL;
@@ -1177,7 +1179,7 @@ void handle_port_mgmt_change_event(struct work_struct *work)
        struct ib_event_work *ew = container_of(work, struct ib_event_work, work);
        struct mlx4_ib_dev *dev = ew->ib_dev;
        struct mlx4_eqe *eqe = &(ew->ib_eqe);
-       u8 port = eqe->event.port_mgmt_change.port;
+       u32 port = eqe->event.port_mgmt_change.port;
        u32 changed_attr;
        u32 tbl_block;
        u32 change_bitmap;
@@ -1274,7 +1276,7 @@ void handle_port_mgmt_change_event(struct work_struct *work)
        kfree(ew);
 }
 
-void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num,
+void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u32 port_num,
                            enum ib_event_type type)
 {
        struct ib_event event;
@@ -1351,7 +1353,7 @@ static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port,
        return ret;
 }
 
-int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
+int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u32 port,
                         enum ib_qp_type dest_qpt, u16 pkey_index,
                         u32 remote_qpn, u32 qkey, struct rdma_ah_attr *attr,
                         u8 *s_mac, u16 vlan_id, struct ib_mad *mad)
index f26a0d9..22898d9 100644 (file)
@@ -81,7 +81,7 @@ static const char mlx4_ib_version[] =
 
 static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
 static enum rdma_link_layer mlx4_ib_port_link_layer(struct ib_device *device,
-                                                   u8 port_num);
+                                                   u32 port_num);
 
 static struct workqueue_struct *wq;
 
@@ -129,7 +129,8 @@ static int num_ib_ports(struct mlx4_dev *dev)
        return ib_ports;
 }
 
-static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_num)
+static struct net_device *mlx4_ib_get_netdev(struct ib_device *device,
+                                            u32 port_num)
 {
        struct mlx4_ib_dev *ibdev = to_mdev(device);
        struct net_device *dev;
@@ -160,7 +161,7 @@ static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_n
 
 static int mlx4_ib_update_gids_v1(struct gid_entry *gids,
                                  struct mlx4_ib_dev *ibdev,
-                                 u8 port_num)
+                                 u32 port_num)
 {
        struct mlx4_cmd_mailbox *mailbox;
        int err;
@@ -193,7 +194,7 @@ static int mlx4_ib_update_gids_v1(struct gid_entry *gids,
 
 static int mlx4_ib_update_gids_v1_v2(struct gid_entry *gids,
                                     struct mlx4_ib_dev *ibdev,
-                                    u8 port_num)
+                                    u32 port_num)
 {
        struct mlx4_cmd_mailbox *mailbox;
        int err;
@@ -238,7 +239,7 @@ static int mlx4_ib_update_gids_v1_v2(struct gid_entry *gids,
 
 static int mlx4_ib_update_gids(struct gid_entry *gids,
                               struct mlx4_ib_dev *ibdev,
-                              u8 port_num)
+                              u32 port_num)
 {
        if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
                return mlx4_ib_update_gids_v1_v2(gids, ibdev, port_num);
@@ -407,7 +408,7 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
        int real_index = -EINVAL;
        int i;
        unsigned long flags;
-       u8 port_num = attr->port_num;
+       u32 port_num = attr->port_num;
 
        if (port_num > MLX4_MAX_PORTS)
                return -EINVAL;
@@ -649,7 +650,7 @@ out:
 }
 
 static enum rdma_link_layer
-mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num)
+mlx4_ib_port_link_layer(struct ib_device *device, u32 port_num)
 {
        struct mlx4_dev *dev = to_mdev(device)->dev;
 
@@ -657,7 +658,7 @@ mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num)
                IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
 }
 
-static int ib_link_query_port(struct ib_device *ibdev, u8 port,
+static int ib_link_query_port(struct ib_device *ibdev, u32 port,
                              struct ib_port_attr *props, int netw_view)
 {
        struct ib_smp *in_mad  = NULL;
@@ -753,7 +754,7 @@ static u8 state_to_phys_state(enum ib_port_state state)
                IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
 }
 
-static int eth_link_query_port(struct ib_device *ibdev, u8 port,
+static int eth_link_query_port(struct ib_device *ibdev, u32 port,
                               struct ib_port_attr *props)
 {
 
@@ -814,7 +815,7 @@ out:
        return err;
 }
 
-int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+int __mlx4_ib_query_port(struct ib_device *ibdev, u32 port,
                         struct ib_port_attr *props, int netw_view)
 {
        int err;
@@ -828,14 +829,14 @@ int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
        return err;
 }
 
-static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+static int mlx4_ib_query_port(struct ib_device *ibdev, u32 port,
                              struct ib_port_attr *props)
 {
        /* returns host view */
        return __mlx4_ib_query_port(ibdev, port, props, 0);
 }
 
-int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+int __mlx4_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
                        union ib_gid *gid, int netw_view)
 {
        struct ib_smp *in_mad  = NULL;
@@ -891,7 +892,7 @@ out:
        return err;
 }
 
-static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+static int mlx4_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
                             union ib_gid *gid)
 {
        if (rdma_protocol_ib(ibdev, port))
@@ -899,7 +900,8 @@ static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
        return 0;
 }
 
-static int mlx4_ib_query_sl2vl(struct ib_device *ibdev, u8 port, u64 *sl2vl_tbl)
+static int mlx4_ib_query_sl2vl(struct ib_device *ibdev, u32 port,
+                              u64 *sl2vl_tbl)
 {
        union sl2vl_tbl_to_u64 sl2vl64;
        struct ib_smp *in_mad  = NULL;
@@ -959,7 +961,7 @@ static void mlx4_init_sl2vl_tbl(struct mlx4_ib_dev *mdev)
        }
 }
 
-int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+int __mlx4_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
                         u16 *pkey, int netw_view)
 {
        struct ib_smp *in_mad  = NULL;
@@ -992,7 +994,8 @@ out:
        return err;
 }
 
-static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+static int mlx4_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
+                             u16 *pkey)
 {
        return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0);
 }
@@ -1033,8 +1036,8 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
        return 0;
 }
 
-static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols,
-                           u32 cap_mask)
+static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u32 port,
+                           int reset_qkey_viols, u32 cap_mask)
 {
        struct mlx4_cmd_mailbox *mailbox;
        int err;
@@ -1059,7 +1062,7 @@ static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_vio
        return err;
 }
 
-static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
+static int mlx4_ib_modify_port(struct ib_device *ibdev, u32 port, int mask,
                               struct ib_port_modify *props)
 {
        struct mlx4_ib_dev *mdev = to_mdev(ibdev);
@@ -2103,7 +2106,7 @@ static const struct diag_counter diag_device_only[] = {
 };
 
 static struct rdma_hw_stats *mlx4_ib_alloc_hw_stats(struct ib_device *ibdev,
-                                                   u8 port_num)
+                                                   u32 port_num)
 {
        struct mlx4_ib_dev *dev = to_mdev(ibdev);
        struct mlx4_ib_diag_counters *diag = dev->diag_counters;
@@ -2118,7 +2121,7 @@ static struct rdma_hw_stats *mlx4_ib_alloc_hw_stats(struct ib_device *ibdev,
 
 static int mlx4_ib_get_hw_stats(struct ib_device *ibdev,
                                struct rdma_hw_stats *stats,
-                               u8 port, int index)
+                               u32 port, int index)
 {
        struct mlx4_ib_dev *dev = to_mdev(ibdev);
        struct mlx4_ib_diag_counters *diag = dev->diag_counters;
@@ -2466,7 +2469,7 @@ static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
        ibdev->eq_table = NULL;
 }
 
-static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
+static int mlx4_port_immutable(struct ib_device *ibdev, u32 port_num,
                               struct ib_port_immutable *immutable)
 {
        struct ib_port_attr attr;
index 78c9bb7..e856cf2 100644 (file)
@@ -429,7 +429,7 @@ struct mlx4_sriov_alias_guid_port_rec_det {
        struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT];
        struct workqueue_struct *wq;
        struct delayed_work alias_guid_work;
-       u8 port;
+       u32 port;
        u32 state_flags;
        struct mlx4_sriov_alias_guid *parent;
        struct list_head cb_list;
@@ -657,7 +657,7 @@ struct mlx4_ib_qp_tunnel_init_attr {
        struct ib_qp_init_attr init_attr;
        int slave;
        enum ib_qp_type proxy_qp_type;
-       u8 port;
+       u32 port;
 };
 
 struct mlx4_uverbs_ex_query_device {
@@ -810,24 +810,24 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
 int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
                 int port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                 const void *in_mad, void *response_mad);
-int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
                        const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                        const struct ib_mad *in, struct ib_mad *out,
                        size_t *out_mad_size, u16 *out_mad_pkey_index);
 int mlx4_ib_mad_init(struct mlx4_ib_dev *dev);
 void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev);
 
-int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+int __mlx4_ib_query_port(struct ib_device *ibdev, u32 port,
                         struct ib_port_attr *props, int netw_view);
-int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+int __mlx4_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
                         u16 *pkey, int netw_view);
 
-int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+int __mlx4_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
                        union ib_gid *gid, int netw_view);
 
 static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
 {
-       u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3;
+       u32 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3;
 
        if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET)
                return true;
@@ -841,7 +841,7 @@ void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave);
 int mlx4_ib_mcg_init(void);
 void mlx4_ib_mcg_destroy(void);
 
-int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid);
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u32 port, __be64 guid);
 
 int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave,
                                  struct ib_sa_mad *sa_mad);
@@ -851,16 +851,16 @@ int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
 int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
                   union ib_gid *gid);
 
-void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num,
+void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u32 port_num,
                            enum ib_event_type type);
 
 void mlx4_ib_tunnels_update_work(struct work_struct *work);
 
-int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u32 port,
                          enum ib_qp_type qpt, struct ib_wc *wc,
                          struct ib_grh *grh, struct ib_mad *mad);
 
-int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
+int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u32 port,
                         enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn,
                         u32 qkey, struct rdma_ah_attr *attr, u8 *s_mac,
                         u16 vlan_id, struct ib_mad *mad);
@@ -884,10 +884,10 @@ void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port);
 
 void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
                                          int block_num,
-                                         u8 port_num, u8 *p_data);
+                                         u32 port_num, u8 *p_data);
 
 void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev,
-                                        int block_num, u8 port_num,
+                                        int block_num, u32 port_num,
                                         u8 *p_data);
 
 int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
index 651785b..92ddbcc 100644 (file)
@@ -3135,7 +3135,6 @@ static int build_mlx_header(struct mlx4_ib_qp *qp, const struct ib_ud_wr *wr,
        }
 
        if (is_eth) {
-               struct in6_addr in6;
                u16 ether_type;
                u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
 
@@ -3148,8 +3147,6 @@ static int build_mlx_header(struct mlx4_ib_qp *qp, const struct ib_ud_wr *wr,
                memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
                memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
                memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
-               memcpy(&in6, sgid.raw, sizeof(in6));
-
 
                if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
                        mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
index b4c009b..f433801 100644 (file)
@@ -6,6 +6,7 @@ mlx5_ib-y := ah.o \
             cong.o \
             counters.o \
             cq.o \
+            dm.o \
             doorbell.o \
             gsi.o \
             ib_virt.o \
index 234f299..a8db8a0 100644 (file)
@@ -47,107 +47,6 @@ int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
        return mlx5_cmd_exec_inout(dev, query_cong_params, in, out);
 }
 
-int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr,
-                        u64 length, u32 alignment)
-{
-       struct mlx5_core_dev *dev = dm->dev;
-       u64 num_memic_hw_pages = MLX5_CAP_DEV_MEM(dev, memic_bar_size)
-                                       >> PAGE_SHIFT;
-       u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr);
-       u32 max_alignment = MLX5_CAP_DEV_MEM(dev, log_max_memic_addr_alignment);
-       u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE);
-       u32 out[MLX5_ST_SZ_DW(alloc_memic_out)] = {};
-       u32 in[MLX5_ST_SZ_DW(alloc_memic_in)] = {};
-       u32 mlx5_alignment;
-       u64 page_idx = 0;
-       int ret = 0;
-
-       if (!length || (length & MLX5_MEMIC_ALLOC_SIZE_MASK))
-               return -EINVAL;
-
-       /* mlx5 device sets alignment as 64*2^driver_value
-        * so normalizing is needed.
-        */
-       mlx5_alignment = (alignment < MLX5_MEMIC_BASE_ALIGN) ? 0 :
-                        alignment - MLX5_MEMIC_BASE_ALIGN;
-       if (mlx5_alignment > max_alignment)
-               return -EINVAL;
-
-       MLX5_SET(alloc_memic_in, in, opcode, MLX5_CMD_OP_ALLOC_MEMIC);
-       MLX5_SET(alloc_memic_in, in, range_size, num_pages * PAGE_SIZE);
-       MLX5_SET(alloc_memic_in, in, memic_size, length);
-       MLX5_SET(alloc_memic_in, in, log_memic_addr_alignment,
-                mlx5_alignment);
-
-       while (page_idx < num_memic_hw_pages) {
-               spin_lock(&dm->lock);
-               page_idx = bitmap_find_next_zero_area(dm->memic_alloc_pages,
-                                                     num_memic_hw_pages,
-                                                     page_idx,
-                                                     num_pages, 0);
-
-               if (page_idx < num_memic_hw_pages)
-                       bitmap_set(dm->memic_alloc_pages,
-                                  page_idx, num_pages);
-
-               spin_unlock(&dm->lock);
-
-               if (page_idx >= num_memic_hw_pages)
-                       break;
-
-               MLX5_SET64(alloc_memic_in, in, range_start_addr,
-                          hw_start_addr + (page_idx * PAGE_SIZE));
-
-               ret = mlx5_cmd_exec_inout(dev, alloc_memic, in, out);
-               if (ret) {
-                       spin_lock(&dm->lock);
-                       bitmap_clear(dm->memic_alloc_pages,
-                                    page_idx, num_pages);
-                       spin_unlock(&dm->lock);
-
-                       if (ret == -EAGAIN) {
-                               page_idx++;
-                               continue;
-                       }
-
-                       return ret;
-               }
-
-               *addr = dev->bar_addr +
-                       MLX5_GET64(alloc_memic_out, out, memic_start_addr);
-
-               return 0;
-       }
-
-       return -ENOMEM;
-}
-
-void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length)
-{
-       struct mlx5_core_dev *dev = dm->dev;
-       u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr);
-       u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE);
-       u32 in[MLX5_ST_SZ_DW(dealloc_memic_in)] = {};
-       u64 start_page_idx;
-       int err;
-
-       addr -= dev->bar_addr;
-       start_page_idx = (addr - hw_start_addr) >> PAGE_SHIFT;
-
-       MLX5_SET(dealloc_memic_in, in, opcode, MLX5_CMD_OP_DEALLOC_MEMIC);
-       MLX5_SET64(dealloc_memic_in, in, memic_start_addr, addr);
-       MLX5_SET(dealloc_memic_in, in, memic_size, length);
-
-       err =  mlx5_cmd_exec_in(dev, dealloc_memic, in);
-       if (err)
-               return;
-
-       spin_lock(&dm->lock);
-       bitmap_clear(dm->memic_alloc_pages,
-                    start_page_idx, num_pages);
-       spin_unlock(&dm->lock);
-}
-
 void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid)
 {
        u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {};
index 88ea6ef..66c9629 100644 (file)
@@ -41,9 +41,6 @@ int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey);
 int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey);
 int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
                               void *out);
-int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr,
-                        u64 length, u32 alignment);
-void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length);
 int mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid);
 void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid);
 void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid);
index b9291e4..0b61df5 100644 (file)
@@ -267,7 +267,7 @@ static void mlx5_ib_set_cc_param_mask_val(void *field, int offset,
        }
 }
 
-static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, u8 port_num,
+static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, u32 port_num,
                                 int offset, u32 *var)
 {
        int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out);
@@ -304,7 +304,7 @@ alloc_err:
        return err;
 }
 
-static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u8 port_num,
+static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u32 port_num,
                                 int offset, u32 var)
 {
        int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in);
@@ -397,7 +397,7 @@ static const struct file_operations dbg_cc_fops = {
        .read   = get_param,
 };
 
-void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num)
+void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u32 port_num)
 {
        if (!mlx5_debugfs_root ||
            !dev->port[port_num].dbg_cc_params ||
@@ -409,7 +409,7 @@ void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num)
        dev->port[port_num].dbg_cc_params = NULL;
 }
 
-void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num)
+void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u32 port_num)
 {
        struct mlx5_ib_dbg_cc_params *dbg_cc_params;
        struct mlx5_core_dev *mdev;
index 084652e..e365341 100644 (file)
@@ -139,7 +139,7 @@ static int mlx5_ib_create_counters(struct ib_counters *counters,
 
 
 static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev,
-                                                  u8 port_num)
+                                                  u32 port_num)
 {
        return is_mdev_switchdev_mode(dev->mdev) ? &dev->port[0].cnts :
                                                   &dev->port[port_num].cnts;
@@ -154,7 +154,7 @@ static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev,
  * device port combination in switchdev and non switchdev mode of the
  * parent device.
  */
-u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num)
+u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u32 port_num)
 {
        const struct mlx5_ib_counters *cnts = get_counters(dev, port_num);
 
@@ -162,7 +162,7 @@ u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num)
 }
 
 static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
-                                                   u8 port_num)
+                                                   u32 port_num)
 {
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
        const struct mlx5_ib_counters *cnts;
@@ -236,13 +236,13 @@ free:
 
 static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
                                struct rdma_hw_stats *stats,
-                               u8 port_num, int index)
+                               u32 port_num, int index)
 {
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
        const struct mlx5_ib_counters *cnts = get_counters(dev, port_num - 1);
        struct mlx5_core_dev *mdev;
        int ret, num_counters;
-       u8 mdev_port_num;
+       u32 mdev_port_num;
 
        if (!stats)
                return -EINVAL;
index 1aa30c2..6bcaaa5 100644 (file)
@@ -13,5 +13,5 @@ void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev);
 void mlx5_ib_counters_clear_description(struct ib_counters *counters);
 int mlx5_ib_flow_counters_set_data(struct ib_counters *ibcounters,
                                   struct mlx5_ib_create_flow *ucmd);
-u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num);
+u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u32 port_num);
 #endif /* _MLX5_IB_COUNTERS_H */
index 07b8350..a0b677a 100644 (file)
@@ -2185,27 +2185,69 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
        return 0;
 }
 
+static unsigned int devx_umem_find_best_pgsize(struct ib_umem *umem,
+                                              unsigned long pgsz_bitmap)
+{
+       unsigned long page_size;
+
+       /* Don't bother checking larger page sizes as offset must be zero and
+        * total DEVX umem length must be equal to total umem length.
+        */
+       pgsz_bitmap &= GENMASK_ULL(max_t(u64, order_base_2(umem->length),
+                                        PAGE_SHIFT),
+                                  MLX5_ADAPTER_PAGE_SHIFT);
+       if (!pgsz_bitmap)
+               return 0;
+
+       page_size = ib_umem_find_best_pgoff(umem, pgsz_bitmap, U64_MAX);
+       if (!page_size)
+               return 0;
+
+       /* If the page_size is less than the CPU page size then we can use the
+        * offset and create a umem which is a subset of the page list.
+        * For larger page sizes we can't be sure the DMA  list reflects the
+        * VA so we must ensure that the umem extent is exactly equal to the
+        * page list. Reduce the page size until one of these cases is true.
+        */
+       while ((ib_umem_dma_offset(umem, page_size) != 0 ||
+               (umem->length % page_size) != 0) &&
+               page_size > PAGE_SIZE)
+               page_size /= 2;
+
+       return page_size;
+}
+
 static int devx_umem_reg_cmd_alloc(struct mlx5_ib_dev *dev,
                                   struct uverbs_attr_bundle *attrs,
                                   struct devx_umem *obj,
                                   struct devx_umem_reg_cmd *cmd)
 {
+       unsigned long pgsz_bitmap;
        unsigned int page_size;
        __be64 *mtt;
        void *umem;
+       int ret;
 
        /*
-        * We don't know what the user intends to use this umem for, but the HW
-        * restrictions must be met. MR, doorbell records, QP, WQ and CQ all
-        * have different requirements. Since we have no idea how to sort this
-        * out, only support PAGE_SIZE with the expectation that userspace will
-        * provide the necessary alignments inside the known PAGE_SIZE and that
-        * FW will check everything.
+        * If the user does not pass in pgsz_bitmap then the user promises not
+        * to use umem_offset!=0 in any commands that allocate on top of the
+        * umem.
+        *
+        * If the user wants to use a umem_offset then it must pass in
+        * pgsz_bitmap which guides the maximum page size and thus maximum
+        * object alignment inside the umem. See the PRM.
+        *
+        * Users are not allowed to use IOVA here, mkeys are not supported on
+        * umem.
         */
-       page_size = ib_umem_find_best_pgoff(
-               obj->umem, PAGE_SIZE,
-               __mlx5_page_offset_to_bitmask(__mlx5_bit_sz(umem, page_offset),
-                                             0));
+       ret = uverbs_get_const_default(&pgsz_bitmap, attrs,
+                       MLX5_IB_ATTR_DEVX_UMEM_REG_PGSZ_BITMAP,
+                       GENMASK_ULL(63,
+                                   min(PAGE_SHIFT, MLX5_ADAPTER_PAGE_SHIFT)));
+       if (ret)
+               return ret;
+
+       page_size = devx_umem_find_best_pgsize(obj->umem, pgsz_bitmap);
        if (!page_size)
                return -EINVAL;
 
@@ -2791,6 +2833,8 @@ DECLARE_UVERBS_NAMED_METHOD(
                           UA_MANDATORY),
        UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS,
                             enum ib_access_flags),
+       UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_PGSZ_BITMAP,
+                            u64),
        UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID,
                            UVERBS_ATTR_TYPE(u32),
                            UA_MANDATORY));
diff --git a/drivers/infiniband/hw/mlx5/dm.c b/drivers/infiniband/hw/mlx5/dm.c
new file mode 100644 (file)
index 0000000..094bf85
--- /dev/null
@@ -0,0 +1,587 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2021, Mellanox Technologies inc. All rights reserved.
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include "dm.h"
+
+#define UVERBS_MODULE_NAME mlx5_ib
+#include <rdma/uverbs_named_ioctl.h>
+
+static int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr,
+                               u64 length, u32 alignment)
+{
+       struct mlx5_core_dev *dev = dm->dev;
+       u64 num_memic_hw_pages = MLX5_CAP_DEV_MEM(dev, memic_bar_size)
+                                       >> PAGE_SHIFT;
+       u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr);
+       u32 max_alignment = MLX5_CAP_DEV_MEM(dev, log_max_memic_addr_alignment);
+       u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE);
+       u32 out[MLX5_ST_SZ_DW(alloc_memic_out)] = {};
+       u32 in[MLX5_ST_SZ_DW(alloc_memic_in)] = {};
+       u32 mlx5_alignment;
+       u64 page_idx = 0;
+       int ret = 0;
+
+       if (!length || (length & MLX5_MEMIC_ALLOC_SIZE_MASK))
+               return -EINVAL;
+
+       /* mlx5 device sets alignment as 64*2^driver_value
+        * so normalizing is needed.
+        */
+       mlx5_alignment = (alignment < MLX5_MEMIC_BASE_ALIGN) ? 0 :
+                        alignment - MLX5_MEMIC_BASE_ALIGN;
+       if (mlx5_alignment > max_alignment)
+               return -EINVAL;
+
+       MLX5_SET(alloc_memic_in, in, opcode, MLX5_CMD_OP_ALLOC_MEMIC);
+       MLX5_SET(alloc_memic_in, in, range_size, num_pages * PAGE_SIZE);
+       MLX5_SET(alloc_memic_in, in, memic_size, length);
+       MLX5_SET(alloc_memic_in, in, log_memic_addr_alignment,
+                mlx5_alignment);
+
+       while (page_idx < num_memic_hw_pages) {
+               spin_lock(&dm->lock);
+               page_idx = bitmap_find_next_zero_area(dm->memic_alloc_pages,
+                                                     num_memic_hw_pages,
+                                                     page_idx,
+                                                     num_pages, 0);
+
+               if (page_idx < num_memic_hw_pages)
+                       bitmap_set(dm->memic_alloc_pages,
+                                  page_idx, num_pages);
+
+               spin_unlock(&dm->lock);
+
+               if (page_idx >= num_memic_hw_pages)
+                       break;
+
+               MLX5_SET64(alloc_memic_in, in, range_start_addr,
+                          hw_start_addr + (page_idx * PAGE_SIZE));
+
+               ret = mlx5_cmd_exec_inout(dev, alloc_memic, in, out);
+               if (ret) {
+                       spin_lock(&dm->lock);
+                       bitmap_clear(dm->memic_alloc_pages,
+                                    page_idx, num_pages);
+                       spin_unlock(&dm->lock);
+
+                       if (ret == -EAGAIN) {
+                               page_idx++;
+                               continue;
+                       }
+
+                       return ret;
+               }
+
+               *addr = dev->bar_addr +
+                       MLX5_GET64(alloc_memic_out, out, memic_start_addr);
+
+               return 0;
+       }
+
+       return -ENOMEM;
+}
+
+void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr,
+                           u64 length)
+{
+       struct mlx5_core_dev *dev = dm->dev;
+       u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr);
+       u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE);
+       u32 in[MLX5_ST_SZ_DW(dealloc_memic_in)] = {};
+       u64 start_page_idx;
+       int err;
+
+       addr -= dev->bar_addr;
+       start_page_idx = (addr - hw_start_addr) >> PAGE_SHIFT;
+
+       MLX5_SET(dealloc_memic_in, in, opcode, MLX5_CMD_OP_DEALLOC_MEMIC);
+       MLX5_SET64(dealloc_memic_in, in, memic_start_addr, addr);
+       MLX5_SET(dealloc_memic_in, in, memic_size, length);
+
+       err =  mlx5_cmd_exec_in(dev, dealloc_memic, in);
+       if (err)
+               return;
+
+       spin_lock(&dm->lock);
+       bitmap_clear(dm->memic_alloc_pages,
+                    start_page_idx, num_pages);
+       spin_unlock(&dm->lock);
+}
+
+void mlx5_cmd_dealloc_memic_op(struct mlx5_dm *dm, phys_addr_t addr,
+                              u8 operation)
+{
+       u32 in[MLX5_ST_SZ_DW(modify_memic_in)] = {};
+       struct mlx5_core_dev *dev = dm->dev;
+
+       MLX5_SET(modify_memic_in, in, opcode, MLX5_CMD_OP_MODIFY_MEMIC);
+       MLX5_SET(modify_memic_in, in, op_mod, MLX5_MODIFY_MEMIC_OP_MOD_DEALLOC);
+       MLX5_SET(modify_memic_in, in, memic_operation_type, operation);
+       MLX5_SET64(modify_memic_in, in, memic_start_addr, addr - dev->bar_addr);
+
+       mlx5_cmd_exec_in(dev, modify_memic, in);
+}
+
+static int mlx5_cmd_alloc_memic_op(struct mlx5_dm *dm, phys_addr_t addr,
+                                  u8 operation, phys_addr_t *op_addr)
+{
+       u32 out[MLX5_ST_SZ_DW(modify_memic_out)] = {};
+       u32 in[MLX5_ST_SZ_DW(modify_memic_in)] = {};
+       struct mlx5_core_dev *dev = dm->dev;
+       int err;
+
+       MLX5_SET(modify_memic_in, in, opcode, MLX5_CMD_OP_MODIFY_MEMIC);
+       MLX5_SET(modify_memic_in, in, op_mod, MLX5_MODIFY_MEMIC_OP_MOD_ALLOC);
+       MLX5_SET(modify_memic_in, in, memic_operation_type, operation);
+       MLX5_SET64(modify_memic_in, in, memic_start_addr, addr - dev->bar_addr);
+
+       err = mlx5_cmd_exec_inout(dev, modify_memic, in, out);
+       if (err)
+               return err;
+
+       *op_addr = dev->bar_addr +
+                  MLX5_GET64(modify_memic_out, out, memic_operation_addr);
+       return 0;
+}
+
+static int add_dm_mmap_entry(struct ib_ucontext *context,
+                            struct mlx5_user_mmap_entry *mentry, u8 mmap_flag,
+                            size_t size, u64 address)
+{
+       mentry->mmap_flag = mmap_flag;
+       mentry->address = address;
+
+       return rdma_user_mmap_entry_insert_range(
+               context, &mentry->rdma_entry, size,
+               MLX5_IB_MMAP_DEVICE_MEM << 16,
+               (MLX5_IB_MMAP_DEVICE_MEM << 16) + (1UL << 16) - 1);
+}
+
+static void mlx5_ib_dm_memic_free(struct kref *kref)
+{
+       struct mlx5_ib_dm_memic *dm =
+               container_of(kref, struct mlx5_ib_dm_memic, ref);
+       struct mlx5_ib_dev *dev = to_mdev(dm->base.ibdm.device);
+
+       mlx5_cmd_dealloc_memic(&dev->dm, dm->base.dev_addr, dm->base.size);
+       kfree(dm);
+}
+
+static int copy_op_to_user(struct mlx5_ib_dm_op_entry *op_entry,
+                          struct uverbs_attr_bundle *attrs)
+{
+       u64 start_offset;
+       u16 page_idx;
+       int err;
+
+       page_idx = op_entry->mentry.rdma_entry.start_pgoff & 0xFFFF;
+       start_offset = op_entry->op_addr & ~PAGE_MASK;
+       err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_PAGE_INDEX,
+                            &page_idx, sizeof(page_idx));
+       if (err)
+               return err;
+
+       return uverbs_copy_to(attrs,
+                             MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_START_OFFSET,
+                             &start_offset, sizeof(start_offset));
+}
+
+static int map_existing_op(struct mlx5_ib_dm_memic *dm, u8 op,
+                          struct uverbs_attr_bundle *attrs)
+{
+       struct mlx5_ib_dm_op_entry *op_entry;
+
+       op_entry = xa_load(&dm->ops, op);
+       if (!op_entry)
+               return -ENOENT;
+
+       return copy_op_to_user(op_entry, attrs);
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DM_MAP_OP_ADDR)(
+       struct uverbs_attr_bundle *attrs)
+{
+       struct ib_uobject *uobj = uverbs_attr_get_uobject(
+               attrs, MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_HANDLE);
+       struct mlx5_ib_dev *dev = to_mdev(uobj->context->device);
+       struct ib_dm *ibdm = uobj->object;
+       struct mlx5_ib_dm_memic *dm = to_memic(ibdm);
+       struct mlx5_ib_dm_op_entry *op_entry;
+       int err;
+       u8 op;
+
+       err = uverbs_copy_from(&op, attrs, MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_OP);
+       if (err)
+               return err;
+
+       if (!(MLX5_CAP_DEV_MEM(dev->mdev, memic_operations) & BIT(op)))
+               return -EOPNOTSUPP;
+
+       mutex_lock(&dm->ops_xa_lock);
+       err = map_existing_op(dm, op, attrs);
+       if (!err || err != -ENOENT)
+               goto err_unlock;
+
+       op_entry = kzalloc(sizeof(*op_entry), GFP_KERNEL);
+       if (!op_entry)
+               goto err_unlock;
+
+       err = mlx5_cmd_alloc_memic_op(&dev->dm, dm->base.dev_addr, op,
+                                     &op_entry->op_addr);
+       if (err) {
+               kfree(op_entry);
+               goto err_unlock;
+       }
+       op_entry->op = op;
+       op_entry->dm = dm;
+
+       err = add_dm_mmap_entry(uobj->context, &op_entry->mentry,
+                               MLX5_IB_MMAP_TYPE_MEMIC_OP, dm->base.size,
+                               op_entry->op_addr & PAGE_MASK);
+       if (err) {
+               mlx5_cmd_dealloc_memic_op(&dev->dm, dm->base.dev_addr, op);
+               kfree(op_entry);
+               goto err_unlock;
+       }
+       /* From this point, entry will be freed by mmap_free */
+       kref_get(&dm->ref);
+
+       err = copy_op_to_user(op_entry, attrs);
+       if (err)
+               goto err_remove;
+
+       err = xa_insert(&dm->ops, op, op_entry, GFP_KERNEL);
+       if (err)
+               goto err_remove;
+       mutex_unlock(&dm->ops_xa_lock);
+
+       return 0;
+
+err_remove:
+       rdma_user_mmap_entry_remove(&op_entry->mentry.rdma_entry);
+err_unlock:
+       mutex_unlock(&dm->ops_xa_lock);
+
+       return err;
+}
+
+static struct ib_dm *handle_alloc_dm_memic(struct ib_ucontext *ctx,
+                                          struct ib_dm_alloc_attr *attr,
+                                          struct uverbs_attr_bundle *attrs)
+{
+       struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
+       struct mlx5_ib_dm_memic *dm;
+       u64 start_offset;
+       u16 page_idx;
+       int err;
+       u64 address;
+
+       if (!MLX5_CAP_DEV_MEM(dm_db->dev, memic))
+               return ERR_PTR(-EOPNOTSUPP);
+
+       dm = kzalloc(sizeof(*dm), GFP_KERNEL);
+       if (!dm)
+               return ERR_PTR(-ENOMEM);
+
+       dm->base.type = MLX5_IB_UAPI_DM_TYPE_MEMIC;
+       dm->base.size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE);
+       dm->base.ibdm.device = ctx->device;
+
+       kref_init(&dm->ref);
+       xa_init(&dm->ops);
+       mutex_init(&dm->ops_xa_lock);
+       dm->req_length = attr->length;
+
+       err = mlx5_cmd_alloc_memic(dm_db, &dm->base.dev_addr,
+                                  dm->base.size, attr->alignment);
+       if (err) {
+               kfree(dm);
+               return ERR_PTR(err);
+       }
+
+       address = dm->base.dev_addr & PAGE_MASK;
+       err = add_dm_mmap_entry(ctx, &dm->mentry, MLX5_IB_MMAP_TYPE_MEMIC,
+                               dm->base.size, address);
+       if (err) {
+               mlx5_cmd_dealloc_memic(dm_db, dm->base.dev_addr, dm->base.size);
+               kfree(dm);
+               return ERR_PTR(err);
+       }
+
+       page_idx = dm->mentry.rdma_entry.start_pgoff & 0xFFFF;
+       err = uverbs_copy_to(attrs, MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
+                            &page_idx, sizeof(page_idx));
+       if (err)
+               goto err_copy;
+
+       start_offset = dm->base.dev_addr & ~PAGE_MASK;
+       err = uverbs_copy_to(attrs,
+                            MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
+                            &start_offset, sizeof(start_offset));
+       if (err)
+               goto err_copy;
+
+       return &dm->base.ibdm;
+
+err_copy:
+       rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry);
+       return ERR_PTR(err);
+}
+
+static enum mlx5_sw_icm_type get_icm_type(int uapi_type)
+{
+       return uapi_type == MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM ?
+                      MLX5_SW_ICM_TYPE_STEERING :
+                      MLX5_SW_ICM_TYPE_HEADER_MODIFY;
+}
+
+static struct ib_dm *handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
+                                           struct ib_dm_alloc_attr *attr,
+                                           struct uverbs_attr_bundle *attrs,
+                                           int type)
+{
+       struct mlx5_core_dev *dev = to_mdev(ctx->device)->mdev;
+       enum mlx5_sw_icm_type icm_type = get_icm_type(type);
+       struct mlx5_ib_dm_icm *dm;
+       u64 act_size;
+       int err;
+
+       dm = kzalloc(sizeof(*dm), GFP_KERNEL);
+       if (!dm)
+               return ERR_PTR(-ENOMEM);
+
+       dm->base.type = type;
+       dm->base.ibdm.device = ctx->device;
+
+       if (!capable(CAP_SYS_RAWIO) || !capable(CAP_NET_RAW)) {
+               err = -EPERM;
+               goto free;
+       }
+
+       if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner) ||
+             MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner) ||
+             MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner_v2) ||
+             MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner_v2))) {
+               err = -EOPNOTSUPP;
+               goto free;
+       }
+
+       /* Allocation size must a multiple of the basic block size
+        * and a power of 2.
+        */
+       act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dev));
+       act_size = roundup_pow_of_two(act_size);
+
+       dm->base.size = act_size;
+       err = mlx5_dm_sw_icm_alloc(dev, icm_type, act_size, attr->alignment,
+                                  to_mucontext(ctx)->devx_uid,
+                                  &dm->base.dev_addr, &dm->obj_id);
+       if (err)
+               goto free;
+
+       err = uverbs_copy_to(attrs, MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
+                            &dm->base.dev_addr, sizeof(dm->base.dev_addr));
+       if (err) {
+               mlx5_dm_sw_icm_dealloc(dev, icm_type, dm->base.size,
+                                      to_mucontext(ctx)->devx_uid,
+                                      dm->base.dev_addr, dm->obj_id);
+               goto free;
+       }
+       return &dm->base.ibdm;
+free:
+       kfree(dm);
+       return ERR_PTR(err);
+}
+
+struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
+                              struct ib_ucontext *context,
+                              struct ib_dm_alloc_attr *attr,
+                              struct uverbs_attr_bundle *attrs)
+{
+       enum mlx5_ib_uapi_dm_type type;
+       int err;
+
+       err = uverbs_get_const_default(&type, attrs,
+                                      MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
+                                      MLX5_IB_UAPI_DM_TYPE_MEMIC);
+       if (err)
+               return ERR_PTR(err);
+
+       mlx5_ib_dbg(to_mdev(ibdev), "alloc_dm req: dm_type=%d user_length=0x%llx log_alignment=%d\n",
+                   type, attr->length, attr->alignment);
+
+       switch (type) {
+       case MLX5_IB_UAPI_DM_TYPE_MEMIC:
+               return handle_alloc_dm_memic(context, attr, attrs);
+       case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+               return handle_alloc_dm_sw_icm(context, attr, attrs, type);
+       case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+               return handle_alloc_dm_sw_icm(context, attr, attrs, type);
+       default:
+               return ERR_PTR(-EOPNOTSUPP);
+       }
+}
+
+static void dm_memic_remove_ops(struct mlx5_ib_dm_memic *dm)
+{
+       struct mlx5_ib_dm_op_entry *entry;
+       unsigned long idx;
+
+       mutex_lock(&dm->ops_xa_lock);
+       xa_for_each(&dm->ops, idx, entry) {
+               xa_erase(&dm->ops, idx);
+               rdma_user_mmap_entry_remove(&entry->mentry.rdma_entry);
+       }
+       mutex_unlock(&dm->ops_xa_lock);
+}
+
+static void mlx5_dm_memic_dealloc(struct mlx5_ib_dm_memic *dm)
+{
+       dm_memic_remove_ops(dm);
+       rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry);
+}
+
+static int mlx5_dm_icm_dealloc(struct mlx5_ib_ucontext *ctx,
+                              struct mlx5_ib_dm_icm *dm)
+{
+       enum mlx5_sw_icm_type type = get_icm_type(dm->base.type);
+       struct mlx5_core_dev *dev = to_mdev(dm->base.ibdm.device)->mdev;
+       int err;
+
+       err = mlx5_dm_sw_icm_dealloc(dev, type, dm->base.size, ctx->devx_uid,
+                                    dm->base.dev_addr, dm->obj_id);
+       if (!err)
+               kfree(dm);
+       return 0;
+}
+
+static int mlx5_ib_dealloc_dm(struct ib_dm *ibdm,
+                             struct uverbs_attr_bundle *attrs)
+{
+       struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context(
+               &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+       struct mlx5_ib_dm *dm = to_mdm(ibdm);
+
+       switch (dm->type) {
+       case MLX5_IB_UAPI_DM_TYPE_MEMIC:
+               mlx5_dm_memic_dealloc(to_memic(ibdm));
+               return 0;
+       case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+       case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+               return mlx5_dm_icm_dealloc(ctx, to_icm(ibdm));
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DM_QUERY)(
+       struct uverbs_attr_bundle *attrs)
+{
+       struct ib_dm *ibdm =
+               uverbs_attr_get_obj(attrs, MLX5_IB_ATTR_QUERY_DM_REQ_HANDLE);
+       struct mlx5_ib_dm *dm = to_mdm(ibdm);
+       struct mlx5_ib_dm_memic *memic;
+       u64 start_offset;
+       u16 page_idx;
+       int err;
+
+       if (dm->type != MLX5_IB_UAPI_DM_TYPE_MEMIC)
+               return -EOPNOTSUPP;
+
+       memic = to_memic(ibdm);
+       page_idx = memic->mentry.rdma_entry.start_pgoff & 0xFFFF;
+       err = uverbs_copy_to(attrs, MLX5_IB_ATTR_QUERY_DM_RESP_PAGE_INDEX,
+                            &page_idx, sizeof(page_idx));
+       if (err)
+               return err;
+
+       start_offset = memic->base.dev_addr & ~PAGE_MASK;
+       err =  uverbs_copy_to(attrs, MLX5_IB_ATTR_QUERY_DM_RESP_START_OFFSET,
+                             &start_offset, sizeof(start_offset));
+       if (err)
+               return err;
+
+       return uverbs_copy_to(attrs, MLX5_IB_ATTR_QUERY_DM_RESP_LENGTH,
+                             &memic->req_length,
+                             sizeof(memic->req_length));
+}
+
+void mlx5_ib_dm_mmap_free(struct mlx5_ib_dev *dev,
+                         struct mlx5_user_mmap_entry *mentry)
+{
+       struct mlx5_ib_dm_op_entry *op_entry;
+       struct mlx5_ib_dm_memic *mdm;
+
+       switch (mentry->mmap_flag) {
+       case MLX5_IB_MMAP_TYPE_MEMIC:
+               mdm = container_of(mentry, struct mlx5_ib_dm_memic, mentry);
+               kref_put(&mdm->ref, mlx5_ib_dm_memic_free);
+               break;
+       case MLX5_IB_MMAP_TYPE_MEMIC_OP:
+               op_entry = container_of(mentry, struct mlx5_ib_dm_op_entry,
+                                       mentry);
+               mdm = op_entry->dm;
+               mlx5_cmd_dealloc_memic_op(&dev->dm, mdm->base.dev_addr,
+                                         op_entry->op);
+               kfree(op_entry);
+               kref_put(&mdm->ref, mlx5_ib_dm_memic_free);
+               break;
+       default:
+               WARN_ON(true);
+       }
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+       MLX5_IB_METHOD_DM_QUERY,
+       UVERBS_ATTR_IDR(MLX5_IB_ATTR_QUERY_DM_REQ_HANDLE, UVERBS_OBJECT_DM,
+                       UVERBS_ACCESS_READ, UA_MANDATORY),
+       UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_QUERY_DM_RESP_START_OFFSET,
+                           UVERBS_ATTR_TYPE(u64), UA_MANDATORY),
+       UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_QUERY_DM_RESP_PAGE_INDEX,
+                           UVERBS_ATTR_TYPE(u16), UA_MANDATORY),
+       UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_QUERY_DM_RESP_LENGTH,
+                           UVERBS_ATTR_TYPE(u64), UA_MANDATORY));
+
+ADD_UVERBS_ATTRIBUTES_SIMPLE(
+       mlx5_ib_dm, UVERBS_OBJECT_DM, UVERBS_METHOD_DM_ALLOC,
+       UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
+                           UVERBS_ATTR_TYPE(u64), UA_MANDATORY),
+       UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
+                           UVERBS_ATTR_TYPE(u16), UA_OPTIONAL),
+       UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
+                            enum mlx5_ib_uapi_dm_type, UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD(
+       MLX5_IB_METHOD_DM_MAP_OP_ADDR,
+       UVERBS_ATTR_IDR(MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_HANDLE,
+                       UVERBS_OBJECT_DM,
+                       UVERBS_ACCESS_READ,
+                       UA_MANDATORY),
+       UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_OP,
+                          UVERBS_ATTR_TYPE(u8),
+                          UA_MANDATORY),
+       UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_START_OFFSET,
+                           UVERBS_ATTR_TYPE(u64),
+                           UA_MANDATORY),
+       UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_PAGE_INDEX,
+                           UVERBS_ATTR_TYPE(u16),
+                           UA_OPTIONAL));
+
+DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DM,
+                             &UVERBS_METHOD(MLX5_IB_METHOD_DM_MAP_OP_ADDR),
+                             &UVERBS_METHOD(MLX5_IB_METHOD_DM_QUERY));
+
+const struct uapi_definition mlx5_ib_dm_defs[] = {
+       UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm),
+       UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DM),
+       {},
+};
+
+const struct ib_device_ops mlx5_ib_dev_dm_ops = {
+       .alloc_dm = mlx5_ib_alloc_dm,
+       .dealloc_dm = mlx5_ib_dealloc_dm,
+       .reg_dm_mr = mlx5_ib_reg_dm_mr,
+};
diff --git a/drivers/infiniband/hw/mlx5/dm.h b/drivers/infiniband/hw/mlx5/dm.h
new file mode 100644 (file)
index 0000000..9674a80
--- /dev/null
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2021, Mellanox Technologies inc. All rights reserved.
+ */
+
+#ifndef _MLX5_IB_DM_H
+#define _MLX5_IB_DM_H
+
+#include "mlx5_ib.h"
+
+extern const struct ib_device_ops mlx5_ib_dev_dm_ops;
+extern const struct uapi_definition mlx5_ib_dm_defs[];
+
+struct mlx5_ib_dm {
+       struct ib_dm            ibdm;
+       u32                     type;
+       phys_addr_t             dev_addr;
+       size_t                  size;
+};
+
+struct mlx5_ib_dm_op_entry {
+       struct mlx5_user_mmap_entry     mentry;
+       phys_addr_t                     op_addr;
+       struct mlx5_ib_dm_memic         *dm;
+       u8                              op;
+};
+
+struct mlx5_ib_dm_memic {
+       struct mlx5_ib_dm           base;
+       struct mlx5_user_mmap_entry mentry;
+       struct xarray               ops;
+       struct mutex                ops_xa_lock;
+       struct kref                 ref;
+       size_t                      req_length;
+};
+
+struct mlx5_ib_dm_icm {
+       struct mlx5_ib_dm      base;
+       u32                    obj_id;
+};
+
+static inline struct mlx5_ib_dm *to_mdm(struct ib_dm *ibdm)
+{
+       return container_of(ibdm, struct mlx5_ib_dm, ibdm);
+}
+
+static inline struct mlx5_ib_dm_memic *to_memic(struct ib_dm *ibdm)
+{
+       return container_of(ibdm, struct mlx5_ib_dm_memic, base.ibdm);
+}
+
+static inline struct mlx5_ib_dm_icm *to_icm(struct ib_dm *ibdm)
+{
+       return container_of(ibdm, struct mlx5_ib_dm_icm, base.ibdm);
+}
+
+struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
+                              struct ib_ucontext *context,
+                              struct ib_dm_alloc_attr *attr,
+                              struct uverbs_attr_bundle *attrs);
+void mlx5_ib_dm_mmap_free(struct mlx5_ib_dev *dev,
+                         struct mlx5_user_mmap_entry *mentry);
+void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr,
+                           u64 length);
+void mlx5_cmd_dealloc_memic_op(struct mlx5_dm *dm, phys_addr_t addr,
+                              u8 operation);
+
+#endif /* _MLX5_IB_DM_H */
index 01370d9..2fc6a60 100644 (file)
@@ -1528,8 +1528,8 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add(
                dst_num++;
        }
 
-       handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher,
-                                       flow_context, flow_act,
+       handler = _create_raw_flow_rule(dev, ft_prio, dst_num ? dst : NULL,
+                                       fs_matcher, flow_context, flow_act,
                                        cmd_in, inlen, dst_num);
 
        if (IS_ERR(handler)) {
@@ -1885,8 +1885,9 @@ static int get_dests(struct uverbs_attr_bundle *attrs,
                else
                        *dest_id = mqp->raw_packet_qp.rq.tirn;
                *dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR;
-       } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS ||
-                  fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) {
+       } else if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS ||
+                   fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) &&
+                  !(*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)) {
                *dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT;
        }
 
index db5de72..b25e0b3 100644 (file)
@@ -29,7 +29,7 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 static int
 mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 {
-       int num_ports = mlx5_eswitch_get_total_vports(dev);
+       u32 num_ports = mlx5_eswitch_get_total_vports(dev);
        const struct mlx5_ib_profile *profile;
        struct mlx5_ib_dev *ibdev;
        int vport_index;
@@ -110,7 +110,7 @@ struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
 
 struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
                                                   struct mlx5_ib_sq *sq,
-                                                  u16 port)
+                                                  u32 port)
 {
        struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
        struct mlx5_eswitch_rep *rep;
index ce1dcb1..9c55e5c 100644 (file)
@@ -16,7 +16,7 @@ int mlx5r_rep_init(void);
 void mlx5r_rep_cleanup(void);
 struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
                                                   struct mlx5_ib_sq *sq,
-                                                  u16 port);
+                                                  u32 port);
 struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
                                          u16 vport_num);
 #else /* CONFIG_MLX5_ESWITCH */
@@ -25,7 +25,7 @@ static inline void mlx5r_rep_cleanup(void) {}
 static inline
 struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
                                                   struct mlx5_ib_sq *sq,
-                                                  u16 port)
+                                                  u32 port)
 {
        return NULL;
 }
index 46b2d37..f2f6287 100644 (file)
@@ -48,7 +48,7 @@ static inline u32 mlx_to_net_policy(enum port_state_policy mlx_policy)
        }
 }
 
-int mlx5_ib_get_vf_config(struct ib_device *device, int vf, u8 port,
+int mlx5_ib_get_vf_config(struct ib_device *device, int vf, u32 port,
                          struct ifla_vf_info *info)
 {
        struct mlx5_ib_dev *dev = to_mdev(device);
@@ -91,7 +91,7 @@ static inline enum port_state_policy net_to_mlx_policy(int policy)
 }
 
 int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf,
-                             u8 port, int state)
+                             u32 port, int state)
 {
        struct mlx5_ib_dev *dev = to_mdev(device);
        struct mlx5_core_dev *mdev = dev->mdev;
@@ -119,7 +119,7 @@ out:
 }
 
 int mlx5_ib_get_vf_stats(struct ib_device *device, int vf,
-                        u8 port, struct ifla_vf_stats *stats)
+                        u32 port, struct ifla_vf_stats *stats)
 {
        int out_sz = MLX5_ST_SZ_BYTES(query_vport_counter_out);
        struct mlx5_core_dev *mdev;
@@ -149,7 +149,8 @@ ex:
        return err;
 }
 
-static int set_vf_node_guid(struct ib_device *device, int vf, u8 port, u64 guid)
+static int set_vf_node_guid(struct ib_device *device, int vf, u32 port,
+                           u64 guid)
 {
        struct mlx5_ib_dev *dev = to_mdev(device);
        struct mlx5_core_dev *mdev = dev->mdev;
@@ -172,7 +173,8 @@ static int set_vf_node_guid(struct ib_device *device, int vf, u8 port, u64 guid)
        return err;
 }
 
-static int set_vf_port_guid(struct ib_device *device, int vf, u8 port, u64 guid)
+static int set_vf_port_guid(struct ib_device *device, int vf, u32 port,
+                           u64 guid)
 {
        struct mlx5_ib_dev *dev = to_mdev(device);
        struct mlx5_core_dev *mdev = dev->mdev;
@@ -195,7 +197,7 @@ static int set_vf_port_guid(struct ib_device *device, int vf, u8 port, u64 guid)
        return err;
 }
 
-int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port,
+int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u32 port,
                        u64 guid, int type)
 {
        if (type == IFLA_VF_IB_NODE_GUID)
@@ -206,7 +208,7 @@ int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port,
        return -EINVAL;
 }
 
-int mlx5_ib_get_vf_guid(struct ib_device *device, int vf, u8 port,
+int mlx5_ib_get_vf_guid(struct ib_device *device, int vf, u32 port,
                        struct ifla_vf_guid *node_guid,
                        struct ifla_vf_guid *port_guid)
 {
index 652c6cc..ec242a5 100644 (file)
@@ -42,7 +42,7 @@ enum {
        MLX5_IB_VENDOR_CLASS2 = 0xa
 };
 
-static bool can_do_mad_ifc(struct mlx5_ib_dev *dev, u8 port_num,
+static bool can_do_mad_ifc(struct mlx5_ib_dev *dev, u32 port_num,
                           struct ib_mad *in_mad)
 {
        if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED &&
@@ -52,7 +52,7 @@ static bool can_do_mad_ifc(struct mlx5_ib_dev *dev, u8 port_num,
 }
 
 static int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey,
-                       int ignore_bkey, u8 port, const struct ib_wc *in_wc,
+                       int ignore_bkey, u32 port, const struct ib_wc *in_wc,
                        const struct ib_grh *in_grh, const void *in_mad,
                        void *response_mad)
 {
@@ -147,12 +147,12 @@ static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt,
                             vl_15_dropped);
 }
 
-static int process_pma_cmd(struct mlx5_ib_dev *dev, u8 port_num,
+static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num,
                           const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
        struct mlx5_core_dev *mdev;
        bool native_port = true;
-       u8 mdev_port_num;
+       u32 mdev_port_num;
        void *out_cnt;
        int err;
 
@@ -216,7 +216,7 @@ done:
        return err;
 }
 
-int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
                        const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                        const struct ib_mad *in, struct ib_mad *out,
                        size_t *out_mad_size, u16 *out_mad_pkey_index)
@@ -444,7 +444,7 @@ out:
        return err;
 }
 
-int mlx5_query_mad_ifc_pkey(struct ib_device *ibdev, u8 port, u16 index,
+int mlx5_query_mad_ifc_pkey(struct ib_device *ibdev, u32 port, u16 index,
                            u16 *pkey)
 {
        struct ib_smp *in_mad  = NULL;
@@ -473,7 +473,7 @@ out:
        return err;
 }
 
-int mlx5_query_mad_ifc_gids(struct ib_device *ibdev, u8 port, int index,
+int mlx5_query_mad_ifc_gids(struct ib_device *ibdev, u32 port, int index,
                            union ib_gid *gid)
 {
        struct ib_smp *in_mad  = NULL;
@@ -513,7 +513,7 @@ out:
        return err;
 }
 
-int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port,
+int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u32 port,
                            struct ib_port_attr *props)
 {
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
index 7a7f6cc..6d1dd09 100644 (file)
@@ -34,6 +34,7 @@
 #include "ib_rep.h"
 #include "cmd.h"
 #include "devx.h"
+#include "dm.h"
 #include "fs.h"
 #include "srq.h"
 #include "qp.h"
@@ -42,6 +43,7 @@
 #include "counters.h"
 #include <linux/mlx5/accel.h>
 #include <rdma/uverbs_std_types.h>
+#include <rdma/uverbs_ioctl.h>
 #include <rdma/mlx5_user_ioctl_verbs.h>
 #include <rdma/mlx5_user_ioctl_cmds.h>
 #include <rdma/ib_umem_odp.h>
@@ -100,7 +102,7 @@ mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
 }
 
 static enum rdma_link_layer
-mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
+mlx5_ib_port_link_layer(struct ib_device *device, u32 port_num)
 {
        struct mlx5_ib_dev *dev = to_mdev(device);
        int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
@@ -109,7 +111,7 @@ mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
 }
 
 static int get_port_state(struct ib_device *ibdev,
-                         u8 port_num,
+                         u32 port_num,
                          enum ib_port_state *state)
 {
        struct ib_port_attr attr;
@@ -124,7 +126,7 @@ static int get_port_state(struct ib_device *ibdev,
 
 static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
                                           struct net_device *ndev,
-                                          u8 *port_num)
+                                          u32 *port_num)
 {
        struct net_device *rep_ndev;
        struct mlx5_ib_port *port;
@@ -154,7 +156,7 @@ static int mlx5_netdev_event(struct notifier_block *this,
 {
        struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb);
        struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
-       u8 port_num = roce->native_port_num;
+       u32 port_num = roce->native_port_num;
        struct mlx5_core_dev *mdev;
        struct mlx5_ib_dev *ibdev;
 
@@ -233,7 +235,7 @@ done:
 }
 
 static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
-                                            u8 port_num)
+                                            u32 port_num)
 {
        struct mlx5_ib_dev *ibdev = to_mdev(device);
        struct net_device *ndev;
@@ -261,8 +263,8 @@ out:
 }
 
 struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
-                                                  u8 ib_port_num,
-                                                  u8 *native_port_num)
+                                                  u32 ib_port_num,
+                                                  u32 *native_port_num)
 {
        enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
                                                          ib_port_num);
@@ -296,7 +298,7 @@ struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
        return mdev;
 }
 
-void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u8 port_num)
+void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u32 port_num)
 {
        enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
                                                          port_num);
@@ -452,7 +454,7 @@ static int translate_eth_proto_oper(u32 eth_proto_oper, u16 *active_speed,
                                                active_width);
 }
 
-static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
+static int mlx5_query_port_roce(struct ib_device *device, u32 port_num,
                                struct ib_port_attr *props)
 {
        struct mlx5_ib_dev *dev = to_mdev(device);
@@ -462,7 +464,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
        enum ib_mtu ndev_ib_mtu;
        bool put_mdev = true;
        u32 eth_prot_oper;
-       u8 mdev_port_num;
+       u32 mdev_port_num;
        bool ext;
        int err;
 
@@ -498,7 +500,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
        translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
                                 &props->active_width, ext);
 
-       if (!dev->is_rep && mlx5_is_roce_enabled(mdev)) {
+       if (!dev->is_rep && dev->mdev->roce.roce_en) {
                u16 qkey_viol_cntr;
 
                props->port_cap_flags |= IB_PORT_CM_SUP;
@@ -549,19 +551,19 @@ out:
        return err;
 }
 
-static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
+static int set_roce_addr(struct mlx5_ib_dev *dev, u32 port_num,
                         unsigned int index, const union ib_gid *gid,
                         const struct ib_gid_attr *attr)
 {
-       enum ib_gid_type gid_type = IB_GID_TYPE_ROCE;
+       enum ib_gid_type gid_type;
        u16 vlan_id = 0xffff;
        u8 roce_version = 0;
        u8 roce_l3_type = 0;
        u8 mac[ETH_ALEN];
        int ret;
 
+       gid_type = attr->gid_type;
        if (gid) {
-               gid_type = attr->gid_type;
                ret = rdma_read_gid_l2_fields(attr, &vlan_id, &mac[0]);
                if (ret)
                        return ret;
@@ -573,7 +575,7 @@ static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
                break;
        case IB_GID_TYPE_ROCE_UDP_ENCAP:
                roce_version = MLX5_ROCE_VERSION_2;
-               if (ipv6_addr_v4mapped((void *)gid))
+               if (gid && ipv6_addr_v4mapped((void *)gid))
                        roce_l3_type = MLX5_ROCE_L3_TYPE_IPV4;
                else
                        roce_l3_type = MLX5_ROCE_L3_TYPE_IPV6;
@@ -600,7 +602,7 @@ static int mlx5_ib_del_gid(const struct ib_gid_attr *attr,
                           __always_unused void **context)
 {
        return set_roce_addr(to_mdev(attr->device), attr->port_num,
-                            attr->index, NULL, NULL);
+                            attr->index, NULL, attr);
 }
 
 __be16 mlx5_get_roce_udp_sport_min(const struct mlx5_ib_dev *dev,
@@ -1267,7 +1269,7 @@ static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
        return 0;
 }
 
-static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
+static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port,
                               struct ib_port_attr *props)
 {
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
@@ -1335,7 +1337,7 @@ out:
        return err;
 }
 
-int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
+int mlx5_ib_query_port(struct ib_device *ibdev, u32 port,
                       struct ib_port_attr *props)
 {
        unsigned int count;
@@ -1380,13 +1382,13 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
        return ret;
 }
 
-static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u8 port,
+static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u32 port,
                                  struct ib_port_attr *props)
 {
        return mlx5_query_port_roce(ibdev, port, props);
 }
 
-static int mlx5_ib_rep_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+static int mlx5_ib_rep_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
                                  u16 *pkey)
 {
        /* Default special Pkey for representor device port as per the
@@ -1396,7 +1398,7 @@ static int mlx5_ib_rep_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
        return 0;
 }
 
-static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+static int mlx5_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
                             union ib_gid *gid)
 {
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
@@ -1415,13 +1417,13 @@ static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
 
 }
 
-static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u8 port,
+static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u32 port,
                                   u16 index, u16 *pkey)
 {
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
        struct mlx5_core_dev *mdev;
        bool put_mdev = true;
-       u8 mdev_port_num;
+       u32 mdev_port_num;
        int err;
 
        mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num);
@@ -1442,7 +1444,7 @@ static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u8 port,
        return err;
 }
 
-static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+static int mlx5_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
                              u16 *pkey)
 {
        switch (mlx5_get_vport_access_method(ibdev)) {
@@ -1486,12 +1488,12 @@ static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
        return err;
 }
 
-static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask,
+static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u32 port_num, u32 mask,
                                u32 value)
 {
        struct mlx5_hca_vport_context ctx = {};
        struct mlx5_core_dev *mdev;
-       u8 mdev_port_num;
+       u32 mdev_port_num;
        int err;
 
        mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
@@ -1520,7 +1522,7 @@ out:
        return err;
 }
 
-static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
+static int mlx5_ib_modify_port(struct ib_device *ibdev, u32 port, int mask,
                               struct ib_port_modify *props)
 {
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
@@ -1929,7 +1931,7 @@ uar_done:
        print_lib_caps(dev, context->lib_caps);
 
        if (mlx5_ib_lag_should_assign_affinity(dev)) {
-               u8 port = mlx5_core_native_port_num(dev->mdev) - 1;
+               u32 port = mlx5_core_native_port_num(dev->mdev) - 1;
 
                atomic_set(&context->tx_port_affinity,
                           atomic_add_return(
@@ -2087,14 +2089,11 @@ static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry)
        struct mlx5_user_mmap_entry *mentry = to_mmmap(entry);
        struct mlx5_ib_dev *dev = to_mdev(entry->ucontext->device);
        struct mlx5_var_table *var_table = &dev->var_table;
-       struct mlx5_ib_dm *mdm;
 
        switch (mentry->mmap_flag) {
        case MLX5_IB_MMAP_TYPE_MEMIC:
-               mdm = container_of(mentry, struct mlx5_ib_dm, mentry);
-               mlx5_cmd_dealloc_memic(&dev->dm, mdm->dev_addr,
-                                      mdm->size);
-               kfree(mdm);
+       case MLX5_IB_MMAP_TYPE_MEMIC_OP:
+               mlx5_ib_dm_mmap_free(dev, mentry);
                break;
        case MLX5_IB_MMAP_TYPE_VAR:
                mutex_lock(&var_table->bitmap_lock);
@@ -2219,19 +2218,6 @@ free_bfreg:
        return err;
 }
 
-static int add_dm_mmap_entry(struct ib_ucontext *context,
-                            struct mlx5_ib_dm *mdm,
-                            u64 address)
-{
-       mdm->mentry.mmap_flag = MLX5_IB_MMAP_TYPE_MEMIC;
-       mdm->mentry.address = address;
-       return rdma_user_mmap_entry_insert_range(
-                       context, &mdm->mentry.rdma_entry,
-                       mdm->size,
-                       MLX5_IB_MMAP_DEVICE_MEM << 16,
-                       (MLX5_IB_MMAP_DEVICE_MEM << 16) + (1UL << 16) - 1);
-}
-
 static unsigned long mlx5_vma_to_pgoff(struct vm_area_struct *vma)
 {
        unsigned long idx;
@@ -2333,206 +2319,6 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
        return 0;
 }
 
-static inline int check_dm_type_support(struct mlx5_ib_dev *dev,
-                                       u32 type)
-{
-       switch (type) {
-       case MLX5_IB_UAPI_DM_TYPE_MEMIC:
-               if (!MLX5_CAP_DEV_MEM(dev->mdev, memic))
-                       return -EOPNOTSUPP;
-               break;
-       case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
-       case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
-               if (!capable(CAP_SYS_RAWIO) ||
-                   !capable(CAP_NET_RAW))
-                       return -EPERM;
-
-               if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner) ||
-                     MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, sw_owner) ||
-                     MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner_v2) ||
-                     MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, sw_owner_v2)))
-                       return -EOPNOTSUPP;
-               break;
-       }
-
-       return 0;
-}
-
-static int handle_alloc_dm_memic(struct ib_ucontext *ctx,
-                                struct mlx5_ib_dm *dm,
-                                struct ib_dm_alloc_attr *attr,
-                                struct uverbs_attr_bundle *attrs)
-{
-       struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
-       u64 start_offset;
-       u16 page_idx;
-       int err;
-       u64 address;
-
-       dm->size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE);
-
-       err = mlx5_cmd_alloc_memic(dm_db, &dm->dev_addr,
-                                  dm->size, attr->alignment);
-       if (err)
-               return err;
-
-       address = dm->dev_addr & PAGE_MASK;
-       err = add_dm_mmap_entry(ctx, dm, address);
-       if (err)
-               goto err_dealloc;
-
-       page_idx = dm->mentry.rdma_entry.start_pgoff & 0xFFFF;
-       err = uverbs_copy_to(attrs,
-                            MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
-                            &page_idx,
-                            sizeof(page_idx));
-       if (err)
-               goto err_copy;
-
-       start_offset = dm->dev_addr & ~PAGE_MASK;
-       err = uverbs_copy_to(attrs,
-                            MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
-                            &start_offset, sizeof(start_offset));
-       if (err)
-               goto err_copy;
-
-       return 0;
-
-err_copy:
-       rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry);
-err_dealloc:
-       mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size);
-
-       return err;
-}
-
-static int handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
-                                 struct mlx5_ib_dm *dm,
-                                 struct ib_dm_alloc_attr *attr,
-                                 struct uverbs_attr_bundle *attrs,
-                                 int type)
-{
-       struct mlx5_core_dev *dev = to_mdev(ctx->device)->mdev;
-       u64 act_size;
-       int err;
-
-       /* Allocation size must a multiple of the basic block size
-        * and a power of 2.
-        */
-       act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dev));
-       act_size = roundup_pow_of_two(act_size);
-
-       dm->size = act_size;
-       err = mlx5_dm_sw_icm_alloc(dev, type, act_size, attr->alignment,
-                                  to_mucontext(ctx)->devx_uid, &dm->dev_addr,
-                                  &dm->icm_dm.obj_id);
-       if (err)
-               return err;
-
-       err = uverbs_copy_to(attrs,
-                            MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
-                            &dm->dev_addr, sizeof(dm->dev_addr));
-       if (err)
-               mlx5_dm_sw_icm_dealloc(dev, type, dm->size,
-                                      to_mucontext(ctx)->devx_uid, dm->dev_addr,
-                                      dm->icm_dm.obj_id);
-
-       return err;
-}
-
-struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
-                              struct ib_ucontext *context,
-                              struct ib_dm_alloc_attr *attr,
-                              struct uverbs_attr_bundle *attrs)
-{
-       struct mlx5_ib_dm *dm;
-       enum mlx5_ib_uapi_dm_type type;
-       int err;
-
-       err = uverbs_get_const_default(&type, attrs,
-                                      MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
-                                      MLX5_IB_UAPI_DM_TYPE_MEMIC);
-       if (err)
-               return ERR_PTR(err);
-
-       mlx5_ib_dbg(to_mdev(ibdev), "alloc_dm req: dm_type=%d user_length=0x%llx log_alignment=%d\n",
-                   type, attr->length, attr->alignment);
-
-       err = check_dm_type_support(to_mdev(ibdev), type);
-       if (err)
-               return ERR_PTR(err);
-
-       dm = kzalloc(sizeof(*dm), GFP_KERNEL);
-       if (!dm)
-               return ERR_PTR(-ENOMEM);
-
-       dm->type = type;
-
-       switch (type) {
-       case MLX5_IB_UAPI_DM_TYPE_MEMIC:
-               err = handle_alloc_dm_memic(context, dm,
-                                           attr,
-                                           attrs);
-               break;
-       case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
-               err = handle_alloc_dm_sw_icm(context, dm,
-                                            attr, attrs,
-                                            MLX5_SW_ICM_TYPE_STEERING);
-               break;
-       case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
-               err = handle_alloc_dm_sw_icm(context, dm,
-                                            attr, attrs,
-                                            MLX5_SW_ICM_TYPE_HEADER_MODIFY);
-               break;
-       default:
-               err = -EOPNOTSUPP;
-       }
-
-       if (err)
-               goto err_free;
-
-       return &dm->ibdm;
-
-err_free:
-       kfree(dm);
-       return ERR_PTR(err);
-}
-
-int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs)
-{
-       struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context(
-               &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
-       struct mlx5_core_dev *dev = to_mdev(ibdm->device)->mdev;
-       struct mlx5_ib_dm *dm = to_mdm(ibdm);
-       int ret;
-
-       switch (dm->type) {
-       case MLX5_IB_UAPI_DM_TYPE_MEMIC:
-               rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry);
-               return 0;
-       case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
-               ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_STEERING,
-                                            dm->size, ctx->devx_uid, dm->dev_addr,
-                                            dm->icm_dm.obj_id);
-               if (ret)
-                       return ret;
-               break;
-       case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
-               ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_HEADER_MODIFY,
-                                            dm->size, ctx->devx_uid, dm->dev_addr,
-                                            dm->icm_dm.obj_id);
-               if (ret)
-                       return ret;
-               break;
-       default:
-               return -EOPNOTSUPP;
-       }
-
-       kfree(dm);
-
-       return 0;
-}
-
 static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct mlx5_ib_pd *pd = to_mpd(ibpd);
@@ -2779,7 +2565,7 @@ static void delay_drop_handler(struct work_struct *work)
 static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
                                 struct ib_event *ibev)
 {
-       u8 port = (eqe->data.port.port >> 4) & 0xf;
+       u32 port = (eqe->data.port.port >> 4) & 0xf;
 
        switch (eqe->sub_type) {
        case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT:
@@ -2795,7 +2581,7 @@ static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe
 static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
                              struct ib_event *ibev)
 {
-       u8 port = (eqe->data.port.port >> 4) & 0xf;
+       u32 port = (eqe->data.port.port >> 4) & 0xf;
 
        ibev->element.port_num = port;
 
@@ -3152,7 +2938,7 @@ static u32 get_core_cap_flags(struct ib_device *ibdev,
        return ret;
 }
 
-static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
+static int mlx5_port_immutable(struct ib_device *ibdev, u32 port_num,
                               struct ib_port_immutable *immutable)
 {
        struct ib_port_attr attr;
@@ -3180,7 +2966,7 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
        return 0;
 }
 
-static int mlx5_port_rep_immutable(struct ib_device *ibdev, u8 port_num,
+static int mlx5_port_rep_immutable(struct ib_device *ibdev, u32 port_num,
                                   struct ib_port_immutable *immutable)
 {
        struct ib_port_attr attr;
@@ -3252,7 +3038,7 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
        }
 }
 
-static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
+static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u32 port_num)
 {
        int err;
 
@@ -3266,7 +3052,7 @@ static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
        return 0;
 }
 
-static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
+static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u32 port_num)
 {
        if (dev->port[port_num].roce.nb.notifier_call) {
                unregister_netdevice_notifier(&dev->port[port_num].roce.nb);
@@ -3300,7 +3086,7 @@ static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
        mlx5_nic_vport_disable_roce(dev->mdev);
 }
 
-static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num,
+static int mlx5_ib_rn_get_params(struct ib_device *device, u32 port_num,
                                 enum rdma_netdev_t type,
                                 struct rdma_netdev_alloc_params *params)
 {
@@ -3352,7 +3138,7 @@ static const struct file_operations fops_delay_drop_timeout = {
 static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
                                      struct mlx5_ib_multiport_info *mpi)
 {
-       u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
+       u32 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
        struct mlx5_ib_port *port = &ibdev->port[port_num];
        int comps;
        int err;
@@ -3398,7 +3184,7 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
 
        err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev);
 
-       mlx5_ib_dbg(ibdev, "unaffiliated port %d\n", port_num + 1);
+       mlx5_ib_dbg(ibdev, "unaffiliated port %u\n", port_num + 1);
        /* Log an error, still needed to cleanup the pointers and add
         * it back to the list.
         */
@@ -3412,14 +3198,14 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
 static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
                                    struct mlx5_ib_multiport_info *mpi)
 {
-       u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
+       u32 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
        int err;
 
        lockdep_assert_held(&mlx5_ib_multiport_mutex);
 
        spin_lock(&ibdev->port[port_num].mp.mpi_lock);
        if (ibdev->port[port_num].mp.mpi) {
-               mlx5_ib_dbg(ibdev, "port %d already affiliated.\n",
+               mlx5_ib_dbg(ibdev, "port %u already affiliated.\n",
                            port_num + 1);
                spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
                return false;
@@ -3455,12 +3241,12 @@ unbind:
 
 static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
 {
-       int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+       u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
        enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
                                                          port_num + 1);
        struct mlx5_ib_multiport_info *mpi;
        int err;
-       int i;
+       u32 i;
 
        if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
                return 0;
@@ -3523,10 +3309,10 @@ static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
 
 static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev)
 {
-       int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+       u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
        enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
                                                          port_num + 1);
-       int i;
+       u32 i;
 
        if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
                return;
@@ -3539,7 +3325,8 @@ static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev)
                                kfree(dev->port[i].mp.mpi);
                                dev->port[i].mp.mpi = NULL;
                        } else {
-                               mlx5_ib_dbg(dev, "unbinding port_num: %d\n", i + 1);
+                               mlx5_ib_dbg(dev, "unbinding port_num: %u\n",
+                                           i + 1);
                                mlx5_ib_unbind_slave_port(dev, dev->port[i].mp.mpi);
                        }
                }
@@ -3815,20 +3602,6 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_UAR,
                            &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_ALLOC),
                            &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_DESTROY));
 
-ADD_UVERBS_ATTRIBUTES_SIMPLE(
-       mlx5_ib_dm,
-       UVERBS_OBJECT_DM,
-       UVERBS_METHOD_DM_ALLOC,
-       UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
-                           UVERBS_ATTR_TYPE(u64),
-                           UA_MANDATORY),
-       UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
-                           UVERBS_ATTR_TYPE(u16),
-                           UA_OPTIONAL),
-       UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
-                            enum mlx5_ib_uapi_dm_type,
-                            UA_OPTIONAL));
-
 ADD_UVERBS_ATTRIBUTES_SIMPLE(
        mlx5_ib_flow_action,
        UVERBS_OBJECT_FLOW_ACTION,
@@ -3851,10 +3624,10 @@ static const struct uapi_definition mlx5_ib_defs[] = {
        UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
        UAPI_DEF_CHAIN(mlx5_ib_qos_defs),
        UAPI_DEF_CHAIN(mlx5_ib_std_types_defs),
+       UAPI_DEF_CHAIN(mlx5_ib_dm_defs),
 
        UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
                                &mlx5_ib_flow_action),
-       UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm),
        UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context),
        UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR,
                                UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)),
@@ -3891,8 +3664,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
                dev->port[i].roce.last_port_state = IB_PORT_DOWN;
        }
 
-       mlx5_ib_internal_fill_odp_caps(dev);
-
        err = mlx5_ib_init_multiport_master(dev);
        if (err)
                return err;
@@ -4032,12 +3803,6 @@ static const struct ib_device_ops mlx5_ib_dev_xrc_ops = {
        INIT_RDMA_OBJ_SIZE(ib_xrcd, mlx5_ib_xrcd, ibxrcd),
 };
 
-static const struct ib_device_ops mlx5_ib_dev_dm_ops = {
-       .alloc_dm = mlx5_ib_alloc_dm,
-       .dealloc_dm = mlx5_ib_dealloc_dm,
-       .reg_dm_mr = mlx5_ib_reg_dm_mr,
-};
-
 static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev)
 {
        struct mlx5_core_dev *mdev = dev->mdev;
@@ -4160,7 +3925,7 @@ static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev)
        struct mlx5_core_dev *mdev = dev->mdev;
        enum rdma_link_layer ll;
        int port_type_cap;
-       u8 port_num = 0;
+       u32 port_num = 0;
        int err;
 
        port_type_cap = MLX5_CAP_GEN(mdev, port_type);
@@ -4173,7 +3938,7 @@ static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev)
 
                /* Register only for native ports */
                err = mlx5_add_netdev_notifier(dev, port_num);
-               if (err || dev->is_rep || !mlx5_is_roce_enabled(mdev))
+               if (err || dev->is_rep || !mlx5_is_roce_init_enabled(mdev))
                        /*
                         * We don't enable ETH interface for
                         * 1. IB representors
@@ -4197,7 +3962,7 @@ static void mlx5_ib_roce_cleanup(struct mlx5_ib_dev *dev)
        struct mlx5_core_dev *mdev = dev->mdev;
        enum rdma_link_layer ll;
        int port_type_cap;
-       u8 port_num;
+       u32 port_num;
 
        port_type_cap = MLX5_CAP_GEN(mdev, port_type);
        ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
@@ -4710,7 +4475,7 @@ static int mlx5r_probe(struct auxiliary_device *adev,
        dev->mdev = mdev;
        dev->num_ports = num_ports;
 
-       if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_is_roce_enabled(mdev))
+       if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_is_roce_init_enabled(mdev))
                profile = &raw_eth_profile;
        else
                profile = &pf_profile;
index 88cc26e..e9a3f34 100644 (file)
@@ -166,6 +166,7 @@ enum mlx5_ib_mmap_type {
        MLX5_IB_MMAP_TYPE_VAR = 2,
        MLX5_IB_MMAP_TYPE_UAR_WC = 3,
        MLX5_IB_MMAP_TYPE_UAR_NC = 4,
+       MLX5_IB_MMAP_TYPE_MEMIC_OP = 5,
 };
 
 struct mlx5_bfreg_info {
@@ -406,7 +407,7 @@ struct mlx5_ib_qp_base {
 struct mlx5_ib_qp_trans {
        struct mlx5_ib_qp_base  base;
        u16                     xrcdn;
-       u                     alt_port;
+       u32                     alt_port;
        u8                      atomic_rd_en;
        u8                      resp_depth;
 };
@@ -453,7 +454,7 @@ struct mlx5_ib_dct {
 
 struct mlx5_ib_gsi_qp {
        struct ib_qp *rx_qp;
-       u8 port_num;
+       u32 port_num;
        struct ib_qp_cap cap;
        struct ib_cq *cq;
        struct mlx5_ib_gsi_wr *outstanding_wrs;
@@ -490,7 +491,7 @@ struct mlx5_ib_qp {
        struct mutex            mutex;
        /* cached variant of create_flags from struct ib_qp_init_attr */
        u32                     flags;
-       u                     port;
+       u32                     port;
        u8                      state;
        int                     max_inline_data;
        struct mlx5_bf          bf;
@@ -547,11 +548,6 @@ static inline const struct mlx5_umr_wr *umr_wr(const struct ib_send_wr *wr)
        return container_of(wr, struct mlx5_umr_wr, wr);
 }
 
-struct mlx5_shared_mr_info {
-       int mr_id;
-       struct ib_umem          *umem;
-};
-
 enum mlx5_ib_cq_pr_flags {
        MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD = 1 << 0,
 };
@@ -623,20 +619,6 @@ struct mlx5_user_mmap_entry {
        u32 page_idx;
 };
 
-struct mlx5_ib_dm {
-       struct ib_dm            ibdm;
-       phys_addr_t             dev_addr;
-       u32                     type;
-       size_t                  size;
-       union {
-               struct {
-                       u32     obj_id;
-               } icm_dm;
-               /* other dm types specific params should be added here */
-       };
-       struct mlx5_user_mmap_entry mentry;
-};
-
 #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
 
 #define MLX5_IB_DM_MEMIC_ALLOWED_ACCESS (IB_ACCESS_LOCAL_WRITE   |\
@@ -654,47 +636,69 @@ struct mlx5_ib_dm {
        atomic64_add(value, &((mr)->odp_stats.counter_name))
 
 struct mlx5_ib_mr {
-       struct ib_mr            ibmr;
-       void                    *descs;
-       dma_addr_t              desc_map;
-       int                     ndescs;
-       int                     data_length;
-       int                     meta_ndescs;
-       int                     meta_length;
-       int                     max_descs;
-       int                     desc_size;
-       int                     access_mode;
-       unsigned int            page_shift;
-       struct mlx5_core_mkey   mmkey;
-       struct ib_umem         *umem;
-       struct mlx5_shared_mr_info      *smr_info;
-       struct list_head        list;
-       struct mlx5_cache_ent  *cache_ent;
-       u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
-       struct mlx5_core_sig_ctx    *sig;
-       void                    *descs_alloc;
-       int                     access_flags; /* Needed for rereg MR */
-
-       struct mlx5_ib_mr      *parent;
-       /* Needed for IB_MR_TYPE_INTEGRITY */
-       struct mlx5_ib_mr      *pi_mr;
-       struct mlx5_ib_mr      *klm_mr;
-       struct mlx5_ib_mr      *mtt_mr;
-       u64                     data_iova;
-       u64                     pi_iova;
-
-       /* For ODP and implicit */
-       struct xarray           implicit_children;
+       struct ib_mr ibmr;
+       struct mlx5_core_mkey mmkey;
+
+       /* User MR data */
+       struct mlx5_cache_ent *cache_ent;
+       struct ib_umem *umem;
+
+       /* This is zero'd when the MR is allocated */
        union {
-               struct list_head elm;
-               struct work_struct work;
-       } odp_destroy;
-       struct ib_odp_counters  odp_stats;
-       bool                    is_odp_implicit;
+               /* Used only while the MR is in the cache */
+               struct {
+                       u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
+                       struct mlx5_async_work cb_work;
+                       /* Cache list element */
+                       struct list_head list;
+               };
 
-       struct mlx5_async_work  cb_work;
+               /* Used only by kernel MRs (umem == NULL) */
+               struct {
+                       void *descs;
+                       void *descs_alloc;
+                       dma_addr_t desc_map;
+                       int max_descs;
+                       int ndescs;
+                       int desc_size;
+                       int access_mode;
+
+                       /* For Kernel IB_MR_TYPE_INTEGRITY */
+                       struct mlx5_core_sig_ctx *sig;
+                       struct mlx5_ib_mr *pi_mr;
+                       struct mlx5_ib_mr *klm_mr;
+                       struct mlx5_ib_mr *mtt_mr;
+                       u64 data_iova;
+                       u64 pi_iova;
+                       int meta_ndescs;
+                       int meta_length;
+                       int data_length;
+               };
+
+               /* Used only by User MRs (umem != NULL) */
+               struct {
+                       unsigned int page_shift;
+                       /* Current access_flags */
+                       int access_flags;
+
+                       /* For User ODP */
+                       struct mlx5_ib_mr *parent;
+                       struct xarray implicit_children;
+                       union {
+                               struct work_struct work;
+                       } odp_destroy;
+                       struct ib_odp_counters odp_stats;
+                       bool is_odp_implicit;
+               };
+       };
 };
 
+/* Zero the fields in the mr that are variant depending on usage */
+static inline void mlx5_clear_mr(struct mlx5_ib_mr *mr)
+{
+       memset(mr->out, 0, sizeof(*mr) - offsetof(struct mlx5_ib_mr, out));
+}
+
 static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
 {
        return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem &&
@@ -822,7 +826,7 @@ struct mlx5_roce {
        atomic_t                tx_port_affinity;
        enum ib_port_state last_port_state;
        struct mlx5_ib_dev      *dev;
-       u                     native_port_num;
+       u32                     native_port_num;
 };
 
 struct mlx5_ib_port {
@@ -837,7 +841,7 @@ struct mlx5_ib_dbg_param {
        int                     offset;
        struct mlx5_ib_dev      *dev;
        struct dentry           *dentry;
-       u                     port_num;
+       u32                     port_num;
 };
 
 enum mlx5_ib_dbg_cc_types {
@@ -1063,6 +1067,7 @@ struct mlx5_ib_dev {
        struct mutex                    slow_path_mutex;
        struct ib_odp_caps      odp_caps;
        u64                     odp_max_size;
+       struct mutex            odp_eq_mutex;
        struct mlx5_ib_pf_eq    odp_pf_eq;
 
        struct xarray           odp_mkeys;
@@ -1170,11 +1175,6 @@ static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq)
        return container_of(msrq, struct mlx5_ib_srq, msrq);
 }
 
-static inline struct mlx5_ib_dm *to_mdm(struct ib_dm *ibdm)
-{
-       return container_of(ibdm, struct mlx5_ib_dm, ibdm);
-}
-
 static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr)
 {
        return container_of(ibmr, struct mlx5_ib_mr, ibmr);
@@ -1268,8 +1268,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
                                             struct ib_udata *udata,
                                             int access_flags);
 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
-void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr);
-void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr);
+void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr);
 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
                                    u64 length, u64 virt_addr, int access_flags,
                                    struct ib_pd *pd, struct ib_udata *udata);
@@ -1285,7 +1284,7 @@ int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
                         int data_sg_nents, unsigned int *data_sg_offset,
                         struct scatterlist *meta_sg, int meta_sg_nents,
                         unsigned int *meta_sg_offset);
-int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
                        const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                        const struct ib_mad *in, struct ib_mad *out,
                        size_t *out_mad_size, u16 *out_mad_pkey_index);
@@ -1300,13 +1299,13 @@ int mlx5_query_mad_ifc_vendor_id(struct ib_device *ibdev,
                                 u32 *vendor_id);
 int mlx5_query_mad_ifc_node_desc(struct mlx5_ib_dev *dev, char *node_desc);
 int mlx5_query_mad_ifc_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid);
-int mlx5_query_mad_ifc_pkey(struct ib_device *ibdev, u8 port, u16 index,
+int mlx5_query_mad_ifc_pkey(struct ib_device *ibdev, u32 port, u16 index,
                            u16 *pkey);
-int mlx5_query_mad_ifc_gids(struct ib_device *ibdev, u8 port, int index,
+int mlx5_query_mad_ifc_gids(struct ib_device *ibdev, u32 port, int index,
                            union ib_gid *gid);
-int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port,
+int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u32 port,
                            struct ib_port_attr *props);
-int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
+int mlx5_ib_query_port(struct ib_device *ibdev, u32 port,
                       struct ib_port_attr *props);
 void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
                          u64 access_flags);
@@ -1317,8 +1316,6 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
 
 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
                                       unsigned int entry, int access_flags);
-void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
-int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr);
 
 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
                            struct ib_mr_status *mr_status);
@@ -1332,18 +1329,13 @@ int mlx5_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table,
                                 struct ib_rwq_ind_table_init_attr *init_attr,
                                 struct ib_udata *udata);
 int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
-struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
-                              struct ib_ucontext *context,
-                              struct ib_dm_alloc_attr *attr,
-                              struct uverbs_attr_bundle *attrs);
-int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs);
 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
                                struct ib_dm_mr_attr *attr,
                                struct uverbs_attr_bundle *attrs);
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev);
 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
+int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq);
 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
 int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
@@ -1357,12 +1349,12 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
 int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr);
 int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr);
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
-static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
+static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
+static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev,
+                                     struct mlx5_ib_pf_eq *eq)
 {
-       return;
+       return 0;
 }
-
-static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
 static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline void mlx5_ib_odp_cleanup(void)                               {}
@@ -1397,22 +1389,22 @@ int __mlx5_ib_add(struct mlx5_ib_dev *dev,
                  const struct mlx5_ib_profile *profile);
 
 int mlx5_ib_get_vf_config(struct ib_device *device, int vf,
-                         u8 port, struct ifla_vf_info *info);
+                         u32 port, struct ifla_vf_info *info);
 int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf,
-                             u8 port, int state);
+                             u32 port, int state);
 int mlx5_ib_get_vf_stats(struct ib_device *device, int vf,
-                        u8 port, struct ifla_vf_stats *stats);
-int mlx5_ib_get_vf_guid(struct ib_device *device, int vf, u8 port,
+                        u32 port, struct ifla_vf_stats *stats);
+int mlx5_ib_get_vf_guid(struct ib_device *device, int vf, u32 port,
                        struct ifla_vf_guid *node_guid,
                        struct ifla_vf_guid *port_guid);
-int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port,
+int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u32 port,
                        u64 guid, int type);
 
 __be16 mlx5_get_roce_udp_sport_min(const struct mlx5_ib_dev *dev,
                                   const struct ib_gid_attr *attr);
 
-void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num);
-void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num);
+void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u32 port_num);
+void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u32 port_num);
 
 /* GSI QP helper functions */
 int mlx5_ib_create_gsi(struct ib_pd *pd, struct mlx5_ib_qp *mqp,
@@ -1435,10 +1427,10 @@ void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi,
                        int bfregn);
 struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi);
 struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *dev,
-                                                  u8 ib_port_num,
-                                                  u8 *native_port_num);
+                                                  u32 ib_port_num,
+                                                  u32 *native_port_num);
 void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev,
-                                 u8 port_num);
+                                 u32 port_num);
 
 extern const struct uapi_definition mlx5_ib_devx_defs[];
 extern const struct uapi_definition mlx5_ib_flow_defs[];
index db05b0e..4388afe 100644 (file)
@@ -42,6 +42,7 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_umem_odp.h>
 #include <rdma/ib_verbs.h>
+#include "dm.h"
 #include "mlx5_ib.h"
 
 /*
@@ -119,8 +120,6 @@ mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
                                create_mkey_callback, context);
 }
 
-static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
-static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
 static int mr_cache_max_order(struct mlx5_ib_dev *dev);
 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
 
@@ -590,6 +589,8 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
                ent->available_mrs--;
                queue_adjust_cache_locked(ent);
                spin_unlock_irq(&ent->lock);
+
+               mlx5_clear_mr(mr);
        }
        mr->access_flags = access_flags;
        return mr;
@@ -615,42 +616,20 @@ static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent)
                        ent->available_mrs--;
                        queue_adjust_cache_locked(ent);
                        spin_unlock_irq(&ent->lock);
-                       break;
+                       mlx5_clear_mr(mr);
+                       return mr;
                }
                queue_adjust_cache_locked(ent);
                spin_unlock_irq(&ent->lock);
        }
-
-       if (!mr)
-               req_ent->miss++;
-
-       return mr;
-}
-
-static void detach_mr_from_cache(struct mlx5_ib_mr *mr)
-{
-       struct mlx5_cache_ent *ent = mr->cache_ent;
-
-       mr->cache_ent = NULL;
-       spin_lock_irq(&ent->lock);
-       ent->total_mrs--;
-       spin_unlock_irq(&ent->lock);
+       req_ent->miss++;
+       return NULL;
 }
 
-void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
        struct mlx5_cache_ent *ent = mr->cache_ent;
 
-       if (!ent)
-               return;
-
-       if (mlx5_mr_cache_invalidate(mr)) {
-               detach_mr_from_cache(mr);
-               destroy_mkey(dev, mr);
-               kfree(mr);
-               return;
-       }
-
        spin_lock_irq(&ent->lock);
        list_add_tail(&mr->list, &ent->head);
        ent->available_mrs++;
@@ -993,8 +972,6 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 
        mr->ibmr.pd = pd;
        mr->umem = umem;
-       mr->access_flags = access_flags;
-       mr->desc_size = sizeof(struct mlx5_mtt);
        mr->mmkey.iova = iova;
        mr->mmkey.size = umem->length;
        mr->mmkey.pd = to_mpd(pd)->pdn;
@@ -1028,7 +1005,7 @@ static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
         */
        might_sleep();
 
-       gfp_mask |= __GFP_ZERO;
+       gfp_mask |= __GFP_ZERO | __GFP_NORETRY;
 
        /*
         * If the system already has a suitable high order page then just use
@@ -1505,7 +1482,7 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
                 */
                err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
                if (err) {
-                       dereg_mr(dev, mr);
+                       mlx5_ib_dereg_mr(&mr->ibmr, NULL);
                        return ERR_PTR(err);
                }
        }
@@ -1524,6 +1501,9 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
        if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
                return ERR_PTR(-EOPNOTSUPP);
 
+       err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
+       if (err)
+               return ERR_PTR(err);
        if (!start && length == U64_MAX) {
                if (iova != 0)
                        return ERR_PTR(-EINVAL);
@@ -1562,7 +1542,7 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
        return &mr->ibmr;
 
 err_dereg_mr:
-       dereg_mr(dev, mr);
+       mlx5_ib_dereg_mr(&mr->ibmr, NULL);
        return ERR_PTR(err);
 }
 
@@ -1659,19 +1639,19 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
        return &mr->ibmr;
 
 err_dereg_mr:
-       dereg_mr(dev, mr);
+       mlx5_ib_dereg_mr(&mr->ibmr, NULL);
        return ERR_PTR(err);
 }
 
 /**
- * mlx5_mr_cache_invalidate - Fence all DMA on the MR
+ * revoke_mr - Fence all DMA on the MR
  * @mr: The MR to fence
  *
  * Upon return the NIC will not be doing any DMA to the pages under the MR,
- * and any DMA inprogress will be completed. Failure of this function
+ * and any DMA in progress will be completed. Failure of this function
  * indicates the HW has failed catastrophically.
  */
-int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr)
+static int revoke_mr(struct mlx5_ib_mr *mr)
 {
        struct mlx5_umr_wr umrwr = {};
 
@@ -1765,7 +1745,7 @@ static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
         * with it. This ensure the change is atomic relative to any use of the
         * MR.
         */
-       err = mlx5_mr_cache_invalidate(mr);
+       err = revoke_mr(mr);
        if (err)
                return err;
 
@@ -1844,7 +1824,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
                 * Only one active MR can refer to a umem at one time, revoke
                 * the old MR before assigning the umem to the new one.
                 */
-               err = mlx5_mr_cache_invalidate(mr);
+               err = revoke_mr(mr);
                if (err)
                        return ERR_PTR(err);
                umem = mr->umem;
@@ -1931,7 +1911,7 @@ err:
 static void
 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
 {
-       if (mr->descs) {
+       if (!mr->umem && mr->descs) {
                struct ib_device *device = mr->ibmr.device;
                int size = mr->max_descs * mr->desc_size;
                struct mlx5_ib_dev *dev = to_mdev(device);
@@ -1943,69 +1923,82 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
        }
 }
 
-static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
-       if (mr->sig) {
+       struct mlx5_ib_mr *mr = to_mmr(ibmr);
+       struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
+       int rc;
+
+       /*
+        * Any async use of the mr must hold the refcount, once the refcount
+        * goes to zero no other thread, such as ODP page faults, prefetch, any
+        * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
+        */
+       if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
+           refcount_read(&mr->mmkey.usecount) != 0 &&
+           xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
+               mlx5r_deref_wait_odp_mkey(&mr->mmkey);
+
+       if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
+               xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), ibmr,
+                          NULL, GFP_KERNEL);
+
+               if (mr->mtt_mr) {
+                       rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
+                       if (rc)
+                               return rc;
+                       mr->mtt_mr = NULL;
+               }
+               if (mr->klm_mr) {
+                       rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
+                       if (rc)
+                               return rc;
+                       mr->klm_mr = NULL;
+               }
+
                if (mlx5_core_destroy_psv(dev->mdev,
                                          mr->sig->psv_memory.psv_idx))
                        mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
                                     mr->sig->psv_memory.psv_idx);
-               if (mlx5_core_destroy_psv(dev->mdev,
-                                         mr->sig->psv_wire.psv_idx))
+               if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
                        mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
                                     mr->sig->psv_wire.psv_idx);
-               xa_erase(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key));
                kfree(mr->sig);
                mr->sig = NULL;
        }
 
+       /* Stop DMA */
+       if (mr->cache_ent) {
+               if (revoke_mr(mr)) {
+                       spin_lock_irq(&mr->cache_ent->lock);
+                       mr->cache_ent->total_mrs--;
+                       spin_unlock_irq(&mr->cache_ent->lock);
+                       mr->cache_ent = NULL;
+               }
+       }
        if (!mr->cache_ent) {
-               destroy_mkey(dev, mr);
-               mlx5_free_priv_descs(mr);
+               rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
+               if (rc)
+                       return rc;
        }
-}
 
-static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
-{
-       struct ib_umem *umem = mr->umem;
+       if (mr->umem) {
+               bool is_odp = is_odp_mr(mr);
 
-       /* Stop all DMA */
-       if (is_odp_mr(mr))
-               mlx5_ib_fence_odp_mr(mr);
-       else if (is_dmabuf_mr(mr))
-               mlx5_ib_fence_dmabuf_mr(mr);
-       else
-               clean_mr(dev, mr);
-
-       if (umem) {
-               if (!is_odp_mr(mr))
-                       atomic_sub(ib_umem_num_pages(umem),
+               if (!is_odp)
+                       atomic_sub(ib_umem_num_pages(mr->umem),
                                   &dev->mdev->priv.reg_pages);
-               ib_umem_release(umem);
+               ib_umem_release(mr->umem);
+               if (is_odp)
+                       mlx5_ib_free_odp_mr(mr);
        }
 
-       if (mr->cache_ent)
+       if (mr->cache_ent) {
                mlx5_mr_cache_free(dev, mr);
-       else
+       } else {
+               mlx5_free_priv_descs(mr);
                kfree(mr);
-}
-
-int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
-{
-       struct mlx5_ib_mr *mmr = to_mmr(ibmr);
-
-       if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
-               dereg_mr(to_mdev(mmr->mtt_mr->ibmr.device), mmr->mtt_mr);
-               dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr);
        }
-
-       if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) {
-               mlx5_ib_free_implicit_mr(mmr);
-               return 0;
-       }
-
-       dereg_mr(to_mdev(ibmr->device), mmr);
-
        return 0;
 }
 
@@ -2177,10 +2170,10 @@ err_free_descs:
        destroy_mkey(dev, mr);
        mlx5_free_priv_descs(mr);
 err_free_mtt_mr:
-       dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr);
+       mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
        mr->mtt_mr = NULL;
 err_free_klm_mr:
-       dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr);
+       mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
        mr->klm_mr = NULL;
 err_destroy_psv:
        if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
index b103555..782b2af 100644 (file)
@@ -181,64 +181,29 @@ void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
        }
 }
 
-static void dma_fence_odp_mr(struct mlx5_ib_mr *mr)
-{
-       struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
-
-       /* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */
-       mutex_lock(&odp->umem_mutex);
-       if (odp->npages) {
-               mlx5_mr_cache_invalidate(mr);
-               ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp),
-                                           ib_umem_end(odp));
-               WARN_ON(odp->npages);
-       }
-       odp->private = NULL;
-       mutex_unlock(&odp->umem_mutex);
-
-       if (!mr->cache_ent) {
-               mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, &mr->mmkey);
-               WARN_ON(mr->descs);
-       }
-}
-
 /*
  * This must be called after the mr has been removed from implicit_children.
  * NOTE: The MR does not necessarily have to be
  * empty here, parallel page faults could have raced with the free process and
  * added pages to it.
  */
-static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
+static void free_implicit_child_mr_work(struct work_struct *work)
 {
+       struct mlx5_ib_mr *mr =
+               container_of(work, struct mlx5_ib_mr, odp_destroy.work);
        struct mlx5_ib_mr *imr = mr->parent;
        struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
        struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
-       unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
 
        mlx5r_deref_wait_odp_mkey(&mr->mmkey);
 
-       if (need_imr_xlt) {
-               mutex_lock(&odp_imr->umem_mutex);
-               mlx5_ib_update_xlt(mr->parent, idx, 1, 0,
-                                  MLX5_IB_UPD_XLT_INDIRECT |
-                                  MLX5_IB_UPD_XLT_ATOMIC);
-               mutex_unlock(&odp_imr->umem_mutex);
-       }
-
-       dma_fence_odp_mr(mr);
-
-       mr->parent = NULL;
-       mlx5_mr_cache_free(mr_to_mdev(mr), mr);
-       ib_umem_odp_release(odp);
-}
-
-static void free_implicit_child_mr_work(struct work_struct *work)
-{
-       struct mlx5_ib_mr *mr =
-               container_of(work, struct mlx5_ib_mr, odp_destroy.work);
-       struct mlx5_ib_mr *imr = mr->parent;
+       mutex_lock(&odp_imr->umem_mutex);
+       mlx5_ib_update_xlt(mr->parent, ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT,
+                          1, 0,
+                          MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC);
+       mutex_unlock(&odp_imr->umem_mutex);
+       mlx5_ib_dereg_mr(&mr->ibmr, NULL);
 
-       free_implicit_child_mr(mr, true);
        mlx5r_deref_odp_mkey(&imr->mmkey);
 }
 
@@ -352,7 +317,7 @@ const struct mmu_interval_notifier_ops mlx5_mn_ops = {
        .invalidate = mlx5_ib_invalidate_range,
 };
 
-void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
+static void internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 {
        struct ib_odp_caps *caps = &dev->odp_caps;
 
@@ -455,8 +420,10 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 
        ret = mr = mlx5_mr_cache_alloc(
                mr_to_mdev(imr), MLX5_IMR_MTT_CACHE_ENTRY, imr->access_flags);
-       if (IS_ERR(mr))
-               goto out_umem;
+       if (IS_ERR(mr)) {
+               ib_umem_odp_release(odp);
+               return mr;
+       }
 
        mr->ibmr.pd = imr->ibmr.pd;
        mr->ibmr.device = &mr_to_mdev(imr)->ib_dev;
@@ -506,9 +473,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 out_lock:
        xa_unlock(&imr->implicit_children);
 out_mr:
-       mlx5_mr_cache_free(mr_to_mdev(imr), mr);
-out_umem:
-       ib_umem_odp_release(odp);
+       mlx5_ib_dereg_mr(&mr->ibmr, NULL);
        return ret;
 }
 
@@ -531,8 +496,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 
        imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY, access_flags);
        if (IS_ERR(imr)) {
-               err = PTR_ERR(imr);
-               goto out_umem;
+               ib_umem_odp_release(umem_odp);
+               return imr;
        }
 
        imr->ibmr.pd = &pd->ibpd;
@@ -562,93 +527,22 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
        return imr;
 out_mr:
        mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
-       mlx5_mr_cache_free(dev, imr);
-out_umem:
-       ib_umem_odp_release(umem_odp);
+       mlx5_ib_dereg_mr(&imr->ibmr, NULL);
        return ERR_PTR(err);
 }
 
-void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
+void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr)
 {
-       struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
-       struct mlx5_ib_dev *dev = mr_to_mdev(imr);
        struct mlx5_ib_mr *mtt;
        unsigned long idx;
 
-       xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key));
-       /*
-        * All work on the prefetch list must be completed, xa_erase() prevented
-        * new work from being created.
-        */
-       mlx5r_deref_wait_odp_mkey(&imr->mmkey);
-       /*
-        * At this point it is forbidden for any other thread to enter
-        * pagefault_mr() on this imr. It is already forbidden to call
-        * pagefault_mr() on an implicit child. Due to this additions to
-        * implicit_children are prevented.
-        * In addition, any new call to destroy_unused_implicit_child_mr()
-        * may return immediately.
-        */
-
        /*
-        * Fence the imr before we destroy the children. This allows us to
-        * skip updating the XLT of the imr during destroy of the child mkey
-        * the imr points to.
+        * If this is an implicit MR it is already invalidated so we can just
+        * delete the children mkeys.
         */
-       mlx5_mr_cache_invalidate(imr);
-
-       xa_for_each(&imr->implicit_children, idx, mtt) {
-               xa_erase(&imr->implicit_children, idx);
-               free_implicit_child_mr(mtt, false);
-       }
-
-       mlx5_mr_cache_free(dev, imr);
-       ib_umem_odp_release(odp_imr);
-}
-
-/**
- * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR
- * @mr: to fence
- *
- * On return no parallel threads will be touching this MR and no DMA will be
- * active.
- */
-void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr)
-{
-       /* Prevent new page faults and prefetch requests from succeeding */
-       xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
-
-       /* Wait for all running page-fault handlers to finish. */
-       mlx5r_deref_wait_odp_mkey(&mr->mmkey);
-
-       dma_fence_odp_mr(mr);
-}
-
-/**
- * mlx5_ib_fence_dmabuf_mr - Stop all access to the dmabuf MR
- * @mr: to fence
- *
- * On return no parallel threads will be touching this MR and no DMA will be
- * active.
- */
-void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr)
-{
-       struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
-
-       /* Prevent new page faults and prefetch requests from succeeding */
-       xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
-
-       mlx5r_deref_wait_odp_mkey(&mr->mmkey);
-
-       dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
-       mlx5_mr_cache_invalidate(mr);
-       umem_dmabuf->private = NULL;
-       ib_umem_dmabuf_unmap_pages(umem_dmabuf);
-       dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
-
-       if (!mr->cache_ent) {
-               mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, &mr->mmkey);
-               WARN_ON(mr->descs);
+       xa_for_each(&mr->implicit_children, idx, mtt) {
+               xa_erase(&mr->implicit_children, idx);
+               mlx5_ib_dereg_mr(&mtt->ibmr, NULL);
        }
 }
 
@@ -1637,20 +1531,24 @@ enum {
        MLX5_IB_NUM_PF_DRAIN    = 64,
 };
 
-static int
-mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
+int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 {
        struct mlx5_eq_param param = {};
-       int err;
+       int err = 0;
 
+       mutex_lock(&dev->odp_eq_mutex);
+       if (eq->core)
+               goto unlock;
        INIT_WORK(&eq->work, mlx5_ib_eq_pf_action);
        spin_lock_init(&eq->lock);
        eq->dev = dev;
 
        eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN,
                                               sizeof(struct mlx5_pagefault));
-       if (!eq->pool)
-               return -ENOMEM;
+       if (!eq->pool) {
+               err = -ENOMEM;
+               goto unlock;
+       }
 
        eq->wq = alloc_workqueue("mlx5_ib_page_fault",
                                 WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM,
@@ -1661,7 +1559,7 @@ mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
        }
 
        eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
-       param = (struct mlx5_eq_param) {
+       param = (struct mlx5_eq_param){
                .irq_index = 0,
                .nent = MLX5_IB_NUM_PF_EQE,
        };
@@ -1677,21 +1575,27 @@ mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
                goto err_eq;
        }
 
+       mutex_unlock(&dev->odp_eq_mutex);
        return 0;
 err_eq:
        mlx5_eq_destroy_generic(dev->mdev, eq->core);
 err_wq:
+       eq->core = NULL;
        destroy_workqueue(eq->wq);
 err_mempool:
        mempool_destroy(eq->pool);
+unlock:
+       mutex_unlock(&dev->odp_eq_mutex);
        return err;
 }
 
 static int
-mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
+mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 {
        int err;
 
+       if (!eq->core)
+               return 0;
        mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb);
        err = mlx5_eq_destroy_generic(dev->mdev, eq->core);
        cancel_work_sync(&eq->work);
@@ -1735,6 +1639,8 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
 {
        int ret = 0;
 
+       internal_fill_odp_caps(dev);
+
        if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
                return ret;
 
@@ -1748,8 +1654,7 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
                }
        }
 
-       ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq);
-
+       mutex_init(&dev->odp_eq_mutex);
        return ret;
 }
 
@@ -1758,7 +1663,7 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
        if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
                return;
 
-       mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq);
+       mlx5_ib_odp_destroy_eq(dev, &dev->odp_pf_eq);
 }
 
 int mlx5_ib_odp_init(void)
index f5a52a6..9282eb1 100644 (file)
@@ -67,7 +67,7 @@ struct mlx5_modify_raw_qp_param {
        struct mlx5_rate_limit rl;
 
        u8 rq_q_ctr_id;
-       u16 port;
+       u32 port;
 };
 
 static void get_cqs(enum ib_qp_type qp_type,
@@ -3146,6 +3146,19 @@ enum {
        MLX5_PATH_FLAG_COUNTER  = 1 << 2,
 };
 
+static int mlx5_to_ib_rate_map(u8 rate)
+{
+       static const int rates[] = { IB_RATE_PORT_CURRENT, IB_RATE_56_GBPS,
+                                    IB_RATE_25_GBPS,      IB_RATE_100_GBPS,
+                                    IB_RATE_200_GBPS,     IB_RATE_50_GBPS,
+                                    IB_RATE_400_GBPS };
+
+       if (rate < ARRAY_SIZE(rates))
+               return rates[rate];
+
+       return rate - MLX5_STAT_RATE_OFFSET;
+}
+
 static int ib_to_mlx5_rate_map(u8 rate)
 {
        switch (rate) {
@@ -4485,7 +4498,7 @@ static void to_rdma_ah_attr(struct mlx5_ib_dev *ibdev,
        rdma_ah_set_path_bits(ah_attr, MLX5_GET(ads, path, mlid));
 
        static_rate = MLX5_GET(ads, path, stat_rate);
-       rdma_ah_set_static_rate(ah_attr, static_rate ? static_rate - 5 : 0);
+       rdma_ah_set_static_rate(ah_attr, mlx5_to_ib_rate_map(static_rate));
        if (MLX5_GET(ads, path, grh) ||
            ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
                rdma_ah_set_grh(ah_attr, NULL, MLX5_GET(ads, path, flow_label),
index 16145fd..c0ddf7b 100644 (file)
@@ -7,6 +7,8 @@
 #include <rdma/mlx5_user_ioctl_cmds.h>
 #include <rdma/mlx5_user_ioctl_verbs.h>
 #include <linux/mlx5/driver.h>
+#include <linux/mlx5/eswitch.h>
+#include <linux/mlx5/vport.h>
 #include "mlx5_ib.h"
 
 #define UVERBS_MODULE_NAME mlx5_ib
@@ -23,6 +25,174 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_PD_QUERY)(
                              &mpd->pdn, sizeof(mpd->pdn));
 }
 
+static int fill_vport_icm_addr(struct mlx5_core_dev *mdev, u16 vport,
+                              struct mlx5_ib_uapi_query_port *info)
+{
+       u32 out[MLX5_ST_SZ_DW(query_esw_vport_context_out)] = {};
+       u32 in[MLX5_ST_SZ_DW(query_esw_vport_context_in)] = {};
+       bool sw_owner_supp;
+       u64 icm_rx;
+       u64 icm_tx;
+       int err;
+
+       sw_owner_supp = MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, sw_owner) ||
+                       MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, sw_owner_v2);
+
+       if (vport == MLX5_VPORT_UPLINK) {
+               icm_rx = MLX5_CAP64_ESW_FLOWTABLE(mdev,
+                       sw_steering_uplink_icm_address_rx);
+               icm_tx = MLX5_CAP64_ESW_FLOWTABLE(mdev,
+                       sw_steering_uplink_icm_address_tx);
+       } else {
+               MLX5_SET(query_esw_vport_context_in, in, opcode,
+                        MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT);
+               MLX5_SET(query_esw_vport_context_in, in, vport_number, vport);
+               MLX5_SET(query_esw_vport_context_in, in, other_vport, true);
+
+               err = mlx5_cmd_exec_inout(mdev, query_esw_vport_context, in,
+                                         out);
+
+               if (err)
+                       return err;
+
+               icm_rx = MLX5_GET64(
+                       query_esw_vport_context_out, out,
+                       esw_vport_context.sw_steering_vport_icm_address_rx);
+
+               icm_tx = MLX5_GET64(
+                       query_esw_vport_context_out, out,
+                       esw_vport_context.sw_steering_vport_icm_address_tx);
+       }
+
+       if (sw_owner_supp && icm_rx) {
+               info->vport_steering_icm_rx = icm_rx;
+               info->flags |=
+                       MLX5_IB_UAPI_QUERY_PORT_VPORT_STEERING_ICM_RX;
+       }
+
+       if (sw_owner_supp && icm_tx) {
+               info->vport_steering_icm_tx = icm_tx;
+               info->flags |=
+                       MLX5_IB_UAPI_QUERY_PORT_VPORT_STEERING_ICM_TX;
+       }
+
+       return 0;
+}
+
+static int fill_vport_vhca_id(struct mlx5_core_dev *mdev, u16 vport,
+                             struct mlx5_ib_uapi_query_port *info)
+{
+       size_t out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+       u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
+       void *out;
+       int err;
+
+       out = kzalloc(out_sz, GFP_KERNEL);
+       if (!out)
+               return -ENOMEM;
+
+       MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+       MLX5_SET(query_hca_cap_in, in, other_function, true);
+       MLX5_SET(query_hca_cap_in, in, function_id, vport);
+       MLX5_SET(query_hca_cap_in, in, op_mod,
+                MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE |
+                HCA_CAP_OPMOD_GET_CUR);
+
+       err = mlx5_cmd_exec(mdev, in, sizeof(in), out, out_sz);
+       if (err)
+               goto out;
+
+       info->vport_vhca_id = MLX5_GET(query_hca_cap_out, out,
+                                      capability.cmd_hca_cap.vhca_id);
+
+       info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID;
+out:
+       kfree(out);
+       return err;
+}
+
+static int fill_switchdev_info(struct mlx5_ib_dev *dev, u32 port_num,
+                              struct mlx5_ib_uapi_query_port *info)
+{
+       struct mlx5_core_dev *mdev = dev->mdev;
+       struct mlx5_eswitch_rep *rep;
+       int err;
+
+       rep = dev->port[port_num - 1].rep;
+       if (!rep)
+               return -EOPNOTSUPP;
+
+       info->vport = rep->vport;
+       info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT;
+
+       if (rep->vport != MLX5_VPORT_UPLINK) {
+               err = fill_vport_vhca_id(mdev, rep->vport, info);
+               if (err)
+                       return err;
+       }
+
+       info->esw_owner_vhca_id = MLX5_CAP_GEN(mdev, vhca_id);
+       info->flags |= MLX5_IB_UAPI_QUERY_PORT_ESW_OWNER_VHCA_ID;
+
+       err = fill_vport_icm_addr(mdev, rep->vport, info);
+       if (err)
+               return err;
+
+       if (mlx5_eswitch_vport_match_metadata_enabled(mdev->priv.eswitch)) {
+               info->reg_c0.value = mlx5_eswitch_get_vport_metadata_for_match(
+                       mdev->priv.eswitch, rep->vport);
+               info->reg_c0.mask = mlx5_eswitch_get_vport_metadata_mask();
+               info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT_REG_C0;
+       }
+
+       return 0;
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_QUERY_PORT)(
+       struct uverbs_attr_bundle *attrs)
+{
+       struct mlx5_ib_uapi_query_port info = {};
+       struct mlx5_ib_ucontext *c;
+       struct mlx5_ib_dev *dev;
+       u32 port_num;
+       int ret;
+
+       if (uverbs_copy_from(&port_num, attrs,
+                            MLX5_IB_ATTR_QUERY_PORT_PORT_NUM))
+               return -EFAULT;
+
+       c = to_mucontext(ib_uverbs_get_ucontext(attrs));
+       if (IS_ERR(c))
+               return PTR_ERR(c);
+       dev = to_mdev(c->ibucontext.device);
+
+       if (!rdma_is_port_valid(&dev->ib_dev, port_num))
+               return -EINVAL;
+
+       if (mlx5_eswitch_mode(dev->mdev) == MLX5_ESWITCH_OFFLOADS) {
+               ret = fill_switchdev_info(dev, port_num, &info);
+               if (ret)
+                       return ret;
+       }
+
+       return uverbs_copy_to_struct_or_zero(attrs, MLX5_IB_ATTR_QUERY_PORT, &info,
+                                            sizeof(info));
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+       MLX5_IB_METHOD_QUERY_PORT,
+       UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_QUERY_PORT_PORT_NUM,
+                          UVERBS_ATTR_TYPE(u32), UA_MANDATORY),
+       UVERBS_ATTR_PTR_OUT(
+               MLX5_IB_ATTR_QUERY_PORT,
+               UVERBS_ATTR_STRUCT(struct mlx5_ib_uapi_query_port,
+                                  reg_c0),
+               UA_MANDATORY));
+
+ADD_UVERBS_METHODS(mlx5_ib_device,
+                  UVERBS_OBJECT_DEVICE,
+                  &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT));
+
 DECLARE_UVERBS_NAMED_METHOD(
        MLX5_IB_METHOD_PD_QUERY,
        UVERBS_ATTR_IDR(MLX5_IB_ATTR_QUERY_PD_HANDLE,
@@ -41,5 +211,8 @@ const struct uapi_definition mlx5_ib_std_types_defs[] = {
        UAPI_DEF_CHAIN_OBJ_TREE(
                UVERBS_OBJECT_PD,
                &mlx5_ib_pd),
+       UAPI_DEF_CHAIN_OBJ_TREE(
+               UVERBS_OBJECT_DEVICE,
+               &mlx5_ib_device),
        {},
 };
index f051f4e..3df1f5f 100644 (file)
@@ -91,7 +91,7 @@ static enum ib_rate tavor_rate_to_ib(u8 mthca_rate, u8 port_rate)
        }
 }
 
-enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port)
+enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u32 port)
 {
        if (mthca_is_memfree(dev)) {
                /* Handle old Arbel FW */
@@ -131,7 +131,7 @@ static u8 ib_rate_to_tavor(u8 static_rate)
        }
 }
 
-u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port)
+u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u32 port)
 {
        u8 rate;
 
@@ -293,7 +293,7 @@ int mthca_ah_query(struct ib_ah *ibah, struct rdma_ah_attr *attr)
 {
        struct mthca_ah *ah   = to_mah(ibah);
        struct mthca_dev *dev = to_mdev(ibah->device);
-       u8 port_num = be32_to_cpu(ah->av->port_pd) >> 24;
+       u32 port_num = be32_to_cpu(ah->av->port_pd) >> 24;
 
        /* Only implement for MAD and memfree ah for now. */
        if (ah->type == MTHCA_AH_ON_HCA)
index a445160..a4a9d87 100644 (file)
@@ -546,7 +546,7 @@ int mthca_alloc_sqp(struct mthca_dev *dev,
                    enum ib_sig_type send_policy,
                    struct ib_qp_cap *cap,
                    int qpn,
-                   int port,
+                   u32 port,
                    struct mthca_qp *qp,
                    struct ib_udata *udata);
 void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp);
@@ -559,13 +559,13 @@ int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
                  struct ib_ud_header *header);
 int mthca_ah_query(struct ib_ah *ibah, struct rdma_ah_attr *attr);
 int mthca_ah_grh_present(struct mthca_ah *ah);
-u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port);
-enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port);
+u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u32 port);
+enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u32 port);
 
 int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
 int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
 
-int mthca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+int mthca_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
                      const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                      const struct ib_mad *in, struct ib_mad *out,
                      size_t *out_mad_size, u16 *out_mad_pkey_index);
index 99aa818..0425270 100644 (file)
@@ -162,7 +162,7 @@ static void node_desc_override(struct ib_device *dev,
 }
 
 static void forward_trap(struct mthca_dev *dev,
-                        u8 port_num,
+                        u32 port_num,
                         const struct ib_mad *mad)
 {
        int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
@@ -196,7 +196,7 @@ static void forward_trap(struct mthca_dev *dev,
        }
 }
 
-int mthca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+int mthca_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
                      const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                      const struct ib_mad *in, struct ib_mad *out,
                      size_t *out_mad_size, u16 *out_mad_pkey_index)
index 1a3dd07..522bb60 100644 (file)
@@ -127,7 +127,7 @@ static int mthca_query_device(struct ib_device *ibdev, struct ib_device_attr *pr
 }
 
 static int mthca_query_port(struct ib_device *ibdev,
-                           u8 port, struct ib_port_attr *props)
+                           u32 port, struct ib_port_attr *props)
 {
        struct ib_smp *in_mad  = NULL;
        struct ib_smp *out_mad = NULL;
@@ -194,7 +194,7 @@ static int mthca_modify_device(struct ib_device *ibdev,
 }
 
 static int mthca_modify_port(struct ib_device *ibdev,
-                            u8 port, int port_modify_mask,
+                            u32 port, int port_modify_mask,
                             struct ib_port_modify *props)
 {
        struct mthca_set_ib_param set_ib;
@@ -223,7 +223,7 @@ out:
 }
 
 static int mthca_query_pkey(struct ib_device *ibdev,
-                           u8 port, u16 index, u16 *pkey)
+                           u32 port, u16 index, u16 *pkey)
 {
        struct ib_smp *in_mad  = NULL;
        struct ib_smp *out_mad = NULL;
@@ -251,7 +251,7 @@ static int mthca_query_pkey(struct ib_device *ibdev,
        return err;
 }
 
-static int mthca_query_gid(struct ib_device *ibdev, u8 port,
+static int mthca_query_gid(struct ib_device *ibdev, u32 port,
                           int index, union ib_gid *gid)
 {
        struct ib_smp *in_mad  = NULL;
@@ -1051,7 +1051,7 @@ out:
        return err;
 }
 
-static int mthca_port_immutable(struct ib_device *ibdev, u8 port_num,
+static int mthca_port_immutable(struct ib_device *ibdev, u32 port_num,
                                struct ib_port_immutable *immutable)
 {
        struct ib_port_attr attr;
index 07cfc09..69bba0e 100644 (file)
@@ -1370,7 +1370,7 @@ int mthca_alloc_sqp(struct mthca_dev *dev,
                    enum ib_sig_type send_policy,
                    struct ib_qp_cap *cap,
                    int qpn,
-                   int port,
+                   u32 port,
                    struct mthca_qp *qp,
                    struct ib_udata *udata)
 {
index 699a8b7..88c4592 100644 (file)
@@ -250,7 +250,7 @@ int ocrdma_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
 }
 
 int ocrdma_process_mad(struct ib_device *ibdev, int process_mad_flags,
-                      u8 port_num, const struct ib_wc *in_wc,
+                      u32 port_num, const struct ib_wc *in_wc,
                       const struct ib_grh *in_grh, const struct ib_mad *in,
                       struct ib_mad *out, size_t *out_mad_size,
                       u16 *out_mad_pkey_index)
index 35cf2e2..2626679 100644 (file)
@@ -57,7 +57,7 @@ int ocrdma_destroy_ah(struct ib_ah *ah, u32 flags);
 int ocrdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
 
 int ocrdma_process_mad(struct ib_device *dev, int process_mad_flags,
-                      u8 port_num, const struct ib_wc *in_wc,
+                      u32 port_num, const struct ib_wc *in_wc,
                       const struct ib_grh *in_grh, const struct ib_mad *in,
                       struct ib_mad *out, size_t *out_mad_size,
                       u16 *out_mad_pkey_index);
index 9a834a9..4882b31 100644 (file)
@@ -77,12 +77,12 @@ void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)
        guid[7] = mac_addr[5];
 }
 static enum rdma_link_layer ocrdma_link_layer(struct ib_device *device,
-                                             u8 port_num)
+                                             u32 port_num)
 {
        return IB_LINK_LAYER_ETHERNET;
 }
 
-static int ocrdma_port_immutable(struct ib_device *ibdev, u8 port_num,
+static int ocrdma_port_immutable(struct ib_device *ibdev, u32 port_num,
                                 struct ib_port_immutable *immutable)
 {
        struct ib_port_attr attr;
index 3acb5c1..58619ce 100644 (file)
@@ -54,7 +54,7 @@
 #include "ocrdma_verbs.h"
 #include <rdma/ocrdma-abi.h>
 
-int ocrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+int ocrdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey)
 {
        if (index > 0)
                return -EINVAL;
@@ -150,7 +150,7 @@ static inline void get_link_speed_and_width(struct ocrdma_dev *dev,
 }
 
 int ocrdma_query_port(struct ib_device *ibdev,
-                     u8 port, struct ib_port_attr *props)
+                     u32 port, struct ib_port_attr *props)
 {
        enum ib_port_state port_state;
        struct ocrdma_dev *dev;
index 425d554..b1c5fad 100644 (file)
@@ -53,13 +53,14 @@ int ocrdma_arm_cq(struct ib_cq *, enum ib_cq_notify_flags flags);
 
 int ocrdma_query_device(struct ib_device *, struct ib_device_attr *props,
                        struct ib_udata *uhw);
-int ocrdma_query_port(struct ib_device *, u8 port, struct ib_port_attr *props);
+int ocrdma_query_port(struct ib_device *ibdev, u32 port,
+                     struct ib_port_attr *props);
 
 enum rdma_protocol_type
-ocrdma_query_protocol(struct ib_device *device, u8 port_num);
+ocrdma_query_protocol(struct ib_device *device, u32 port_num);
 
 void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid);
-int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
+int ocrdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey);
 
 int ocrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
 void ocrdma_dealloc_ucontext(struct ib_ucontext *uctx);
index 8e7c069..8334a98 100644 (file)
@@ -53,7 +53,7 @@ MODULE_LICENSE("Dual BSD/GPL");
 
 #define QEDR_WQ_MULTIPLIER_DFT (3)
 
-static void qedr_ib_dispatch_event(struct qedr_dev *dev, u8 port_num,
+static void qedr_ib_dispatch_event(struct qedr_dev *dev, u32 port_num,
                                   enum ib_event_type type)
 {
        struct ib_event ibev;
@@ -66,7 +66,7 @@ static void qedr_ib_dispatch_event(struct qedr_dev *dev, u8 port_num,
 }
 
 static enum rdma_link_layer qedr_link_layer(struct ib_device *device,
-                                           u8 port_num)
+                                           u32 port_num)
 {
        return IB_LINK_LAYER_ETHERNET;
 }
@@ -81,7 +81,7 @@ static void qedr_get_dev_fw_str(struct ib_device *ibdev, char *str)
                 (fw_ver >> 8) & 0xFF, fw_ver & 0xFF);
 }
 
-static int qedr_roce_port_immutable(struct ib_device *ibdev, u8 port_num,
+static int qedr_roce_port_immutable(struct ib_device *ibdev, u32 port_num,
                                    struct ib_port_immutable *immutable)
 {
        struct ib_port_attr attr;
@@ -100,7 +100,7 @@ static int qedr_roce_port_immutable(struct ib_device *ibdev, u8 port_num,
        return 0;
 }
 
-static int qedr_iw_port_immutable(struct ib_device *ibdev, u8 port_num,
+static int qedr_iw_port_immutable(struct ib_device *ibdev, u32 port_num,
                                  struct ib_port_immutable *immutable)
 {
        struct ib_port_attr attr;
index c4bc587..1715fbe 100644 (file)
@@ -636,8 +636,10 @@ int qedr_iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
        memcpy(in_params.local_mac_addr, dev->ndev->dev_addr, ETH_ALEN);
 
        if (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_CONNECT,
-                            &qp->iwarp_cm_flags))
+                            &qp->iwarp_cm_flags)) {
+               rc = -ENODEV;
                goto err; /* QP already being destroyed */
+       }
 
        rc = dev->ops->iwarp_connect(dev->rdma_ctx, &in_params, &out_params);
        if (rc) {
index 9ea5422..fdc47ef 100644 (file)
@@ -72,7 +72,7 @@ static inline int qedr_ib_copy_to_udata(struct ib_udata *udata, void *src,
        return ib_copy_to_udata(udata, src, min_len);
 }
 
-int qedr_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+int qedr_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey)
 {
        if (index >= QEDR_ROCE_PKEY_TABLE_LEN)
                return -EINVAL;
@@ -81,7 +81,7 @@ int qedr_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
        return 0;
 }
 
-int qedr_iw_query_gid(struct ib_device *ibdev, u8 port,
+int qedr_iw_query_gid(struct ib_device *ibdev, u32 port,
                      int index, union ib_gid *sgid)
 {
        struct qedr_dev *dev = get_qedr_dev(ibdev);
@@ -210,7 +210,8 @@ static inline void get_link_speed_and_width(int speed, u16 *ib_speed,
        }
 }
 
-int qedr_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *attr)
+int qedr_query_port(struct ib_device *ibdev, u32 port,
+                   struct ib_port_attr *attr)
 {
        struct qedr_dev *dev;
        struct qed_rdma_port *rdma_port;
@@ -4483,7 +4484,7 @@ int qedr_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
 }
 
 int qedr_process_mad(struct ib_device *ibdev, int process_mad_flags,
-                    u8 port_num, const struct ib_wc *in_wc,
+                    u32 port_num, const struct ib_wc *in_wc,
                     const struct ib_grh *in_grh, const struct ib_mad *in,
                     struct ib_mad *out_mad, size_t *out_mad_size,
                     u16 *out_mad_pkey_index)
index 2672c32..34ad475 100644 (file)
 
 int qedr_query_device(struct ib_device *ibdev,
                      struct ib_device_attr *attr, struct ib_udata *udata);
-int qedr_query_port(struct ib_device *, u8 port, struct ib_port_attr *props);
+int qedr_query_port(struct ib_device *ibdev, u32 port,
+                   struct ib_port_attr *props);
 
-int qedr_iw_query_gid(struct ib_device *ibdev, u8 port,
+int qedr_iw_query_gid(struct ib_device *ibdev, u32 port,
                      int index, union ib_gid *gid);
 
-int qedr_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
+int qedr_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey);
 
 int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
 void qedr_dealloc_ucontext(struct ib_ucontext *uctx);
@@ -92,11 +93,11 @@ int qedr_post_send(struct ib_qp *, const struct ib_send_wr *,
 int qedr_post_recv(struct ib_qp *, const struct ib_recv_wr *,
                   const struct ib_recv_wr **bad_wr);
 int qedr_process_mad(struct ib_device *ibdev, int process_mad_flags,
-                    u8 port_num, const struct ib_wc *in_wc,
+                    u32 port_num, const struct ib_wc *in_wc,
                     const struct ib_grh *in_grh, const struct ib_mad *in_mad,
                     struct ib_mad *out_mad, size_t *out_mad_size,
                     u16 *out_mad_pkey_index);
 
-int qedr_port_immutable(struct ib_device *ibdev, u8 port_num,
+int qedr_port_immutable(struct ib_device *ibdev, u32 port_num,
                        struct ib_port_immutable *immutable);
 #endif
index ee21142..8849773 100644 (file)
@@ -630,7 +630,7 @@ struct qib_pportdata {
        u8 rx_pol_inv;
 
        u8 hw_pidx;     /* physical port index */
-       u8 port;        /* IB port number and index into dd->pports - 1 */
+       u32 port;        /* IB port number and index into dd->pports - 1 */
 
        u8 delay_mult;
 
@@ -1200,10 +1200,10 @@ static inline struct qib_pportdata *ppd_from_ibp(struct qib_ibport *ibp)
        return container_of(ibp, struct qib_pportdata, ibport_data);
 }
 
-static inline struct qib_ibport *to_iport(struct ib_device *ibdev, u8 port)
+static inline struct qib_ibport *to_iport(struct ib_device *ibdev, u32 port)
 {
        struct qib_devdata *dd = dd_from_ibdev(ibdev);
-       unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+       u32 pidx = port - 1; /* IB number port from 1, hdw from 0 */
 
        WARN_ON(pidx >= dd->num_pports);
        return &dd->pport[pidx].ibport_data;
@@ -1303,11 +1303,6 @@ int qib_sdma_verbs_send(struct qib_pportdata *, struct rvt_sge_state *,
 /* ppd->sdma_lock should be locked before calling this. */
 int qib_sdma_make_progress(struct qib_pportdata *dd);
 
-static inline int qib_sdma_empty(const struct qib_pportdata *ppd)
-{
-       return ppd->sdma_descq_added == ppd->sdma_descq_removed;
-}
-
 /* must be called under qib_sdma_lock */
 static inline u16 qib_sdma_descq_freecnt(const struct qib_pportdata *ppd)
 {
@@ -1364,27 +1359,6 @@ static inline u32 qib_get_rcvhdrtail(const struct qib_ctxtdata *rcd)
                *((volatile __le64 *)rcd->rcvhdrtail_kvaddr)); /* DMA'ed */
 }
 
-static inline u32 qib_get_hdrqtail(const struct qib_ctxtdata *rcd)
-{
-       const struct qib_devdata *dd = rcd->dd;
-       u32 hdrqtail;
-
-       if (dd->flags & QIB_NODMA_RTAIL) {
-               __le32 *rhf_addr;
-               u32 seq;
-
-               rhf_addr = (__le32 *) rcd->rcvhdrq +
-                       rcd->head + dd->rhf_offset;
-               seq = qib_hdrget_seq(rhf_addr);
-               hdrqtail = rcd->head;
-               if (seq == rcd->seq_cnt)
-                       hdrqtail++;
-       } else
-               hdrqtail = qib_get_rcvhdrtail(rcd);
-
-       return hdrqtail;
-}
-
 /*
  * sysfs interface.
  */
@@ -1395,7 +1369,7 @@ extern const struct attribute_group qib_attr_group;
 int qib_device_create(struct qib_devdata *);
 void qib_device_remove(struct qib_devdata *);
 
-int qib_create_port_files(struct ib_device *ibdev, u8 port_num,
+int qib_create_port_files(struct ib_device *ibdev, u32 port_num,
                          struct kobject *kobj);
 void qib_verbs_unregister_sysfs(struct qib_devdata *);
 /* Hook for sysfs read of QSFP */
index f91f23e..cf65283 100644 (file)
@@ -795,11 +795,4 @@ static inline __u32 qib_hdrget_use_egr_buf(const __le32 *rbuf)
 {
        return __le32_to_cpu(rbuf[0]) & QLOGIC_IB_RHF_L_USE_EGR;
 }
-
-static inline __u32 qib_hdrget_qib_ver(__le32 hdrword)
-{
-       return (__le32_to_cpu(hdrword) >> QLOGIC_IB_I_VERS_SHIFT) &
-               QLOGIC_IB_I_VERS_MASK;
-}
-
 #endif                          /* _QIB_COMMON_H */
index ff87a67..c60e79d 100644 (file)
@@ -1758,7 +1758,8 @@ bail:
 }
 
 /**
- * unlock_exptid - unlock any expected TID entries context still had in use
+ * unlock_expected_tids - unlock any expected TID entries context still had
+ * in use
  * @rcd: ctxt
  *
  * We don't actually update the chip here, because we do a bulk update
@@ -2247,7 +2248,7 @@ static ssize_t qib_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
        if (!iter_is_iovec(from) || !from->nr_segs || !pq)
                return -EINVAL;
-                        
+
        return qib_user_sdma_writev(rcd, pq, from->iov, from->nr_segs);
 }
 
index e336d77..a0c5f3b 100644 (file)
@@ -427,79 +427,21 @@ bail:
        return ret;
 }
 
-static int remove_file(struct dentry *parent, char *name)
-{
-       struct dentry *tmp;
-       int ret;
-
-       tmp = lookup_one_len(name, parent, strlen(name));
-
-       if (IS_ERR(tmp)) {
-               ret = PTR_ERR(tmp);
-               goto bail;
-       }
-
-       spin_lock(&tmp->d_lock);
-       if (simple_positive(tmp)) {
-               __d_drop(tmp);
-               spin_unlock(&tmp->d_lock);
-               simple_unlink(d_inode(parent), tmp);
-       } else {
-               spin_unlock(&tmp->d_lock);
-       }
-       dput(tmp);
-
-       ret = 0;
-bail:
-       /*
-        * We don't expect clients to care about the return value, but
-        * it's there if they need it.
-        */
-       return ret;
-}
-
 static int remove_device_files(struct super_block *sb,
                               struct qib_devdata *dd)
 {
-       struct dentry *dir, *root;
+       struct dentry *dir;
        char unit[10];
-       int ret, i;
 
-       root = dget(sb->s_root);
-       inode_lock(d_inode(root));
        snprintf(unit, sizeof(unit), "%u", dd->unit);
-       dir = lookup_one_len(unit, root, strlen(unit));
+       dir = lookup_one_len_unlocked(unit, sb->s_root, strlen(unit));
 
        if (IS_ERR(dir)) {
-               ret = PTR_ERR(dir);
                pr_err("Lookup of %s failed\n", unit);
-               goto bail;
+               return PTR_ERR(dir);
        }
-
-       inode_lock(d_inode(dir));
-       remove_file(dir, "counters");
-       remove_file(dir, "counter_names");
-       remove_file(dir, "portcounter_names");
-       for (i = 0; i < dd->num_pports; i++) {
-               char fname[24];
-
-               sprintf(fname, "port%dcounters", i + 1);
-               remove_file(dir, fname);
-               if (dd->flags & QIB_HAS_QSFP) {
-                       sprintf(fname, "qsfp%d", i + 1);
-                       remove_file(dir, fname);
-               }
-       }
-       remove_file(dir, "flash");
-       inode_unlock(d_inode(dir));
-       ret = simple_rmdir(d_inode(root), dir);
-       d_drop(dir);
-       dput(dir);
-
-bail:
-       inode_unlock(d_inode(root));
-       dput(root);
-       return ret;
+       simple_recursive_removal(dir, NULL);
+       return 0;
 }
 
 /*
index b35e117..a9b83bc 100644 (file)
@@ -2609,7 +2609,7 @@ static void qib_chk_6120_errormask(struct qib_devdata *dd)
 }
 
 /**
- * qib_get_faststats - get word counters from chip before they overflow
+ * qib_get_6120_faststats - get word counters from chip before they overflow
  * @t: contains a pointer to the qlogic_ib device qib_devdata
  *
  * This needs more work; in particular, decision on whether we really
index 229dcd6..d1c0bc3 100644 (file)
@@ -2236,7 +2236,7 @@ static void qib_7220_tidtemplate(struct qib_devdata *dd)
 }
 
 /**
- * qib_init_7220_get_base_info - set chip-specific flags for user code
+ * qib_7220_get_base_info - set chip-specific flags for user code
  * @rcd: the qlogic_ib ctxt
  * @kinfo: qib_base_info pointer
  *
@@ -4411,7 +4411,7 @@ static void writescratch(struct qib_devdata *dd, u32 val)
 
 #define VALID_TS_RD_REG_MASK 0xBF
 /**
- * qib_7220_tempsense_read - read register of temp sensor via TWSI
+ * qib_7220_tempsense_rd - read register of temp sensor via TWSI
  * @dd: the qlogic_ib device
  * @regnum: register to read from
  *
index 9fe6ea7..ab98b6a 100644 (file)
@@ -790,28 +790,6 @@ static inline u32 qib_read_ureg32(const struct qib_devdata *dd,
                 (char __iomem *)dd->kregbase + dd->uregbase)));
 }
 
-/**
- * qib_read_ureg - read virtualized per-context register
- * @dd: device
- * @regno: register number
- * @ctxt: context number
- *
- * Return the contents of a register that is virtualized to be per context.
- * Returns -1 on errors (not distinguishable from valid contents at
- * runtime; we may add a separate error variable at some point).
- */
-static inline u64 qib_read_ureg(const struct qib_devdata *dd,
-                               enum qib_ureg regno, int ctxt)
-{
-
-       if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
-               return 0;
-       return readq(regno + (u64 __iomem *)(
-               (dd->ureg_align * ctxt) + (dd->userbase ?
-                (char __iomem *)dd->userbase :
-                (char __iomem *)dd->kregbase + dd->uregbase)));
-}
-
 /**
  * qib_write_ureg - write virtualized per-context register
  * @dd: device
@@ -2513,7 +2491,7 @@ static int qib_7322_bringup_serdes(struct qib_pportdata *ppd)
 }
 
 /**
- * qib_7322_quiet_serdes - set serdes to txidle
+ * qib_7322_mini_quiet_serdes - set serdes to txidle
  * @ppd: the qlogic_ib device
  * Called when driver is being unloaded
  */
@@ -3859,7 +3837,7 @@ static void qib_7322_tidtemplate(struct qib_devdata *dd)
 }
 
 /**
- * qib_init_7322_get_base_info - set chip-specific flags for user code
+ * qib_7322_get_base_info - set chip-specific flags for user code
  * @rcd: the qlogic_ib ctxt
  * @kinfo: qib_base_info pointer
  *
index 43c8ee1..b5a7857 100644 (file)
@@ -1609,7 +1609,7 @@ bail:
 }
 
 /**
- * allocate eager buffers, both kernel and user contexts.
+ * qib_setup_eagerbufs - allocate eager buffers, both kernel and user contexts.
  * @rcd: the context we are setting up.
  *
  * Allocate the eager TID buffers and program them into hip.
index 44e2f81..ef02f2b 100644 (file)
@@ -203,7 +203,7 @@ static void qib_bad_mkey(struct qib_ibport *ibp, struct ib_smp *smp)
 /*
  * Send a Port Capability Mask Changed trap (ch. 14.3.11).
  */
-void qib_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
+void qib_cap_mask_chg(struct rvt_dev_info *rdi, u32 port_num)
 {
        struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi);
        struct qib_devdata *dd = dd_from_dev(ibdev);
@@ -2360,7 +2360,7 @@ static int process_cc(struct ib_device *ibdev, int mad_flags,
  *
  * This is called by the ib_mad module.
  */
-int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+int qib_process_mad(struct ib_device *ibdev, int mad_flags, u32 port,
                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                    const struct ib_mad *in, struct ib_mad *out,
                    size_t *out_mad_size, u16 *out_mad_pkey_index)
index ca39a02..1974ceb 100644 (file)
@@ -125,7 +125,7 @@ static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map)
  * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI.
  */
 int qib_alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
-                 enum ib_qp_type type, u8 port)
+                 enum ib_qp_type type, u32 port)
 {
        u32 i, offset, max_scan, qpn;
        struct rvt_qpn_map *map;
@@ -136,7 +136,7 @@ int qib_alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
        u16 qpt_mask = dd->qpn_mask;
 
        if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
-               unsigned n;
+               u32 n;
 
                ret = type == IB_QPT_GSI;
                n = 1 << (ret + 2 * (port - 1));
index 4f4a09c..81b810d 100644 (file)
@@ -687,7 +687,6 @@ static int qib_sd7220_reg_mod(struct qib_devdata *dd, int sdnum, u32 loc,
                spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
                return -1;
        }
-       ret = 0;
        for (tries = EPB_TRANS_TRIES; tries; --tries) {
                transval = qib_read_kreg32(dd, trans);
                if (transval & EPB_TRANS_RDY)
index 62c179f..5e9e66f 100644 (file)
@@ -728,7 +728,7 @@ const struct attribute_group qib_attr_group = {
        .attrs = qib_attributes,
 };
 
-int qib_create_port_files(struct ib_device *ibdev, u8 port_num,
+int qib_create_port_files(struct ib_device *ibdev, u32 port_num,
                          struct kobject *kobj)
 {
        struct qib_pportdata *ppd;
index 8e0de26..d17d034 100644 (file)
@@ -1188,7 +1188,7 @@ full:
        }
 }
 
-static int qib_query_port(struct rvt_dev_info *rdi, u8 port_num,
+static int qib_query_port(struct rvt_dev_info *rdi, u32 port_num,
                          struct ib_port_attr *props)
 {
        struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi);
@@ -1273,7 +1273,7 @@ bail:
        return ret;
 }
 
-static int qib_shut_down_port(struct rvt_dev_info *rdi, u8 port_num)
+static int qib_shut_down_port(struct rvt_dev_info *rdi, u32 port_num)
 {
        struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi);
        struct qib_devdata *dd = dd_from_dev(ibdev);
@@ -1342,7 +1342,7 @@ struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid)
        struct rvt_qp *qp0;
        struct qib_pportdata *ppd = ppd_from_ibp(ibp);
        struct qib_devdata *dd = dd_from_ppd(ppd);
-       u8 port_num = ppd->port;
+       u32 port_num = ppd->port;
 
        memset(&attr, 0, sizeof(attr));
        attr.type = rdma_ah_find_type(&dd->verbs_dev.rdi.ibdev, port_num);
index dc0e81f..07548fa 100644 (file)
@@ -239,10 +239,10 @@ static inline int qib_pkey_ok(u16 pkey1, u16 pkey2)
 
 void qib_bad_pkey(struct qib_ibport *ibp, u32 key, u32 sl,
                  u32 qp1, u32 qp2, __be16 lid1, __be16 lid2);
-void qib_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num);
+void qib_cap_mask_chg(struct rvt_dev_info *rdi, u32 port_num);
 void qib_sys_guid_chg(struct qib_ibport *ibp);
 void qib_node_desc_chg(struct qib_ibport *ibp);
-int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+int qib_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                    const struct ib_mad *in, struct ib_mad *out,
                    size_t *out_mad_size, u16 *out_mad_pkey_index);
@@ -273,7 +273,7 @@ void *qib_qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp);
 void qib_qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
 void qib_notify_qp_reset(struct rvt_qp *qp);
 int qib_alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
-                 enum ib_qp_type type, u8 port);
+                 enum ib_qp_type type, u32 port);
 void qib_restart_rc(struct rvt_qp *qp, u32 psn, int wait);
 #ifdef CONFIG_DEBUG_FS
 
index 1b63a49..ff6a40e 100644 (file)
@@ -303,7 +303,7 @@ static struct notifier_block usnic_ib_inetaddr_notifier = {
 };
 /* End of inet section*/
 
-static int usnic_port_immutable(struct ib_device *ibdev, u8 port_num,
+static int usnic_port_immutable(struct ib_device *ibdev, u32 port_num,
                                struct ib_port_immutable *immutable)
 {
        struct ib_port_attr attr;
index 3705c6b..57d210c 100644 (file)
@@ -270,7 +270,7 @@ static int create_qp_validate_user_data(struct usnic_ib_create_qp_cmd cmd)
 /* Start of ib callback functions */
 
 enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device,
-                                               u8 port_num)
+                                             u32 port_num)
 {
        return IB_LINK_LAYER_ETHERNET;
 }
@@ -332,7 +332,7 @@ int usnic_ib_query_device(struct ib_device *ibdev,
        return 0;
 }
 
-int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
+int usnic_ib_query_port(struct ib_device *ibdev, u32 port,
                                struct ib_port_attr *props)
 {
        struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
@@ -420,7 +420,7 @@ err_out:
        return err;
 }
 
-int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+int usnic_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
                                union ib_gid *gid)
 {
 
index 11fe1ba..6b82d0f 100644 (file)
 #include "usnic_ib.h"
 
 enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device,
-                                               u8 port_num);
+                                             u32 port_num);
 int usnic_ib_query_device(struct ib_device *ibdev,
                                struct ib_device_attr *props,
                          struct ib_udata *uhw);
-int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
+int usnic_ib_query_port(struct ib_device *ibdev, u32 port,
                                struct ib_port_attr *props);
 int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
                                int qp_attr_mask,
                                struct ib_qp_init_attr *qp_init_attr);
-int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+int usnic_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
                                union ib_gid *gid);
 int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
 int usnic_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
index de57f2f..763ddc6 100644 (file)
@@ -344,11 +344,6 @@ static inline enum ib_port_state pvrdma_port_state_to_ib(
        return (enum ib_port_state)state;
 }
 
-static inline int ib_port_cap_flags_to_pvrdma(int flags)
-{
-       return flags & PVRDMA_MASK(PVRDMA_PORT_CAP_FLAGS_MAX);
-}
-
 static inline int pvrdma_port_cap_flags_to_ib(int flags)
 {
        return flags;
@@ -410,11 +405,6 @@ static inline enum pvrdma_qp_type ib_qp_type_to_pvrdma(enum ib_qp_type type)
        return (enum pvrdma_qp_type)type;
 }
 
-static inline enum ib_qp_type pvrdma_qp_type_to_ib(enum pvrdma_qp_type type)
-{
-       return (enum ib_qp_type)type;
-}
-
 static inline enum pvrdma_qp_state ib_qp_state_to_pvrdma(enum ib_qp_state state)
 {
        return (enum pvrdma_qp_state)state;
index 4b6019e..6bf2d2e 100644 (file)
@@ -121,7 +121,7 @@ static int pvrdma_init_device(struct pvrdma_dev *dev)
        return 0;
 }
 
-static int pvrdma_port_immutable(struct ib_device *ibdev, u8 port_num,
+static int pvrdma_port_immutable(struct ib_device *ibdev, u32 port_num,
                                 struct ib_port_immutable *immutable)
 {
        struct pvrdma_dev *dev = to_vdev(ibdev);
index 1d3bdd7..67769b7 100644 (file)
@@ -882,7 +882,7 @@ out:
 }
 
 /**
- * pvrdma_post_receive - post receive work request entries on a QP
+ * pvrdma_post_recv - post receive work request entries on a QP
  * @ibqp: the QP
  * @wr: the work request list to post
  * @bad_wr: the first bad WR returned
index fc412cb..1917658 100644 (file)
@@ -125,7 +125,7 @@ int pvrdma_query_device(struct ib_device *ibdev,
  *
  * @return: 0 on success, otherwise negative errno
  */
-int pvrdma_query_port(struct ib_device *ibdev, u8 port,
+int pvrdma_query_port(struct ib_device *ibdev, u32 port,
                      struct ib_port_attr *props)
 {
        struct pvrdma_dev *dev = to_vdev(ibdev);
@@ -183,7 +183,7 @@ int pvrdma_query_port(struct ib_device *ibdev, u8 port,
  *
  * @return: 0 on success, otherwise negative errno
  */
-int pvrdma_query_gid(struct ib_device *ibdev, u8 port, int index,
+int pvrdma_query_gid(struct ib_device *ibdev, u32 port, int index,
                     union ib_gid *gid)
 {
        struct pvrdma_dev *dev = to_vdev(ibdev);
@@ -205,7 +205,7 @@ int pvrdma_query_gid(struct ib_device *ibdev, u8 port, int index,
  *
  * @return: 0 on success, otherwise negative errno
  */
-int pvrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+int pvrdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
                      u16 *pkey)
 {
        int err = 0;
@@ -232,7 +232,7 @@ int pvrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
 }
 
 enum rdma_link_layer pvrdma_port_link_layer(struct ib_device *ibdev,
-                                           u8 port)
+                                           u32 port)
 {
        return IB_LINK_LAYER_ETHERNET;
 }
@@ -274,7 +274,7 @@ int pvrdma_modify_device(struct ib_device *ibdev, int mask,
  *
  * @return: 0 on success, otherwise negative errno
  */
-int pvrdma_modify_port(struct ib_device *ibdev, u8 port, int mask,
+int pvrdma_modify_port(struct ib_device *ibdev, u32 port, int mask,
                       struct ib_port_modify *props)
 {
        struct ib_port_attr attr;
@@ -516,7 +516,7 @@ int pvrdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
        struct pvrdma_dev *dev = to_vdev(ibah->device);
        struct pvrdma_ah *ah = to_vah(ibah);
        const struct ib_global_route *grh;
-       u8 port_num = rdma_ah_get_port_num(ah_attr);
+       u32 port_num = rdma_ah_get_port_num(ah_attr);
 
        if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
                return -EINVAL;
index 97ed8f9..544b94d 100644 (file)
@@ -70,30 +70,6 @@ enum pvrdma_mtu {
        PVRDMA_MTU_4096 = 5,
 };
 
-static inline int pvrdma_mtu_enum_to_int(enum pvrdma_mtu mtu)
-{
-       switch (mtu) {
-       case PVRDMA_MTU_256:    return  256;
-       case PVRDMA_MTU_512:    return  512;
-       case PVRDMA_MTU_1024:   return 1024;
-       case PVRDMA_MTU_2048:   return 2048;
-       case PVRDMA_MTU_4096:   return 4096;
-       default:                return   -1;
-       }
-}
-
-static inline enum pvrdma_mtu pvrdma_mtu_int_to_enum(int mtu)
-{
-       switch (mtu) {
-       case 256:       return PVRDMA_MTU_256;
-       case 512:       return PVRDMA_MTU_512;
-       case 1024:      return PVRDMA_MTU_1024;
-       case 2048:      return PVRDMA_MTU_2048;
-       case 4096:
-       default:        return PVRDMA_MTU_4096;
-       }
-}
-
 enum pvrdma_port_state {
        PVRDMA_PORT_NOP                 = 0,
        PVRDMA_PORT_DOWN                = 1,
@@ -138,17 +114,6 @@ enum pvrdma_port_width {
        PVRDMA_WIDTH_12X        = 8,
 };
 
-static inline int pvrdma_width_enum_to_int(enum pvrdma_port_width width)
-{
-       switch (width) {
-       case PVRDMA_WIDTH_1X:   return  1;
-       case PVRDMA_WIDTH_4X:   return  4;
-       case PVRDMA_WIDTH_8X:   return  8;
-       case PVRDMA_WIDTH_12X:  return 12;
-       default:                return -1;
-       }
-}
-
 enum pvrdma_port_speed {
        PVRDMA_SPEED_SDR        = 1,
        PVRDMA_SPEED_DDR        = 2,
@@ -383,17 +348,17 @@ enum pvrdma_access_flags {
 int pvrdma_query_device(struct ib_device *ibdev,
                        struct ib_device_attr *props,
                        struct ib_udata *udata);
-int pvrdma_query_port(struct ib_device *ibdev, u8 port,
+int pvrdma_query_port(struct ib_device *ibdev, u32 port,
                      struct ib_port_attr *props);
-int pvrdma_query_gid(struct ib_device *ibdev, u8 port,
+int pvrdma_query_gid(struct ib_device *ibdev, u32 port,
                     int index, union ib_gid *gid);
-int pvrdma_query_pkey(struct ib_device *ibdev, u8 port,
+int pvrdma_query_pkey(struct ib_device *ibdev, u32 port,
                      u16 index, u16 *pkey);
 enum rdma_link_layer pvrdma_port_link_layer(struct ib_device *ibdev,
-                                           u8 port);
+                                           u32 port);
 int pvrdma_modify_device(struct ib_device *ibdev, int mask,
                         struct ib_device_modify *props);
-int pvrdma_modify_port(struct ib_device *ibdev, u8 port,
+int pvrdma_modify_port(struct ib_device *ibdev, u32 port,
                       int mask, struct ib_port_modify *props);
 int pvrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
 int pvrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
index fa5be13..207bc0e 100644 (file)
@@ -70,7 +70,7 @@
  *
  * Return: IB_MAD_RESULT_SUCCESS or error
  */
-int rvt_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+int rvt_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                    const struct ib_mad_hdr *in, size_t in_mad_size,
                    struct ib_mad_hdr *out, size_t *out_mad_size,
@@ -82,9 +82,6 @@ int rvt_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
         * future may choose to implement this but it should not be made into a
         * requirement.
         */
-       if (ibport_num_to_idx(ibdev, port_num) < 0)
-               return -EINVAL;
-
        return IB_MAD_RESULT_FAILURE;
 }
 
index a9d6eec..1eae5ef 100644 (file)
@@ -50,7 +50,7 @@
 
 #include <rdma/rdma_vt.h>
 
-int rvt_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+int rvt_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                    const struct ib_mad_hdr *in, size_t in_mad_size,
                    struct ib_mad_hdr *out, size_t *out_mad_size,
index 8fd0128..12ebe04 100644 (file)
@@ -151,15 +151,12 @@ static int rvt_modify_device(struct ib_device *device,
  *
  * Return: 0 on success
  */
-static int rvt_query_port(struct ib_device *ibdev, u8 port_num,
+static int rvt_query_port(struct ib_device *ibdev, u32 port_num,
                          struct ib_port_attr *props)
 {
        struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
        struct rvt_ibport *rvp;
-       int port_index = ibport_num_to_idx(ibdev, port_num);
-
-       if (port_index < 0)
-               return -EINVAL;
+       u32 port_index = ibport_num_to_idx(ibdev, port_num);
 
        rvp = rdi->ports[port_index];
        /* props being zeroed by the caller, avoid zeroing it here */
@@ -186,16 +183,13 @@ static int rvt_query_port(struct ib_device *ibdev, u8 port_num,
  *
  * Return: 0 on success
  */
-static int rvt_modify_port(struct ib_device *ibdev, u8 port_num,
+static int rvt_modify_port(struct ib_device *ibdev, u32 port_num,
                           int port_modify_mask, struct ib_port_modify *props)
 {
        struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
        struct rvt_ibport *rvp;
        int ret = 0;
-       int port_index = ibport_num_to_idx(ibdev, port_num);
-
-       if (port_index < 0)
-               return -EINVAL;
+       u32 port_index = ibport_num_to_idx(ibdev, port_num);
 
        rvp = rdi->ports[port_index];
        if (port_modify_mask & IB_PORT_OPA_MASK_CHG) {
@@ -225,7 +219,7 @@ static int rvt_modify_port(struct ib_device *ibdev, u8 port_num,
  *
  * Return: 0 on failure pkey otherwise
  */
-static int rvt_query_pkey(struct ib_device *ibdev, u8 port_num, u16 index,
+static int rvt_query_pkey(struct ib_device *ibdev, u32 port_num, u16 index,
                          u16 *pkey)
 {
        /*
@@ -235,11 +229,9 @@ static int rvt_query_pkey(struct ib_device *ibdev, u8 port_num, u16 index,
         * no way to protect against that anyway.
         */
        struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
-       int port_index;
+       u32 port_index;
 
        port_index = ibport_num_to_idx(ibdev, port_num);
-       if (port_index < 0)
-               return -EINVAL;
 
        if (index >= rvt_get_npkeys(rdi))
                return -EINVAL;
@@ -257,12 +249,12 @@ static int rvt_query_pkey(struct ib_device *ibdev, u8 port_num, u16 index,
  *
  * Return: 0 on success
  */
-static int rvt_query_gid(struct ib_device *ibdev, u8 port_num,
+static int rvt_query_gid(struct ib_device *ibdev, u32 port_num,
                         int guid_index, union ib_gid *gid)
 {
        struct rvt_dev_info *rdi;
        struct rvt_ibport *rvp;
-       int port_index;
+       u32 port_index;
 
        /*
         * Driver is responsible for updating the guid table. Which will be used
@@ -270,8 +262,6 @@ static int rvt_query_gid(struct ib_device *ibdev, u8 port_num,
         * is being done.
         */
        port_index = ibport_num_to_idx(ibdev, port_num);
-       if (port_index < 0)
-               return -EINVAL;
 
        rdi = ib_to_rvt(ibdev);
        rvp = rdi->ports[port_index];
@@ -301,16 +291,12 @@ static void rvt_dealloc_ucontext(struct ib_ucontext *context)
        return;
 }
 
-static int rvt_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+static int rvt_get_port_immutable(struct ib_device *ibdev, u32 port_num,
                                  struct ib_port_immutable *immutable)
 {
        struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
        struct ib_port_attr attr;
-       int err, port_index;
-
-       port_index = ibport_num_to_idx(ibdev, port_num);
-       if (port_index < 0)
-               return -EINVAL;
+       int err;
 
        immutable->core_cap_flags = rdi->dparms.core_cap_flags;
 
index d19ff81..c0fed65 100644 (file)
 #define __rvt_pr_err_ratelimited(pdev, name, fmt, ...) \
        dev_err_ratelimited(&(pdev)->dev, "%s: " fmt, name, ##__VA_ARGS__)
 
-static inline int ibport_num_to_idx(struct ib_device *ibdev, u8 port_num)
+static inline u32 ibport_num_to_idx(struct ib_device *ibdev, u32 port_num)
 {
-       struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
-       int port_index;
-
-       port_index = port_num - 1; /* IB ports start at 1 our arrays at 0 */
-       if ((port_index < 0) || (port_index >= rdi->dparms.nports))
-               return -EINVAL;
-
-       return port_index;
+       return port_num - 1; /* IB ports start at 1 our arrays at 0 */
 }
 
 #endif          /* DEF_RDMAVT_H */
index df0d173..da2e867 100644 (file)
@@ -88,7 +88,7 @@ void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr)
                type = RXE_NETWORK_TYPE_IPV4;
                break;
        case RDMA_NETWORK_IPV6:
-               type = RXE_NETWORK_TYPE_IPV4;
+               type = RXE_NETWORK_TYPE_IPV6;
                break;
        default:
                /* not reached - checked in rxe_av_chk_attr */
index 17a361b..2af2673 100644 (file)
@@ -345,7 +345,7 @@ static inline enum comp_state do_read(struct rxe_qp *qp,
 
        ret = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE,
                        &wqe->dma, payload_addr(pkt),
-                       payload_size(pkt), to_mem_obj, NULL);
+                       payload_size(pkt), to_mr_obj, NULL);
        if (ret)
                return COMPST_ERROR;
 
@@ -365,7 +365,7 @@ static inline enum comp_state do_atomic(struct rxe_qp *qp,
 
        ret = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE,
                        &wqe->dma, &atomic_orig,
-                       sizeof(u64), to_mem_obj, NULL);
+                       sizeof(u64), to_mr_obj, NULL);
        if (ret)
                return COMPST_ERROR;
        else
@@ -676,7 +676,6 @@ int rxe_completer(void *arg)
 
                        /* there is nothing to retry in this case */
                        if (!wqe || (wqe->state == wqe_state_posted)) {
-                               pr_warn("Retry attempted without a valid wqe\n");
                                ret = -EAGAIN;
                                goto done;
                        }
index ac9154f..f469fd1 100644 (file)
@@ -26,7 +26,7 @@ static const char * const rxe_counter_name[] = {
 
 int rxe_ib_get_hw_stats(struct ib_device *ibdev,
                        struct rdma_hw_stats *stats,
-                       u8 port, int index)
+                       u32 port, int index)
 {
        struct rxe_dev *dev = to_rdev(ibdev);
        unsigned int cnt;
@@ -41,7 +41,7 @@ int rxe_ib_get_hw_stats(struct ib_device *ibdev,
 }
 
 struct rdma_hw_stats *rxe_ib_alloc_hw_stats(struct ib_device *ibdev,
-                                           u8 port_num)
+                                           u32 port_num)
 {
        BUILD_BUG_ON(ARRAY_SIZE(rxe_counter_name) != RXE_NUM_OF_COUNTERS);
        /* We support only per port stats */
index 49ee6f9..2f369ac 100644 (file)
@@ -30,8 +30,8 @@ enum rxe_counters {
 };
 
 struct rdma_hw_stats *rxe_ib_alloc_hw_stats(struct ib_device *ibdev,
-                                           u8 port_num);
+                                           u32 port_num);
 int rxe_ib_get_hw_stats(struct ib_device *ibdev,
                        struct rdma_hw_stats *stats,
-                       u8 port, int index);
+                       u32 port, int index);
 #endif /* RXE_HW_COUNTERS_H */
index 0d75876..ef8061d 100644 (file)
@@ -72,40 +72,37 @@ int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
 
 /* rxe_mr.c */
 enum copy_direction {
-       to_mem_obj,
-       from_mem_obj,
+       to_mr_obj,
+       from_mr_obj,
 };
 
-void rxe_mem_init_dma(struct rxe_pd *pd,
-                     int access, struct rxe_mem *mem);
+void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr);
 
-int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
-                     u64 length, u64 iova, int access, struct ib_udata *udata,
-                     struct rxe_mem *mr);
+int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
+                    int access, struct ib_udata *udata, struct rxe_mr *mr);
 
-int rxe_mem_init_fast(struct rxe_pd *pd,
-                     int max_pages, struct rxe_mem *mem);
+int rxe_mr_init_fast(struct rxe_pd *pd, int max_pages, struct rxe_mr *mr);
 
-int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr,
-                int length, enum copy_direction dir, u32 *crcp);
+int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
+               enum copy_direction dir, u32 *crcp);
 
 int copy_data(struct rxe_pd *pd, int access,
              struct rxe_dma_info *dma, void *addr, int length,
              enum copy_direction dir, u32 *crcp);
 
-void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length);
+void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length);
 
 enum lookup_type {
        lookup_local,
        lookup_remote,
 };
 
-struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key,
-                          enum lookup_type type);
+struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
+                        enum lookup_type type);
 
-int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length);
+int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length);
 
-void rxe_mem_cleanup(struct rxe_pool_entry *arg);
+void rxe_mr_cleanup(struct rxe_pool_entry *arg);
 
 int advance_dma_data(struct rxe_dma_info *dma, unsigned int length);
 
@@ -116,7 +113,6 @@ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
                                int paylen, struct rxe_pkt_info *pkt);
 int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc);
 const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num);
-struct device *rxe_dma_device(struct rxe_dev *rxe);
 int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid);
 int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid);
 
index 6e8c415..9f63947 100644 (file)
@@ -24,16 +24,15 @@ static u8 rxe_get_key(void)
        return key;
 }
 
-int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length)
+int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length)
 {
-       switch (mem->type) {
-       case RXE_MEM_TYPE_DMA:
+       switch (mr->type) {
+       case RXE_MR_TYPE_DMA:
                return 0;
 
-       case RXE_MEM_TYPE_MR:
-               if (iova < mem->iova ||
-                   length > mem->length ||
-                   iova > mem->iova + mem->length - length)
+       case RXE_MR_TYPE_MR:
+               if (iova < mr->iova || length > mr->length ||
+                   iova > mr->iova + mr->length - length)
                        return -EFAULT;
                return 0;
 
@@ -46,85 +45,83 @@ int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length)
                                | IB_ACCESS_REMOTE_WRITE        \
                                | IB_ACCESS_REMOTE_ATOMIC)
 
-static void rxe_mem_init(int access, struct rxe_mem *mem)
+static void rxe_mr_init(int access, struct rxe_mr *mr)
 {
-       u32 lkey = mem->pelem.index << 8 | rxe_get_key();
+       u32 lkey = mr->pelem.index << 8 | rxe_get_key();
        u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0;
 
-       mem->ibmr.lkey          = lkey;
-       mem->ibmr.rkey          = rkey;
-       mem->state              = RXE_MEM_STATE_INVALID;
-       mem->type               = RXE_MEM_TYPE_NONE;
-       mem->map_shift          = ilog2(RXE_BUF_PER_MAP);
+       mr->ibmr.lkey = lkey;
+       mr->ibmr.rkey = rkey;
+       mr->state = RXE_MR_STATE_INVALID;
+       mr->type = RXE_MR_TYPE_NONE;
+       mr->map_shift = ilog2(RXE_BUF_PER_MAP);
 }
 
-void rxe_mem_cleanup(struct rxe_pool_entry *arg)
+void rxe_mr_cleanup(struct rxe_pool_entry *arg)
 {
-       struct rxe_mem *mem = container_of(arg, typeof(*mem), pelem);
+       struct rxe_mr *mr = container_of(arg, typeof(*mr), pelem);
        int i;
 
-       ib_umem_release(mem->umem);
+       ib_umem_release(mr->umem);
 
-       if (mem->map) {
-               for (i = 0; i < mem->num_map; i++)
-                       kfree(mem->map[i]);
+       if (mr->map) {
+               for (i = 0; i < mr->num_map; i++)
+                       kfree(mr->map[i]);
 
-               kfree(mem->map);
+               kfree(mr->map);
        }
 }
 
-static int rxe_mem_alloc(struct rxe_mem *mem, int num_buf)
+static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf)
 {
        int i;
        int num_map;
-       struct rxe_map **map = mem->map;
+       struct rxe_map **map = mr->map;
 
        num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP;
 
-       mem->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL);
-       if (!mem->map)
+       mr->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL);
+       if (!mr->map)
                goto err1;
 
        for (i = 0; i < num_map; i++) {
-               mem->map[i] = kmalloc(sizeof(**map), GFP_KERNEL);
-               if (!mem->map[i])
+               mr->map[i] = kmalloc(sizeof(**map), GFP_KERNEL);
+               if (!mr->map[i])
                        goto err2;
        }
 
        BUILD_BUG_ON(!is_power_of_2(RXE_BUF_PER_MAP));
 
-       mem->map_shift  = ilog2(RXE_BUF_PER_MAP);
-       mem->map_mask   = RXE_BUF_PER_MAP - 1;
+       mr->map_shift = ilog2(RXE_BUF_PER_MAP);
+       mr->map_mask = RXE_BUF_PER_MAP - 1;
 
-       mem->num_buf = num_buf;
-       mem->num_map = num_map;
-       mem->max_buf = num_map * RXE_BUF_PER_MAP;
+       mr->num_buf = num_buf;
+       mr->num_map = num_map;
+       mr->max_buf = num_map * RXE_BUF_PER_MAP;
 
        return 0;
 
 err2:
        for (i--; i >= 0; i--)
-               kfree(mem->map[i]);
+               kfree(mr->map[i]);
 
-       kfree(mem->map);
+       kfree(mr->map);
 err1:
        return -ENOMEM;
 }
 
-void rxe_mem_init_dma(struct rxe_pd *pd,
-                     int access, struct rxe_mem *mem)
+void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr)
 {
-       rxe_mem_init(access, mem);
+       rxe_mr_init(access, mr);
 
-       mem->ibmr.pd            = &pd->ibpd;
-       mem->access             = access;
-       mem->state              = RXE_MEM_STATE_VALID;
-       mem->type               = RXE_MEM_TYPE_DMA;
+       mr->ibmr.pd = &pd->ibpd;
+       mr->access = access;
+       mr->state = RXE_MR_STATE_VALID;
+       mr->type = RXE_MR_TYPE_DMA;
 }
 
-int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
-                     u64 length, u64 iova, int access, struct ib_udata *udata,
-                     struct rxe_mem *mem)
+int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
+                    int access, struct ib_udata *udata, struct rxe_mr *mr)
 {
        struct rxe_map          **map;
        struct rxe_phys_buf     *buf = NULL;
@@ -142,23 +139,23 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
                goto err1;
        }
 
-       mem->umem = umem;
+       mr->umem = umem;
        num_buf = ib_umem_num_pages(umem);
 
-       rxe_mem_init(access, mem);
+       rxe_mr_init(access, mr);
 
-       err = rxe_mem_alloc(mem, num_buf);
+       err = rxe_mr_alloc(mr, num_buf);
        if (err) {
-               pr_warn("err %d from rxe_mem_alloc\n", err);
+               pr_warn("err %d from rxe_mr_alloc\n", err);
                ib_umem_release(umem);
                goto err1;
        }
 
-       mem->page_shift         = PAGE_SHIFT;
-       mem->page_mask = PAGE_SIZE - 1;
+       mr->page_shift = PAGE_SHIFT;
+       mr->page_mask = PAGE_SIZE - 1;
 
        num_buf                 = 0;
-       map                     = mem->map;
+       map = mr->map;
        if (length > 0) {
                buf = map[0]->buf;
 
@@ -185,15 +182,15 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
                }
        }
 
-       mem->ibmr.pd            = &pd->ibpd;
-       mem->umem               = umem;
-       mem->access             = access;
-       mem->length             = length;
-       mem->iova               = iova;
-       mem->va                 = start;
-       mem->offset             = ib_umem_offset(umem);
-       mem->state              = RXE_MEM_STATE_VALID;
-       mem->type               = RXE_MEM_TYPE_MR;
+       mr->ibmr.pd = &pd->ibpd;
+       mr->umem = umem;
+       mr->access = access;
+       mr->length = length;
+       mr->iova = iova;
+       mr->va = start;
+       mr->offset = ib_umem_offset(umem);
+       mr->state = RXE_MR_STATE_VALID;
+       mr->type = RXE_MR_TYPE_MR;
 
        return 0;
 
@@ -201,24 +198,23 @@ err1:
        return err;
 }
 
-int rxe_mem_init_fast(struct rxe_pd *pd,
-                     int max_pages, struct rxe_mem *mem)
+int rxe_mr_init_fast(struct rxe_pd *pd, int max_pages, struct rxe_mr *mr)
 {
        int err;
 
-       rxe_mem_init(0, mem);
+       rxe_mr_init(0, mr);
 
        /* In fastreg, we also set the rkey */
-       mem->ibmr.rkey = mem->ibmr.lkey;
+       mr->ibmr.rkey = mr->ibmr.lkey;
 
-       err = rxe_mem_alloc(mem, max_pages);
+       err = rxe_mr_alloc(mr, max_pages);
        if (err)
                goto err1;
 
-       mem->ibmr.pd            = &pd->ibpd;
-       mem->max_buf            = max_pages;
-       mem->state              = RXE_MEM_STATE_FREE;
-       mem->type               = RXE_MEM_TYPE_MR;
+       mr->ibmr.pd = &pd->ibpd;
+       mr->max_buf = max_pages;
+       mr->state = RXE_MR_STATE_FREE;
+       mr->type = RXE_MR_TYPE_MR;
 
        return 0;
 
@@ -226,28 +222,24 @@ err1:
        return err;
 }
 
-static void lookup_iova(
-       struct rxe_mem  *mem,
-       u64                     iova,
-       int                     *m_out,
-       int                     *n_out,
-       size_t                  *offset_out)
+static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out,
+                       size_t *offset_out)
 {
-       size_t                  offset = iova - mem->iova + mem->offset;
+       size_t offset = iova - mr->iova + mr->offset;
        int                     map_index;
        int                     buf_index;
        u64                     length;
 
-       if (likely(mem->page_shift)) {
-               *offset_out = offset & mem->page_mask;
-               offset >>= mem->page_shift;
-               *n_out = offset & mem->map_mask;
-               *m_out = offset >> mem->map_shift;
+       if (likely(mr->page_shift)) {
+               *offset_out = offset & mr->page_mask;
+               offset >>= mr->page_shift;
+               *n_out = offset & mr->map_mask;
+               *m_out = offset >> mr->map_shift;
        } else {
                map_index = 0;
                buf_index = 0;
 
-               length = mem->map[map_index]->buf[buf_index].size;
+               length = mr->map[map_index]->buf[buf_index].size;
 
                while (offset >= length) {
                        offset -= length;
@@ -257,7 +249,7 @@ static void lookup_iova(
                                map_index++;
                                buf_index = 0;
                        }
-                       length = mem->map[map_index]->buf[buf_index].size;
+                       length = mr->map[map_index]->buf[buf_index].size;
                }
 
                *m_out = map_index;
@@ -266,49 +258,49 @@ static void lookup_iova(
        }
 }
 
-void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length)
+void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length)
 {
        size_t offset;
        int m, n;
        void *addr;
 
-       if (mem->state != RXE_MEM_STATE_VALID) {
-               pr_warn("mem not in valid state\n");
+       if (mr->state != RXE_MR_STATE_VALID) {
+               pr_warn("mr not in valid state\n");
                addr = NULL;
                goto out;
        }
 
-       if (!mem->map) {
+       if (!mr->map) {
                addr = (void *)(uintptr_t)iova;
                goto out;
        }
 
-       if (mem_check_range(mem, iova, length)) {
+       if (mr_check_range(mr, iova, length)) {
                pr_warn("range violation\n");
                addr = NULL;
                goto out;
        }
 
-       lookup_iova(mem, iova, &m, &n, &offset);
+       lookup_iova(mr, iova, &m, &n, &offset);
 
-       if (offset + length > mem->map[m]->buf[n].size) {
+       if (offset + length > mr->map[m]->buf[n].size) {
                pr_warn("crosses page boundary\n");
                addr = NULL;
                goto out;
        }
 
-       addr = (void *)(uintptr_t)mem->map[m]->buf[n].addr + offset;
+       addr = (void *)(uintptr_t)mr->map[m]->buf[n].addr + offset;
 
 out:
        return addr;
 }
 
 /* copy data from a range (vaddr, vaddr+length-1) to or from
- * a mem object starting at iova. Compute incremental value of
- * crc32 if crcp is not zero. caller must hold a reference to mem
+ * a mr object starting at iova. Compute incremental value of
+ * crc32 if crcp is not zero. caller must hold a reference to mr
  */
-int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length,
-                enum copy_direction dir, u32 *crcp)
+int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
+               enum copy_direction dir, u32 *crcp)
 {
        int                     err;
        int                     bytes;
@@ -323,43 +315,41 @@ int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length,
        if (length == 0)
                return 0;
 
-       if (mem->type == RXE_MEM_TYPE_DMA) {
+       if (mr->type == RXE_MR_TYPE_DMA) {
                u8 *src, *dest;
 
-               src  = (dir == to_mem_obj) ?
-                       addr : ((void *)(uintptr_t)iova);
+               src = (dir == to_mr_obj) ? addr : ((void *)(uintptr_t)iova);
 
-               dest = (dir == to_mem_obj) ?
-                       ((void *)(uintptr_t)iova) : addr;
+               dest = (dir == to_mr_obj) ? ((void *)(uintptr_t)iova) : addr;
 
                memcpy(dest, src, length);
 
                if (crcp)
-                       *crcp = rxe_crc32(to_rdev(mem->ibmr.device),
-                                       *crcp, dest, length);
+                       *crcp = rxe_crc32(to_rdev(mr->ibmr.device), *crcp, dest,
+                                         length);
 
                return 0;
        }
 
-       WARN_ON_ONCE(!mem->map);
+       WARN_ON_ONCE(!mr->map);
 
-       err = mem_check_range(mem, iova, length);
+       err = mr_check_range(mr, iova, length);
        if (err) {
                err = -EFAULT;
                goto err1;
        }
 
-       lookup_iova(mem, iova, &m, &i, &offset);
+       lookup_iova(mr, iova, &m, &i, &offset);
 
-       map     = mem->map + m;
+       map = mr->map + m;
        buf     = map[0]->buf + i;
 
        while (length > 0) {
                u8 *src, *dest;
 
                va      = (u8 *)(uintptr_t)buf->addr + offset;
-               src  = (dir == to_mem_obj) ? addr : va;
-               dest = (dir == to_mem_obj) ? va : addr;
+               src = (dir == to_mr_obj) ? addr : va;
+               dest = (dir == to_mr_obj) ? va : addr;
 
                bytes   = buf->size - offset;
 
@@ -369,8 +359,8 @@ int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length,
                memcpy(dest, src, bytes);
 
                if (crcp)
-                       crc = rxe_crc32(to_rdev(mem->ibmr.device),
-                                       crc, dest, bytes);
+                       crc = rxe_crc32(to_rdev(mr->ibmr.device), crc, dest,
+                                       bytes);
 
                length  -= bytes;
                addr    += bytes;
@@ -411,7 +401,7 @@ int copy_data(
        struct rxe_sge          *sge    = &dma->sge[dma->cur_sge];
        int                     offset  = dma->sge_offset;
        int                     resid   = dma->resid;
-       struct rxe_mem          *mem    = NULL;
+       struct rxe_mr           *mr     = NULL;
        u64                     iova;
        int                     err;
 
@@ -424,8 +414,8 @@ int copy_data(
        }
 
        if (sge->length && (offset < sge->length)) {
-               mem = lookup_mem(pd, access, sge->lkey, lookup_local);
-               if (!mem) {
+               mr = lookup_mr(pd, access, sge->lkey, lookup_local);
+               if (!mr) {
                        err = -EINVAL;
                        goto err1;
                }
@@ -435,9 +425,9 @@ int copy_data(
                bytes = length;
 
                if (offset >= sge->length) {
-                       if (mem) {
-                               rxe_drop_ref(mem);
-                               mem = NULL;
+                       if (mr) {
+                               rxe_drop_ref(mr);
+                               mr = NULL;
                        }
                        sge++;
                        dma->cur_sge++;
@@ -449,9 +439,9 @@ int copy_data(
                        }
 
                        if (sge->length) {
-                               mem = lookup_mem(pd, access, sge->lkey,
-                                                lookup_local);
-                               if (!mem) {
+                               mr = lookup_mr(pd, access, sge->lkey,
+                                              lookup_local);
+                               if (!mr) {
                                        err = -EINVAL;
                                        goto err1;
                                }
@@ -466,7 +456,7 @@ int copy_data(
                if (bytes > 0) {
                        iova = sge->addr + offset;
 
-                       err = rxe_mem_copy(mem, iova, addr, bytes, dir, crcp);
+                       err = rxe_mr_copy(mr, iova, addr, bytes, dir, crcp);
                        if (err)
                                goto err2;
 
@@ -480,14 +470,14 @@ int copy_data(
        dma->sge_offset = offset;
        dma->resid      = resid;
 
-       if (mem)
-               rxe_drop_ref(mem);
+       if (mr)
+               rxe_drop_ref(mr);
 
        return 0;
 
 err2:
-       if (mem)
-               rxe_drop_ref(mem);
+       if (mr)
+               rxe_drop_ref(mr);
 err1:
        return err;
 }
@@ -525,31 +515,30 @@ int advance_dma_data(struct rxe_dma_info *dma, unsigned int length)
        return 0;
 }
 
-/* (1) find the mem (mr or mw) corresponding to lkey/rkey
+/* (1) find the mr corresponding to lkey/rkey
  *     depending on lookup_type
- * (2) verify that the (qp) pd matches the mem pd
- * (3) verify that the mem can support the requested access
- * (4) verify that mem state is valid
+ * (2) verify that the (qp) pd matches the mr pd
+ * (3) verify that the mr can support the requested access
+ * (4) verify that mr state is valid
  */
-struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key,
-                          enum lookup_type type)
+struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
+                        enum lookup_type type)
 {
-       struct rxe_mem *mem;
+       struct rxe_mr *mr;
        struct rxe_dev *rxe = to_rdev(pd->ibpd.device);
        int index = key >> 8;
 
-       mem = rxe_pool_get_index(&rxe->mr_pool, index);
-       if (!mem)
+       mr = rxe_pool_get_index(&rxe->mr_pool, index);
+       if (!mr)
                return NULL;
 
-       if (unlikely((type == lookup_local && mr_lkey(mem) != key) ||
-                    (type == lookup_remote && mr_rkey(mem) != key) ||
-                    mr_pd(mem) != pd ||
-                    (access && !(access & mem->access)) ||
-                    mem->state != RXE_MEM_STATE_VALID)) {
-               rxe_drop_ref(mem);
-               mem = NULL;
+       if (unlikely((type == lookup_local && mr_lkey(mr) != key) ||
+                    (type == lookup_remote && mr_rkey(mr) != key) ||
+                    mr_pd(mr) != pd || (access && !(access & mr->access)) ||
+                    mr->state != RXE_MR_STATE_VALID)) {
+               rxe_drop_ref(mr);
+               mr = NULL;
        }
 
-       return mem;
+       return mr;
 }
index 307d898..d24901f 100644 (file)
@@ -8,8 +8,6 @@
 #include "rxe_loc.h"
 
 /* info about object pools
- * note that mr and mw share a single index space
- * so that one can map an lkey to the correct type of object
  */
 struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
        [RXE_TYPE_UC] = {
@@ -56,18 +54,18 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
        },
        [RXE_TYPE_MR] = {
                .name           = "rxe-mr",
-               .size           = sizeof(struct rxe_mem),
-               .elem_offset    = offsetof(struct rxe_mem, pelem),
-               .cleanup        = rxe_mem_cleanup,
+               .size           = sizeof(struct rxe_mr),
+               .elem_offset    = offsetof(struct rxe_mr, pelem),
+               .cleanup        = rxe_mr_cleanup,
                .flags          = RXE_POOL_INDEX,
                .max_index      = RXE_MAX_MR_INDEX,
                .min_index      = RXE_MIN_MR_INDEX,
        },
        [RXE_TYPE_MW] = {
                .name           = "rxe-mw",
-               .size           = sizeof(struct rxe_mem),
-               .elem_offset    = offsetof(struct rxe_mem, pelem),
-               .flags          = RXE_POOL_INDEX,
+               .size           = sizeof(struct rxe_mw),
+               .elem_offset    = offsetof(struct rxe_mw, pelem),
+               .flags          = RXE_POOL_INDEX | RXE_POOL_NO_ALLOC,
                .max_index      = RXE_MAX_MW_INDEX,
                .min_index      = RXE_MIN_MW_INDEX,
        },
index 8892907..3664cda 100644 (file)
@@ -464,7 +464,7 @@ static int fill_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
                } else {
                        err = copy_data(qp->pd, 0, &wqe->dma,
                                        payload_addr(pkt), paylen,
-                                       from_mem_obj,
+                                       from_mr_obj,
                                        &crc);
                        if (err)
                                return err;
@@ -596,7 +596,7 @@ next_wqe:
        if (wqe->mask & WR_REG_MASK) {
                if (wqe->wr.opcode == IB_WR_LOCAL_INV) {
                        struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
-                       struct rxe_mem *rmr;
+                       struct rxe_mr *rmr;
 
                        rmr = rxe_pool_get_index(&rxe->mr_pool,
                                                 wqe->wr.ex.invalidate_rkey >> 8);
@@ -607,14 +607,14 @@ next_wqe:
                                wqe->status = IB_WC_MW_BIND_ERR;
                                goto exit;
                        }
-                       rmr->state = RXE_MEM_STATE_FREE;
+                       rmr->state = RXE_MR_STATE_FREE;
                        rxe_drop_ref(rmr);
                        wqe->state = wqe_state_done;
                        wqe->status = IB_WC_SUCCESS;
                } else if (wqe->wr.opcode == IB_WR_REG_MR) {
-                       struct rxe_mem *rmr = to_rmr(wqe->wr.wr.reg.mr);
+                       struct rxe_mr *rmr = to_rmr(wqe->wr.wr.reg.mr);
 
-                       rmr->state = RXE_MEM_STATE_VALID;
+                       rmr->state = RXE_MR_STATE_VALID;
                        rmr->access = wqe->wr.wr.reg.access;
                        rmr->ibmr.lkey = wqe->wr.wr.reg.key;
                        rmr->ibmr.rkey = wqe->wr.wr.reg.key;
index 142f3d8..2b22065 100644 (file)
@@ -391,7 +391,7 @@ static enum resp_states check_length(struct rxe_qp *qp,
 static enum resp_states check_rkey(struct rxe_qp *qp,
                                   struct rxe_pkt_info *pkt)
 {
-       struct rxe_mem *mem = NULL;
+       struct rxe_mr *mr = NULL;
        u64 va;
        u32 rkey;
        u32 resid;
@@ -430,18 +430,18 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
        resid   = qp->resp.resid;
        pktlen  = payload_size(pkt);
 
-       mem = lookup_mem(qp->pd, access, rkey, lookup_remote);
-       if (!mem) {
+       mr = lookup_mr(qp->pd, access, rkey, lookup_remote);
+       if (!mr) {
                state = RESPST_ERR_RKEY_VIOLATION;
                goto err;
        }
 
-       if (unlikely(mem->state == RXE_MEM_STATE_FREE)) {
+       if (unlikely(mr->state == RXE_MR_STATE_FREE)) {
                state = RESPST_ERR_RKEY_VIOLATION;
                goto err;
        }
 
-       if (mem_check_range(mem, va, resid)) {
+       if (mr_check_range(mr, va, resid)) {
                state = RESPST_ERR_RKEY_VIOLATION;
                goto err;
        }
@@ -469,12 +469,12 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
 
        WARN_ON_ONCE(qp->resp.mr);
 
-       qp->resp.mr = mem;
+       qp->resp.mr = mr;
        return RESPST_EXECUTE;
 
 err:
-       if (mem)
-               rxe_drop_ref(mem);
+       if (mr)
+               rxe_drop_ref(mr);
        return state;
 }
 
@@ -484,7 +484,7 @@ static enum resp_states send_data_in(struct rxe_qp *qp, void *data_addr,
        int err;
 
        err = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE, &qp->resp.wqe->dma,
-                       data_addr, data_len, to_mem_obj, NULL);
+                       data_addr, data_len, to_mr_obj, NULL);
        if (unlikely(err))
                return (err == -ENOSPC) ? RESPST_ERR_LENGTH
                                        : RESPST_ERR_MALFORMED_WQE;
@@ -499,8 +499,8 @@ static enum resp_states write_data_in(struct rxe_qp *qp,
        int     err;
        int data_len = payload_size(pkt);
 
-       err = rxe_mem_copy(qp->resp.mr, qp->resp.va, payload_addr(pkt),
-                          data_len, to_mem_obj, NULL);
+       err = rxe_mr_copy(qp->resp.mr, qp->resp.va, payload_addr(pkt), data_len,
+                         to_mr_obj, NULL);
        if (err) {
                rc = RESPST_ERR_RKEY_VIOLATION;
                goto out;
@@ -522,9 +522,9 @@ static enum resp_states process_atomic(struct rxe_qp *qp,
        u64 iova = atmeth_va(pkt);
        u64 *vaddr;
        enum resp_states ret;
-       struct rxe_mem *mr = qp->resp.mr;
+       struct rxe_mr *mr = qp->resp.mr;
 
-       if (mr->state != RXE_MEM_STATE_VALID) {
+       if (mr->state != RXE_MR_STATE_VALID) {
                ret = RESPST_ERR_RKEY_VIOLATION;
                goto out;
        }
@@ -700,8 +700,8 @@ static enum resp_states read_reply(struct rxe_qp *qp,
        if (!skb)
                return RESPST_ERR_RNR;
 
-       err = rxe_mem_copy(res->read.mr, res->read.va, payload_addr(&ack_pkt),
-                          payload, from_mem_obj, &icrc);
+       err = rxe_mr_copy(res->read.mr, res->read.va, payload_addr(&ack_pkt),
+                         payload, from_mr_obj, &icrc);
        if (err)
                pr_err("Failed copying memory\n");
 
@@ -816,8 +816,8 @@ static enum resp_states do_complete(struct rxe_qp *qp,
        struct rxe_recv_wqe *wqe = qp->resp.wqe;
        struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
 
-       if (unlikely(!wqe))
-               return RESPST_CLEANUP;
+       if (!wqe)
+               goto finish;
 
        memset(&cqe, 0, sizeof(cqe));
 
@@ -883,7 +883,7 @@ static enum resp_states do_complete(struct rxe_qp *qp,
                        }
 
                        if (pkt->mask & RXE_IETH_MASK) {
-                               struct rxe_mem *rmr;
+                               struct rxe_mr *rmr;
 
                                wc->wc_flags |= IB_WC_WITH_INVALIDATE;
                                wc->ex.invalidate_rkey = ieth_rkey(pkt);
@@ -895,7 +895,7 @@ static enum resp_states do_complete(struct rxe_qp *qp,
                                               wc->ex.invalidate_rkey);
                                        return RESPST_ERROR;
                                }
-                               rmr->state = RXE_MEM_STATE_FREE;
+                               rmr->state = RXE_MR_STATE_FREE;
                                rxe_drop_ref(rmr);
                        }
 
@@ -917,12 +917,12 @@ static enum resp_states do_complete(struct rxe_qp *qp,
        if (rxe_cq_post(qp->rcq, &cqe, pkt ? bth_se(pkt) : 1))
                return RESPST_ERR_CQ_OVERFLOW;
 
-       if (qp->resp.state == QP_STATE_ERROR)
+finish:
+       if (unlikely(qp->resp.state == QP_STATE_ERROR))
                return RESPST_CHK_RESOURCE;
-
-       if (!pkt)
+       if (unlikely(!pkt))
                return RESPST_DONE;
-       else if (qp_type(qp) == IB_QPT_RC)
+       if (qp_type(qp) == IB_QPT_RC)
                return RESPST_ACKNOWLEDGE;
        else
                return RESPST_CLEANUP;
@@ -1056,10 +1056,8 @@ static enum resp_states duplicate_request(struct rxe_qp *qp,
        if (pkt->mask & RXE_SEND_MASK ||
            pkt->mask & RXE_WRITE_MASK) {
                /* SEND. Ack again and cleanup. C9-105. */
-               if (bth_ack(pkt))
-                       send_ack(qp, pkt, AETH_ACK_UNLIMITED, prev_psn);
-               rc = RESPST_CLEANUP;
-               goto out;
+               send_ack(qp, pkt, AETH_ACK_UNLIMITED, prev_psn);
+               return RESPST_CLEANUP;
        } else if (pkt->mask & RXE_READ_MASK) {
                struct resp_res *res;
 
index dee5e0e..aeb5e23 100644 (file)
@@ -26,7 +26,7 @@ static int rxe_query_device(struct ib_device *dev,
 }
 
 static int rxe_query_port(struct ib_device *dev,
-                         u8 port_num, struct ib_port_attr *attr)
+                         u32 port_num, struct ib_port_attr *attr)
 {
        struct rxe_dev *rxe = to_rdev(dev);
        struct rxe_port *port;
@@ -54,7 +54,7 @@ static int rxe_query_port(struct ib_device *dev,
 }
 
 static int rxe_query_pkey(struct ib_device *device,
-                         u8 port_num, u16 index, u16 *pkey)
+                         u32 port_num, u16 index, u16 *pkey)
 {
        if (index > 0)
                return -EINVAL;
@@ -84,7 +84,7 @@ static int rxe_modify_device(struct ib_device *dev,
 }
 
 static int rxe_modify_port(struct ib_device *dev,
-                          u8 port_num, int mask, struct ib_port_modify *attr)
+                          u32 port_num, int mask, struct ib_port_modify *attr)
 {
        struct rxe_dev *rxe = to_rdev(dev);
        struct rxe_port *port;
@@ -101,7 +101,7 @@ static int rxe_modify_port(struct ib_device *dev,
 }
 
 static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev,
-                                              u8 port_num)
+                                              u32 port_num)
 {
        return IB_LINK_LAYER_ETHERNET;
 }
@@ -121,7 +121,7 @@ static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
        rxe_drop_ref(uc);
 }
 
-static int rxe_port_immutable(struct ib_device *dev, u8 port_num,
+static int rxe_port_immutable(struct ib_device *dev, u32 port_num,
                              struct ib_port_immutable *immutable)
 {
        int err;
@@ -865,7 +865,7 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access)
 {
        struct rxe_dev *rxe = to_rdev(ibpd->device);
        struct rxe_pd *pd = to_rpd(ibpd);
-       struct rxe_mem *mr;
+       struct rxe_mr *mr;
 
        mr = rxe_alloc(&rxe->mr_pool);
        if (!mr)
@@ -873,7 +873,7 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access)
 
        rxe_add_index(mr);
        rxe_add_ref(pd);
-       rxe_mem_init_dma(pd, access, mr);
+       rxe_mr_init_dma(pd, access, mr);
 
        return &mr->ibmr;
 }
@@ -887,7 +887,7 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd,
        int err;
        struct rxe_dev *rxe = to_rdev(ibpd->device);
        struct rxe_pd *pd = to_rpd(ibpd);
-       struct rxe_mem *mr;
+       struct rxe_mr *mr;
 
        mr = rxe_alloc(&rxe->mr_pool);
        if (!mr) {
@@ -899,8 +899,7 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd,
 
        rxe_add_ref(pd);
 
-       err = rxe_mem_init_user(pd, start, length, iova,
-                               access, udata, mr);
+       err = rxe_mr_init_user(pd, start, length, iova, access, udata, mr);
        if (err)
                goto err3;
 
@@ -916,9 +915,9 @@ err2:
 
 static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
-       struct rxe_mem *mr = to_rmr(ibmr);
+       struct rxe_mr *mr = to_rmr(ibmr);
 
-       mr->state = RXE_MEM_STATE_ZOMBIE;
+       mr->state = RXE_MR_STATE_ZOMBIE;
        rxe_drop_ref(mr_pd(mr));
        rxe_drop_index(mr);
        rxe_drop_ref(mr);
@@ -930,7 +929,7 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
 {
        struct rxe_dev *rxe = to_rdev(ibpd->device);
        struct rxe_pd *pd = to_rpd(ibpd);
-       struct rxe_mem *mr;
+       struct rxe_mr *mr;
        int err;
 
        if (mr_type != IB_MR_TYPE_MEM_REG)
@@ -946,7 +945,7 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
 
        rxe_add_ref(pd);
 
-       err = rxe_mem_init_fast(pd, max_num_sg, mr);
+       err = rxe_mr_init_fast(pd, max_num_sg, mr);
        if (err)
                goto err2;
 
@@ -962,7 +961,7 @@ err1:
 
 static int rxe_set_page(struct ib_mr *ibmr, u64 addr)
 {
-       struct rxe_mem *mr = to_rmr(ibmr);
+       struct rxe_mr *mr = to_rmr(ibmr);
        struct rxe_map *map;
        struct rxe_phys_buf *buf;
 
@@ -982,7 +981,7 @@ static int rxe_set_page(struct ib_mr *ibmr, u64 addr)
 static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
                         int sg_nents, unsigned int *sg_offset)
 {
-       struct rxe_mem *mr = to_rmr(ibmr);
+       struct rxe_mr *mr = to_rmr(ibmr);
        int n;
 
        mr->nbuf = 0;
@@ -1110,6 +1109,7 @@ static const struct ib_device_ops rxe_dev_ops = {
        INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc),
+       INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw),
 };
 
 int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
index 79e0a5a..11eba7a 100644 (file)
@@ -156,7 +156,7 @@ struct resp_res {
                        struct sk_buff  *skb;
                } atomic;
                struct {
-                       struct rxe_mem  *mr;
+                       struct rxe_m  *mr;
                        u64             va_org;
                        u32             rkey;
                        u32             length;
@@ -183,7 +183,7 @@ struct rxe_resp_info {
 
        /* RDMA read / atomic only */
        u64                     va;
-       struct rxe_mem          *mr;
+       struct rxe_m          *mr;
        u32                     resid;
        u32                     rkey;
        u32                     length;
@@ -262,18 +262,18 @@ struct rxe_qp {
        struct execute_work     cleanup_work;
 };
 
-enum rxe_mem_state {
-       RXE_MEM_STATE_ZOMBIE,
-       RXE_MEM_STATE_INVALID,
-       RXE_MEM_STATE_FREE,
-       RXE_MEM_STATE_VALID,
+enum rxe_mr_state {
+       RXE_MR_STATE_ZOMBIE,
+       RXE_MR_STATE_INVALID,
+       RXE_MR_STATE_FREE,
+       RXE_MR_STATE_VALID,
 };
 
-enum rxe_mem_type {
-       RXE_MEM_TYPE_NONE,
-       RXE_MEM_TYPE_DMA,
-       RXE_MEM_TYPE_MR,
-       RXE_MEM_TYPE_MW,
+enum rxe_mr_type {
+       RXE_MR_TYPE_NONE,
+       RXE_MR_TYPE_DMA,
+       RXE_MR_TYPE_MR,
+       RXE_MR_TYPE_MW,
 };
 
 #define RXE_BUF_PER_MAP                (PAGE_SIZE / sizeof(struct rxe_phys_buf))
@@ -287,17 +287,14 @@ struct rxe_map {
        struct rxe_phys_buf     buf[RXE_BUF_PER_MAP];
 };
 
-struct rxe_mem {
+struct rxe_mr {
        struct rxe_pool_entry   pelem;
-       union {
-               struct ib_mr            ibmr;
-               struct ib_mw            ibmw;
-       };
+       struct ib_mr            ibmr;
 
        struct ib_umem          *umem;
 
-       enum rxe_mem_state      state;
-       enum rxe_mem_type       type;
+       enum rxe_mr_state       state;
+       enum rxe_mr_type        type;
        u64                     va;
        u64                     iova;
        size_t                  length;
@@ -318,6 +315,17 @@ struct rxe_mem {
        struct rxe_map          **map;
 };
 
+enum rxe_mw_state {
+       RXE_MW_STATE_INVALID = RXE_MR_STATE_INVALID,
+       RXE_MW_STATE_FREE = RXE_MR_STATE_FREE,
+       RXE_MW_STATE_VALID = RXE_MR_STATE_VALID,
+};
+
+struct rxe_mw {
+       struct ib_mw ibmw;
+       struct rxe_pool_entry pelem;
+};
+
 struct rxe_mc_grp {
        struct rxe_pool_entry   pelem;
        spinlock_t              mcg_lock; /* guard group */
@@ -422,27 +430,27 @@ static inline struct rxe_cq *to_rcq(struct ib_cq *cq)
        return cq ? container_of(cq, struct rxe_cq, ibcq) : NULL;
 }
 
-static inline struct rxe_mem *to_rmr(struct ib_mr *mr)
+static inline struct rxe_mr *to_rmr(struct ib_mr *mr)
 {
-       return mr ? container_of(mr, struct rxe_mem, ibmr) : NULL;
+       return mr ? container_of(mr, struct rxe_mr, ibmr) : NULL;
 }
 
-static inline struct rxe_mem *to_rmw(struct ib_mw *mw)
+static inline struct rxe_mw *to_rmw(struct ib_mw *mw)
 {
-       return mw ? container_of(mw, struct rxe_mem, ibmw) : NULL;
+       return mw ? container_of(mw, struct rxe_mw, ibmw) : NULL;
 }
 
-static inline struct rxe_pd *mr_pd(struct rxe_mem *mr)
+static inline struct rxe_pd *mr_pd(struct rxe_mr *mr)
 {
        return to_rpd(mr->ibmr.pd);
 }
 
-static inline u32 mr_lkey(struct rxe_mem *mr)
+static inline u32 mr_lkey(struct rxe_mr *mr)
 {
        return mr->ibmr.lkey;
 }
 
-static inline u32 mr_rkey(struct rxe_mem *mr)
+static inline u32 mr_rkey(struct rxe_mr *mr)
 {
        return mr->ibmr.rkey;
 }
index e8a04d9..3f1dedb 100644 (file)
@@ -114,13 +114,6 @@ static inline u8 __ddp_get_version(struct iwarp_ctrl *ctrl)
        return be16_to_cpu(ctrl->ddp_rdmap_ctrl & DDP_MASK_VERSION) >> 8;
 }
 
-static inline void __ddp_set_version(struct iwarp_ctrl *ctrl, u8 version)
-{
-       ctrl->ddp_rdmap_ctrl =
-               (ctrl->ddp_rdmap_ctrl & ~DDP_MASK_VERSION) |
-               (cpu_to_be16((u16)version << 8) & DDP_MASK_VERSION);
-}
-
 static inline u8 __rdmap_get_version(struct iwarp_ctrl *ctrl)
 {
        __be16 ver = ctrl->ddp_rdmap_ctrl & RDMAP_MASK_VERSION;
@@ -128,12 +121,6 @@ static inline u8 __rdmap_get_version(struct iwarp_ctrl *ctrl)
        return be16_to_cpu(ver) >> 6;
 }
 
-static inline void __rdmap_set_version(struct iwarp_ctrl *ctrl, u8 version)
-{
-       ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_VERSION) |
-                              (cpu_to_be16(version << 6) & RDMAP_MASK_VERSION);
-}
-
 static inline u8 __rdmap_get_opcode(struct iwarp_ctrl *ctrl)
 {
        return be16_to_cpu(ctrl->ddp_rdmap_ctrl & RDMAP_MASK_OPCODE);
index 1f9e15b..7a5ed86 100644 (file)
@@ -1300,7 +1300,7 @@ static void siw_cm_llp_state_change(struct sock *sk)
 }
 
 static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
-                             struct sockaddr *raddr)
+                             struct sockaddr *raddr, bool afonly)
 {
        int rv, flags = 0;
        size_t size = laddr->sa_family == AF_INET ?
@@ -1311,6 +1311,12 @@ static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
         */
        sock_set_reuseaddr(s->sk);
 
+       if (afonly) {
+               rv = ip6_sock_set_v6only(s->sk);
+               if (rv)
+                       return rv;
+       }
+
        rv = s->ops->bind(s, laddr, size);
        if (rv < 0)
                return rv;
@@ -1371,7 +1377,7 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
         * mode. Might be reconsidered for async connection setup at
         * TCP level.
         */
-       rv = kernel_bindconnect(s, laddr, raddr);
+       rv = kernel_bindconnect(s, laddr, raddr, id->afonly);
        if (rv != 0) {
                siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv);
                goto error;
@@ -1786,6 +1792,15 @@ int siw_create_listen(struct iw_cm_id *id, int backlog)
        } else {
                struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr);
 
+               if (id->afonly) {
+                       rv = ip6_sock_set_v6only(s->sk);
+                       if (rv) {
+                               siw_dbg(id->device,
+                                       "ip6_sock_set_v6only erro: %d\n", rv);
+                               goto error;
+                       }
+               }
+
                /* For wildcard addr, limit binding to current device only */
                if (ipv6_addr_any(&laddr->sin6_addr))
                        s->sk->sk_bound_dev_if = sdev->netdev->ifindex;
index 34a910c..61c17db 100644 (file)
@@ -106,8 +106,6 @@ int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
        mem->perms = rights & IWARP_ACCESS_MASK;
        kref_init(&mem->ref);
 
-       mr->mem = mem;
-
        get_random_bytes(&next, 4);
        next &= 0x00ffffff;
 
@@ -116,6 +114,8 @@ int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
                kfree(mem);
                return -ENOMEM;
        }
+
+       mr->mem = mem;
        /* Set the STag index part */
        mem->stag = id << 8;
        mr->base_mr.lkey = mr->base_mr.rkey = mem->stag;
index db138c8..f911287 100644 (file)
@@ -29,11 +29,6 @@ static inline void siw_mem_put(struct siw_mem *mem)
        kref_put(&mem->ref, siw_free_mem);
 }
 
-static inline struct siw_mr *siw_mem2mr(struct siw_mem *m)
-{
-       return container_of(m, struct siw_mr, mem);
-}
-
 static inline void siw_unref_mem_sgl(struct siw_mem **mem, unsigned int num_sge)
 {
        while (num_sge) {
index e389d44..d2313ef 100644 (file)
@@ -160,7 +160,7 @@ int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
        return 0;
 }
 
-int siw_query_port(struct ib_device *base_dev, u8 port,
+int siw_query_port(struct ib_device *base_dev, u32 port,
                   struct ib_port_attr *attr)
 {
        struct siw_device *sdev = to_siw_dev(base_dev);
@@ -194,7 +194,7 @@ int siw_query_port(struct ib_device *base_dev, u8 port,
        return rv;
 }
 
-int siw_get_port_immutable(struct ib_device *base_dev, u8 port,
+int siw_get_port_immutable(struct ib_device *base_dev, u32 port,
                           struct ib_port_immutable *port_immutable)
 {
        struct ib_port_attr attr;
@@ -209,7 +209,7 @@ int siw_get_port_immutable(struct ib_device *base_dev, u8 port,
        return 0;
 }
 
-int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
+int siw_query_gid(struct ib_device *base_dev, u32 port, int idx,
                  union ib_gid *gid)
 {
        struct siw_device *sdev = to_siw_dev(base_dev);
@@ -1848,7 +1848,7 @@ void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype)
        }
 }
 
-void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype)
+void siw_port_event(struct siw_device *sdev, u32 port, enum ib_event_type etype)
 {
        struct ib_event event;
 
index 6374545..67ac088 100644 (file)
@@ -36,17 +36,17 @@ static inline void siw_copy_sgl(struct ib_sge *sge, struct siw_sge *siw_sge,
 
 int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata);
 void siw_dealloc_ucontext(struct ib_ucontext *base_ctx);
-int siw_query_port(struct ib_device *base_dev, u8 port,
+int siw_query_port(struct ib_device *base_dev, u32 port,
                   struct ib_port_attr *attr);
-int siw_get_port_immutable(struct ib_device *base_dev, u8 port,
+int siw_get_port_immutable(struct ib_device *base_dev, u32 port,
                           struct ib_port_immutable *port_immutable);
 int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
                     struct ib_udata *udata);
 int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
                  struct ib_udata *udata);
-int siw_query_port(struct ib_device *base_dev, u8 port,
+int siw_query_port(struct ib_device *base_dev, u32 port,
                   struct ib_port_attr *attr);
-int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
+int siw_query_gid(struct ib_device *base_dev, u32 port, int idx,
                  union ib_gid *gid);
 int siw_alloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
 int siw_dealloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
@@ -86,6 +86,6 @@ void siw_mmap_free(struct rdma_user_mmap_entry *rdma_entry);
 void siw_qp_event(struct siw_qp *qp, enum ib_event_type type);
 void siw_cq_event(struct siw_cq *cq, enum ib_event_type type);
 void siw_srq_event(struct siw_srq *srq, enum ib_event_type type);
-void siw_port_event(struct siw_device *dev, u8 port, enum ib_event_type type);
+void siw_port_event(struct siw_device *dev, u32 port, enum ib_event_type type);
 
 #endif
index 179ff1d..75cd447 100644 (file)
@@ -501,9 +501,9 @@ void ipoib_reap_ah(struct work_struct *work);
 struct ipoib_path *__path_find(struct net_device *dev, void *gid);
 void ipoib_mark_paths_invalid(struct net_device *dev);
 void ipoib_flush_paths(struct net_device *dev);
-struct net_device *ipoib_intf_alloc(struct ib_device *hca, u8 port,
+struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port,
                                    const char *format);
-int ipoib_intf_init(struct ib_device *hca, u8 port, const char *format,
+int ipoib_intf_init(struct ib_device *hca, u32 port, const char *format,
                    struct net_device *dev);
 void ipoib_ib_tx_timer_func(struct timer_list *t);
 void ipoib_ib_dev_flush_light(struct work_struct *work);
@@ -677,8 +677,6 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
 void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc);
 #else
 
-struct ipoib_cm_tx;
-
 #define ipoib_max_conn_qp 0
 
 static inline int ipoib_cm_admin_enabled(struct net_device *dev)
index d5d592b..9dbc85a 100644 (file)
@@ -1122,12 +1122,8 @@ static int ipoib_cm_modify_tx_init(struct net_device *dev,
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct ib_qp_attr qp_attr;
        int qp_attr_mask, ret;
-       ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
-       if (ret) {
-               ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret);
-               return ret;
-       }
 
+       qp_attr.pkey_index = priv->pkey_index;
        qp_attr.qp_state = IB_QPS_INIT;
        qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
        qp_attr.port_num = priv->port;
index 494f413..ceabfb0 100644 (file)
@@ -1060,7 +1060,7 @@ static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
        union ib_gid *netdev_gid;
        int err;
        u16 index;
-       u8 port;
+       u32 port;
        bool ret = false;
 
        netdev_gid = (union ib_gid *)(priv->dev->dev_addr + 4);
index e16b40c..bbb1808 100644 (file)
@@ -90,7 +90,7 @@ static int ipoib_add_one(struct ib_device *device);
 static void ipoib_remove_one(struct ib_device *device, void *client_data);
 static void ipoib_neigh_reclaim(struct rcu_head *rp);
 static struct net_device *ipoib_get_net_dev_by_params(
-               struct ib_device *dev, u8 port, u16 pkey,
+               struct ib_device *dev, u32 port, u16 pkey,
                const union ib_gid *gid, const struct sockaddr *addr,
                void *client_data);
 static int ipoib_set_mac(struct net_device *dev, void *addr);
@@ -164,8 +164,13 @@ int ipoib_open(struct net_device *dev)
                        dev_change_flags(cpriv->dev, flags | IFF_UP, NULL);
                }
                up_read(&priv->vlan_rwsem);
-       }
+       } else if (priv->parent) {
+               struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
 
+               if (!test_bit(IPOIB_FLAG_ADMIN_UP, &ppriv->flags))
+                       ipoib_dbg(priv, "parent device %s is not up, so child device may be not functioning.\n",
+                                 ppriv->dev->name);
+       }
        netif_start_queue(dev);
 
        return 0;
@@ -438,7 +443,7 @@ static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
 /* Returns the number of matching net_devs found (between 0 and 2). Also
  * return the matching net_device in the @net_dev parameter, holding a
  * reference to the net_device, if the number of matches >= 1 */
-static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
+static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u32 port,
                                         u16 pkey_index,
                                         const union ib_gid *gid,
                                         const struct sockaddr *addr,
@@ -463,7 +468,7 @@ static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
 }
 
 static struct net_device *ipoib_get_net_dev_by_params(
-               struct ib_device *dev, u8 port, u16 pkey,
+               struct ib_device *dev, u32 port, u16 pkey,
                const union ib_gid *gid, const struct sockaddr *addr,
                void *client_data)
 {
@@ -1181,7 +1186,12 @@ unref:
 static void ipoib_timeout(struct net_device *dev, unsigned int txqueue)
 {
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
+       struct rdma_netdev *rn = netdev_priv(dev);
 
+       if (rn->tx_timeout) {
+               rn->tx_timeout(dev, txqueue);
+               return;
+       }
        ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
                   jiffies_to_msecs(jiffies - dev_trans_start(dev)));
        ipoib_warn(priv,
@@ -2145,7 +2155,7 @@ static void ipoib_build_priv(struct net_device *dev)
        INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
 }
 
-static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u8 port,
+static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u32 port,
                                             const char *name)
 {
        struct net_device *dev;
@@ -2162,7 +2172,7 @@ static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u8 port,
        return dev;
 }
 
-int ipoib_intf_init(struct ib_device *hca, u8 port, const char *name,
+int ipoib_intf_init(struct ib_device *hca, u32 port, const char *name,
                    struct net_device *dev)
 {
        struct rdma_netdev *rn = netdev_priv(dev);
@@ -2213,7 +2223,7 @@ out:
        return rc;
 }
 
-struct net_device *ipoib_intf_alloc(struct ib_device *hca, u8 port,
+struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port,
                                    const char *name)
 {
        struct net_device *dev;
@@ -2456,7 +2466,7 @@ static int ipoib_intercept_dev_id_attr(struct net_device *dev)
 }
 
 static struct net_device *ipoib_add_port(const char *format,
-                                        struct ib_device *hca, u8 port)
+                                        struct ib_device *hca, u32 port)
 {
        struct rtnl_link_ops *ops = ipoib_get_link_ops();
        struct rdma_netdev_alloc_params params;
index 78ee944..9f6ac0a 100644 (file)
@@ -297,7 +297,6 @@ struct iser_login_desc {
 
 struct iser_conn;
 struct ib_conn;
-struct iscsi_iser_task;
 
 /**
  * struct iser_device - iSER device handle
index 7305ed8..18266f0 100644 (file)
@@ -438,23 +438,23 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
        isert_init_conn(isert_conn);
        isert_conn->cm_id = cma_id;
 
-       ret = isert_alloc_login_buf(isert_conn, cma_id->device);
-       if (ret)
-               goto out;
-
        device = isert_device_get(cma_id);
        if (IS_ERR(device)) {
                ret = PTR_ERR(device);
-               goto out_rsp_dma_map;
+               goto out;
        }
        isert_conn->device = device;
 
+       ret = isert_alloc_login_buf(isert_conn, cma_id->device);
+       if (ret)
+               goto out_conn_dev;
+
        isert_set_nego_params(isert_conn, &event->param.conn);
 
        isert_conn->qp = isert_create_qp(isert_conn, cma_id);
        if (IS_ERR(isert_conn->qp)) {
                ret = PTR_ERR(isert_conn->qp);
-               goto out_conn_dev;
+               goto out_rsp_dma_map;
        }
 
        ret = isert_login_post_recv(isert_conn);
@@ -473,10 +473,10 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
 
 out_destroy_qp:
        isert_destroy_qp(isert_conn);
-out_conn_dev:
-       isert_device_put(device);
 out_rsp_dma_map:
        isert_free_login_buf(isert_conn);
+out_conn_dev:
+       isert_device_put(device);
 out:
        kfree(isert_conn);
        rdma_reject(cma_id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED);
index b6a0abf..7d53d18 100644 (file)
@@ -101,6 +101,9 @@ static ssize_t mpath_policy_show(struct device *dev,
        case MP_POLICY_MIN_INFLIGHT:
                return sysfs_emit(page, "min-inflight (MI: %d)\n",
                                  clt->mp_policy);
+       case MP_POLICY_MIN_LATENCY:
+               return sysfs_emit(page, "min-latency (ML: %d)\n",
+                                 clt->mp_policy);
        default:
                return sysfs_emit(page, "Unknown (%d)\n", clt->mp_policy);
        }
@@ -114,22 +117,32 @@ static ssize_t mpath_policy_store(struct device *dev,
        struct rtrs_clt *clt;
        int value;
        int ret;
+       size_t len = 0;
 
        clt = container_of(dev, struct rtrs_clt, dev);
 
        ret = kstrtoint(buf, 10, &value);
        if (!ret && (value == MP_POLICY_RR ||
-                    value == MP_POLICY_MIN_INFLIGHT)) {
+                    value == MP_POLICY_MIN_INFLIGHT ||
+                    value == MP_POLICY_MIN_LATENCY)) {
                clt->mp_policy = value;
                return count;
        }
 
+       /* distinguish "mi" and "min-latency" with length */
+       len = strnlen(buf, NAME_MAX);
+       if (buf[len - 1] == '\n')
+               len--;
+
        if (!strncasecmp(buf, "round-robin", 11) ||
-           !strncasecmp(buf, "rr", 2))
+           (len == 2 && !strncasecmp(buf, "rr", 2)))
                clt->mp_policy = MP_POLICY_RR;
        else if (!strncasecmp(buf, "min-inflight", 12) ||
-                !strncasecmp(buf, "mi", 2))
+                (len == 2 && !strncasecmp(buf, "mi", 2)))
                clt->mp_policy = MP_POLICY_MIN_INFLIGHT;
+       else if (!strncasecmp(buf, "min-latency", 11) ||
+                (len == 2 && !strncasecmp(buf, "ml", 2)))
+               clt->mp_policy = MP_POLICY_MIN_LATENCY;
        else
                return -EINVAL;
 
@@ -342,6 +355,21 @@ static ssize_t rtrs_clt_hca_name_show(struct kobject *kobj,
 static struct kobj_attribute rtrs_clt_hca_name_attr =
        __ATTR(hca_name, 0444, rtrs_clt_hca_name_show, NULL);
 
+static ssize_t rtrs_clt_cur_latency_show(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   char *page)
+{
+       struct rtrs_clt_sess *sess;
+
+       sess = container_of(kobj, struct rtrs_clt_sess, kobj);
+
+       return sysfs_emit(page, "%lld ns\n",
+                         ktime_to_ns(sess->s.hb_cur_latency));
+}
+
+static struct kobj_attribute rtrs_clt_cur_latency_attr =
+       __ATTR(cur_latency, 0444, rtrs_clt_cur_latency_show, NULL);
+
 static ssize_t rtrs_clt_src_addr_show(struct kobject *kobj,
                                       struct kobj_attribute *attr,
                                       char *page)
@@ -385,6 +413,7 @@ static struct attribute *rtrs_clt_sess_attrs[] = {
        &rtrs_clt_reconnect_attr.attr,
        &rtrs_clt_disconnect_attr.attr,
        &rtrs_clt_remove_path_attr.attr,
+       &rtrs_clt_cur_latency_attr.attr,
        NULL,
 };
 
@@ -396,14 +425,13 @@ int rtrs_clt_create_sess_files(struct rtrs_clt_sess *sess)
 {
        struct rtrs_clt *clt = sess->clt;
        char str[NAME_MAX];
-       int err, cnt;
-
-       cnt = sockaddr_to_str((struct sockaddr *)&sess->s.src_addr,
-                             str, sizeof(str));
-       cnt += scnprintf(str + cnt, sizeof(str) - cnt, "@");
-       sockaddr_to_str((struct sockaddr *)&sess->s.dst_addr,
-                       str + cnt, sizeof(str) - cnt);
+       int err;
+       struct rtrs_addr path = {
+               .src = &sess->s.src_addr,
+               .dst = &sess->s.dst_addr,
+       };
 
+       rtrs_addr_to_str(&path, str, sizeof(str));
        err = kobject_init_and_add(&sess->kobj, &ktype_sess, clt->kobj_paths,
                                   "%s", str);
        if (err) {
index b74a872..0a794d7 100644 (file)
@@ -325,7 +325,7 @@ static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con)
 
 static void rtrs_clt_fast_reg_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-       struct rtrs_clt_con *con = cq->cq_context;
+       struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
 
        if (unlikely(wc->status != IB_WC_SUCCESS)) {
                rtrs_err(con->c.sess, "Failed IB_WR_REG_MR: %s\n",
@@ -345,7 +345,7 @@ static void rtrs_clt_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
 {
        struct rtrs_clt_io_req *req =
                container_of(wc->wr_cqe, typeof(*req), inv_cqe);
-       struct rtrs_clt_con *con = cq->cq_context;
+       struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
 
        if (unlikely(wc->status != IB_WC_SUCCESS)) {
                rtrs_err(con->c.sess, "Failed IB_WR_LOCAL_INV: %s\n",
@@ -437,6 +437,13 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno,
        req->in_use = false;
        req->con = NULL;
 
+       if (errno) {
+               rtrs_err_rl(con->c.sess,
+                           "IO request failed: error=%d path=%s [%s:%u]\n",
+                           errno, kobject_name(&sess->kobj), sess->hca_name,
+                           sess->hca_port);
+       }
+
        if (notify)
                req->conf(req->priv, errno);
 }
@@ -586,7 +593,7 @@ static int rtrs_post_recv_empty_x2(struct rtrs_con *con, struct ib_cqe *cqe)
 
 static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-       struct rtrs_clt_con *con = cq->cq_context;
+       struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
        struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
        u32 imm_type, imm_payload;
        bool w_inval = false;
@@ -628,6 +635,8 @@ static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc)
                } else if (imm_type == RTRS_HB_ACK_IMM) {
                        WARN_ON(con->c.cid);
                        sess->s.hb_missed_cnt = 0;
+                       sess->s.hb_cur_latency =
+                               ktime_sub(ktime_get(), sess->s.hb_last_sent);
                        if (sess->flags & RTRS_MSG_NEW_RKEY_F)
                                return  rtrs_clt_recv_done(con, wc);
                } else {
@@ -826,6 +835,57 @@ static struct rtrs_clt_sess *get_next_path_min_inflight(struct path_it *it)
        return min_path;
 }
 
+/**
+ * get_next_path_min_latency() - Returns path with minimal latency.
+ * @it:        the path pointer
+ *
+ * Return: a path with the lowest latency or NULL if all paths are tried
+ *
+ * Locks:
+ *    rcu_read_lock() must be hold.
+ *
+ * Related to @MP_POLICY_MIN_LATENCY
+ *
+ * This DOES skip an already-tried path.
+ * There is a skip-list to skip a path if the path has tried but failed.
+ * It will try the minimum latency path and then the second minimum latency
+ * path and so on. Finally it will return NULL if all paths are tried.
+ * Therefore the caller MUST check the returned
+ * path is NULL and trigger the IO error.
+ */
+static struct rtrs_clt_sess *get_next_path_min_latency(struct path_it *it)
+{
+       struct rtrs_clt_sess *min_path = NULL;
+       struct rtrs_clt *clt = it->clt;
+       struct rtrs_clt_sess *sess;
+       ktime_t min_latency = INT_MAX;
+       ktime_t latency;
+
+       list_for_each_entry_rcu(sess, &clt->paths_list, s.entry) {
+               if (unlikely(READ_ONCE(sess->state) != RTRS_CLT_CONNECTED))
+                       continue;
+
+               if (unlikely(!list_empty(raw_cpu_ptr(sess->mp_skip_entry))))
+                       continue;
+
+               latency = sess->s.hb_cur_latency;
+
+               if (latency < min_latency) {
+                       min_latency = latency;
+                       min_path = sess;
+               }
+       }
+
+       /*
+        * add the path to the skip list, so that next time we can get
+        * a different one
+        */
+       if (min_path)
+               list_add(raw_cpu_ptr(min_path->mp_skip_entry), &it->skip_list);
+
+       return min_path;
+}
+
 static inline void path_it_init(struct path_it *it, struct rtrs_clt *clt)
 {
        INIT_LIST_HEAD(&it->skip_list);
@@ -834,8 +894,10 @@ static inline void path_it_init(struct path_it *it, struct rtrs_clt *clt)
 
        if (clt->mp_policy == MP_POLICY_RR)
                it->next_path = get_next_path_rr;
-       else
+       else if (clt->mp_policy == MP_POLICY_MIN_INFLIGHT)
                it->next_path = get_next_path_min_inflight;
+       else
+               it->next_path = get_next_path_min_latency;
 }
 
 static inline void path_it_deinit(struct path_it *it)
@@ -1020,7 +1082,10 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req)
                                       req->usr_len + sizeof(*msg),
                                       imm);
        if (unlikely(ret)) {
-               rtrs_err(s, "Write request failed: %d\n", ret);
+               rtrs_err_rl(s,
+                           "Write request failed: error=%d path=%s [%s:%u]\n",
+                           ret, kobject_name(&sess->kobj), sess->hca_name,
+                           sess->hca_port);
                if (sess->clt->mp_policy == MP_POLICY_MIN_INFLIGHT)
                        atomic_dec(&sess->stats->inflight);
                if (req->sg_cnt)
@@ -1052,7 +1117,7 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req)
        struct rtrs_sess *s = con->c.sess;
        struct rtrs_clt_sess *sess = to_clt_sess(s);
        struct rtrs_msg_rdma_read *msg;
-       struct rtrs_ib_dev *dev;
+       struct rtrs_ib_dev *dev = sess->s.dev;
 
        struct ib_reg_wr rwr;
        struct ib_send_wr *wr = NULL;
@@ -1062,9 +1127,6 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req)
 
        const size_t tsize = sizeof(*msg) + req->data_len + req->usr_len;
 
-       s = &sess->s;
-       dev = sess->s.dev;
-
        if (unlikely(tsize > sess->chunk_size)) {
                rtrs_wrn(s,
                          "Read request failed, message size is %zu, bigger than CHUNK_SIZE %d\n",
@@ -1141,7 +1203,10 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req)
        ret = rtrs_post_send_rdma(req->con, req, &sess->rbufs[buf_id],
                                   req->data_len, imm, wr);
        if (unlikely(ret)) {
-               rtrs_err(s, "Read request failed: %d\n", ret);
+               rtrs_err_rl(s,
+                           "Read request failed: error=%d path=%s [%s:%u]\n",
+                           ret, kobject_name(&sess->kobj), sess->hca_name,
+                           sess->hca_port);
                if (sess->clt->mp_policy == MP_POLICY_MIN_INFLIGHT)
                        atomic_dec(&sess->stats->inflight);
                req->need_inv = false;
@@ -1863,12 +1928,14 @@ static int rtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id,
        case RDMA_CM_EVENT_UNREACHABLE:
        case RDMA_CM_EVENT_ADDR_CHANGE:
        case RDMA_CM_EVENT_TIMEWAIT_EXIT:
-               rtrs_wrn(s, "CM error event %d\n", ev->event);
+               rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n",
+                        rdma_event_msg(ev->event), ev->status);
                cm_err = -ECONNRESET;
                break;
        case RDMA_CM_EVENT_ADDR_ERROR:
        case RDMA_CM_EVENT_ROUTE_ERROR:
-               rtrs_wrn(s, "CM error event %d\n", ev->event);
+               rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n",
+                        rdma_event_msg(ev->event), ev->status);
                cm_err = -EHOSTUNREACH;
                break;
        case RDMA_CM_EVENT_DEVICE_REMOVAL:
@@ -1878,7 +1945,8 @@ static int rtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id,
                rtrs_clt_close_conns(sess, false);
                return 0;
        default:
-               rtrs_err(s, "Unexpected RDMA CM event (%d)\n", ev->event);
+               rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %d)\n",
+                        rdma_event_msg(ev->event), ev->status);
                cm_err = -ECONNRESET;
                break;
        }
@@ -2251,7 +2319,7 @@ destroy:
 
 static void rtrs_clt_info_req_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-       struct rtrs_clt_con *con = cq->cq_context;
+       struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
        struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
        struct rtrs_iu *iu;
 
@@ -2333,7 +2401,7 @@ static int process_info_rsp(struct rtrs_clt_sess *sess,
 
 static void rtrs_clt_info_rsp_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-       struct rtrs_clt_con *con = cq->cq_context;
+       struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context);
        struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess);
        struct rtrs_msg_info_rsp *msg;
        enum rtrs_clt_state state;
@@ -2464,16 +2532,28 @@ out:
 static int init_sess(struct rtrs_clt_sess *sess)
 {
        int err;
+       char str[NAME_MAX];
+       struct rtrs_addr path = {
+               .src = &sess->s.src_addr,
+               .dst = &sess->s.dst_addr,
+       };
+
+       rtrs_addr_to_str(&path, str, sizeof(str));
 
        mutex_lock(&sess->init_mutex);
        err = init_conns(sess);
        if (err) {
-               rtrs_err(sess->clt, "init_conns(), err: %d\n", err);
+               rtrs_err(sess->clt,
+                        "init_conns() failed: err=%d path=%s [%s:%u]\n", err,
+                        str, sess->hca_name, sess->hca_port);
                goto out;
        }
        err = rtrs_send_sess_info(sess);
        if (err) {
-               rtrs_err(sess->clt, "rtrs_send_sess_info(), err: %d\n", err);
+               rtrs_err(
+                       sess->clt,
+                       "rtrs_send_sess_info() failed: err=%d path=%s [%s:%u]\n",
+                       err, str, sess->hca_name, sess->hca_port);
                goto out;
        }
        rtrs_clt_sess_up(sess);
@@ -2791,8 +2871,8 @@ int rtrs_clt_remove_path_from_sysfs(struct rtrs_clt_sess *sess,
        } while (!changed && old_state != RTRS_CLT_DEAD);
 
        if (likely(changed)) {
-               rtrs_clt_destroy_sess_files(sess, sysfs_self);
                rtrs_clt_remove_path_from_arr(sess);
+               rtrs_clt_destroy_sess_files(sess, sysfs_self);
                kobject_put(&sess->kobj);
        }
 
@@ -2896,7 +2976,8 @@ EXPORT_SYMBOL(rtrs_clt_request);
 
 int rtrs_clt_rdma_cq_direct(struct rtrs_clt *clt, unsigned int index)
 {
-       int cnt;
+       /* If no path, return -1 for block layer not to try again */
+       int cnt = -1;
        struct rtrs_con *con;
        struct rtrs_clt_sess *sess;
        struct path_it it;
@@ -2933,9 +3014,9 @@ int rtrs_clt_query(struct rtrs_clt *clt, struct rtrs_attrs *attr)
                return -ECOMM;
 
        attr->queue_depth      = clt->queue_depth;
-       attr->max_io_size      = clt->max_io_size;
-       attr->sess_kobj        = &clt->dev.kobj;
-       strlcpy(attr->sessname, clt->sessname, sizeof(attr->sessname));
+       /* Cap max_io_size to min of remote buffer size and the fr pages */
+       attr->max_io_size = min_t(int, clt->max_io_size,
+                                 clt->max_segments * SZ_4K);
 
        return 0;
 }
index 98ba5d0..4c52f30 100644 (file)
@@ -29,6 +29,7 @@ enum rtrs_clt_state {
 enum rtrs_mp_policy {
        MP_POLICY_RR,
        MP_POLICY_MIN_INFLIGHT,
+       MP_POLICY_MIN_LATENCY,
 };
 
 /* see Documentation/ABI/testing/sysfs-class-rtrs-client for details */
index 00eb450..86e65cf 100644 (file)
@@ -91,6 +91,7 @@ struct rtrs_con {
        struct ib_cq            *cq;
        struct rdma_cm_id       *cm_id;
        unsigned int            cid;
+       u16                     cq_size;
 };
 
 struct rtrs_sess {
@@ -112,6 +113,8 @@ struct rtrs_sess {
        unsigned int            hb_interval_ms;
        unsigned int            hb_missed_cnt;
        unsigned int            hb_missed_max;
+       ktime_t                 hb_last_sent;
+       ktime_t                 hb_cur_latency;
 };
 
 /* rtrs information unit */
index 126a96e..a928817 100644 (file)
@@ -176,7 +176,8 @@ static int rtrs_srv_create_once_sysfs_root_folders(struct rtrs_srv_sess *sess)
        err = device_add(&srv->dev);
        if (err) {
                pr_err("device_add(): %d\n", err);
-               goto put;
+               put_device(&srv->dev);
+               goto unlock;
        }
        srv->kobj_paths = kobject_create_and_add("paths", &srv->dev.kobj);
        if (!srv->kobj_paths) {
@@ -188,10 +189,6 @@ static int rtrs_srv_create_once_sysfs_root_folders(struct rtrs_srv_sess *sess)
        }
        dev_set_uevent_suppress(&srv->dev, false);
        kobject_uevent(&srv->dev.kobj, KOBJ_ADD);
-       goto unlock;
-
-put:
-       put_device(&srv->dev);
 unlock:
        mutex_unlock(&srv->paths_mutex);
 
@@ -262,14 +259,13 @@ int rtrs_srv_create_sess_files(struct rtrs_srv_sess *sess)
        struct rtrs_srv *srv = sess->srv;
        struct rtrs_sess *s = &sess->s;
        char str[NAME_MAX];
-       int err, cnt;
-
-       cnt = sockaddr_to_str((struct sockaddr *)&sess->s.dst_addr,
-                             str, sizeof(str));
-       cnt += scnprintf(str + cnt, sizeof(str) - cnt, "@");
-       sockaddr_to_str((struct sockaddr *)&sess->s.src_addr,
-                       str + cnt, sizeof(str) - cnt);
+       int err;
+       struct rtrs_addr path = {
+               .src = &sess->s.dst_addr,
+               .dst = &sess->s.src_addr,
+       };
 
+       rtrs_addr_to_str(&path, str, sizeof(str));
        err = rtrs_srv_create_once_sysfs_root_folders(sess);
        if (err)
                return err;
index f7aa2a7..0fa116c 100644 (file)
@@ -199,7 +199,7 @@ static void rtrs_srv_wait_ops_ids(struct rtrs_srv_sess *sess)
 
 static void rtrs_srv_reg_mr_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-       struct rtrs_srv_con *con = cq->cq_context;
+       struct rtrs_srv_con *con = to_srv_con(wc->qp->qp_context);
        struct rtrs_sess *s = con->c.sess;
        struct rtrs_srv_sess *sess = to_srv_sess(s);
 
@@ -518,8 +518,9 @@ bool rtrs_srv_resp_rdma(struct rtrs_srv_op *id, int status)
 
        if (unlikely(sess->state != RTRS_SRV_CONNECTED)) {
                rtrs_err_rl(s,
-                            "Sending I/O response failed,  session is disconnected, sess state %s\n",
-                            rtrs_srv_state_str(sess->state));
+                           "Sending I/O response failed,  session %s is disconnected, sess state %s\n",
+                           kobject_name(&sess->kobj),
+                           rtrs_srv_state_str(sess->state));
                goto out;
        }
        if (always_invalidate) {
@@ -529,7 +530,9 @@ bool rtrs_srv_resp_rdma(struct rtrs_srv_op *id, int status)
        }
        if (unlikely(atomic_sub_return(1,
                                       &con->sq_wr_avail) < 0)) {
-               pr_err("IB send queue full\n");
+               rtrs_err(s, "IB send queue full: sess=%s cid=%d\n",
+                        kobject_name(&sess->kobj),
+                        con->c.cid);
                atomic_add(1, &con->sq_wr_avail);
                spin_lock(&con->rsp_wr_wait_lock);
                list_add_tail(&id->wait_list, &con->rsp_wr_wait_list);
@@ -543,7 +546,8 @@ bool rtrs_srv_resp_rdma(struct rtrs_srv_op *id, int status)
                err = rdma_write_sg(id);
 
        if (unlikely(err)) {
-               rtrs_err_rl(s, "IO response failed: %d\n", err);
+               rtrs_err_rl(s, "IO response failed: %d: sess=%s\n", err,
+                           kobject_name(&sess->kobj));
                close_sess(sess);
        }
 out:
@@ -720,7 +724,7 @@ static void rtrs_srv_stop_hb(struct rtrs_srv_sess *sess)
 
 static void rtrs_srv_info_rsp_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-       struct rtrs_srv_con *con = cq->cq_context;
+       struct rtrs_srv_con *con = to_srv_con(wc->qp->qp_context);
        struct rtrs_sess *s = con->c.sess;
        struct rtrs_srv_sess *sess = to_srv_sess(s);
        struct rtrs_iu *iu;
@@ -862,7 +866,7 @@ rwr_free:
 
 static void rtrs_srv_info_req_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-       struct rtrs_srv_con *con = cq->cq_context;
+       struct rtrs_srv_con *con = to_srv_con(wc->qp->qp_context);
        struct rtrs_sess *s = con->c.sess;
        struct rtrs_srv_sess *sess = to_srv_sess(s);
        struct rtrs_msg_info_req *msg;
@@ -1110,7 +1114,7 @@ static void rtrs_srv_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
 {
        struct rtrs_srv_mr *mr =
                container_of(wc->wr_cqe, typeof(*mr), inv_cqe);
-       struct rtrs_srv_con *con = cq->cq_context;
+       struct rtrs_srv_con *con = to_srv_con(wc->qp->qp_context);
        struct rtrs_sess *s = con->c.sess;
        struct rtrs_srv_sess *sess = to_srv_sess(s);
        struct rtrs_srv *srv = sess->srv;
@@ -1167,7 +1171,7 @@ static void rtrs_rdma_process_wr_wait_list(struct rtrs_srv_con *con)
 
 static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-       struct rtrs_srv_con *con = cq->cq_context;
+       struct rtrs_srv_con *con = to_srv_con(wc->qp->qp_context);
        struct rtrs_sess *s = con->c.sess;
        struct rtrs_srv_sess *sess = to_srv_sess(s);
        struct rtrs_srv *srv = sess->srv;
@@ -1683,6 +1687,8 @@ static struct rtrs_srv_sess *__alloc_sess(struct rtrs_srv *srv,
 {
        struct rtrs_srv_sess *sess;
        int err = -ENOMEM;
+       char str[NAME_MAX];
+       struct rtrs_addr path;
 
        if (srv->paths_num >= MAX_PATHS_NUM) {
                err = -ECONNRESET;
@@ -1717,6 +1723,13 @@ static struct rtrs_srv_sess *__alloc_sess(struct rtrs_srv *srv,
        sess->cur_cq_vector = -1;
        sess->s.dst_addr = cm_id->route.addr.dst_addr;
        sess->s.src_addr = cm_id->route.addr.src_addr;
+
+       /* temporary until receiving session-name from client */
+       path.src = &sess->s.src_addr;
+       path.dst = &sess->s.dst_addr;
+       rtrs_addr_to_str(&path, str, sizeof(str));
+       strlcpy(sess->s.sessname, str, sizeof(sess->s.sessname));
+
        sess->s.con_num = con_num;
        sess->s.recon_cnt = recon_cnt;
        uuid_copy(&sess->s.uuid, uuid);
@@ -1908,13 +1921,10 @@ static int rtrs_srv_rdma_cm_handler(struct rdma_cm_id *cm_id,
        case RDMA_CM_EVENT_UNREACHABLE:
                rtrs_err(s, "CM error (CM event: %s, err: %d)\n",
                          rdma_event_msg(ev->event), ev->status);
-               close_sess(sess);
-               break;
+               fallthrough;
        case RDMA_CM_EVENT_DISCONNECTED:
        case RDMA_CM_EVENT_ADDR_CHANGE:
        case RDMA_CM_EVENT_TIMEWAIT_EXIT:
-               close_sess(sess);
-               break;
        case RDMA_CM_EVENT_DEVICE_REMOVAL:
                close_sess(sess);
                break;
index d13aff0..a784728 100644 (file)
@@ -218,14 +218,14 @@ static int create_cq(struct rtrs_con *con, int cq_vector, u16 cq_size,
        struct rdma_cm_id *cm_id = con->cm_id;
        struct ib_cq *cq;
 
-       cq = ib_alloc_cq(cm_id->device, con, cq_size,
-                        cq_vector, poll_ctx);
+       cq = ib_cq_pool_get(cm_id->device, cq_size, cq_vector, poll_ctx);
        if (IS_ERR(cq)) {
                rtrs_err(con->sess, "Creating completion queue failed, errno: %ld\n",
                          PTR_ERR(cq));
                return PTR_ERR(cq);
        }
        con->cq = cq;
+       con->cq_size = cq_size;
 
        return 0;
 }
@@ -273,7 +273,7 @@ int rtrs_cq_qp_create(struct rtrs_sess *sess, struct rtrs_con *con,
        err = create_qp(con, sess->dev->ib_pd, max_send_wr, max_recv_wr,
                        max_send_sge);
        if (err) {
-               ib_free_cq(con->cq);
+               ib_cq_pool_put(con->cq, con->cq_size);
                con->cq = NULL;
                return err;
        }
@@ -290,7 +290,7 @@ void rtrs_cq_qp_destroy(struct rtrs_con *con)
                con->qp = NULL;
        }
        if (con->cq) {
-               ib_free_cq(con->cq);
+               ib_cq_pool_put(con->cq, con->cq_size);
                con->cq = NULL;
        }
 }
@@ -337,6 +337,9 @@ static void hb_work(struct work_struct *work)
                schedule_hb(sess);
                return;
        }
+
+       sess->hb_last_sent = ktime_get();
+
        imm = rtrs_to_imm(RTRS_HB_MSG_IMM, 0);
        err = rtrs_post_rdma_write_imm_empty(usr_con, sess->hb_cqe, imm,
                                             0, NULL);
@@ -463,6 +466,30 @@ int sockaddr_to_str(const struct sockaddr *addr, char *buf, size_t len)
 }
 EXPORT_SYMBOL(sockaddr_to_str);
 
+/**
+ * rtrs_addr_to_str() - convert rtrs_addr to a string "src@dst"
+ * @addr:      the rtrs_addr structure to be converted
+ * @buf:       string containing source and destination addr of a path
+ *             separated by '@' I.e. "ip:1.1.1.1@ip:1.1.1.2"
+ *             "ip:1.1.1.1@ip:1.1.1.2".
+ * @len:       string length
+ *
+ * The return value is the number of characters written into buf not
+ * including the trailing '\0'.
+ */
+int rtrs_addr_to_str(const struct rtrs_addr *addr, char *buf, size_t len)
+{
+       int cnt;
+
+       cnt = sockaddr_to_str((struct sockaddr *)addr->src,
+                             buf, len);
+       cnt += scnprintf(buf + cnt, len - cnt, "@");
+       sockaddr_to_str((struct sockaddr *)addr->dst,
+                       buf + cnt, len - cnt);
+       return cnt;
+}
+EXPORT_SYMBOL(rtrs_addr_to_str);
+
 /**
  * rtrs_addr_to_sockaddr() - convert path string "src,dst" or "src@dst"
  * to sockaddreses
index bebaa94..dc3e1af 100644 (file)
@@ -110,8 +110,6 @@ int rtrs_clt_rdma_cq_direct(struct rtrs_clt *clt, unsigned int index);
 struct rtrs_attrs {
        u32             queue_depth;
        u32             max_io_size;
-       u8              sessname[NAME_MAX];
-       struct kobject  *sess_kobj;
 };
 
 int rtrs_clt_query(struct rtrs_clt *sess, struct rtrs_attrs *attr);
@@ -185,4 +183,5 @@ int rtrs_addr_to_sockaddr(const char *str, size_t len, u16 port,
                          struct rtrs_addr *addr);
 
 int sockaddr_to_str(const struct sockaddr *addr, char *buf, size_t len);
+int rtrs_addr_to_str(const struct rtrs_addr *addr, char *buf, size_t len);
 #endif
index 51c386a..ea44780 100644 (file)
@@ -2382,6 +2382,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev,
                pr_info("rejected SRP_LOGIN_REQ because target %s_%d is not enabled\n",
                        dev_name(&sdev->device->dev), port_num);
                mutex_unlock(&sport->mutex);
+               ret = -EINVAL;
                goto reject;
        }
 
@@ -3109,7 +3110,8 @@ static int srpt_add_one(struct ib_device *device)
 {
        struct srpt_device *sdev;
        struct srpt_port *sport;
-       int i, ret;
+       int ret;
+       u32 i;
 
        pr_debug("device = %p\n", device);
 
index d8f5310..037cc59 100644 (file)
@@ -7,6 +7,7 @@
 
 obj-$(CONFIG_INPUT)            += input-core.o
 input-core-y := input.o input-compat.o input-mt.o input-poller.o ff-core.o
+input-core-y += touchscreen.o
 
 obj-$(CONFIG_INPUT_FF_MEMLESS) += ff-memless.o
 obj-$(CONFIG_INPUT_SPARSEKMAP) += sparse-keymap.o
index 9f0d07d..d69d765 100644 (file)
@@ -268,6 +268,7 @@ static const struct xpad_device {
        { 0x1689, 0xfd00, "Razer Onza Tournament Edition", 0, XTYPE_XBOX360 },
        { 0x1689, 0xfd01, "Razer Onza Classic Edition", 0, XTYPE_XBOX360 },
        { 0x1689, 0xfe00, "Razer Sabertooth", 0, XTYPE_XBOX360 },
+       { 0x1949, 0x041a, "Amazon Game Controller", 0, XTYPE_XBOX360 },
        { 0x1bad, 0x0002, "Harmonix Rock Band Guitar", 0, XTYPE_XBOX360 },
        { 0x1bad, 0x0003, "Harmonix Rock Band Drumkit", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 },
        { 0x1bad, 0x0130, "Ion Drum Rocker", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 },
@@ -440,6 +441,7 @@ static const struct usb_device_id xpad_table[] = {
        XPAD_XBOX360_VENDOR(0x15e4),            /* Numark X-Box 360 controllers */
        XPAD_XBOX360_VENDOR(0x162e),            /* Joytech X-Box 360 controllers */
        XPAD_XBOX360_VENDOR(0x1689),            /* Razer Onza */
+       XPAD_XBOX360_VENDOR(0x1949),            /* Amazon controllers */
        XPAD_XBOX360_VENDOR(0x1bad),            /* Harminix Rock Band Guitar and Drums */
        XPAD_XBOX360_VENDOR(0x20d6),            /* PowerA Controllers */
        XPAD_XBOXONE_VENDOR(0x20d6),            /* PowerA Controllers */
index 77bac4d..8dbf1e6 100644 (file)
@@ -8,6 +8,7 @@
 
 #include <linux/module.h>
 
+#include <linux/hrtimer.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/interrupt.h>
@@ -36,10 +37,11 @@ struct gpio_button_data {
 
        unsigned short *code;
 
-       struct timer_list release_timer;
+       struct hrtimer release_timer;
        unsigned int release_delay;     /* in msecs, for IRQ-only buttons */
 
        struct delayed_work work;
+       struct hrtimer debounce_timer;
        unsigned int software_debounce; /* in msecs, for GPIO-driven buttons */
 
        unsigned int irq;
@@ -48,6 +50,7 @@ struct gpio_button_data {
        bool disabled;
        bool key_pressed;
        bool suspended;
+       bool debounce_use_hrtimer;
 };
 
 struct gpio_keys_drvdata {
@@ -122,6 +125,18 @@ static const unsigned long *get_bm_events_by_type(struct input_dev *dev,
        return (type == EV_KEY) ? dev->keybit : dev->swbit;
 }
 
+static void gpio_keys_quiesce_key(void *data)
+{
+       struct gpio_button_data *bdata = data;
+
+       if (!bdata->gpiod)
+               hrtimer_cancel(&bdata->release_timer);
+       if (bdata->debounce_use_hrtimer)
+               hrtimer_cancel(&bdata->debounce_timer);
+       else
+               cancel_delayed_work_sync(&bdata->work);
+}
+
 /**
  * gpio_keys_disable_button() - disables given GPIO button
  * @bdata: button data for button to be disabled
@@ -142,12 +157,7 @@ static void gpio_keys_disable_button(struct gpio_button_data *bdata)
                 * Disable IRQ and associated timer/work structure.
                 */
                disable_irq(bdata->irq);
-
-               if (bdata->gpiod)
-                       cancel_delayed_work_sync(&bdata->work);
-               else
-                       del_timer_sync(&bdata->release_timer);
-
+               gpio_keys_quiesce_key(bdata);
                bdata->disabled = true;
        }
 }
@@ -360,7 +370,9 @@ static void gpio_keys_gpio_report_event(struct gpio_button_data *bdata)
        unsigned int type = button->type ?: EV_KEY;
        int state;
 
-       state = gpiod_get_value_cansleep(bdata->gpiod);
+       state = bdata->debounce_use_hrtimer ?
+                       gpiod_get_value(bdata->gpiod) :
+                       gpiod_get_value_cansleep(bdata->gpiod);
        if (state < 0) {
                dev_err(input->dev.parent,
                        "failed to get gpio state: %d\n", state);
@@ -373,7 +385,15 @@ static void gpio_keys_gpio_report_event(struct gpio_button_data *bdata)
        } else {
                input_event(input, type, *bdata->code, state);
        }
-       input_sync(input);
+}
+
+static void gpio_keys_debounce_event(struct gpio_button_data *bdata)
+{
+       gpio_keys_gpio_report_event(bdata);
+       input_sync(bdata->input);
+
+       if (bdata->button->wakeup)
+               pm_relax(bdata->input->dev.parent);
 }
 
 static void gpio_keys_gpio_work_func(struct work_struct *work)
@@ -381,10 +401,17 @@ static void gpio_keys_gpio_work_func(struct work_struct *work)
        struct gpio_button_data *bdata =
                container_of(work, struct gpio_button_data, work.work);
 
-       gpio_keys_gpio_report_event(bdata);
+       gpio_keys_debounce_event(bdata);
+}
 
-       if (bdata->button->wakeup)
-               pm_relax(bdata->input->dev.parent);
+static enum hrtimer_restart gpio_keys_debounce_timer(struct hrtimer *t)
+{
+       struct gpio_button_data *bdata =
+               container_of(t, struct gpio_button_data, debounce_timer);
+
+       gpio_keys_debounce_event(bdata);
+
+       return HRTIMER_NORESTART;
 }
 
 static irqreturn_t gpio_keys_gpio_isr(int irq, void *dev_id)
@@ -408,26 +435,33 @@ static irqreturn_t gpio_keys_gpio_isr(int irq, void *dev_id)
                }
        }
 
-       mod_delayed_work(system_wq,
-                        &bdata->work,
-                        msecs_to_jiffies(bdata->software_debounce));
+       if (bdata->debounce_use_hrtimer) {
+               hrtimer_start(&bdata->debounce_timer,
+                             ms_to_ktime(bdata->software_debounce),
+                             HRTIMER_MODE_REL);
+       } else {
+               mod_delayed_work(system_wq,
+                                &bdata->work,
+                                msecs_to_jiffies(bdata->software_debounce));
+       }
 
        return IRQ_HANDLED;
 }
 
-static void gpio_keys_irq_timer(struct timer_list *t)
+static enum hrtimer_restart gpio_keys_irq_timer(struct hrtimer *t)
 {
-       struct gpio_button_data *bdata = from_timer(bdata, t, release_timer);
+       struct gpio_button_data *bdata = container_of(t,
+                                                     struct gpio_button_data,
+                                                     release_timer);
        struct input_dev *input = bdata->input;
-       unsigned long flags;
 
-       spin_lock_irqsave(&bdata->lock, flags);
        if (bdata->key_pressed) {
                input_event(input, EV_KEY, *bdata->code, 0);
                input_sync(input);
                bdata->key_pressed = false;
        }
-       spin_unlock_irqrestore(&bdata->lock, flags);
+
+       return HRTIMER_NORESTART;
 }
 
 static irqreturn_t gpio_keys_irq_isr(int irq, void *dev_id)
@@ -457,23 +491,14 @@ static irqreturn_t gpio_keys_irq_isr(int irq, void *dev_id)
        }
 
        if (bdata->release_delay)
-               mod_timer(&bdata->release_timer,
-                       jiffies + msecs_to_jiffies(bdata->release_delay));
+               hrtimer_start(&bdata->release_timer,
+                             ms_to_ktime(bdata->release_delay),
+                             HRTIMER_MODE_REL_HARD);
 out:
        spin_unlock_irqrestore(&bdata->lock, flags);
        return IRQ_HANDLED;
 }
 
-static void gpio_keys_quiesce_key(void *data)
-{
-       struct gpio_button_data *bdata = data;
-
-       if (bdata->gpiod)
-               cancel_delayed_work_sync(&bdata->work);
-       else
-               del_timer_sync(&bdata->release_timer);
-}
-
 static int gpio_keys_setup_key(struct platform_device *pdev,
                                struct input_dev *input,
                                struct gpio_keys_drvdata *ddata,
@@ -543,6 +568,14 @@ static int gpio_keys_setup_key(struct platform_device *pdev,
                        if (error < 0)
                                bdata->software_debounce =
                                                button->debounce_interval;
+
+                       /*
+                        * If reading the GPIO won't sleep, we can use a
+                        * hrtimer instead of a standard timer for the software
+                        * debounce, to reduce the latency as much as possible.
+                        */
+                       bdata->debounce_use_hrtimer =
+                                       !gpiod_cansleep(bdata->gpiod);
                }
 
                if (button->irq) {
@@ -561,6 +594,10 @@ static int gpio_keys_setup_key(struct platform_device *pdev,
 
                INIT_DELAYED_WORK(&bdata->work, gpio_keys_gpio_work_func);
 
+               hrtimer_init(&bdata->debounce_timer,
+                            CLOCK_REALTIME, HRTIMER_MODE_REL);
+               bdata->debounce_timer.function = gpio_keys_debounce_timer;
+
                isr = gpio_keys_gpio_isr;
                irqflags = IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING;
 
@@ -595,7 +632,9 @@ static int gpio_keys_setup_key(struct platform_device *pdev,
                }
 
                bdata->release_delay = button->debounce_interval;
-               timer_setup(&bdata->release_timer, gpio_keys_irq_timer, 0);
+               hrtimer_init(&bdata->release_timer,
+                            CLOCK_REALTIME, HRTIMER_MODE_REL_HARD);
+               bdata->release_timer.function = gpio_keys_irq_timer;
 
                isr = gpio_keys_irq_isr;
                irqflags = 0;
index 1f5c9ea..ae93038 100644 (file)
@@ -408,27 +408,18 @@ open_err:
        return -EIO;
 }
 
-#ifdef CONFIG_OF
 static const struct of_device_id imx_keypad_of_match[] = {
        { .compatible = "fsl,imx21-kpp", },
        { /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, imx_keypad_of_match);
-#endif
 
 static int imx_keypad_probe(struct platform_device *pdev)
 {
-       const struct matrix_keymap_data *keymap_data =
-                       dev_get_platdata(&pdev->dev);
        struct imx_keypad *keypad;
        struct input_dev *input_dev;
        int irq, error, i, row, col;
 
-       if (!keymap_data && !pdev->dev.of_node) {
-               dev_err(&pdev->dev, "no keymap defined\n");
-               return -EINVAL;
-       }
-
        irq = platform_get_irq(pdev, 0);
        if (irq < 0)
                return irq;
@@ -469,7 +460,7 @@ static int imx_keypad_probe(struct platform_device *pdev)
        input_dev->open = imx_keypad_open;
        input_dev->close = imx_keypad_close;
 
-       error = matrix_keypad_build_keymap(keymap_data, NULL,
+       error = matrix_keypad_build_keymap(NULL, NULL,
                                           MAX_MATRIX_KEY_ROWS,
                                           MAX_MATRIX_KEY_COLS,
                                           keypad->keycodes, input_dev);
@@ -582,7 +573,7 @@ static struct platform_driver imx_keypad_driver = {
        .driver         = {
                .name   = "imx-keypad",
                .pm     = &imx_kbd_pm_ops,
-               .of_match_table = of_match_ptr(imx_keypad_of_match),
+               .of_match_table = imx_keypad_of_match,
        },
        .probe          = imx_keypad_probe,
 };
index 9b0f966..2a97559 100644 (file)
@@ -274,7 +274,7 @@ static int tca6416_keypad_probe(struct i2c_client *client,
                error = request_threaded_irq(chip->irqnum, NULL,
                                             tca6416_keys_isr,
                                             IRQF_TRIGGER_FALLING |
-                                               IRQF_ONESHOT,
+                                            IRQF_ONESHOT | IRQF_NO_AUTOEN,
                                             "tca6416-keypad", chip);
                if (error) {
                        dev_dbg(&client->dev,
@@ -282,7 +282,6 @@ static int tca6416_keypad_probe(struct i2c_client *client,
                                chip->irqnum, error);
                        goto fail1;
                }
-               disable_irq(chip->irqnum);
        }
 
        error = input_register_device(input);
index 9671842..570fe18 100644 (file)
@@ -694,14 +694,13 @@ static int tegra_kbc_probe(struct platform_device *pdev)
        input_set_drvdata(kbc->idev, kbc);
 
        err = devm_request_irq(&pdev->dev, kbc->irq, tegra_kbc_isr,
-                              IRQF_TRIGGER_HIGH, pdev->name, kbc);
+                              IRQF_TRIGGER_HIGH | IRQF_NO_AUTOEN,
+                              pdev->name, kbc);
        if (err) {
                dev_err(&pdev->dev, "failed to request keyboard IRQ\n");
                return err;
        }
 
-       disable_irq(kbc->irq);
-
        err = input_register_device(kbc->idev);
        if (err) {
                dev_err(&pdev->dev, "failed to register input device\n");
index 7237dc4..498cde3 100644 (file)
@@ -763,6 +763,17 @@ config INPUT_IQS269A
          To compile this driver as a module, choose M here: the
          module will be called iqs269a.
 
+config INPUT_IQS626A
+       tristate "Azoteq IQS626A capacitive touch controller"
+       depends on I2C
+       select REGMAP_I2C
+       help
+         Say Y to enable support for the Azoteq IQS626A capacitive
+         touch controller.
+
+         To compile this driver as a module, choose M here: the
+         module will be called iqs626a.
+
 config INPUT_CMA3000
        tristate "VTI CMA3000 Tri-axis accelerometer"
        help
index 46db664..f593bee 100644 (file)
@@ -43,6 +43,7 @@ obj-$(CONFIG_INPUT_HISI_POWERKEY)     += hisi_powerkey.o
 obj-$(CONFIG_HP_SDC_RTC)               += hp_sdc_rtc.o
 obj-$(CONFIG_INPUT_IMS_PCU)            += ims-pcu.o
 obj-$(CONFIG_INPUT_IQS269A)            += iqs269a.o
+obj-$(CONFIG_INPUT_IQS626A)            += iqs626a.o
 obj-$(CONFIG_INPUT_IXP4XX_BEEPER)      += ixp4xx-beeper.o
 obj-$(CONFIG_INPUT_KEYSPAN_REMOTE)     += keyspan_remote.o
 obj-$(CONFIG_INPUT_KXTJ9)              += kxtj9.o
index 08b9b5c..81de8c4 100644 (file)
@@ -2018,7 +2018,6 @@ static int ims_pcu_probe(struct usb_interface *intf,
        }
 
        usb_set_intfdata(pcu->ctrl_intf, pcu);
-       usb_set_intfdata(pcu->data_intf, pcu);
 
        error = ims_pcu_buffers_alloc(pcu);
        if (error)
diff --git a/drivers/input/misc/iqs626a.c b/drivers/input/misc/iqs626a.c
new file mode 100644 (file)
index 0000000..d57e996
--- /dev/null
@@ -0,0 +1,1838 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Azoteq IQS626A Capacitive Touch Controller
+ *
+ * Copyright (C) 2020 Jeff LaBundy <jeff@labundy.com>
+ *
+ * This driver registers up to 2 input devices: one representing capacitive or
+ * inductive keys as well as Hall-effect switches, and one for a trackpad that
+ * can express various gestures.
+ */
+
+#include <linux/bits.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/input.h>
+#include <linux/input/touchscreen.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/property.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+
+#define IQS626_VER_INFO                                0x00
+#define IQS626_VER_INFO_PROD_NUM               0x51
+
+#define IQS626_SYS_FLAGS                       0x02
+#define IQS626_SYS_FLAGS_SHOW_RESET            BIT(15)
+#define IQS626_SYS_FLAGS_IN_ATI                        BIT(12)
+#define IQS626_SYS_FLAGS_PWR_MODE_MASK         GENMASK(9, 8)
+#define IQS626_SYS_FLAGS_PWR_MODE_SHIFT                8
+
+#define IQS626_HALL_OUTPUT                     0x23
+
+#define IQS626_SYS_SETTINGS                    0x80
+#define IQS626_SYS_SETTINGS_CLK_DIV            BIT(15)
+#define IQS626_SYS_SETTINGS_ULP_AUTO           BIT(14)
+#define IQS626_SYS_SETTINGS_DIS_AUTO           BIT(13)
+#define IQS626_SYS_SETTINGS_PWR_MODE_MASK      GENMASK(12, 11)
+#define IQS626_SYS_SETTINGS_PWR_MODE_SHIFT     11
+#define IQS626_SYS_SETTINGS_PWR_MODE_MAX       3
+#define IQS626_SYS_SETTINGS_ULP_UPDATE_MASK    GENMASK(10, 8)
+#define IQS626_SYS_SETTINGS_ULP_UPDATE_SHIFT   8
+#define IQS626_SYS_SETTINGS_ULP_UPDATE_MAX     7
+#define IQS626_SYS_SETTINGS_EVENT_MODE         BIT(5)
+#define IQS626_SYS_SETTINGS_EVENT_MODE_LP      BIT(4)
+#define IQS626_SYS_SETTINGS_REDO_ATI           BIT(2)
+#define IQS626_SYS_SETTINGS_ACK_RESET          BIT(0)
+
+#define IQS626_MISC_A_ATI_BAND_DISABLE         BIT(7)
+#define IQS626_MISC_A_TPx_LTA_UPDATE_MASK      GENMASK(6, 4)
+#define IQS626_MISC_A_TPx_LTA_UPDATE_SHIFT     4
+#define IQS626_MISC_A_TPx_LTA_UPDATE_MAX       7
+#define IQS626_MISC_A_ATI_LP_ONLY              BIT(3)
+#define IQS626_MISC_A_GPIO3_SELECT_MASK                GENMASK(2, 0)
+#define IQS626_MISC_A_GPIO3_SELECT_MAX         7
+
+#define IQS626_EVENT_MASK_SYS                  BIT(6)
+#define IQS626_EVENT_MASK_GESTURE              BIT(3)
+#define IQS626_EVENT_MASK_DEEP                 BIT(2)
+#define IQS626_EVENT_MASK_TOUCH                        BIT(1)
+#define IQS626_EVENT_MASK_PROX                 BIT(0)
+
+#define IQS626_RATE_NP_MS_MAX                  255
+#define IQS626_RATE_LP_MS_MAX                  255
+#define IQS626_RATE_ULP_MS_MAX                 4080
+#define IQS626_TIMEOUT_PWR_MS_MAX              130560
+#define IQS626_TIMEOUT_LTA_MS_MAX              130560
+
+#define IQS626_MISC_B_RESEED_UI_SEL_MASK       GENMASK(7, 6)
+#define IQS626_MISC_B_RESEED_UI_SEL_SHIFT      6
+#define IQS626_MISC_B_RESEED_UI_SEL_MAX                3
+#define IQS626_MISC_B_THRESH_EXTEND            BIT(5)
+#define IQS626_MISC_B_TRACKING_UI_ENABLE       BIT(4)
+#define IQS626_MISC_B_TPx_SWIPE                        BIT(3)
+#define IQS626_MISC_B_RESEED_OFFSET            BIT(2)
+#define IQS626_MISC_B_FILT_STR_TPx             GENMASK(1, 0)
+
+#define IQS626_THRESH_SWIPE_MAX                        255
+#define IQS626_TIMEOUT_TAP_MS_MAX              4080
+#define IQS626_TIMEOUT_SWIPE_MS_MAX            4080
+
+#define IQS626_CHx_ENG_0_MEAS_CAP_SIZE         BIT(7)
+#define IQS626_CHx_ENG_0_RX_TERM_VSS           BIT(5)
+#define IQS626_CHx_ENG_0_LINEARIZE             BIT(4)
+#define IQS626_CHx_ENG_0_DUAL_DIR              BIT(3)
+#define IQS626_CHx_ENG_0_FILT_DISABLE          BIT(2)
+#define IQS626_CHx_ENG_0_ATI_MODE_MASK         GENMASK(1, 0)
+#define IQS626_CHx_ENG_0_ATI_MODE_MAX          3
+
+#define IQS626_CHx_ENG_1_CCT_HIGH_1            BIT(7)
+#define IQS626_CHx_ENG_1_CCT_HIGH_0            BIT(6)
+#define IQS626_CHx_ENG_1_PROJ_BIAS_MASK                GENMASK(5, 4)
+#define IQS626_CHx_ENG_1_PROJ_BIAS_SHIFT       4
+#define IQS626_CHx_ENG_1_PROJ_BIAS_MAX         3
+#define IQS626_CHx_ENG_1_CCT_ENABLE            BIT(3)
+#define IQS626_CHx_ENG_1_SENSE_FREQ_MASK       GENMASK(2, 1)
+#define IQS626_CHx_ENG_1_SENSE_FREQ_SHIFT      1
+#define IQS626_CHx_ENG_1_SENSE_FREQ_MAX                3
+#define IQS626_CHx_ENG_1_ATI_BAND_TIGHTEN      BIT(0)
+
+#define IQS626_CHx_ENG_2_LOCAL_CAP_MASK                GENMASK(7, 6)
+#define IQS626_CHx_ENG_2_LOCAL_CAP_SHIFT       6
+#define IQS626_CHx_ENG_2_LOCAL_CAP_MAX         3
+#define IQS626_CHx_ENG_2_LOCAL_CAP_ENABLE      BIT(5)
+#define IQS626_CHx_ENG_2_SENSE_MODE_MASK       GENMASK(3, 0)
+#define IQS626_CHx_ENG_2_SENSE_MODE_MAX                15
+
+#define IQS626_CHx_ENG_3_TX_FREQ_MASK          GENMASK(5, 4)
+#define IQS626_CHx_ENG_3_TX_FREQ_SHIFT         4
+#define IQS626_CHx_ENG_3_TX_FREQ_MAX           3
+#define IQS626_CHx_ENG_3_INV_LOGIC             BIT(0)
+
+#define IQS626_CHx_ENG_4_RX_TERM_VREG          BIT(6)
+#define IQS626_CHx_ENG_4_CCT_LOW_1             BIT(5)
+#define IQS626_CHx_ENG_4_CCT_LOW_0             BIT(4)
+#define IQS626_CHx_ENG_4_COMP_DISABLE          BIT(1)
+#define IQS626_CHx_ENG_4_STATIC_ENABLE         BIT(0)
+
+#define IQS626_TPx_ATI_BASE_MIN                        45
+#define IQS626_TPx_ATI_BASE_MAX                        300
+#define IQS626_CHx_ATI_BASE_MASK               GENMASK(7, 6)
+#define IQS626_CHx_ATI_BASE_75                 0x00
+#define IQS626_CHx_ATI_BASE_100                        0x40
+#define IQS626_CHx_ATI_BASE_150                        0x80
+#define IQS626_CHx_ATI_BASE_200                        0xC0
+#define IQS626_CHx_ATI_TARGET_MASK             GENMASK(5, 0)
+#define IQS626_CHx_ATI_TARGET_MAX              2016
+
+#define IQS626_CHx_THRESH_MAX                  255
+#define IQS626_CHx_HYST_DEEP_MASK              GENMASK(7, 4)
+#define IQS626_CHx_HYST_DEEP_SHIFT             4
+#define IQS626_CHx_HYST_TOUCH_MASK             GENMASK(3, 0)
+#define IQS626_CHx_HYST_MAX                    15
+
+#define IQS626_FILT_STR_NP_TPx_MASK            GENMASK(7, 6)
+#define IQS626_FILT_STR_NP_TPx_SHIFT           6
+#define IQS626_FILT_STR_LP_TPx_MASK            GENMASK(5, 4)
+#define IQS626_FILT_STR_LP_TPx_SHIFT           4
+
+#define IQS626_FILT_STR_NP_CNT_MASK            GENMASK(7, 6)
+#define IQS626_FILT_STR_NP_CNT_SHIFT           6
+#define IQS626_FILT_STR_LP_CNT_MASK            GENMASK(5, 4)
+#define IQS626_FILT_STR_LP_CNT_SHIFT           4
+#define IQS626_FILT_STR_NP_LTA_MASK            GENMASK(3, 2)
+#define IQS626_FILT_STR_NP_LTA_SHIFT           2
+#define IQS626_FILT_STR_LP_LTA_MASK            GENMASK(1, 0)
+#define IQS626_FILT_STR_MAX                    3
+
+#define IQS626_ULP_PROJ_ENABLE                 BIT(4)
+#define IQS626_GEN_WEIGHT_MAX                  255
+
+#define IQS626_MAX_REG                         0xFF
+
+#define IQS626_NUM_CH_TP_3                     9
+#define IQS626_NUM_CH_TP_2                     6
+#define IQS626_NUM_CH_GEN                      3
+#define IQS626_NUM_CRx_TX                      8
+
+#define IQS626_PWR_MODE_POLL_SLEEP_US          50000
+#define IQS626_PWR_MODE_POLL_TIMEOUT_US                500000
+
+#define iqs626_irq_wait()                      usleep_range(350, 400)
+
+enum iqs626_ch_id {
+       IQS626_CH_ULP_0,
+       IQS626_CH_TP_2,
+       IQS626_CH_TP_3,
+       IQS626_CH_GEN_0,
+       IQS626_CH_GEN_1,
+       IQS626_CH_GEN_2,
+       IQS626_CH_HALL,
+};
+
+enum iqs626_rx_inactive {
+       IQS626_RX_INACTIVE_VSS,
+       IQS626_RX_INACTIVE_FLOAT,
+       IQS626_RX_INACTIVE_VREG,
+};
+
+enum iqs626_st_offs {
+       IQS626_ST_OFFS_PROX,
+       IQS626_ST_OFFS_DIR,
+       IQS626_ST_OFFS_TOUCH,
+       IQS626_ST_OFFS_DEEP,
+};
+
+enum iqs626_th_offs {
+       IQS626_TH_OFFS_PROX,
+       IQS626_TH_OFFS_TOUCH,
+       IQS626_TH_OFFS_DEEP,
+};
+
+enum iqs626_event_id {
+       IQS626_EVENT_PROX_DN,
+       IQS626_EVENT_PROX_UP,
+       IQS626_EVENT_TOUCH_DN,
+       IQS626_EVENT_TOUCH_UP,
+       IQS626_EVENT_DEEP_DN,
+       IQS626_EVENT_DEEP_UP,
+};
+
+enum iqs626_gesture_id {
+       IQS626_GESTURE_FLICK_X_POS,
+       IQS626_GESTURE_FLICK_X_NEG,
+       IQS626_GESTURE_FLICK_Y_POS,
+       IQS626_GESTURE_FLICK_Y_NEG,
+       IQS626_GESTURE_TAP,
+       IQS626_GESTURE_HOLD,
+       IQS626_NUM_GESTURES,
+};
+
+struct iqs626_event_desc {
+       const char *name;
+       enum iqs626_st_offs st_offs;
+       enum iqs626_th_offs th_offs;
+       bool dir_up;
+       u8 mask;
+};
+
+static const struct iqs626_event_desc iqs626_events[] = {
+       [IQS626_EVENT_PROX_DN] = {
+               .name = "event-prox",
+               .st_offs = IQS626_ST_OFFS_PROX,
+               .th_offs = IQS626_TH_OFFS_PROX,
+               .mask = IQS626_EVENT_MASK_PROX,
+       },
+       [IQS626_EVENT_PROX_UP] = {
+               .name = "event-prox-alt",
+               .st_offs = IQS626_ST_OFFS_PROX,
+               .th_offs = IQS626_TH_OFFS_PROX,
+               .dir_up = true,
+               .mask = IQS626_EVENT_MASK_PROX,
+       },
+       [IQS626_EVENT_TOUCH_DN] = {
+               .name = "event-touch",
+               .st_offs = IQS626_ST_OFFS_TOUCH,
+               .th_offs = IQS626_TH_OFFS_TOUCH,
+               .mask = IQS626_EVENT_MASK_TOUCH,
+       },
+       [IQS626_EVENT_TOUCH_UP] = {
+               .name = "event-touch-alt",
+               .st_offs = IQS626_ST_OFFS_TOUCH,
+               .th_offs = IQS626_TH_OFFS_TOUCH,
+               .dir_up = true,
+               .mask = IQS626_EVENT_MASK_TOUCH,
+       },
+       [IQS626_EVENT_DEEP_DN] = {
+               .name = "event-deep",
+               .st_offs = IQS626_ST_OFFS_DEEP,
+               .th_offs = IQS626_TH_OFFS_DEEP,
+               .mask = IQS626_EVENT_MASK_DEEP,
+       },
+       [IQS626_EVENT_DEEP_UP] = {
+               .name = "event-deep-alt",
+               .st_offs = IQS626_ST_OFFS_DEEP,
+               .th_offs = IQS626_TH_OFFS_DEEP,
+               .dir_up = true,
+               .mask = IQS626_EVENT_MASK_DEEP,
+       },
+};
+
+struct iqs626_ver_info {
+       u8 prod_num;
+       u8 sw_num;
+       u8 hw_num;
+       u8 padding;
+} __packed;
+
+struct iqs626_flags {
+       __be16 system;
+       u8 gesture;
+       u8 padding_a;
+       u8 states[4];
+       u8 ref_active;
+       u8 padding_b;
+       u8 comp_min;
+       u8 comp_max;
+       u8 trackpad_x;
+       u8 trackpad_y;
+} __packed;
+
+struct iqs626_ch_reg_ulp {
+       u8 thresh[2];
+       u8 hyst;
+       u8 filter;
+       u8 engine[2];
+       u8 ati_target;
+       u8 padding;
+       __be16 ati_comp;
+       u8 rx_enable;
+       u8 tx_enable;
+} __packed;
+
+struct iqs626_ch_reg_tp {
+       u8 thresh;
+       u8 ati_base;
+       __be16 ati_comp;
+} __packed;
+
+struct iqs626_tp_grp_reg {
+       u8 hyst;
+       u8 ati_target;
+       u8 engine[2];
+       struct iqs626_ch_reg_tp ch_reg_tp[IQS626_NUM_CH_TP_3];
+} __packed;
+
+struct iqs626_ch_reg_gen {
+       u8 thresh[3];
+       u8 padding;
+       u8 hyst;
+       u8 ati_target;
+       __be16 ati_comp;
+       u8 engine[5];
+       u8 filter;
+       u8 rx_enable;
+       u8 tx_enable;
+       u8 assoc_select;
+       u8 assoc_weight;
+} __packed;
+
+struct iqs626_ch_reg_hall {
+       u8 engine;
+       u8 thresh;
+       u8 hyst;
+       u8 ati_target;
+       __be16 ati_comp;
+} __packed;
+
+struct iqs626_sys_reg {
+       __be16 general;
+       u8 misc_a;
+       u8 event_mask;
+       u8 active;
+       u8 reseed;
+       u8 rate_np;
+       u8 rate_lp;
+       u8 rate_ulp;
+       u8 timeout_pwr;
+       u8 timeout_rdy;
+       u8 timeout_lta;
+       u8 misc_b;
+       u8 thresh_swipe;
+       u8 timeout_tap;
+       u8 timeout_swipe;
+       u8 redo_ati;
+       u8 padding;
+       struct iqs626_ch_reg_ulp ch_reg_ulp;
+       struct iqs626_tp_grp_reg tp_grp_reg;
+       struct iqs626_ch_reg_gen ch_reg_gen[IQS626_NUM_CH_GEN];
+       struct iqs626_ch_reg_hall ch_reg_hall;
+} __packed;
+
+struct iqs626_channel_desc {
+       const char *name;
+       int num_ch;
+       u8 active;
+       bool events[ARRAY_SIZE(iqs626_events)];
+};
+
+static const struct iqs626_channel_desc iqs626_channels[] = {
+       [IQS626_CH_ULP_0] = {
+               .name = "ulp-0",
+               .num_ch = 1,
+               .active = BIT(0),
+               .events = {
+                       [IQS626_EVENT_PROX_DN] = true,
+                       [IQS626_EVENT_PROX_UP] = true,
+                       [IQS626_EVENT_TOUCH_DN] = true,
+                       [IQS626_EVENT_TOUCH_UP] = true,
+               },
+       },
+       [IQS626_CH_TP_2] = {
+               .name = "trackpad-3x2",
+               .num_ch = IQS626_NUM_CH_TP_2,
+               .active = BIT(1),
+               .events = {
+                       [IQS626_EVENT_TOUCH_DN] = true,
+               },
+       },
+       [IQS626_CH_TP_3] = {
+               .name = "trackpad-3x3",
+               .num_ch = IQS626_NUM_CH_TP_3,
+               .active = BIT(2) | BIT(1),
+               .events = {
+                       [IQS626_EVENT_TOUCH_DN] = true,
+               },
+       },
+       [IQS626_CH_GEN_0] = {
+               .name = "generic-0",
+               .num_ch = 1,
+               .active = BIT(4),
+               .events = {
+                       [IQS626_EVENT_PROX_DN] = true,
+                       [IQS626_EVENT_PROX_UP] = true,
+                       [IQS626_EVENT_TOUCH_DN] = true,
+                       [IQS626_EVENT_TOUCH_UP] = true,
+                       [IQS626_EVENT_DEEP_DN] = true,
+                       [IQS626_EVENT_DEEP_UP] = true,
+               },
+       },
+       [IQS626_CH_GEN_1] = {
+               .name = "generic-1",
+               .num_ch = 1,
+               .active = BIT(5),
+               .events = {
+                       [IQS626_EVENT_PROX_DN] = true,
+                       [IQS626_EVENT_PROX_UP] = true,
+                       [IQS626_EVENT_TOUCH_DN] = true,
+                       [IQS626_EVENT_TOUCH_UP] = true,
+                       [IQS626_EVENT_DEEP_DN] = true,
+                       [IQS626_EVENT_DEEP_UP] = true,
+               },
+       },
+       [IQS626_CH_GEN_2] = {
+               .name = "generic-2",
+               .num_ch = 1,
+               .active = BIT(6),
+               .events = {
+                       [IQS626_EVENT_PROX_DN] = true,
+                       [IQS626_EVENT_PROX_UP] = true,
+                       [IQS626_EVENT_TOUCH_DN] = true,
+                       [IQS626_EVENT_TOUCH_UP] = true,
+                       [IQS626_EVENT_DEEP_DN] = true,
+                       [IQS626_EVENT_DEEP_UP] = true,
+               },
+       },
+       [IQS626_CH_HALL] = {
+               .name = "hall",
+               .num_ch = 1,
+               .active = BIT(7),
+               .events = {
+                       [IQS626_EVENT_TOUCH_DN] = true,
+                       [IQS626_EVENT_TOUCH_UP] = true,
+               },
+       },
+};
+
+struct iqs626_private {
+       struct i2c_client *client;
+       struct regmap *regmap;
+       struct iqs626_sys_reg sys_reg;
+       struct completion ati_done;
+       struct input_dev *keypad;
+       struct input_dev *trackpad;
+       struct touchscreen_properties prop;
+       unsigned int kp_type[ARRAY_SIZE(iqs626_channels)]
+                           [ARRAY_SIZE(iqs626_events)];
+       unsigned int kp_code[ARRAY_SIZE(iqs626_channels)]
+                           [ARRAY_SIZE(iqs626_events)];
+       unsigned int tp_code[IQS626_NUM_GESTURES];
+       unsigned int suspend_mode;
+};
+
+static int iqs626_parse_events(struct iqs626_private *iqs626,
+                              const struct fwnode_handle *ch_node,
+                              enum iqs626_ch_id ch_id)
+{
+       struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg;
+       struct i2c_client *client = iqs626->client;
+       const struct fwnode_handle *ev_node;
+       const char *ev_name;
+       u8 *thresh, *hyst;
+       unsigned int thresh_tp[IQS626_NUM_CH_TP_3];
+       unsigned int val;
+       int num_ch = iqs626_channels[ch_id].num_ch;
+       int error, i, j;
+
+       switch (ch_id) {
+       case IQS626_CH_ULP_0:
+               thresh = sys_reg->ch_reg_ulp.thresh;
+               hyst = &sys_reg->ch_reg_ulp.hyst;
+               break;
+
+       case IQS626_CH_TP_2:
+       case IQS626_CH_TP_3:
+               thresh = &sys_reg->tp_grp_reg.ch_reg_tp[0].thresh;
+               hyst = &sys_reg->tp_grp_reg.hyst;
+               break;
+
+       case IQS626_CH_GEN_0:
+       case IQS626_CH_GEN_1:
+       case IQS626_CH_GEN_2:
+               i = ch_id - IQS626_CH_GEN_0;
+               thresh = sys_reg->ch_reg_gen[i].thresh;
+               hyst = &sys_reg->ch_reg_gen[i].hyst;
+               break;
+
+       case IQS626_CH_HALL:
+               thresh = &sys_reg->ch_reg_hall.thresh;
+               hyst = &sys_reg->ch_reg_hall.hyst;
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(iqs626_events); i++) {
+               if (!iqs626_channels[ch_id].events[i])
+                       continue;
+
+               if (ch_id == IQS626_CH_TP_2 || ch_id == IQS626_CH_TP_3) {
+                       /*
+                        * Trackpad touch events are simply described under the
+                        * trackpad child node.
+                        */
+                       ev_node = ch_node;
+               } else {
+                       ev_name = iqs626_events[i].name;
+                       ev_node = fwnode_get_named_child_node(ch_node, ev_name);
+                       if (!ev_node)
+                               continue;
+
+                       if (!fwnode_property_read_u32(ev_node, "linux,code",
+                                                     &val)) {
+                               iqs626->kp_code[ch_id][i] = val;
+
+                               if (fwnode_property_read_u32(ev_node,
+                                                            "linux,input-type",
+                                                            &val)) {
+                                       if (ch_id == IQS626_CH_HALL)
+                                               val = EV_SW;
+                                       else
+                                               val = EV_KEY;
+                               }
+
+                               if (val != EV_KEY && val != EV_SW) {
+                                       dev_err(&client->dev,
+                                               "Invalid input type: %u\n",
+                                               val);
+                                       return -EINVAL;
+                               }
+
+                               iqs626->kp_type[ch_id][i] = val;
+
+                               sys_reg->event_mask &= ~iqs626_events[i].mask;
+                       }
+               }
+
+               if (!fwnode_property_read_u32(ev_node, "azoteq,hyst", &val)) {
+                       if (val > IQS626_CHx_HYST_MAX) {
+                               dev_err(&client->dev,
+                                       "Invalid %s channel hysteresis: %u\n",
+                                       fwnode_get_name(ch_node), val);
+                               return -EINVAL;
+                       }
+
+                       if (i == IQS626_EVENT_DEEP_DN ||
+                           i == IQS626_EVENT_DEEP_UP) {
+                               *hyst &= ~IQS626_CHx_HYST_DEEP_MASK;
+                               *hyst |= (val << IQS626_CHx_HYST_DEEP_SHIFT);
+                       } else if (i == IQS626_EVENT_TOUCH_DN ||
+                                  i == IQS626_EVENT_TOUCH_UP) {
+                               *hyst &= ~IQS626_CHx_HYST_TOUCH_MASK;
+                               *hyst |= val;
+                       }
+               }
+
+               if (ch_id != IQS626_CH_TP_2 && ch_id != IQS626_CH_TP_3 &&
+                   !fwnode_property_read_u32(ev_node, "azoteq,thresh", &val)) {
+                       if (val > IQS626_CHx_THRESH_MAX) {
+                               dev_err(&client->dev,
+                                       "Invalid %s channel threshold: %u\n",
+                                       fwnode_get_name(ch_node), val);
+                               return -EINVAL;
+                       }
+
+                       if (ch_id == IQS626_CH_HALL)
+                               *thresh = val;
+                       else
+                               *(thresh + iqs626_events[i].th_offs) = val;
+
+                       continue;
+               }
+
+               if (!fwnode_property_present(ev_node, "azoteq,thresh"))
+                       continue;
+
+               error = fwnode_property_read_u32_array(ev_node, "azoteq,thresh",
+                                                      thresh_tp, num_ch);
+               if (error) {
+                       dev_err(&client->dev,
+                               "Failed to read %s channel thresholds: %d\n",
+                               fwnode_get_name(ch_node), error);
+                       return error;
+               }
+
+               for (j = 0; j < num_ch; j++) {
+                       if (thresh_tp[j] > IQS626_CHx_THRESH_MAX) {
+                               dev_err(&client->dev,
+                                       "Invalid %s channel threshold: %u\n",
+                                       fwnode_get_name(ch_node), thresh_tp[j]);
+                               return -EINVAL;
+                       }
+
+                       sys_reg->tp_grp_reg.ch_reg_tp[j].thresh = thresh_tp[j];
+               }
+       }
+
+       return 0;
+}
+
+static int iqs626_parse_ati_target(struct iqs626_private *iqs626,
+                                  const struct fwnode_handle *ch_node,
+                                  enum iqs626_ch_id ch_id)
+{
+       struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg;
+       struct i2c_client *client = iqs626->client;
+       unsigned int ati_base[IQS626_NUM_CH_TP_3];
+       unsigned int val;
+       u8 *ati_target;
+       int num_ch = iqs626_channels[ch_id].num_ch;
+       int error, i;
+
+       switch (ch_id) {
+       case IQS626_CH_ULP_0:
+               ati_target = &sys_reg->ch_reg_ulp.ati_target;
+               break;
+
+       case IQS626_CH_TP_2:
+       case IQS626_CH_TP_3:
+               ati_target = &sys_reg->tp_grp_reg.ati_target;
+               break;
+
+       case IQS626_CH_GEN_0:
+       case IQS626_CH_GEN_1:
+       case IQS626_CH_GEN_2:
+               i = ch_id - IQS626_CH_GEN_0;
+               ati_target = &sys_reg->ch_reg_gen[i].ati_target;
+               break;
+
+       case IQS626_CH_HALL:
+               ati_target = &sys_reg->ch_reg_hall.ati_target;
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,ati-target", &val)) {
+               if (val > IQS626_CHx_ATI_TARGET_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel ATI target: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *ati_target &= ~IQS626_CHx_ATI_TARGET_MASK;
+               *ati_target |= (val / 32);
+       }
+
+       if (ch_id != IQS626_CH_TP_2 && ch_id != IQS626_CH_TP_3 &&
+           !fwnode_property_read_u32(ch_node, "azoteq,ati-base", &val)) {
+               switch (val) {
+               case 75:
+                       val = IQS626_CHx_ATI_BASE_75;
+                       break;
+
+               case 100:
+                       val = IQS626_CHx_ATI_BASE_100;
+                       break;
+
+               case 150:
+                       val = IQS626_CHx_ATI_BASE_150;
+                       break;
+
+               case 200:
+                       val = IQS626_CHx_ATI_BASE_200;
+                       break;
+
+               default:
+                       dev_err(&client->dev,
+                               "Invalid %s channel ATI base: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *ati_target &= ~IQS626_CHx_ATI_BASE_MASK;
+               *ati_target |= val;
+
+               return 0;
+       }
+
+       if (!fwnode_property_present(ch_node, "azoteq,ati-base"))
+               return 0;
+
+       error = fwnode_property_read_u32_array(ch_node, "azoteq,ati-base",
+                                              ati_base, num_ch);
+       if (error) {
+               dev_err(&client->dev,
+                       "Failed to read %s channel ATI bases: %d\n",
+                       fwnode_get_name(ch_node), error);
+               return error;
+       }
+
+       for (i = 0; i < num_ch; i++) {
+               if (ati_base[i] < IQS626_TPx_ATI_BASE_MIN ||
+                   ati_base[i] > IQS626_TPx_ATI_BASE_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel ATI base: %u\n",
+                               fwnode_get_name(ch_node), ati_base[i]);
+                       return -EINVAL;
+               }
+
+               ati_base[i] -= IQS626_TPx_ATI_BASE_MIN;
+               sys_reg->tp_grp_reg.ch_reg_tp[i].ati_base = ati_base[i];
+       }
+
+       return 0;
+}
+
+static int iqs626_parse_pins(struct iqs626_private *iqs626,
+                            const struct fwnode_handle *ch_node,
+                            const char *propname, u8 *enable)
+{
+       struct i2c_client *client = iqs626->client;
+       unsigned int val[IQS626_NUM_CRx_TX];
+       int error, count, i;
+
+       if (!fwnode_property_present(ch_node, propname))
+               return 0;
+
+       count = fwnode_property_count_u32(ch_node, propname);
+       if (count > IQS626_NUM_CRx_TX) {
+               dev_err(&client->dev,
+                       "Too many %s channel CRX/TX pins present\n",
+                       fwnode_get_name(ch_node));
+               return -EINVAL;
+       } else if (count < 0) {
+               dev_err(&client->dev,
+                       "Failed to count %s channel CRX/TX pins: %d\n",
+                       fwnode_get_name(ch_node), count);
+               return count;
+       }
+
+       error = fwnode_property_read_u32_array(ch_node, propname, val, count);
+       if (error) {
+               dev_err(&client->dev,
+                       "Failed to read %s channel CRX/TX pins: %d\n",
+                       fwnode_get_name(ch_node), error);
+               return error;
+       }
+
+       *enable = 0;
+
+       for (i = 0; i < count; i++) {
+               if (val[i] >= IQS626_NUM_CRx_TX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel CRX/TX pin: %u\n",
+                               fwnode_get_name(ch_node), val[i]);
+                       return -EINVAL;
+               }
+
+               *enable |= BIT(val[i]);
+       }
+
+       return 0;
+}
+
+static int iqs626_parse_trackpad(struct iqs626_private *iqs626,
+                                const struct fwnode_handle *ch_node)
+{
+       struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg;
+       struct i2c_client *client = iqs626->client;
+       u8 *hyst = &sys_reg->tp_grp_reg.hyst;
+       unsigned int val;
+       int error, count;
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,lta-update", &val)) {
+               if (val > IQS626_MISC_A_TPx_LTA_UPDATE_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel update rate: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               sys_reg->misc_a &= ~IQS626_MISC_A_TPx_LTA_UPDATE_MASK;
+               sys_reg->misc_a |= (val << IQS626_MISC_A_TPx_LTA_UPDATE_SHIFT);
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-trackpad",
+                                     &val)) {
+               if (val > IQS626_FILT_STR_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel filter strength: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               sys_reg->misc_b &= ~IQS626_MISC_B_FILT_STR_TPx;
+               sys_reg->misc_b |= val;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-np-cnt",
+                                     &val)) {
+               if (val > IQS626_FILT_STR_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel filter strength: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *hyst &= ~IQS626_FILT_STR_NP_TPx_MASK;
+               *hyst |= (val << IQS626_FILT_STR_NP_TPx_SHIFT);
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-lp-cnt",
+                                     &val)) {
+               if (val > IQS626_FILT_STR_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel filter strength: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *hyst &= ~IQS626_FILT_STR_LP_TPx_MASK;
+               *hyst |= (val << IQS626_FILT_STR_LP_TPx_SHIFT);
+       }
+
+       if (!fwnode_property_present(ch_node, "linux,keycodes"))
+               return 0;
+
+       count = fwnode_property_count_u32(ch_node, "linux,keycodes");
+       if (count > IQS626_NUM_GESTURES) {
+               dev_err(&client->dev, "Too many keycodes present\n");
+               return -EINVAL;
+       } else if (count < 0) {
+               dev_err(&client->dev, "Failed to count keycodes: %d\n", count);
+               return count;
+       }
+
+       error = fwnode_property_read_u32_array(ch_node, "linux,keycodes",
+                                              iqs626->tp_code, count);
+       if (error) {
+               dev_err(&client->dev, "Failed to read keycodes: %d\n", error);
+               return error;
+       }
+
+       sys_reg->misc_b &= ~IQS626_MISC_B_TPx_SWIPE;
+       if (fwnode_property_present(ch_node, "azoteq,gesture-swipe"))
+               sys_reg->misc_b |= IQS626_MISC_B_TPx_SWIPE;
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,timeout-tap-ms",
+                                     &val)) {
+               if (val > IQS626_TIMEOUT_TAP_MS_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel timeout: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               sys_reg->timeout_tap = val / 16;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,timeout-swipe-ms",
+                                     &val)) {
+               if (val > IQS626_TIMEOUT_SWIPE_MS_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel timeout: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               sys_reg->timeout_swipe = val / 16;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,thresh-swipe",
+                                     &val)) {
+               if (val > IQS626_THRESH_SWIPE_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel threshold: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               sys_reg->thresh_swipe = val;
+       }
+
+       sys_reg->event_mask &= ~IQS626_EVENT_MASK_GESTURE;
+
+       return 0;
+}
+
+static int iqs626_parse_channel(struct iqs626_private *iqs626,
+                               const struct fwnode_handle *ch_node,
+                               enum iqs626_ch_id ch_id)
+{
+       struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg;
+       struct i2c_client *client = iqs626->client;
+       u8 *engine, *filter, *rx_enable, *tx_enable;
+       u8 *assoc_select, *assoc_weight;
+       unsigned int val;
+       int error, i;
+
+       switch (ch_id) {
+       case IQS626_CH_ULP_0:
+               engine = sys_reg->ch_reg_ulp.engine;
+               break;
+
+       case IQS626_CH_TP_2:
+       case IQS626_CH_TP_3:
+               engine = sys_reg->tp_grp_reg.engine;
+               break;
+
+       case IQS626_CH_GEN_0:
+       case IQS626_CH_GEN_1:
+       case IQS626_CH_GEN_2:
+               i = ch_id - IQS626_CH_GEN_0;
+               engine = sys_reg->ch_reg_gen[i].engine;
+               break;
+
+       case IQS626_CH_HALL:
+               engine = &sys_reg->ch_reg_hall.engine;
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       *engine |= IQS626_CHx_ENG_0_MEAS_CAP_SIZE;
+       if (fwnode_property_present(ch_node, "azoteq,meas-cap-decrease"))
+               *engine &= ~IQS626_CHx_ENG_0_MEAS_CAP_SIZE;
+
+       *engine |= IQS626_CHx_ENG_0_RX_TERM_VSS;
+       if (!fwnode_property_read_u32(ch_node, "azoteq,rx-inactive", &val)) {
+               switch (val) {
+               case IQS626_RX_INACTIVE_VSS:
+                       break;
+
+               case IQS626_RX_INACTIVE_FLOAT:
+                       *engine &= ~IQS626_CHx_ENG_0_RX_TERM_VSS;
+                       if (ch_id == IQS626_CH_GEN_0 ||
+                           ch_id == IQS626_CH_GEN_1 ||
+                           ch_id == IQS626_CH_GEN_2)
+                               *(engine + 4) &= ~IQS626_CHx_ENG_4_RX_TERM_VREG;
+                       break;
+
+               case IQS626_RX_INACTIVE_VREG:
+                       if (ch_id == IQS626_CH_GEN_0 ||
+                           ch_id == IQS626_CH_GEN_1 ||
+                           ch_id == IQS626_CH_GEN_2) {
+                               *engine &= ~IQS626_CHx_ENG_0_RX_TERM_VSS;
+                               *(engine + 4) |= IQS626_CHx_ENG_4_RX_TERM_VREG;
+                               break;
+                       }
+                       fallthrough;
+
+               default:
+                       dev_err(&client->dev,
+                               "Invalid %s channel CRX pin termination: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+       }
+
+       *engine &= ~IQS626_CHx_ENG_0_LINEARIZE;
+       if (fwnode_property_present(ch_node, "azoteq,linearize"))
+               *engine |= IQS626_CHx_ENG_0_LINEARIZE;
+
+       *engine &= ~IQS626_CHx_ENG_0_DUAL_DIR;
+       if (fwnode_property_present(ch_node, "azoteq,dual-direction"))
+               *engine |= IQS626_CHx_ENG_0_DUAL_DIR;
+
+       *engine &= ~IQS626_CHx_ENG_0_FILT_DISABLE;
+       if (fwnode_property_present(ch_node, "azoteq,filt-disable"))
+               *engine |= IQS626_CHx_ENG_0_FILT_DISABLE;
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,ati-mode", &val)) {
+               if (val > IQS626_CHx_ENG_0_ATI_MODE_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel ATI mode: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *engine &= ~IQS626_CHx_ENG_0_ATI_MODE_MASK;
+               *engine |= val;
+       }
+
+       if (ch_id == IQS626_CH_HALL)
+               return 0;
+
+       *(engine + 1) &= ~IQS626_CHx_ENG_1_CCT_ENABLE;
+       if (!fwnode_property_read_u32(ch_node, "azoteq,cct-increase",
+                                     &val) && val) {
+               unsigned int orig_val = val--;
+
+               /*
+                * In the case of the generic channels, the charge cycle time
+                * field doubles in size and straddles two separate registers.
+                */
+               if (ch_id == IQS626_CH_GEN_0 ||
+                   ch_id == IQS626_CH_GEN_1 ||
+                   ch_id == IQS626_CH_GEN_2) {
+                       *(engine + 4) &= ~IQS626_CHx_ENG_4_CCT_LOW_1;
+                       if (val & BIT(1))
+                               *(engine + 4) |= IQS626_CHx_ENG_4_CCT_LOW_1;
+
+                       *(engine + 4) &= ~IQS626_CHx_ENG_4_CCT_LOW_0;
+                       if (val & BIT(0))
+                               *(engine + 4) |= IQS626_CHx_ENG_4_CCT_LOW_0;
+
+                       val >>= 2;
+               }
+
+               if (val & ~GENMASK(1, 0)) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel charge cycle time: %u\n",
+                               fwnode_get_name(ch_node), orig_val);
+                       return -EINVAL;
+               }
+
+               *(engine + 1) &= ~IQS626_CHx_ENG_1_CCT_HIGH_1;
+               if (val & BIT(1))
+                       *(engine + 1) |= IQS626_CHx_ENG_1_CCT_HIGH_1;
+
+               *(engine + 1) &= ~IQS626_CHx_ENG_1_CCT_HIGH_0;
+               if (val & BIT(0))
+                       *(engine + 1) |= IQS626_CHx_ENG_1_CCT_HIGH_0;
+
+               *(engine + 1) |= IQS626_CHx_ENG_1_CCT_ENABLE;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,proj-bias", &val)) {
+               if (val > IQS626_CHx_ENG_1_PROJ_BIAS_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel bias current: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *(engine + 1) &= ~IQS626_CHx_ENG_1_PROJ_BIAS_MASK;
+               *(engine + 1) |= (val << IQS626_CHx_ENG_1_PROJ_BIAS_SHIFT);
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,sense-freq", &val)) {
+               if (val > IQS626_CHx_ENG_1_SENSE_FREQ_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel sensing frequency: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *(engine + 1) &= ~IQS626_CHx_ENG_1_SENSE_FREQ_MASK;
+               *(engine + 1) |= (val << IQS626_CHx_ENG_1_SENSE_FREQ_SHIFT);
+       }
+
+       *(engine + 1) &= ~IQS626_CHx_ENG_1_ATI_BAND_TIGHTEN;
+       if (fwnode_property_present(ch_node, "azoteq,ati-band-tighten"))
+               *(engine + 1) |= IQS626_CHx_ENG_1_ATI_BAND_TIGHTEN;
+
+       if (ch_id == IQS626_CH_TP_2 || ch_id == IQS626_CH_TP_3)
+               return iqs626_parse_trackpad(iqs626, ch_node);
+
+       if (ch_id == IQS626_CH_ULP_0) {
+               sys_reg->ch_reg_ulp.hyst &= ~IQS626_ULP_PROJ_ENABLE;
+               if (fwnode_property_present(ch_node, "azoteq,proj-enable"))
+                       sys_reg->ch_reg_ulp.hyst |= IQS626_ULP_PROJ_ENABLE;
+
+               filter = &sys_reg->ch_reg_ulp.filter;
+
+               rx_enable = &sys_reg->ch_reg_ulp.rx_enable;
+               tx_enable = &sys_reg->ch_reg_ulp.tx_enable;
+       } else {
+               i = ch_id - IQS626_CH_GEN_0;
+               filter = &sys_reg->ch_reg_gen[i].filter;
+
+               rx_enable = &sys_reg->ch_reg_gen[i].rx_enable;
+               tx_enable = &sys_reg->ch_reg_gen[i].tx_enable;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-np-cnt",
+                                     &val)) {
+               if (val > IQS626_FILT_STR_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel filter strength: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *filter &= ~IQS626_FILT_STR_NP_CNT_MASK;
+               *filter |= (val << IQS626_FILT_STR_NP_CNT_SHIFT);
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-lp-cnt",
+                                     &val)) {
+               if (val > IQS626_FILT_STR_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel filter strength: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *filter &= ~IQS626_FILT_STR_LP_CNT_MASK;
+               *filter |= (val << IQS626_FILT_STR_LP_CNT_SHIFT);
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-np-lta",
+                                     &val)) {
+               if (val > IQS626_FILT_STR_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel filter strength: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *filter &= ~IQS626_FILT_STR_NP_LTA_MASK;
+               *filter |= (val << IQS626_FILT_STR_NP_LTA_SHIFT);
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-lp-lta",
+                                     &val)) {
+               if (val > IQS626_FILT_STR_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel filter strength: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *filter &= ~IQS626_FILT_STR_LP_LTA_MASK;
+               *filter |= val;
+       }
+
+       error = iqs626_parse_pins(iqs626, ch_node, "azoteq,rx-enable",
+                                 rx_enable);
+       if (error)
+               return error;
+
+       error = iqs626_parse_pins(iqs626, ch_node, "azoteq,tx-enable",
+                                 tx_enable);
+       if (error)
+               return error;
+
+       if (ch_id == IQS626_CH_ULP_0)
+               return 0;
+
+       *(engine + 2) &= ~IQS626_CHx_ENG_2_LOCAL_CAP_ENABLE;
+       if (!fwnode_property_read_u32(ch_node, "azoteq,local-cap-size",
+                                     &val) && val) {
+               unsigned int orig_val = val--;
+
+               if (val > IQS626_CHx_ENG_2_LOCAL_CAP_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel local cap. size: %u\n",
+                               fwnode_get_name(ch_node), orig_val);
+                       return -EINVAL;
+               }
+
+               *(engine + 2) &= ~IQS626_CHx_ENG_2_LOCAL_CAP_MASK;
+               *(engine + 2) |= (val << IQS626_CHx_ENG_2_LOCAL_CAP_SHIFT);
+
+               *(engine + 2) |= IQS626_CHx_ENG_2_LOCAL_CAP_ENABLE;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,sense-mode", &val)) {
+               if (val > IQS626_CHx_ENG_2_SENSE_MODE_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel sensing mode: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *(engine + 2) &= ~IQS626_CHx_ENG_2_SENSE_MODE_MASK;
+               *(engine + 2) |= val;
+       }
+
+       if (!fwnode_property_read_u32(ch_node, "azoteq,tx-freq", &val)) {
+               if (val > IQS626_CHx_ENG_3_TX_FREQ_MAX) {
+                       dev_err(&client->dev,
+                               "Invalid %s channel excitation frequency: %u\n",
+                               fwnode_get_name(ch_node), val);
+                       return -EINVAL;
+               }
+
+               *(engine + 3) &= ~IQS626_CHx_ENG_3_TX_FREQ_MASK;
+               *(engine + 3) |= (val << IQS626_CHx_ENG_3_TX_FREQ_SHIFT);
+       }
+
+       *(engine + 3) &= ~IQS626_CHx_ENG_3_INV_LOGIC;
+       if (fwnode_property_present(ch_node, "azoteq,invert-enable"))
+               *(engine + 3) |= IQS626_CHx_ENG_3_INV_LOGIC;
+
+       *(engine + 4) &= ~IQS626_CHx_ENG_4_COMP_DISABLE;
+       if (fwnode_property_present(ch_node, "azoteq,comp-disable"))
+               *(engine + 4) |= IQS626_CHx_ENG_4_COMP_DISABLE;
+
+       *(engine + 4) &= ~IQS626_CHx_ENG_4_STATIC_ENABLE;
+       if (fwnode_property_present(ch_node, "azoteq,static-enable"))
+               *(engine + 4) |= IQS626_CHx_ENG_4_STATIC_ENABLE;
+
+       i = ch_id - IQS626_CH_GEN_0;
+       assoc_select = &sys_reg->ch_reg_gen[i].assoc_select;
+       assoc_weight = &sys_reg->ch_reg_gen[i].assoc_weight;
+
+       *assoc_select = 0;
+       if (!fwnode_property_present(ch_node, "azoteq,assoc-select"))
+               return 0;
+
+       for (i = 0; i < ARRAY_SIZE(iqs626_channels); i++) {
+               if (fwnode_property_match_string(ch_node, "azoteq,assoc-select",
+                                                iqs626_channels[i].name) < 0)
+                       continue;
+
+               *assoc_select |= iqs626_channels[i].active;
+       }
+
+       if (fwnode_property_read_u32(ch_node, "azoteq,assoc-weight", &val))
+               return 0;
+
+       if (val > IQS626_GEN_WEIGHT_MAX) {
+               dev_err(&client->dev,
+                       "Invalid %s channel associated weight: %u\n",
+                       fwnode_get_name(ch_node), val);
+               return -EINVAL;
+       }
+
+       *assoc_weight = val;
+
+       return 0;
+}
+
+static int iqs626_parse_prop(struct iqs626_private *iqs626)
+{
+       struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg;
+       struct i2c_client *client = iqs626->client;
+       struct fwnode_handle *ch_node;
+       unsigned int val;
+       int error, i;
+       u16 general;
+
+       if (!device_property_read_u32(&client->dev, "azoteq,suspend-mode",
+                                     &val)) {
+               if (val > IQS626_SYS_SETTINGS_PWR_MODE_MAX) {
+                       dev_err(&client->dev, "Invalid suspend mode: %u\n",
+                               val);
+                       return -EINVAL;
+               }
+
+               iqs626->suspend_mode = val;
+       }
+
+       error = regmap_raw_read(iqs626->regmap, IQS626_SYS_SETTINGS, sys_reg,
+                               sizeof(*sys_reg));
+       if (error)
+               return error;
+
+       general = be16_to_cpu(sys_reg->general);
+       general &= IQS626_SYS_SETTINGS_ULP_UPDATE_MASK;
+
+       if (device_property_present(&client->dev, "azoteq,clk-div"))
+               general |= IQS626_SYS_SETTINGS_CLK_DIV;
+
+       if (device_property_present(&client->dev, "azoteq,ulp-enable"))
+               general |= IQS626_SYS_SETTINGS_ULP_AUTO;
+
+       if (!device_property_read_u32(&client->dev, "azoteq,ulp-update",
+                                     &val)) {
+               if (val > IQS626_SYS_SETTINGS_ULP_UPDATE_MAX) {
+                       dev_err(&client->dev, "Invalid update rate: %u\n", val);
+                       return -EINVAL;
+               }
+
+               general &= ~IQS626_SYS_SETTINGS_ULP_UPDATE_MASK;
+               general |= (val << IQS626_SYS_SETTINGS_ULP_UPDATE_SHIFT);
+       }
+
+       sys_reg->misc_a &= ~IQS626_MISC_A_ATI_BAND_DISABLE;
+       if (device_property_present(&client->dev, "azoteq,ati-band-disable"))
+               sys_reg->misc_a |= IQS626_MISC_A_ATI_BAND_DISABLE;
+
+       sys_reg->misc_a &= ~IQS626_MISC_A_ATI_LP_ONLY;
+       if (device_property_present(&client->dev, "azoteq,ati-lp-only"))
+               sys_reg->misc_a |= IQS626_MISC_A_ATI_LP_ONLY;
+
+       if (!device_property_read_u32(&client->dev, "azoteq,gpio3-select",
+                                     &val)) {
+               if (val > IQS626_MISC_A_GPIO3_SELECT_MAX) {
+                       dev_err(&client->dev, "Invalid GPIO3 selection: %u\n",
+                               val);
+                       return -EINVAL;
+               }
+
+               sys_reg->misc_a &= ~IQS626_MISC_A_GPIO3_SELECT_MASK;
+               sys_reg->misc_a |= val;
+       }
+
+       if (!device_property_read_u32(&client->dev, "azoteq,reseed-select",
+                                     &val)) {
+               if (val > IQS626_MISC_B_RESEED_UI_SEL_MAX) {
+                       dev_err(&client->dev, "Invalid reseed selection: %u\n",
+                               val);
+                       return -EINVAL;
+               }
+
+               sys_reg->misc_b &= ~IQS626_MISC_B_RESEED_UI_SEL_MASK;
+               sys_reg->misc_b |= (val << IQS626_MISC_B_RESEED_UI_SEL_SHIFT);
+       }
+
+       sys_reg->misc_b &= ~IQS626_MISC_B_THRESH_EXTEND;
+       if (device_property_present(&client->dev, "azoteq,thresh-extend"))
+               sys_reg->misc_b |= IQS626_MISC_B_THRESH_EXTEND;
+
+       sys_reg->misc_b &= ~IQS626_MISC_B_TRACKING_UI_ENABLE;
+       if (device_property_present(&client->dev, "azoteq,tracking-enable"))
+               sys_reg->misc_b |= IQS626_MISC_B_TRACKING_UI_ENABLE;
+
+       sys_reg->misc_b &= ~IQS626_MISC_B_RESEED_OFFSET;
+       if (device_property_present(&client->dev, "azoteq,reseed-offset"))
+               sys_reg->misc_b |= IQS626_MISC_B_RESEED_OFFSET;
+
+       if (!device_property_read_u32(&client->dev, "azoteq,rate-np-ms",
+                                     &val)) {
+               if (val > IQS626_RATE_NP_MS_MAX) {
+                       dev_err(&client->dev, "Invalid report rate: %u\n", val);
+                       return -EINVAL;
+               }
+
+               sys_reg->rate_np = val;
+       }
+
+       if (!device_property_read_u32(&client->dev, "azoteq,rate-lp-ms",
+                                     &val)) {
+               if (val > IQS626_RATE_LP_MS_MAX) {
+                       dev_err(&client->dev, "Invalid report rate: %u\n", val);
+                       return -EINVAL;
+               }
+
+               sys_reg->rate_lp = val;
+       }
+
+       if (!device_property_read_u32(&client->dev, "azoteq,rate-ulp-ms",
+                                     &val)) {
+               if (val > IQS626_RATE_ULP_MS_MAX) {
+                       dev_err(&client->dev, "Invalid report rate: %u\n", val);
+                       return -EINVAL;
+               }
+
+               sys_reg->rate_ulp = val / 16;
+       }
+
+       if (!device_property_read_u32(&client->dev, "azoteq,timeout-pwr-ms",
+                                     &val)) {
+               if (val > IQS626_TIMEOUT_PWR_MS_MAX) {
+                       dev_err(&client->dev, "Invalid timeout: %u\n", val);
+                       return -EINVAL;
+               }
+
+               sys_reg->timeout_pwr = val / 512;
+       }
+
+       if (!device_property_read_u32(&client->dev, "azoteq,timeout-lta-ms",
+                                     &val)) {
+               if (val > IQS626_TIMEOUT_LTA_MS_MAX) {
+                       dev_err(&client->dev, "Invalid timeout: %u\n", val);
+                       return -EINVAL;
+               }
+
+               sys_reg->timeout_lta = val / 512;
+       }
+
+       sys_reg->event_mask = ~((u8)IQS626_EVENT_MASK_SYS);
+       sys_reg->redo_ati = 0;
+
+       sys_reg->reseed = 0;
+       sys_reg->active = 0;
+
+       for (i = 0; i < ARRAY_SIZE(iqs626_channels); i++) {
+               ch_node = device_get_named_child_node(&client->dev,
+                                                     iqs626_channels[i].name);
+               if (!ch_node)
+                       continue;
+
+               error = iqs626_parse_channel(iqs626, ch_node, i);
+               if (error)
+                       return error;
+
+               error = iqs626_parse_ati_target(iqs626, ch_node, i);
+               if (error)
+                       return error;
+
+               error = iqs626_parse_events(iqs626, ch_node, i);
+               if (error)
+                       return error;
+
+               if (!fwnode_property_present(ch_node, "azoteq,ati-exclude"))
+                       sys_reg->redo_ati |= iqs626_channels[i].active;
+
+               if (!fwnode_property_present(ch_node, "azoteq,reseed-disable"))
+                       sys_reg->reseed |= iqs626_channels[i].active;
+
+               sys_reg->active |= iqs626_channels[i].active;
+       }
+
+       general |= IQS626_SYS_SETTINGS_EVENT_MODE;
+
+       /*
+        * Enable streaming during normal-power mode if the trackpad is used to
+        * report raw coordinates instead of gestures. In that case, the device
+        * returns to event mode during low-power mode.
+        */
+       if (sys_reg->active & iqs626_channels[IQS626_CH_TP_2].active &&
+           sys_reg->event_mask & IQS626_EVENT_MASK_GESTURE)
+               general |= IQS626_SYS_SETTINGS_EVENT_MODE_LP;
+
+       general |= IQS626_SYS_SETTINGS_REDO_ATI;
+       general |= IQS626_SYS_SETTINGS_ACK_RESET;
+
+       sys_reg->general = cpu_to_be16(general);
+
+       error = regmap_raw_write(iqs626->regmap, IQS626_SYS_SETTINGS,
+                                &iqs626->sys_reg, sizeof(iqs626->sys_reg));
+       if (error)
+               return error;
+
+       iqs626_irq_wait();
+
+       return 0;
+}
+
+static int iqs626_input_init(struct iqs626_private *iqs626)
+{
+       struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg;
+       struct i2c_client *client = iqs626->client;
+       int error, i, j;
+
+       iqs626->keypad = devm_input_allocate_device(&client->dev);
+       if (!iqs626->keypad)
+               return -ENOMEM;
+
+       iqs626->keypad->keycodemax = ARRAY_SIZE(iqs626->kp_code);
+       iqs626->keypad->keycode = iqs626->kp_code;
+       iqs626->keypad->keycodesize = sizeof(**iqs626->kp_code);
+
+       iqs626->keypad->name = "iqs626a_keypad";
+       iqs626->keypad->id.bustype = BUS_I2C;
+
+       for (i = 0; i < ARRAY_SIZE(iqs626_channels); i++) {
+               if (!(sys_reg->active & iqs626_channels[i].active))
+                       continue;
+
+               for (j = 0; j < ARRAY_SIZE(iqs626_events); j++) {
+                       if (!iqs626->kp_type[i][j])
+                               continue;
+
+                       input_set_capability(iqs626->keypad,
+                                            iqs626->kp_type[i][j],
+                                            iqs626->kp_code[i][j]);
+               }
+       }
+
+       if (!(sys_reg->active & iqs626_channels[IQS626_CH_TP_2].active))
+               return 0;
+
+       iqs626->trackpad = devm_input_allocate_device(&client->dev);
+       if (!iqs626->trackpad)
+               return -ENOMEM;
+
+       iqs626->trackpad->keycodemax = ARRAY_SIZE(iqs626->tp_code);
+       iqs626->trackpad->keycode = iqs626->tp_code;
+       iqs626->trackpad->keycodesize = sizeof(*iqs626->tp_code);
+
+       iqs626->trackpad->name = "iqs626a_trackpad";
+       iqs626->trackpad->id.bustype = BUS_I2C;
+
+       /*
+        * Present the trackpad as a traditional pointing device if no gestures
+        * have been mapped to a keycode.
+        */
+       if (sys_reg->event_mask & IQS626_EVENT_MASK_GESTURE) {
+               u8 tp_mask = iqs626_channels[IQS626_CH_TP_3].active;
+
+               input_set_capability(iqs626->trackpad, EV_KEY, BTN_TOUCH);
+               input_set_abs_params(iqs626->trackpad, ABS_Y, 0, 255, 0, 0);
+
+               if ((sys_reg->active & tp_mask) == tp_mask)
+                       input_set_abs_params(iqs626->trackpad,
+                                            ABS_X, 0, 255, 0, 0);
+               else
+                       input_set_abs_params(iqs626->trackpad,
+                                            ABS_X, 0, 128, 0, 0);
+
+               touchscreen_parse_properties(iqs626->trackpad, false,
+                                            &iqs626->prop);
+       } else {
+               for (i = 0; i < IQS626_NUM_GESTURES; i++)
+                       if (iqs626->tp_code[i] != KEY_RESERVED)
+                               input_set_capability(iqs626->trackpad, EV_KEY,
+                                                    iqs626->tp_code[i]);
+       }
+
+       error = input_register_device(iqs626->trackpad);
+       if (error)
+               dev_err(&client->dev, "Failed to register trackpad: %d\n",
+                       error);
+
+       return error;
+}
+
+static int iqs626_report(struct iqs626_private *iqs626)
+{
+       struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg;
+       struct i2c_client *client = iqs626->client;
+       struct iqs626_flags flags;
+       __le16 hall_output;
+       int error, i, j;
+       u8 state;
+       u8 *dir_mask = &flags.states[IQS626_ST_OFFS_DIR];
+
+       error = regmap_raw_read(iqs626->regmap, IQS626_SYS_FLAGS, &flags,
+                               sizeof(flags));
+       if (error) {
+               dev_err(&client->dev, "Failed to read device status: %d\n",
+                       error);
+               return error;
+       }
+
+       /*
+        * The device resets itself if its own watchdog bites, which can happen
+        * in the event of an I2C communication error. In this case, the device
+        * asserts a SHOW_RESET interrupt and all registers must be restored.
+        */
+       if (be16_to_cpu(flags.system) & IQS626_SYS_FLAGS_SHOW_RESET) {
+               dev_err(&client->dev, "Unexpected device reset\n");
+
+               error = regmap_raw_write(iqs626->regmap, IQS626_SYS_SETTINGS,
+                                        sys_reg, sizeof(*sys_reg));
+               if (error)
+                       dev_err(&client->dev,
+                               "Failed to re-initialize device: %d\n", error);
+
+               return error;
+       }
+
+       if (be16_to_cpu(flags.system) & IQS626_SYS_FLAGS_IN_ATI)
+               return 0;
+
+       /*
+        * Unlike the ULP or generic channels, the Hall channel does not have a
+        * direction flag. Instead, the direction (i.e. magnet polarity) can be
+        * derived based on the sign of the 2's complement differential output.
+        */
+       if (sys_reg->active & iqs626_channels[IQS626_CH_HALL].active) {
+               error = regmap_raw_read(iqs626->regmap, IQS626_HALL_OUTPUT,
+                                       &hall_output, sizeof(hall_output));
+               if (error) {
+                       dev_err(&client->dev,
+                               "Failed to read Hall output: %d\n", error);
+                       return error;
+               }
+
+               *dir_mask &= ~iqs626_channels[IQS626_CH_HALL].active;
+               if (le16_to_cpu(hall_output) < 0x8000)
+                       *dir_mask |= iqs626_channels[IQS626_CH_HALL].active;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(iqs626_channels); i++) {
+               if (!(sys_reg->active & iqs626_channels[i].active))
+                       continue;
+
+               for (j = 0; j < ARRAY_SIZE(iqs626_events); j++) {
+                       if (!iqs626->kp_type[i][j])
+                               continue;
+
+                       state = flags.states[iqs626_events[j].st_offs];
+                       state &= iqs626_events[j].dir_up ? *dir_mask
+                                                        : ~(*dir_mask);
+                       state &= iqs626_channels[i].active;
+
+                       input_event(iqs626->keypad, iqs626->kp_type[i][j],
+                                   iqs626->kp_code[i][j], !!state);
+               }
+       }
+
+       input_sync(iqs626->keypad);
+
+       /*
+        * The following completion signals that ATI has finished, any initial
+        * switch states have been reported and the keypad can be registered.
+        */
+       complete_all(&iqs626->ati_done);
+
+       if (!(sys_reg->active & iqs626_channels[IQS626_CH_TP_2].active))
+               return 0;
+
+       if (sys_reg->event_mask & IQS626_EVENT_MASK_GESTURE) {
+               state = flags.states[IQS626_ST_OFFS_TOUCH];
+               state &= iqs626_channels[IQS626_CH_TP_2].active;
+
+               input_report_key(iqs626->trackpad, BTN_TOUCH, state);
+
+               if (state)
+                       touchscreen_report_pos(iqs626->trackpad, &iqs626->prop,
+                                              flags.trackpad_x,
+                                              flags.trackpad_y, false);
+       } else {
+               for (i = 0; i < IQS626_NUM_GESTURES; i++)
+                       input_report_key(iqs626->trackpad, iqs626->tp_code[i],
+                                        flags.gesture & BIT(i));
+
+               if (flags.gesture & GENMASK(IQS626_GESTURE_TAP, 0)) {
+                       input_sync(iqs626->trackpad);
+
+                       /*
+                        * Momentary gestures are followed by a complementary
+                        * release cycle so as to emulate a full keystroke.
+                        */
+                       for (i = 0; i < IQS626_GESTURE_HOLD; i++)
+                               input_report_key(iqs626->trackpad,
+                                                iqs626->tp_code[i], 0);
+               }
+       }
+
+       input_sync(iqs626->trackpad);
+
+       return 0;
+}
+
+static irqreturn_t iqs626_irq(int irq, void *context)
+{
+       struct iqs626_private *iqs626 = context;
+
+       if (iqs626_report(iqs626))
+               return IRQ_NONE;
+
+       /*
+        * The device does not deassert its interrupt (RDY) pin until shortly
+        * after receiving an I2C stop condition; the following delay ensures
+        * the interrupt handler does not return before this time.
+        */
+       iqs626_irq_wait();
+
+       return IRQ_HANDLED;
+}
+
+static const struct regmap_config iqs626_regmap_config = {
+       .reg_bits = 8,
+       .val_bits = 16,
+       .max_register = IQS626_MAX_REG,
+};
+
+static int iqs626_probe(struct i2c_client *client)
+{
+       struct iqs626_ver_info ver_info;
+       struct iqs626_private *iqs626;
+       int error;
+
+       iqs626 = devm_kzalloc(&client->dev, sizeof(*iqs626), GFP_KERNEL);
+       if (!iqs626)
+               return -ENOMEM;
+
+       i2c_set_clientdata(client, iqs626);
+       iqs626->client = client;
+
+       iqs626->regmap = devm_regmap_init_i2c(client, &iqs626_regmap_config);
+       if (IS_ERR(iqs626->regmap)) {
+               error = PTR_ERR(iqs626->regmap);
+               dev_err(&client->dev, "Failed to initialize register map: %d\n",
+                       error);
+               return error;
+       }
+
+       init_completion(&iqs626->ati_done);
+
+       error = regmap_raw_read(iqs626->regmap, IQS626_VER_INFO, &ver_info,
+                               sizeof(ver_info));
+       if (error)
+               return error;
+
+       if (ver_info.prod_num != IQS626_VER_INFO_PROD_NUM) {
+               dev_err(&client->dev, "Unrecognized product number: 0x%02X\n",
+                       ver_info.prod_num);
+               return -EINVAL;
+       }
+
+       error = iqs626_parse_prop(iqs626);
+       if (error)
+               return error;
+
+       error = iqs626_input_init(iqs626);
+       if (error)
+               return error;
+
+       error = devm_request_threaded_irq(&client->dev, client->irq,
+                                         NULL, iqs626_irq, IRQF_ONESHOT,
+                                         client->name, iqs626);
+       if (error) {
+               dev_err(&client->dev, "Failed to request IRQ: %d\n", error);
+               return error;
+       }
+
+       if (!wait_for_completion_timeout(&iqs626->ati_done,
+                                        msecs_to_jiffies(2000))) {
+               dev_err(&client->dev, "Failed to complete ATI\n");
+               return -ETIMEDOUT;
+       }
+
+       /*
+        * The keypad may include one or more switches and is not registered
+        * until ATI is complete and the initial switch states are read.
+        */
+       error = input_register_device(iqs626->keypad);
+       if (error)
+               dev_err(&client->dev, "Failed to register keypad: %d\n", error);
+
+       return error;
+}
+
+static int __maybe_unused iqs626_suspend(struct device *dev)
+{
+       struct iqs626_private *iqs626 = dev_get_drvdata(dev);
+       struct i2c_client *client = iqs626->client;
+       unsigned int val;
+       int error;
+
+       if (!iqs626->suspend_mode)
+               return 0;
+
+       disable_irq(client->irq);
+
+       /*
+        * Automatic power mode switching must be disabled before the device is
+        * forced into any particular power mode. In this case, the device will
+        * transition into normal-power mode.
+        */
+       error = regmap_update_bits(iqs626->regmap, IQS626_SYS_SETTINGS,
+                                  IQS626_SYS_SETTINGS_DIS_AUTO, ~0);
+       if (error)
+               goto err_irq;
+
+       /*
+        * The following check ensures the device has completed its transition
+        * into normal-power mode before a manual mode switch is performed.
+        */
+       error = regmap_read_poll_timeout(iqs626->regmap, IQS626_SYS_FLAGS, val,
+                                       !(val & IQS626_SYS_FLAGS_PWR_MODE_MASK),
+                                        IQS626_PWR_MODE_POLL_SLEEP_US,
+                                        IQS626_PWR_MODE_POLL_TIMEOUT_US);
+       if (error)
+               goto err_irq;
+
+       error = regmap_update_bits(iqs626->regmap, IQS626_SYS_SETTINGS,
+                                  IQS626_SYS_SETTINGS_PWR_MODE_MASK,
+                                  iqs626->suspend_mode <<
+                                  IQS626_SYS_SETTINGS_PWR_MODE_SHIFT);
+       if (error)
+               goto err_irq;
+
+       /*
+        * This last check ensures the device has completed its transition into
+        * the desired power mode to prevent any spurious interrupts from being
+        * triggered after iqs626_suspend has already returned.
+        */
+       error = regmap_read_poll_timeout(iqs626->regmap, IQS626_SYS_FLAGS, val,
+                                        (val & IQS626_SYS_FLAGS_PWR_MODE_MASK)
+                                        == (iqs626->suspend_mode <<
+                                            IQS626_SYS_FLAGS_PWR_MODE_SHIFT),
+                                        IQS626_PWR_MODE_POLL_SLEEP_US,
+                                        IQS626_PWR_MODE_POLL_TIMEOUT_US);
+
+err_irq:
+       iqs626_irq_wait();
+       enable_irq(client->irq);
+
+       return error;
+}
+
+static int __maybe_unused iqs626_resume(struct device *dev)
+{
+       struct iqs626_private *iqs626 = dev_get_drvdata(dev);
+       struct i2c_client *client = iqs626->client;
+       unsigned int val;
+       int error;
+
+       if (!iqs626->suspend_mode)
+               return 0;
+
+       disable_irq(client->irq);
+
+       error = regmap_update_bits(iqs626->regmap, IQS626_SYS_SETTINGS,
+                                  IQS626_SYS_SETTINGS_PWR_MODE_MASK, 0);
+       if (error)
+               goto err_irq;
+
+       /*
+        * This check ensures the device has returned to normal-power mode
+        * before automatic power mode switching is re-enabled.
+        */
+       error = regmap_read_poll_timeout(iqs626->regmap, IQS626_SYS_FLAGS, val,
+                                       !(val & IQS626_SYS_FLAGS_PWR_MODE_MASK),
+                                        IQS626_PWR_MODE_POLL_SLEEP_US,
+                                        IQS626_PWR_MODE_POLL_TIMEOUT_US);
+       if (error)
+               goto err_irq;
+
+       error = regmap_update_bits(iqs626->regmap, IQS626_SYS_SETTINGS,
+                                  IQS626_SYS_SETTINGS_DIS_AUTO, 0);
+       if (error)
+               goto err_irq;
+
+       /*
+        * This step reports any events that may have been "swallowed" as a
+        * result of polling PWR_MODE (which automatically acknowledges any
+        * pending interrupts).
+        */
+       error = iqs626_report(iqs626);
+
+err_irq:
+       iqs626_irq_wait();
+       enable_irq(client->irq);
+
+       return error;
+}
+
+static SIMPLE_DEV_PM_OPS(iqs626_pm, iqs626_suspend, iqs626_resume);
+
+static const struct of_device_id iqs626_of_match[] = {
+       { .compatible = "azoteq,iqs626a" },
+       { }
+};
+MODULE_DEVICE_TABLE(of, iqs626_of_match);
+
+static struct i2c_driver iqs626_i2c_driver = {
+       .driver = {
+               .name = "iqs626a",
+               .of_match_table = iqs626_of_match,
+               .pm = &iqs626_pm,
+       },
+       .probe_new = iqs626_probe,
+};
+module_i2c_driver(iqs626_i2c_driver);
+
+MODULE_AUTHOR("Jeff LaBundy <jeff@labundy.com>");
+MODULE_DESCRIPTION("Azoteq IQS626A Capacitive Touch Controller");
+MODULE_LICENSE("GPL");
index 20ff087..cd5e99e 100644 (file)
@@ -61,15 +61,10 @@ static int max8997_haptic_set_duty_cycle(struct max8997_haptic *chip)
                unsigned int duty = chip->pwm_period * chip->level / 100;
                ret = pwm_config(chip->pwm, duty, chip->pwm_period);
        } else {
-               int i;
                u8 duty_index = 0;
 
-               for (i = 0; i <= 64; i++) {
-                       if (chip->level <= i * 100 / 64) {
-                               duty_index = i;
-                               break;
-                       }
-               }
+               duty_index = DIV_ROUND_UP(chip->level * 64, 100);
+
                switch (chip->internal_mode_pattern) {
                case 0:
                        max8997_write_reg(chip->client,
index e12da5b..dc4a240 100644 (file)
 #define ETP_FW_PAGE_SIZE_512   512
 #define ETP_FW_SIGNATURE_SIZE  6
 
+#define ETP_PRODUCT_ID_DELBIN  0x00C2
+#define ETP_PRODUCT_ID_VOXEL   0x00BF
+#define ETP_PRODUCT_ID_MAGPIE  0x0120
+#define ETP_PRODUCT_ID_BOBBA   0x0121
+
 struct i2c_client;
 struct completion;
 
@@ -73,7 +78,7 @@ struct elan_transport_ops {
        int (*calibrate_result)(struct i2c_client *client, u8 *val);
 
        int (*get_baseline_data)(struct i2c_client *client,
-                                bool max_baseliune, u8 *value);
+                                bool max_baseline, u8 *value);
 
        int (*get_version)(struct i2c_client *client, u8 pattern, bool iap,
                           u8 *version);
index bef7382..dad22c1 100644 (file)
@@ -46,6 +46,9 @@
 #define ETP_FINGER_WIDTH       15
 #define ETP_RETRY_COUNT                3
 
+/* quirks to control the device */
+#define ETP_QUIRK_QUICK_WAKEUP BIT(0)
+
 /* The main device structure */
 struct elan_tp_data {
        struct i2c_client       *client;
@@ -90,8 +93,38 @@ struct elan_tp_data {
        bool                    baseline_ready;
        u8                      clickpad;
        bool                    middle_button;
+
+       u32                     quirks;         /* Various quirks */
 };
 
+static u32 elan_i2c_lookup_quirks(u16 ic_type, u16 product_id)
+{
+       static const struct {
+               u16 ic_type;
+               u16 product_id;
+               u32 quirks;
+       } elan_i2c_quirks[] = {
+               { 0x0D, ETP_PRODUCT_ID_DELBIN, ETP_QUIRK_QUICK_WAKEUP },
+               { 0x10, ETP_PRODUCT_ID_VOXEL, ETP_QUIRK_QUICK_WAKEUP },
+               { 0x14, ETP_PRODUCT_ID_MAGPIE, ETP_QUIRK_QUICK_WAKEUP },
+               { 0x14, ETP_PRODUCT_ID_BOBBA, ETP_QUIRK_QUICK_WAKEUP },
+       };
+       u32 quirks = 0;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(elan_i2c_quirks); i++) {
+               if (elan_i2c_quirks[i].ic_type == ic_type &&
+                   elan_i2c_quirks[i].product_id == product_id) {
+                       quirks = elan_i2c_quirks[i].quirks;
+               }
+       }
+
+       if (ic_type >= 0x0D && product_id >= 0x123)
+               quirks |= ETP_QUIRK_QUICK_WAKEUP;
+
+       return quirks;
+}
+
 static int elan_get_fwinfo(u16 ic_type, u8 iap_version, u16 *validpage_count,
                           u32 *signature_address, u16 *page_size)
 {
@@ -258,16 +291,18 @@ static int elan_check_ASUS_special_fw(struct elan_tp_data *data)
        return false;
 }
 
-static int __elan_initialize(struct elan_tp_data *data)
+static int __elan_initialize(struct elan_tp_data *data, bool skip_reset)
 {
        struct i2c_client *client = data->client;
        bool woken_up = false;
        int error;
 
-       error = data->ops->initialize(client);
-       if (error) {
-               dev_err(&client->dev, "device initialize failed: %d\n", error);
-               return error;
+       if (!skip_reset) {
+               error = data->ops->initialize(client);
+               if (error) {
+                       dev_err(&client->dev, "device initialize failed: %d\n", error);
+                       return error;
+               }
        }
 
        error = elan_query_product(data);
@@ -311,16 +346,17 @@ static int __elan_initialize(struct elan_tp_data *data)
        return 0;
 }
 
-static int elan_initialize(struct elan_tp_data *data)
+static int elan_initialize(struct elan_tp_data *data, bool skip_reset)
 {
        int repeat = ETP_RETRY_COUNT;
        int error;
 
        do {
-               error = __elan_initialize(data);
+               error = __elan_initialize(data, skip_reset);
                if (!error)
                        return 0;
 
+               skip_reset = false;
                msleep(30);
        } while (--repeat > 0);
 
@@ -357,6 +393,8 @@ static int elan_query_device_info(struct elan_tp_data *data)
        if (error)
                return error;
 
+       data->quirks = elan_i2c_lookup_quirks(data->ic_type, data->product_id);
+
        error = elan_get_fwinfo(data->ic_type, data->iap_version,
                                &data->fw_validpage_count,
                                &data->fw_signature_address,
@@ -546,7 +584,7 @@ static int elan_update_firmware(struct elan_tp_data *data,
                data->ops->iap_reset(client);
        } else {
                /* Reinitialize TP after fw is updated */
-               elan_initialize(data);
+               elan_initialize(data, false);
                elan_query_device_info(data);
        }
 
@@ -1247,7 +1285,7 @@ static int elan_probe(struct i2c_client *client,
        }
 
        /* Initialize the touchpad. */
-       error = elan_initialize(data);
+       error = elan_initialize(data, false);
        if (error)
                return error;
 
@@ -1384,7 +1422,7 @@ static int __maybe_unused elan_resume(struct device *dev)
                goto err;
        }
 
-       error = elan_initialize(data);
+       error = elan_initialize(data, data->quirks & ETP_QUIRK_QUICK_WAKEUP);
        if (error)
                dev_err(dev, "initialize when resuming failed: %d\n", error);
 
index 594ac4e..974d7bf 100644 (file)
@@ -103,7 +103,6 @@ static int apbps2_open(struct serio *io)
 {
        struct apbps2_priv *priv = io->port_data;
        int limit;
-       unsigned long tmp;
 
        /* clear error flags */
        iowrite32be(0, &priv->regs->status);
@@ -111,7 +110,7 @@ static int apbps2_open(struct serio *io)
        /* Clear old data if available (unlikely) */
        limit = 1024;
        while ((ioread32be(&priv->regs->status) & APBPS2_STATUS_DR) && --limit)
-               tmp = ioread32be(&priv->regs->data);
+               ioread32be(&priv->regs->data);
 
        /* Enable reciever and it's interrupt */
        iowrite32be(APBPS2_CTRL_RE | APBPS2_CTRL_RI, &priv->regs->ctrl);
diff --git a/drivers/input/touchscreen.c b/drivers/input/touchscreen.c
new file mode 100644 (file)
index 0000000..dd18cb9
--- /dev/null
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Generic helper functions for touchscreens and other two-dimensional
+ *  pointing devices
+ *
+ *  Copyright (c) 2014 Sebastian Reichel <sre@kernel.org>
+ */
+
+#include <linux/property.h>
+#include <linux/input.h>
+#include <linux/input/mt.h>
+#include <linux/input/touchscreen.h>
+#include <linux/module.h>
+
+static bool touchscreen_get_prop_u32(struct device *dev,
+                                    const char *property,
+                                    unsigned int default_value,
+                                    unsigned int *value)
+{
+       u32 val;
+       int error;
+
+       error = device_property_read_u32(dev, property, &val);
+       if (error) {
+               *value = default_value;
+               return false;
+       }
+
+       *value = val;
+       return true;
+}
+
+static void touchscreen_set_params(struct input_dev *dev,
+                                  unsigned long axis,
+                                  int min, int max, int fuzz)
+{
+       struct input_absinfo *absinfo;
+
+       if (!test_bit(axis, dev->absbit)) {
+               dev_warn(&dev->dev,
+                        "Parameters are specified but the axis %lu is not set up\n",
+                        axis);
+               return;
+       }
+
+       absinfo = &dev->absinfo[axis];
+       absinfo->minimum = min;
+       absinfo->maximum = max;
+       absinfo->fuzz = fuzz;
+}
+
+/**
+ * touchscreen_parse_properties - parse common touchscreen properties
+ * @input: input device that should be parsed
+ * @multitouch: specifies whether parsed properties should be applied to
+ *     single-touch or multi-touch axes
+ * @prop: pointer to a struct touchscreen_properties into which to store
+ *     axis swap and invert info for use with touchscreen_report_x_y();
+ *     or %NULL
+ *
+ * This function parses common properties for touchscreens and sets up the
+ * input device accordingly. The function keeps previously set up default
+ * values if no value is specified.
+ */
+void touchscreen_parse_properties(struct input_dev *input, bool multitouch,
+                                 struct touchscreen_properties *prop)
+{
+       struct device *dev = input->dev.parent;
+       struct input_absinfo *absinfo;
+       unsigned int axis, axis_x, axis_y;
+       unsigned int minimum, maximum, fuzz;
+       bool data_present;
+
+       input_alloc_absinfo(input);
+       if (!input->absinfo)
+               return;
+
+       axis_x = multitouch ? ABS_MT_POSITION_X : ABS_X;
+       axis_y = multitouch ? ABS_MT_POSITION_Y : ABS_Y;
+
+       data_present = touchscreen_get_prop_u32(dev, "touchscreen-min-x",
+                                               input_abs_get_min(input, axis_x),
+                                               &minimum) |
+                      touchscreen_get_prop_u32(dev, "touchscreen-size-x",
+                                               input_abs_get_max(input,
+                                                                 axis_x) + 1,
+                                               &maximum) |
+                      touchscreen_get_prop_u32(dev, "touchscreen-fuzz-x",
+                                               input_abs_get_fuzz(input, axis_x),
+                                               &fuzz);
+       if (data_present)
+               touchscreen_set_params(input, axis_x, minimum, maximum - 1, fuzz);
+
+       data_present = touchscreen_get_prop_u32(dev, "touchscreen-min-y",
+                                               input_abs_get_min(input, axis_y),
+                                               &minimum) |
+                      touchscreen_get_prop_u32(dev, "touchscreen-size-y",
+                                               input_abs_get_max(input,
+                                                                 axis_y) + 1,
+                                               &maximum) |
+                      touchscreen_get_prop_u32(dev, "touchscreen-fuzz-y",
+                                               input_abs_get_fuzz(input, axis_y),
+                                               &fuzz);
+       if (data_present)
+               touchscreen_set_params(input, axis_y, minimum, maximum - 1, fuzz);
+
+       axis = multitouch ? ABS_MT_PRESSURE : ABS_PRESSURE;
+       data_present = touchscreen_get_prop_u32(dev,
+                                               "touchscreen-max-pressure",
+                                               input_abs_get_max(input, axis),
+                                               &maximum) |
+                      touchscreen_get_prop_u32(dev,
+                                               "touchscreen-fuzz-pressure",
+                                               input_abs_get_fuzz(input, axis),
+                                               &fuzz);
+       if (data_present)
+               touchscreen_set_params(input, axis, 0, maximum, fuzz);
+
+       if (!prop)
+               return;
+
+       prop->max_x = input_abs_get_max(input, axis_x);
+       prop->max_y = input_abs_get_max(input, axis_y);
+
+       prop->invert_x =
+               device_property_read_bool(dev, "touchscreen-inverted-x");
+       if (prop->invert_x) {
+               absinfo = &input->absinfo[axis_x];
+               absinfo->maximum -= absinfo->minimum;
+               absinfo->minimum = 0;
+       }
+
+       prop->invert_y =
+               device_property_read_bool(dev, "touchscreen-inverted-y");
+       if (prop->invert_y) {
+               absinfo = &input->absinfo[axis_y];
+               absinfo->maximum -= absinfo->minimum;
+               absinfo->minimum = 0;
+       }
+
+       prop->swap_x_y =
+               device_property_read_bool(dev, "touchscreen-swapped-x-y");
+       if (prop->swap_x_y)
+               swap(input->absinfo[axis_x], input->absinfo[axis_y]);
+}
+EXPORT_SYMBOL(touchscreen_parse_properties);
+
+static void
+touchscreen_apply_prop_to_x_y(const struct touchscreen_properties *prop,
+                             unsigned int *x, unsigned int *y)
+{
+       if (prop->invert_x)
+               *x = prop->max_x - *x;
+
+       if (prop->invert_y)
+               *y = prop->max_y - *y;
+
+       if (prop->swap_x_y)
+               swap(*x, *y);
+}
+
+/**
+ * touchscreen_set_mt_pos - Set input_mt_pos coordinates
+ * @pos: input_mt_pos to set coordinates of
+ * @prop: pointer to a struct touchscreen_properties
+ * @x: X coordinate to store in pos
+ * @y: Y coordinate to store in pos
+ *
+ * Adjust the passed in x and y values applying any axis inversion and
+ * swapping requested in the passed in touchscreen_properties and store
+ * the result in a struct input_mt_pos.
+ */
+void touchscreen_set_mt_pos(struct input_mt_pos *pos,
+                           const struct touchscreen_properties *prop,
+                           unsigned int x, unsigned int y)
+{
+       touchscreen_apply_prop_to_x_y(prop, &x, &y);
+       pos->x = x;
+       pos->y = y;
+}
+EXPORT_SYMBOL(touchscreen_set_mt_pos);
+
+/**
+ * touchscreen_report_pos - Report touchscreen coordinates
+ * @input: input_device to report coordinates for
+ * @prop: pointer to a struct touchscreen_properties
+ * @x: X coordinate to report
+ * @y: Y coordinate to report
+ * @multitouch: Report coordinates on single-touch or multi-touch axes
+ *
+ * Adjust the passed in x and y values applying any axis inversion and
+ * swapping requested in the passed in touchscreen_properties and then
+ * report the resulting coordinates on the input_dev's x and y axis.
+ */
+void touchscreen_report_pos(struct input_dev *input,
+                           const struct touchscreen_properties *prop,
+                           unsigned int x, unsigned int y,
+                           bool multitouch)
+{
+       touchscreen_apply_prop_to_x_y(prop, &x, &y);
+       input_report_abs(input, multitouch ? ABS_MT_POSITION_X : ABS_X, x);
+       input_report_abs(input, multitouch ? ABS_MT_POSITION_Y : ABS_Y, y);
+}
+EXPORT_SYMBOL(touchscreen_report_pos);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Helper functions for touchscreens and other devices");
index 529614d..ad454cd 100644 (file)
@@ -12,10 +12,6 @@ menuconfig INPUT_TOUCHSCREEN
 
 if INPUT_TOUCHSCREEN
 
-config TOUCHSCREEN_PROPERTIES
-       def_tristate INPUT
-       depends on INPUT
-
 config TOUCHSCREEN_88PM860X
        tristate "Marvell 88PM860x touchscreen"
        depends on MFD_88PM860X
@@ -415,6 +411,17 @@ config TOUCHSCREEN_HIDEEP
          To compile this driver as a module, choose M here : the
          module will be called hideep_ts.
 
+config TOUCHSCREEN_HYCON_HY46XX
+       tristate "Hycon hy46xx touchscreen support"
+       depends on I2C
+       help
+         Say Y here if you have a touchscreen using Hycon hy46xx
+
+         If unsure, say N.
+
+         To compile this driver as a module, choose M here: the
+         module will be called hycon-hy46xx.
+
 config TOUCHSCREEN_ILI210X
        tristate "Ilitek ILI210X based touchscreen"
        depends on I2C
@@ -430,6 +437,18 @@ config TOUCHSCREEN_ILI210X
          To compile this driver as a module, choose M here: the
          module will be called ili210x.
 
+config TOUCHSCREEN_ILITEK
+       tristate "Ilitek I2C 213X/23XX/25XX/Lego Series Touch ICs"
+       depends on I2C
+       help
+         Say Y here if you have touchscreen with ILITEK touch IC,
+         it supports 213X/23XX/25XX and other Lego series.
+
+         If unsure, say N.
+
+         To compile this driver as a module, choose M here: the
+         module will be called ilitek_ts_i2c.
+
 config TOUCHSCREEN_IPROC
        tristate "IPROC touch panel driver support"
        depends on ARCH_BCM_IPROC || COMPILE_TEST
@@ -594,6 +613,18 @@ config TOUCHSCREEN_MELFAS_MIP4
          To compile this driver as a module, choose M here:
          the module will be called melfas_mip4.
 
+config TOUCHSCREEN_MSG2638
+       tristate "MStar msg2638 touchscreen support"
+       depends on I2C
+       depends on GPIOLIB || COMPILE_TEST
+       help
+         Say Y here if you have an I2C touchscreen using MStar msg2638.
+
+         If unsure, say N.
+
+         To compile this driver as a module, choose M here: the
+         module will be called msg2638.
+
 config TOUCHSCREEN_MTOUCH
        tristate "MicroTouch serial touchscreens"
        select SERIO
index 6233541..7d34100 100644 (file)
@@ -7,7 +7,6 @@
 
 wm97xx-ts-y := wm97xx-core.o
 
-obj-$(CONFIG_TOUCHSCREEN_PROPERTIES)   += of_touchscreen.o
 obj-$(CONFIG_TOUCHSCREEN_88PM860X)     += 88pm860x-ts.o
 obj-$(CONFIG_TOUCHSCREEN_AD7877)       += ad7877.o
 obj-$(CONFIG_TOUCHSCREEN_AD7879)       += ad7879.o
@@ -35,6 +34,7 @@ obj-$(CONFIG_TOUCHSCREEN_DA9052)      += da9052_tsi.o
 obj-$(CONFIG_TOUCHSCREEN_DYNAPRO)      += dynapro.o
 obj-$(CONFIG_TOUCHSCREEN_EDT_FT5X06)   += edt-ft5x06.o
 obj-$(CONFIG_TOUCHSCREEN_HAMPSHIRE)    += hampshire.o
+obj-$(CONFIG_TOUCHSCREEN_HYCON_HY46XX) += hycon-hy46xx.o
 obj-$(CONFIG_TOUCHSCREEN_GUNZE)                += gunze.o
 obj-$(CONFIG_TOUCHSCREEN_EETI)         += eeti_ts.o
 obj-$(CONFIG_TOUCHSCREEN_EKTF2127)     += ektf2127.o
@@ -47,6 +47,7 @@ obj-$(CONFIG_TOUCHSCREEN_FUJITSU)     += fujitsu_ts.o
 obj-$(CONFIG_TOUCHSCREEN_GOODIX)       += goodix.o
 obj-$(CONFIG_TOUCHSCREEN_HIDEEP)       += hideep.o
 obj-$(CONFIG_TOUCHSCREEN_ILI210X)      += ili210x.o
+obj-$(CONFIG_TOUCHSCREEN_ILITEK)       += ilitek_ts_i2c.o
 obj-$(CONFIG_TOUCHSCREEN_IMX6UL_TSC)   += imx6ul_tsc.o
 obj-$(CONFIG_TOUCHSCREEN_INEXIO)       += inexio.o
 obj-$(CONFIG_TOUCHSCREEN_IPROC)                += bcm_iproc_tsc.o
@@ -59,6 +60,7 @@ obj-$(CONFIG_TOUCHSCREEN_MCS5000)     += mcs5000_ts.o
 obj-$(CONFIG_TOUCHSCREEN_MELFAS_MIP4)  += melfas_mip4.o
 obj-$(CONFIG_TOUCHSCREEN_MIGOR)                += migor_ts.o
 obj-$(CONFIG_TOUCHSCREEN_MMS114)       += mms114.o
+obj-$(CONFIG_TOUCHSCREEN_MSG2638)      += msg2638.o
 obj-$(CONFIG_TOUCHSCREEN_MTOUCH)       += mtouch.o
 obj-$(CONFIG_TOUCHSCREEN_MK712)                += mk712.o
 obj-$(CONFIG_TOUCHSCREEN_HP600)                += hp680_ts_input.o
index c0d5c24..dc6a853 100644 (file)
@@ -125,7 +125,7 @@ static int ar1021_i2c_probe(struct i2c_client *client,
 
        error = devm_request_threaded_irq(&client->dev, client->irq,
                                          NULL, ar1021_i2c_irq,
-                                         IRQF_ONESHOT,
+                                         IRQF_ONESHOT | IRQF_NO_AUTOEN,
                                          "ar1021_i2c", ar1021);
        if (error) {
                dev_err(&client->dev,
@@ -133,9 +133,6 @@ static int ar1021_i2c_probe(struct i2c_client *client,
                return error;
        }
 
-       /* Disable the IRQ, we'll enable it in ar1021_i2c_open() */
-       disable_irq(client->irq);
-
        error = input_register_device(ar1021->input);
        if (error) {
                dev_err(&client->dev,
index 383a848..05de92c 100644 (file)
@@ -31,6 +31,7 @@
 #include <media/v4l2-ioctl.h>
 #include <media/videobuf2-v4l2.h>
 #include <media/videobuf2-vmalloc.h>
+#include <dt-bindings/input/atmel-maxtouch.h>
 
 /* Firmware files */
 #define MXT_FW_NAME            "maxtouch.fw"
@@ -199,6 +200,7 @@ enum t100_type {
 #define MXT_CRC_TIMEOUT                1000    /* msec */
 #define MXT_FW_RESET_TIME      3000    /* msec */
 #define MXT_FW_CHG_TIMEOUT     300     /* msec */
+#define MXT_WAKEUP_TIME                25      /* msec */
 
 /* Command to unlock bootloader */
 #define MXT_UNLOCK_CMD_MSB     0xaa
@@ -312,6 +314,7 @@ struct mxt_data {
        struct mxt_dbg dbg;
        struct regulator_bulk_data regulators[2];
        struct gpio_desc *reset_gpio;
+       struct gpio_desc *wake_gpio;
        bool use_retrigen_workaround;
 
        /* Cached parameters from object table */
@@ -342,6 +345,8 @@ struct mxt_data {
        unsigned int t19_num_keys;
 
        enum mxt_suspend_mode suspend_mode;
+
+       u32 wakeup_method;
 };
 
 struct mxt_vb2_buffer {
@@ -621,10 +626,42 @@ static int mxt_send_bootloader_cmd(struct mxt_data *data, bool unlock)
        return mxt_bootloader_write(data, buf, sizeof(buf));
 }
 
+static bool mxt_wakeup_toggle(struct i2c_client *client,
+                             bool wake_up, bool in_i2c)
+{
+       struct mxt_data *data = i2c_get_clientdata(client);
+
+       switch (data->wakeup_method) {
+       case ATMEL_MXT_WAKEUP_I2C_SCL:
+               if (!in_i2c)
+                       return false;
+               break;
+
+       case ATMEL_MXT_WAKEUP_GPIO:
+               if (in_i2c)
+                       return false;
+
+               gpiod_set_value(data->wake_gpio, wake_up);
+               break;
+
+       default:
+               return false;
+       }
+
+       if (wake_up) {
+               dev_dbg(&client->dev, "waking up controller\n");
+
+               msleep(MXT_WAKEUP_TIME);
+       }
+
+       return true;
+}
+
 static int __mxt_read_reg(struct i2c_client *client,
                               u16 reg, u16 len, void *val)
 {
        struct i2c_msg xfer[2];
+       bool retried = false;
        u8 buf[2];
        int ret;
 
@@ -643,9 +680,13 @@ static int __mxt_read_reg(struct i2c_client *client,
        xfer[1].len = len;
        xfer[1].buf = val;
 
+retry:
        ret = i2c_transfer(client->adapter, xfer, 2);
        if (ret == 2) {
                ret = 0;
+       } else if (!retried && mxt_wakeup_toggle(client, true, true)) {
+               retried = true;
+               goto retry;
        } else {
                if (ret >= 0)
                        ret = -EIO;
@@ -659,6 +700,7 @@ static int __mxt_read_reg(struct i2c_client *client,
 static int __mxt_write_reg(struct i2c_client *client, u16 reg, u16 len,
                           const void *val)
 {
+       bool retried = false;
        u8 *buf;
        size_t count;
        int ret;
@@ -672,9 +714,13 @@ static int __mxt_write_reg(struct i2c_client *client, u16 reg, u16 len,
        buf[1] = (reg >> 8) & 0xff;
        memcpy(&buf[2], val, len);
 
+retry:
        ret = i2c_master_send(client, buf, count);
        if (ret == count) {
                ret = 0;
+       } else if (!retried && mxt_wakeup_toggle(client, true, true)) {
+               retried = true;
+               goto retry;
        } else {
                if (ret >= 0)
                        ret = -EIO;
@@ -2975,6 +3021,8 @@ static const struct attribute_group mxt_attr_group = {
 
 static void mxt_start(struct mxt_data *data)
 {
+       mxt_wakeup_toggle(data->client, true, false);
+
        switch (data->suspend_mode) {
        case MXT_SUSPEND_T9_CTRL:
                mxt_soft_reset(data);
@@ -3009,6 +3057,8 @@ static void mxt_stop(struct mxt_data *data)
                mxt_set_t7_power_cfg(data, MXT_POWER_CFG_DEEPSLEEP);
                break;
        }
+
+       mxt_wakeup_toggle(data->client, false, false);
 }
 
 static int mxt_input_open(struct input_dev *dev)
@@ -3155,16 +3205,24 @@ static int mxt_probe(struct i2c_client *client, const struct i2c_device_id *id)
                return error;
        }
 
+       /* Request the WAKE line as asserted so we go out of sleep */
+       data->wake_gpio = devm_gpiod_get_optional(&client->dev,
+                                                 "wake", GPIOD_OUT_HIGH);
+       if (IS_ERR(data->wake_gpio)) {
+               error = PTR_ERR(data->wake_gpio);
+               dev_err(&client->dev, "Failed to get wake gpio: %d\n", error);
+               return error;
+       }
+
        error = devm_request_threaded_irq(&client->dev, client->irq,
-                                         NULL, mxt_interrupt, IRQF_ONESHOT,
+                                         NULL, mxt_interrupt,
+                                         IRQF_ONESHOT | IRQF_NO_AUTOEN,
                                          client->name, data);
        if (error) {
                dev_err(&client->dev, "Failed to register interrupt\n");
                return error;
        }
 
-       disable_irq(client->irq);
-
        error = regulator_bulk_enable(ARRAY_SIZE(data->regulators),
                                      data->regulators);
        if (error) {
@@ -3185,6 +3243,25 @@ static int mxt_probe(struct i2c_client *client, const struct i2c_device_id *id)
                msleep(MXT_RESET_INVALID_CHG);
        }
 
+       /*
+        * Controllers like mXT1386 have a dedicated WAKE line that could be
+        * connected to a GPIO or to I2C SCL pin, or permanently asserted low.
+        *
+        * This WAKE line is used for waking controller from a deep-sleep and
+        * it needs to be asserted low for 25 milliseconds before I2C transfers
+        * could be accepted by controller if it was in a deep-sleep mode.
+        * Controller will go into sleep automatically after 2 seconds of
+        * inactivity if WAKE line is deasserted and deep sleep is activated.
+        *
+        * If WAKE line is connected to I2C SCL pin, then the first I2C transfer
+        * will get an instant NAK and transfer needs to be retried after 25ms.
+        *
+        * If WAKE line is connected to a GPIO line, the line must be asserted
+        * 25ms before the host attempts to communicate with the controller.
+        */
+       device_property_read_u32(&client->dev, "atmel,wakeup-method",
+                                &data->wakeup_method);
+
        error = mxt_initialize(data);
        if (error)
                goto err_disable_regulators;
index 341925e..392950a 100644 (file)
@@ -401,10 +401,10 @@ static int bu21029_probe(struct i2c_client *client,
 
        input_set_drvdata(in_dev, bu21029);
 
-       irq_set_status_flags(client->irq, IRQ_NOAUTOEN);
        error = devm_request_threaded_irq(&client->dev, client->irq,
                                          NULL, bu21029_touch_soft_irq,
-                                         IRQF_ONESHOT, DRIVER_NAME, bu21029);
+                                         IRQF_ONESHOT | IRQF_NO_AUTOEN,
+                                         DRIVER_NAME, bu21029);
        if (error) {
                dev_err(&client->dev,
                        "unable to request touch irq: %d\n", error);
index 73c854f..106dd49 100644 (file)
@@ -229,16 +229,21 @@ static int cyttsp_set_sysinfo_regs(struct cyttsp *ts)
 static void cyttsp_hard_reset(struct cyttsp *ts)
 {
        if (ts->reset_gpio) {
+               /*
+                * According to the CY8CTMA340 datasheet page 21, the external
+                * reset pulse width should be >= 1 ms. The datasheet does not
+                * specify how long we have to wait after reset but a vendor
+                * tree specifies 5 ms here.
+                */
                gpiod_set_value_cansleep(ts->reset_gpio, 1);
-               msleep(CY_DELAY_DFLT);
+               usleep_range(1000, 2000);
                gpiod_set_value_cansleep(ts->reset_gpio, 0);
-               msleep(CY_DELAY_DFLT);
+               usleep_range(5000, 6000);
        }
 }
 
 static int cyttsp_soft_reset(struct cyttsp *ts)
 {
-       unsigned long timeout;
        int retval;
 
        /* wait for interrupt to set ready completion */
@@ -248,12 +253,16 @@ static int cyttsp_soft_reset(struct cyttsp *ts)
        enable_irq(ts->irq);
 
        retval = ttsp_send_command(ts, CY_SOFT_RESET_MODE);
-       if (retval)
+       if (retval) {
+               dev_err(ts->dev, "failed to send soft reset\n");
                goto out;
+       }
 
-       timeout = wait_for_completion_timeout(&ts->bl_ready,
-                       msecs_to_jiffies(CY_DELAY_DFLT * CY_DELAY_MAX));
-       retval = timeout ? 0 : -EIO;
+       if (!wait_for_completion_timeout(&ts->bl_ready,
+                       msecs_to_jiffies(CY_DELAY_DFLT * CY_DELAY_MAX))) {
+               dev_err(ts->dev, "timeout waiting for soft reset\n");
+               retval = -EIO;
+       }
 
 out:
        ts->state = CY_IDLE_STATE;
@@ -405,8 +414,10 @@ static int cyttsp_power_on(struct cyttsp *ts)
        if (GET_BOOTLOADERMODE(ts->bl_data.bl_status) &&
            IS_VALID_APP(ts->bl_data.bl_status)) {
                error = cyttsp_exit_bl_mode(ts);
-               if (error)
+               if (error) {
+                       dev_err(ts->dev, "failed to exit bootloader mode\n");
                        return error;
+               }
        }
 
        if (GET_HSTMODE(ts->bl_data.bl_file) != CY_OPERATE_MODE ||
@@ -629,10 +640,8 @@ struct cyttsp *cyttsp_probe(const struct cyttsp_bus_ops *bus_ops,
                return ERR_PTR(error);
 
        init_completion(&ts->bl_ready);
-       snprintf(ts->phys, sizeof(ts->phys), "%s/input0", dev_name(dev));
 
        input_dev->name = "Cypress TTSP TouchScreen";
-       input_dev->phys = ts->phys;
        input_dev->id.bustype = bus_ops->bustype;
        input_dev->dev.parent = ts->dev;
 
@@ -643,16 +652,20 @@ struct cyttsp *cyttsp_probe(const struct cyttsp_bus_ops *bus_ops,
 
        input_set_capability(input_dev, EV_ABS, ABS_MT_POSITION_X);
        input_set_capability(input_dev, EV_ABS, ABS_MT_POSITION_Y);
+       /* One byte for width 0..255 so this is the limit */
+       input_set_abs_params(input_dev, ABS_MT_TOUCH_MAJOR, 0, 255, 0, 0);
+
        touchscreen_parse_properties(input_dev, true, NULL);
 
-       error = input_mt_init_slots(input_dev, CY_MAX_ID, 0);
+       error = input_mt_init_slots(input_dev, CY_MAX_ID, INPUT_MT_DIRECT);
        if (error) {
                dev_err(dev, "Unable to init MT slots.\n");
                return ERR_PTR(error);
        }
 
        error = devm_request_threaded_irq(dev, ts->irq, NULL, cyttsp_irq,
-                                         IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+                                         IRQF_TRIGGER_FALLING | IRQF_ONESHOT |
+                                         IRQF_NO_AUTOEN,
                                          "cyttsp", ts);
        if (error) {
                dev_err(ts->dev, "failed to request IRQ %d, err: %d\n",
@@ -660,8 +673,6 @@ struct cyttsp *cyttsp_probe(const struct cyttsp_bus_ops *bus_ops,
                return ERR_PTR(error);
        }
 
-       disable_irq(ts->irq);
-
        cyttsp_hard_reset(ts);
 
        error = cyttsp_power_on(ts);
index 8c65133..9bc4fe7 100644 (file)
@@ -114,7 +114,6 @@ struct cyttsp {
        struct device *dev;
        int irq;
        struct input_dev *input;
-       char phys[32];
        const struct cyttsp_bus_ops *bus_ops;
        struct cyttsp_bootloader_data bl_data;
        struct cyttsp_sysinfo_data sysinfo_data;
index 5f7706f..17540bd 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/of.h>
 #include <linux/gpio/consumer.h>
 #include <linux/regulator/consumer.h>
+#include <linux/uuid.h>
 #include <asm/unaligned.h>
 
 /* Device, Driver information */
@@ -1334,6 +1335,40 @@ static void elants_i2c_power_off(void *_data)
        }
 }
 
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id i2c_hid_ids[] = {
+       {"ACPI0C50", 0 },
+       {"PNP0C50", 0 },
+       { },
+};
+
+static const guid_t i2c_hid_guid =
+       GUID_INIT(0x3CDFF6F7, 0x4267, 0x4555,
+                 0xAD, 0x05, 0xB3, 0x0A, 0x3D, 0x89, 0x38, 0xDE);
+
+static bool elants_acpi_is_hid_device(struct device *dev)
+{
+       acpi_handle handle = ACPI_HANDLE(dev);
+       union acpi_object *obj;
+
+       if (acpi_match_device_ids(ACPI_COMPANION(dev), i2c_hid_ids))
+               return false;
+
+       obj = acpi_evaluate_dsm_typed(handle, &i2c_hid_guid, 1, 1, NULL, ACPI_TYPE_INTEGER);
+       if (obj) {
+               ACPI_FREE(obj);
+               return true;
+       }
+
+       return false;
+}
+#else
+static bool elants_acpi_is_hid_device(struct device *dev)
+{
+       return false;
+}
+#endif
+
 static int elants_i2c_probe(struct i2c_client *client,
                            const struct i2c_device_id *id)
 {
@@ -1342,9 +1377,14 @@ static int elants_i2c_probe(struct i2c_client *client,
        unsigned long irqflags;
        int error;
 
+       /* Don't bind to i2c-hid compatible devices, these are handled by the i2c-hid drv. */
+       if (elants_acpi_is_hid_device(&client->dev)) {
+               dev_warn(&client->dev, "This device appears to be an I2C-HID device, not binding\n");
+               return -ENODEV;
+       }
+
        if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
-               dev_err(&client->dev,
-                       "%s: i2c check functionality error\n", DEVICE_NAME);
+               dev_err(&client->dev, "I2C check functionality error\n");
                return -ENXIO;
        }
 
index a6597f0..cbe0dd4 100644 (file)
 #define EXC3000_NUM_SLOTS              10
 #define EXC3000_SLOTS_PER_FRAME                5
 #define EXC3000_LEN_FRAME              66
+#define EXC3000_LEN_VENDOR_REQUEST     68
 #define EXC3000_LEN_POINT              10
 
 #define EXC3000_LEN_MODEL_NAME         16
 #define EXC3000_LEN_FW_VERSION         16
 
+#define EXC3000_VENDOR_EVENT           0x03
 #define EXC3000_MT1_EVENT              0x06
 #define EXC3000_MT2_EVENT              0x18
 
@@ -76,9 +78,6 @@ struct exc3000_data {
        u8 buf[2 * EXC3000_LEN_FRAME];
        struct completion wait_event;
        struct mutex query_lock;
-       int query_result;
-       char model[EXC3000_LEN_MODEL_NAME];
-       char fw_version[EXC3000_LEN_FW_VERSION];
 };
 
 static void exc3000_report_slots(struct input_dev *input,
@@ -105,15 +104,16 @@ static void exc3000_timer(struct timer_list *t)
        input_sync(data->input);
 }
 
+static inline void exc3000_schedule_timer(struct exc3000_data *data)
+{
+       mod_timer(&data->timer, jiffies + msecs_to_jiffies(EXC3000_TIMEOUT_MS));
+}
+
 static int exc3000_read_frame(struct exc3000_data *data, u8 *buf)
 {
        struct i2c_client *client = data->client;
-       u8 expected_event = EXC3000_MT1_EVENT;
        int ret;
 
-       if (data->info->max_xy == SZ_16K - 1)
-               expected_event = EXC3000_MT2_EVENT;
-
        ret = i2c_master_send(client, "'", 2);
        if (ret < 0)
                return ret;
@@ -131,175 +131,196 @@ static int exc3000_read_frame(struct exc3000_data *data, u8 *buf)
        if (get_unaligned_le16(buf) != EXC3000_LEN_FRAME)
                return -EINVAL;
 
-       if (buf[2] != expected_event)
-               return -EINVAL;
-
        return 0;
 }
 
-static int exc3000_read_data(struct exc3000_data *data,
-                            u8 *buf, int *n_slots)
+static int exc3000_handle_mt_event(struct exc3000_data *data)
 {
-       int error;
-
-       error = exc3000_read_frame(data, buf);
-       if (error)
-               return error;
+       struct input_dev *input = data->input;
+       int ret, total_slots;
+       u8 *buf = data->buf;
 
-       *n_slots = buf[3];
-       if (!*n_slots || *n_slots > EXC3000_NUM_SLOTS)
-               return -EINVAL;
+       total_slots = buf[3];
+       if (!total_slots || total_slots > EXC3000_NUM_SLOTS) {
+               ret = -EINVAL;
+               goto out_fail;
+       }
 
-       if (*n_slots > EXC3000_SLOTS_PER_FRAME) {
+       if (total_slots > EXC3000_SLOTS_PER_FRAME) {
                /* Read 2nd frame to get the rest of the contacts. */
-               error = exc3000_read_frame(data, buf + EXC3000_LEN_FRAME);
-               if (error)
-                       return error;
+               ret = exc3000_read_frame(data, buf + EXC3000_LEN_FRAME);
+               if (ret)
+                       goto out_fail;
 
                /* 2nd chunk must have number of contacts set to 0. */
-               if (buf[EXC3000_LEN_FRAME + 3] != 0)
-                       return -EINVAL;
+               if (buf[EXC3000_LEN_FRAME + 3] != 0) {
+                       ret = -EINVAL;
+                       goto out_fail;
+               }
        }
 
-       return 0;
-}
-
-static int exc3000_query_interrupt(struct exc3000_data *data)
-{
-       u8 *buf = data->buf;
-       int error;
+       /*
+        * We read full state successfully, no contacts will be "stuck".
+        */
+       del_timer_sync(&data->timer);
 
-       error = i2c_master_recv(data->client, buf, EXC3000_LEN_FRAME);
-       if (error < 0)
-               return error;
+       while (total_slots > 0) {
+               int slots = min(total_slots, EXC3000_SLOTS_PER_FRAME);
 
-       if (buf[0] != 'B')
-               return -EPROTO;
+               exc3000_report_slots(input, &data->prop, buf + 4, slots);
+               total_slots -= slots;
+               buf += EXC3000_LEN_FRAME;
+       }
 
-       if (buf[4] == 'E')
-               strlcpy(data->model, buf + 5, sizeof(data->model));
-       else if (buf[4] == 'D')
-               strlcpy(data->fw_version, buf + 5, sizeof(data->fw_version));
-       else
-               return -EPROTO;
+       input_mt_sync_frame(input);
+       input_sync(input);
 
        return 0;
+
+out_fail:
+       /* Schedule a timer to release "stuck" contacts */
+       exc3000_schedule_timer(data);
+
+       return ret;
 }
 
 static irqreturn_t exc3000_interrupt(int irq, void *dev_id)
 {
        struct exc3000_data *data = dev_id;
-       struct input_dev *input = data->input;
        u8 *buf = data->buf;
-       int slots, total_slots;
-       int error;
-
-       if (mutex_is_locked(&data->query_lock)) {
-               data->query_result = exc3000_query_interrupt(data);
-               complete(&data->wait_event);
-               goto out;
-       }
+       int ret;
 
-       error = exc3000_read_data(data, buf, &total_slots);
-       if (error) {
+       ret = exc3000_read_frame(data, buf);
+       if (ret) {
                /* Schedule a timer to release "stuck" contacts */
-               mod_timer(&data->timer,
-                         jiffies + msecs_to_jiffies(EXC3000_TIMEOUT_MS));
+               exc3000_schedule_timer(data);
                goto out;
        }
 
-       /*
-        * We read full state successfully, no contacts will be "stuck".
-        */
-       del_timer_sync(&data->timer);
+       switch (buf[2]) {
+       case EXC3000_VENDOR_EVENT:
+               complete(&data->wait_event);
+               break;
 
-       while (total_slots > 0) {
-               slots = min(total_slots, EXC3000_SLOTS_PER_FRAME);
-               exc3000_report_slots(input, &data->prop, buf + 4, slots);
-               total_slots -= slots;
-               buf += EXC3000_LEN_FRAME;
-       }
+       case EXC3000_MT1_EVENT:
+       case EXC3000_MT2_EVENT:
+               exc3000_handle_mt_event(data);
+               break;
 
-       input_mt_sync_frame(input);
-       input_sync(input);
+       default:
+               break;
+       }
 
 out:
        return IRQ_HANDLED;
 }
 
-static ssize_t fw_version_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
+static int exc3000_vendor_data_request(struct exc3000_data *data, u8 *request,
+                                      u8 request_len, u8 *response, int timeout)
 {
-       struct i2c_client *client = to_i2c_client(dev);
-       struct exc3000_data *data = i2c_get_clientdata(client);
-       static const u8 request[68] = {
-               0x67, 0x00, 0x42, 0x00, 0x03, 0x01, 'D', 0x00
-       };
-       int error;
+       u8 buf[EXC3000_LEN_VENDOR_REQUEST] = { 0x67, 0x00, 0x42, 0x00, 0x03 };
+       int ret;
 
        mutex_lock(&data->query_lock);
 
-       data->query_result = -ETIMEDOUT;
        reinit_completion(&data->wait_event);
 
-       error = i2c_master_send(client, request, sizeof(request));
-       if (error < 0) {
-               mutex_unlock(&data->query_lock);
-               return error;
+       buf[5] = request_len;
+       memcpy(&buf[6], request, request_len);
+
+       ret = i2c_master_send(data->client, buf, EXC3000_LEN_VENDOR_REQUEST);
+       if (ret < 0)
+               goto out_unlock;
+
+       if (response) {
+               ret = wait_for_completion_timeout(&data->wait_event,
+                                                 timeout * HZ);
+               if (ret <= 0) {
+                       ret = -ETIMEDOUT;
+                       goto out_unlock;
+               }
+
+               if (data->buf[3] >= EXC3000_LEN_FRAME) {
+                       ret = -ENOSPC;
+                       goto out_unlock;
+               }
+
+               memcpy(response, &data->buf[4], data->buf[3]);
+               ret = data->buf[3];
        }
 
-       wait_for_completion_interruptible_timeout(&data->wait_event, 1 * HZ);
+out_unlock:
        mutex_unlock(&data->query_lock);
 
-       if (data->query_result < 0)
-               return data->query_result;
-
-       return sprintf(buf, "%s\n", data->fw_version);
+       return ret;
 }
-static DEVICE_ATTR_RO(fw_version);
 
-static ssize_t exc3000_get_model(struct exc3000_data *data)
+static ssize_t fw_version_show(struct device *dev,
+                              struct device_attribute *attr, char *buf)
 {
-       static const u8 request[68] = {
-               0x67, 0x00, 0x42, 0x00, 0x03, 0x01, 'E', 0x00
-       };
-       struct i2c_client *client = data->client;
-       int error;
+       struct i2c_client *client = to_i2c_client(dev);
+       struct exc3000_data *data = i2c_get_clientdata(client);
+       u8 response[EXC3000_LEN_FRAME];
+       int ret;
 
-       mutex_lock(&data->query_lock);
-       data->query_result = -ETIMEDOUT;
-       reinit_completion(&data->wait_event);
+       /* query bootloader info */
+       ret = exc3000_vendor_data_request(data,
+                                         (u8[]){0x39, 0x02}, 2, response, 1);
+       if (ret < 0)
+               return ret;
 
-       error = i2c_master_send(client, request, sizeof(request));
-       if (error < 0) {
-               mutex_unlock(&data->query_lock);
-               return error;
-       }
+       /*
+        * If the bootloader version is non-zero then the device is in
+        * bootloader mode and won't answer a query for the application FW
+        * version, so we just use the bootloader version info.
+        */
+       if (response[2] || response[3])
+               return sprintf(buf, "%d.%d\n", response[2], response[3]);
 
-       wait_for_completion_interruptible_timeout(&data->wait_event, 1 * HZ);
-       mutex_unlock(&data->query_lock);
+       ret = exc3000_vendor_data_request(data, (u8[]){'D'}, 1, response, 1);
+       if (ret < 0)
+               return ret;
 
-       return data->query_result;
+       return sprintf(buf, "%s\n", &response[1]);
 }
+static DEVICE_ATTR_RO(fw_version);
 
 static ssize_t model_show(struct device *dev,
                          struct device_attribute *attr, char *buf)
 {
        struct i2c_client *client = to_i2c_client(dev);
        struct exc3000_data *data = i2c_get_clientdata(client);
-       int error;
+       u8 response[EXC3000_LEN_FRAME];
+       int ret;
 
-       error = exc3000_get_model(data);
-       if (error < 0)
-               return error;
+       ret = exc3000_vendor_data_request(data, (u8[]){'E'}, 1, response, 1);
+       if (ret < 0)
+               return ret;
 
-       return sprintf(buf, "%s\n", data->model);
+       return sprintf(buf, "%s\n", &response[1]);
 }
 static DEVICE_ATTR_RO(model);
 
+static ssize_t type_show(struct device *dev,
+                         struct device_attribute *attr, char *buf)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct exc3000_data *data = i2c_get_clientdata(client);
+       u8 response[EXC3000_LEN_FRAME];
+       int ret;
+
+       ret = exc3000_vendor_data_request(data, (u8[]){'F'}, 1, response, 1);
+       if (ret < 0)
+               return ret;
+
+       return sprintf(buf, "%s\n", &response[1]);
+}
+static DEVICE_ATTR_RO(type);
+
 static struct attribute *sysfs_attrs[] = {
        &dev_attr_fw_version.attr,
        &dev_attr_model.attr,
+       &dev_attr_type.attr,
        NULL
 };
 
@@ -379,9 +400,15 @@ static int exc3000_probe(struct i2c_client *client)
         * or two touch events anyways).
         */
        for (retry = 0; retry < 3; retry++) {
-               error = exc3000_get_model(data);
-               if (!error)
+               u8 response[EXC3000_LEN_FRAME];
+
+               error = exc3000_vendor_data_request(data, (u8[]){'E'}, 1,
+                                                   response, 1);
+               if (error > 0) {
+                       dev_dbg(&client->dev, "TS Model: %s", &response[1]);
+                       error = 0;
                        break;
+               }
                dev_warn(&client->dev, "Retry %d get EETI EXC3000 model: %d\n",
                         retry + 1, error);
        }
@@ -389,8 +416,6 @@ static int exc3000_probe(struct i2c_client *client)
        if (error)
                return error;
 
-       dev_dbg(&client->dev, "TS Model: %s", data->model);
-
        i2c_set_clientdata(client, data);
 
        error = devm_device_add_group(&client->dev, &exc3000_attribute_group);
diff --git a/drivers/input/touchscreen/hycon-hy46xx.c b/drivers/input/touchscreen/hycon-hy46xx.c
new file mode 100644 (file)
index 0000000..891d043
--- /dev/null
@@ -0,0 +1,591 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021
+ * Author(s): Giulio Benetti <giulio.benetti@benettiengineering.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/input.h>
+#include <linux/input/mt.h>
+#include <linux/input/touchscreen.h>
+#include <linux/irq.h>
+#include <linux/regulator/consumer.h>
+#include <linux/regmap.h>
+
+#include <asm/unaligned.h>
+
+#define HY46XX_CHKSUM_CODE             0x1
+#define HY46XX_FINGER_NUM              0x2
+#define HY46XX_CHKSUM_LEN              0x7
+#define HY46XX_THRESHOLD               0x80
+#define HY46XX_GLOVE_EN                        0x84
+#define HY46XX_REPORT_SPEED            0x88
+#define HY46XX_PWR_NOISE_EN            0x89
+#define HY46XX_FILTER_DATA             0x8A
+#define HY46XX_GAIN                    0x92
+#define HY46XX_EDGE_OFFSET             0x93
+#define HY46XX_RX_NR_USED              0x94
+#define HY46XX_TX_NR_USED              0x95
+#define HY46XX_PWR_MODE                        0xA5
+#define HY46XX_FW_VERSION              0xA6
+#define HY46XX_LIB_VERSION             0xA7
+#define HY46XX_TP_INFO                 0xA8
+#define HY46XX_TP_CHIP_ID              0xA9
+#define HY46XX_BOOT_VER                        0xB0
+
+#define HY46XX_TPLEN                   0x6
+#define HY46XX_REPORT_PKT_LEN          0x44
+
+#define HY46XX_MAX_SUPPORTED_POINTS    11
+
+#define TOUCH_EVENT_DOWN               0x00
+#define TOUCH_EVENT_UP                 0x01
+#define TOUCH_EVENT_CONTACT            0x02
+#define TOUCH_EVENT_RESERVED           0x03
+
+struct hycon_hy46xx_data {
+       struct i2c_client *client;
+       struct input_dev *input;
+       struct touchscreen_properties prop;
+       struct regulator *vcc;
+
+       struct gpio_desc *reset_gpio;
+
+       struct mutex mutex;
+       struct regmap *regmap;
+
+       int threshold;
+       bool glove_enable;
+       int report_speed;
+       bool noise_filter_enable;
+       int filter_data;
+       int gain;
+       int edge_offset;
+       int rx_number_used;
+       int tx_number_used;
+       int power_mode;
+       int fw_version;
+       int lib_version;
+       int tp_information;
+       int tp_chip_id;
+       int bootloader_version;
+};
+
+static const struct regmap_config hycon_hy46xx_i2c_regmap_config = {
+       .reg_bits = 8,
+       .val_bits = 8,
+};
+
+static bool hycon_hy46xx_check_checksum(struct hycon_hy46xx_data *tsdata, u8 *buf)
+{
+       u8 chksum = 0;
+       int i;
+
+       for (i = 2; i < buf[HY46XX_CHKSUM_LEN]; i++)
+               chksum += buf[i];
+
+       if (chksum == buf[HY46XX_CHKSUM_CODE])
+               return true;
+
+       dev_err_ratelimited(&tsdata->client->dev,
+                           "checksum error: 0x%02x expected, got 0x%02x\n",
+                           chksum, buf[HY46XX_CHKSUM_CODE]);
+
+       return false;
+}
+
+static irqreturn_t hycon_hy46xx_isr(int irq, void *dev_id)
+{
+       struct hycon_hy46xx_data *tsdata = dev_id;
+       struct device *dev = &tsdata->client->dev;
+       u8 rdbuf[HY46XX_REPORT_PKT_LEN];
+       int i, x, y, id;
+       int error;
+
+       memset(rdbuf, 0, sizeof(rdbuf));
+
+       error = regmap_bulk_read(tsdata->regmap, 0, rdbuf, sizeof(rdbuf));
+       if (error) {
+               dev_err_ratelimited(dev, "Unable to fetch data, error: %d\n",
+                                   error);
+               goto out;
+       }
+
+       if (!hycon_hy46xx_check_checksum(tsdata, rdbuf))
+               goto out;
+
+       for (i = 0; i < HY46XX_MAX_SUPPORTED_POINTS; i++) {
+               u8 *buf = &rdbuf[3 + (HY46XX_TPLEN * i)];
+               int type = buf[0] >> 6;
+
+               if (type == TOUCH_EVENT_RESERVED)
+                       continue;
+
+               x = get_unaligned_be16(buf) & 0x0fff;
+               y = get_unaligned_be16(buf + 2) & 0x0fff;
+
+               id = buf[2] >> 4;
+
+               input_mt_slot(tsdata->input, id);
+               if (input_mt_report_slot_state(tsdata->input, MT_TOOL_FINGER,
+                                              type != TOUCH_EVENT_UP))
+                       touchscreen_report_pos(tsdata->input, &tsdata->prop,
+                                              x, y, true);
+       }
+
+       input_mt_report_pointer_emulation(tsdata->input, false);
+       input_sync(tsdata->input);
+
+out:
+       return IRQ_HANDLED;
+}
+
+struct hycon_hy46xx_attribute {
+       struct device_attribute dattr;
+       size_t field_offset;
+       u8 address;
+       u8 limit_low;
+       u8 limit_high;
+};
+
+#define HYCON_ATTR_U8(_field, _mode, _address, _limit_low, _limit_high)        \
+       struct hycon_hy46xx_attribute hycon_hy46xx_attr_##_field = {            \
+               .dattr = __ATTR(_field, _mode,                          \
+                               hycon_hy46xx_setting_show,                      \
+                               hycon_hy46xx_setting_store),                    \
+               .field_offset = offsetof(struct hycon_hy46xx_data, _field),     \
+               .address = _address,                                    \
+               .limit_low = _limit_low,                                \
+               .limit_high = _limit_high,                              \
+       }
+
+#define HYCON_ATTR_BOOL(_field, _mode, _address)                       \
+       struct hycon_hy46xx_attribute hycon_hy46xx_attr_##_field = {            \
+               .dattr = __ATTR(_field, _mode,                          \
+                               hycon_hy46xx_setting_show,                      \
+                               hycon_hy46xx_setting_store),                    \
+               .field_offset = offsetof(struct hycon_hy46xx_data, _field),     \
+               .address = _address,                                    \
+               .limit_low = false,                                     \
+               .limit_high = true,                                     \
+       }
+
+static ssize_t hycon_hy46xx_setting_show(struct device *dev,
+                                  struct device_attribute *dattr, char *buf)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct hycon_hy46xx_data *tsdata = i2c_get_clientdata(client);
+       struct hycon_hy46xx_attribute *attr =
+                       container_of(dattr, struct hycon_hy46xx_attribute, dattr);
+       u8 *field = (u8 *)tsdata + attr->field_offset;
+       size_t count = 0;
+       int error = 0;
+       int val;
+
+       mutex_lock(&tsdata->mutex);
+
+       error = regmap_read(tsdata->regmap, attr->address, &val);
+       if (error < 0) {
+               dev_err(&tsdata->client->dev,
+                       "Failed to fetch attribute %s, error %d\n",
+                       dattr->attr.name, error);
+               goto out;
+       }
+
+       if (val != *field) {
+               dev_warn(&tsdata->client->dev,
+                        "%s: read (%d) and stored value (%d) differ\n",
+                        dattr->attr.name, val, *field);
+               *field = val;
+       }
+
+       count = scnprintf(buf, PAGE_SIZE, "%d\n", val);
+
+out:
+       mutex_unlock(&tsdata->mutex);
+       return error ?: count;
+}
+
+static ssize_t hycon_hy46xx_setting_store(struct device *dev,
+                                       struct device_attribute *dattr,
+                                       const char *buf, size_t count)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct hycon_hy46xx_data *tsdata = i2c_get_clientdata(client);
+       struct hycon_hy46xx_attribute *attr =
+                       container_of(dattr, struct hycon_hy46xx_attribute, dattr);
+       u8 *field = (u8 *)tsdata + attr->field_offset;
+       unsigned int val;
+       int error;
+
+       mutex_lock(&tsdata->mutex);
+
+       error = kstrtouint(buf, 0, &val);
+       if (error)
+               goto out;
+
+       if (val < attr->limit_low || val > attr->limit_high) {
+               error = -ERANGE;
+               goto out;
+       }
+
+       error = regmap_write(tsdata->regmap, attr->address, val);
+       if (error < 0) {
+               dev_err(&tsdata->client->dev,
+                       "Failed to update attribute %s, error: %d\n",
+                       dattr->attr.name, error);
+               goto out;
+       }
+       *field = val;
+
+out:
+       mutex_unlock(&tsdata->mutex);
+       return error ?: count;
+}
+
+static HYCON_ATTR_U8(threshold, 0644, HY46XX_THRESHOLD, 0, 255);
+static HYCON_ATTR_BOOL(glove_enable, 0644, HY46XX_GLOVE_EN);
+static HYCON_ATTR_U8(report_speed, 0644, HY46XX_REPORT_SPEED, 0, 255);
+static HYCON_ATTR_BOOL(noise_filter_enable, 0644, HY46XX_PWR_NOISE_EN);
+static HYCON_ATTR_U8(filter_data, 0644, HY46XX_FILTER_DATA, 0, 5);
+static HYCON_ATTR_U8(gain, 0644, HY46XX_GAIN, 0, 5);
+static HYCON_ATTR_U8(edge_offset, 0644, HY46XX_EDGE_OFFSET, 0, 5);
+static HYCON_ATTR_U8(fw_version, 0444, HY46XX_FW_VERSION, 0, 255);
+static HYCON_ATTR_U8(lib_version, 0444, HY46XX_LIB_VERSION, 0, 255);
+static HYCON_ATTR_U8(tp_information, 0444, HY46XX_TP_INFO, 0, 255);
+static HYCON_ATTR_U8(tp_chip_id, 0444, HY46XX_TP_CHIP_ID, 0, 255);
+static HYCON_ATTR_U8(bootloader_version, 0444, HY46XX_BOOT_VER, 0, 255);
+
+static struct attribute *hycon_hy46xx_attrs[] = {
+       &hycon_hy46xx_attr_threshold.dattr.attr,
+       &hycon_hy46xx_attr_glove_enable.dattr.attr,
+       &hycon_hy46xx_attr_report_speed.dattr.attr,
+       &hycon_hy46xx_attr_noise_filter_enable.dattr.attr,
+       &hycon_hy46xx_attr_filter_data.dattr.attr,
+       &hycon_hy46xx_attr_gain.dattr.attr,
+       &hycon_hy46xx_attr_edge_offset.dattr.attr,
+       &hycon_hy46xx_attr_fw_version.dattr.attr,
+       &hycon_hy46xx_attr_lib_version.dattr.attr,
+       &hycon_hy46xx_attr_tp_information.dattr.attr,
+       &hycon_hy46xx_attr_tp_chip_id.dattr.attr,
+       &hycon_hy46xx_attr_bootloader_version.dattr.attr,
+       NULL
+};
+
+static const struct attribute_group hycon_hy46xx_attr_group = {
+       .attrs = hycon_hy46xx_attrs,
+};
+
+static void hycon_hy46xx_get_defaults(struct device *dev, struct hycon_hy46xx_data *tsdata)
+{
+       bool val_bool;
+       int error;
+       u32 val;
+
+       error = device_property_read_u32(dev, "hycon,threshold", &val);
+       if (!error) {
+               error = regmap_write(tsdata->regmap, HY46XX_THRESHOLD, val);
+               if (error < 0)
+                       goto out;
+
+               tsdata->threshold = val;
+       }
+
+       val_bool = device_property_read_bool(dev, "hycon,glove-enable");
+       error = regmap_write(tsdata->regmap, HY46XX_GLOVE_EN, val_bool);
+       if (error < 0)
+               goto out;
+       tsdata->glove_enable = val_bool;
+
+       error = device_property_read_u32(dev, "hycon,report-speed-hz", &val);
+       if (!error) {
+               error = regmap_write(tsdata->regmap, HY46XX_REPORT_SPEED, val);
+               if (error < 0)
+                       goto out;
+
+               tsdata->report_speed = val;
+       }
+
+       val_bool = device_property_read_bool(dev, "hycon,noise-filter-enable");
+       error = regmap_write(tsdata->regmap, HY46XX_PWR_NOISE_EN, val_bool);
+       if (error < 0)
+               goto out;
+       tsdata->noise_filter_enable = val_bool;
+
+       error = device_property_read_u32(dev, "hycon,filter-data", &val);
+       if (!error) {
+               error = regmap_write(tsdata->regmap, HY46XX_FILTER_DATA, val);
+               if (error < 0)
+                       goto out;
+
+               tsdata->filter_data = val;
+       }
+
+       error = device_property_read_u32(dev, "hycon,gain", &val);
+       if (!error) {
+               error = regmap_write(tsdata->regmap, HY46XX_GAIN, val);
+               if (error < 0)
+                       goto out;
+
+               tsdata->gain = val;
+       }
+
+       error = device_property_read_u32(dev, "hycon,edge-offset", &val);
+       if (!error) {
+               error = regmap_write(tsdata->regmap, HY46XX_EDGE_OFFSET, val);
+               if (error < 0)
+                       goto out;
+
+               tsdata->edge_offset = val;
+       }
+
+       return;
+out:
+       dev_err(&tsdata->client->dev, "Failed to set default settings");
+}
+
+static void hycon_hy46xx_get_parameters(struct hycon_hy46xx_data *tsdata)
+{
+       int error;
+       u32 val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_THRESHOLD, &val);
+       if (error < 0)
+               goto out;
+       tsdata->threshold = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_GLOVE_EN, &val);
+       if (error < 0)
+               goto out;
+       tsdata->glove_enable = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_REPORT_SPEED, &val);
+       if (error < 0)
+               goto out;
+       tsdata->report_speed = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_PWR_NOISE_EN, &val);
+       if (error < 0)
+               goto out;
+       tsdata->noise_filter_enable = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_FILTER_DATA, &val);
+       if (error < 0)
+               goto out;
+       tsdata->filter_data = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_GAIN, &val);
+       if (error < 0)
+               goto out;
+       tsdata->gain = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_EDGE_OFFSET, &val);
+       if (error < 0)
+               goto out;
+       tsdata->edge_offset = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_RX_NR_USED, &val);
+       if (error < 0)
+               goto out;
+       tsdata->rx_number_used = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_TX_NR_USED, &val);
+       if (error < 0)
+               goto out;
+       tsdata->tx_number_used = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_PWR_MODE, &val);
+       if (error < 0)
+               goto out;
+       tsdata->power_mode = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_FW_VERSION, &val);
+       if (error < 0)
+               goto out;
+       tsdata->fw_version = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_LIB_VERSION, &val);
+       if (error < 0)
+               goto out;
+       tsdata->lib_version = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_TP_INFO, &val);
+       if (error < 0)
+               goto out;
+       tsdata->tp_information = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_TP_CHIP_ID, &val);
+       if (error < 0)
+               goto out;
+       tsdata->tp_chip_id = val;
+
+       error = regmap_read(tsdata->regmap, HY46XX_BOOT_VER, &val);
+       if (error < 0)
+               goto out;
+       tsdata->bootloader_version = val;
+
+       return;
+out:
+       dev_err(&tsdata->client->dev, "Failed to read default settings");
+}
+
+static void hycon_hy46xx_disable_regulator(void *arg)
+{
+       struct hycon_hy46xx_data *data = arg;
+
+       regulator_disable(data->vcc);
+}
+
+static int hycon_hy46xx_probe(struct i2c_client *client,
+                                        const struct i2c_device_id *id)
+{
+       struct hycon_hy46xx_data *tsdata;
+       struct input_dev *input;
+       int error;
+
+       dev_dbg(&client->dev, "probing for HYCON HY46XX I2C\n");
+
+       tsdata = devm_kzalloc(&client->dev, sizeof(*tsdata), GFP_KERNEL);
+       if (!tsdata)
+               return -ENOMEM;
+
+       tsdata->vcc = devm_regulator_get(&client->dev, "vcc");
+       if (IS_ERR(tsdata->vcc)) {
+               error = PTR_ERR(tsdata->vcc);
+               if (error != -EPROBE_DEFER)
+                       dev_err(&client->dev,
+                               "failed to request regulator: %d\n", error);
+               return error;
+       }
+
+       error = regulator_enable(tsdata->vcc);
+       if (error < 0) {
+               dev_err(&client->dev, "failed to enable vcc: %d\n", error);
+               return error;
+       }
+
+       error = devm_add_action_or_reset(&client->dev,
+                                        hycon_hy46xx_disable_regulator,
+                                        tsdata);
+       if (error)
+               return error;
+
+       tsdata->reset_gpio = devm_gpiod_get_optional(&client->dev,
+                                                    "reset", GPIOD_OUT_LOW);
+       if (IS_ERR(tsdata->reset_gpio)) {
+               error = PTR_ERR(tsdata->reset_gpio);
+               dev_err(&client->dev,
+                       "Failed to request GPIO reset pin, error %d\n", error);
+               return error;
+       }
+
+       if (tsdata->reset_gpio) {
+               usleep_range(5000, 6000);
+               gpiod_set_value_cansleep(tsdata->reset_gpio, 1);
+               usleep_range(5000, 6000);
+               gpiod_set_value_cansleep(tsdata->reset_gpio, 0);
+               msleep(1000);
+       }
+
+       input = devm_input_allocate_device(&client->dev);
+       if (!input) {
+               dev_err(&client->dev, "failed to allocate input device.\n");
+               return -ENOMEM;
+       }
+
+       mutex_init(&tsdata->mutex);
+       tsdata->client = client;
+       tsdata->input = input;
+
+       tsdata->regmap = devm_regmap_init_i2c(client,
+                                             &hycon_hy46xx_i2c_regmap_config);
+       if (IS_ERR(tsdata->regmap)) {
+               dev_err(&client->dev, "regmap allocation failed\n");
+               return PTR_ERR(tsdata->regmap);
+       }
+
+       hycon_hy46xx_get_defaults(&client->dev, tsdata);
+       hycon_hy46xx_get_parameters(tsdata);
+
+       input->name = "Hycon Capacitive Touch";
+       input->id.bustype = BUS_I2C;
+       input->dev.parent = &client->dev;
+
+       input_set_abs_params(input, ABS_MT_POSITION_X, 0, -1, 0, 0);
+       input_set_abs_params(input, ABS_MT_POSITION_Y, 0, -1, 0, 0);
+
+       touchscreen_parse_properties(input, true, &tsdata->prop);
+
+       error = input_mt_init_slots(input, HY46XX_MAX_SUPPORTED_POINTS,
+                                   INPUT_MT_DIRECT);
+       if (error) {
+               dev_err(&client->dev, "Unable to init MT slots.\n");
+               return error;
+       }
+
+       i2c_set_clientdata(client, tsdata);
+
+       error = devm_request_threaded_irq(&client->dev, client->irq,
+                                         NULL, hycon_hy46xx_isr, IRQF_ONESHOT,
+                                         client->name, tsdata);
+       if (error) {
+               dev_err(&client->dev, "Unable to request touchscreen IRQ.\n");
+               return error;
+       }
+
+       error = devm_device_add_group(&client->dev, &hycon_hy46xx_attr_group);
+       if (error)
+               return error;
+
+       error = input_register_device(input);
+       if (error)
+               return error;
+
+       dev_dbg(&client->dev,
+               "HYCON HY46XX initialized: IRQ %d, Reset pin %d.\n",
+               client->irq,
+               tsdata->reset_gpio ? desc_to_gpio(tsdata->reset_gpio) : -1);
+
+       return 0;
+}
+
+static const struct i2c_device_id hycon_hy46xx_id[] = {
+       { .name = "hy4613" },
+       { .name = "hy4614" },
+       { .name = "hy4621" },
+       { .name = "hy4623" },
+       { .name = "hy4633" },
+       { .name = "hy4635" },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(i2c, hycon_hy46xx_id);
+
+static const struct of_device_id hycon_hy46xx_of_match[] = {
+       { .compatible = "hycon,hy4613" },
+       { .compatible = "hycon,hy4614" },
+       { .compatible = "hycon,hy4621" },
+       { .compatible = "hycon,hy4623" },
+       { .compatible = "hycon,hy4633" },
+       { .compatible = "hycon,hy4635" },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, hycon_hy46xx_of_match);
+
+static struct i2c_driver hycon_hy46xx_driver = {
+       .driver = {
+               .name = "hycon_hy46xx",
+               .of_match_table = hycon_hy46xx_of_match,
+               .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+       },
+       .id_table = hycon_hy46xx_id,
+       .probe    = hycon_hy46xx_probe,
+};
+
+module_i2c_driver(hycon_hy46xx_driver);
+
+MODULE_AUTHOR("Giulio Benetti <giulio.benetti@benettiengineering.com>");
+MODULE_DESCRIPTION("HYCON HY46XX I2C Touchscreen Driver");
+MODULE_LICENSE("GPL v2");
index d8fccf0..30576a5 100644 (file)
@@ -87,7 +87,7 @@ static bool ili210x_touchdata_to_coords(const u8 *touchdata,
                                        unsigned int *x, unsigned int *y,
                                        unsigned int *z)
 {
-       if (touchdata[0] & BIT(finger))
+       if (!(touchdata[0] & BIT(finger)))
                return false;
 
        *x = get_unaligned_be16(touchdata + 1 + (finger * 4) + 0);
diff --git a/drivers/input/touchscreen/ilitek_ts_i2c.c b/drivers/input/touchscreen/ilitek_ts_i2c.c
new file mode 100644 (file)
index 0000000..c5d259c
--- /dev/null
@@ -0,0 +1,690 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ILITEK Touch IC driver for 23XX, 25XX and Lego series
+ *
+ * Copyright (C) 2011 ILI Technology Corporation.
+ * Copyright (C) 2020 Luca Hsu <luca_hsu@ilitek.com>
+ * Copyright (C) 2021 Joe Hung <joe_hung@ilitek.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/input.h>
+#include <linux/input/mt.h>
+#include <linux/i2c.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
+#include <linux/errno.h>
+#include <linux/acpi.h>
+#include <linux/input/touchscreen.h>
+#include <asm/unaligned.h>
+
+
+#define ILITEK_TS_NAME                                 "ilitek_ts"
+#define BL_V1_8                                                0x108
+#define BL_V1_7                                                0x107
+#define BL_V1_6                                                0x106
+
+#define ILITEK_TP_CMD_GET_TP_RES                       0x20
+#define ILITEK_TP_CMD_GET_SCRN_RES                     0x21
+#define ILITEK_TP_CMD_SET_IC_SLEEP                     0x30
+#define ILITEK_TP_CMD_SET_IC_WAKE                      0x31
+#define ILITEK_TP_CMD_GET_FW_VER                       0x40
+#define ILITEK_TP_CMD_GET_PRL_VER                      0x42
+#define ILITEK_TP_CMD_GET_MCU_VER                      0x61
+#define ILITEK_TP_CMD_GET_IC_MODE                      0xC0
+
+#define REPORT_COUNT_ADDRESS                           61
+#define ILITEK_SUPPORT_MAX_POINT                       40
+
+struct ilitek_protocol_info {
+       u16 ver;
+       u8 ver_major;
+};
+
+struct ilitek_ts_data {
+       struct i2c_client               *client;
+       struct gpio_desc                *reset_gpio;
+       struct input_dev                *input_dev;
+       struct touchscreen_properties   prop;
+
+       const struct ilitek_protocol_map *ptl_cb_func;
+       struct ilitek_protocol_info     ptl;
+
+       char                            product_id[30];
+       u16                             mcu_ver;
+       u8                              ic_mode;
+       u8                              firmware_ver[8];
+
+       s32                             reset_time;
+       s32                             screen_max_x;
+       s32                             screen_max_y;
+       s32                             screen_min_x;
+       s32                             screen_min_y;
+       s32                             max_tp;
+};
+
+struct ilitek_protocol_map {
+       u16 cmd;
+       const char *name;
+       int (*func)(struct ilitek_ts_data *ts, u16 cmd, u8 *inbuf, u8 *outbuf);
+};
+
+enum ilitek_cmds {
+       /* common cmds */
+       GET_PTL_VER = 0,
+       GET_FW_VER,
+       GET_SCRN_RES,
+       GET_TP_RES,
+       GET_IC_MODE,
+       GET_MCU_VER,
+       SET_IC_SLEEP,
+       SET_IC_WAKE,
+
+       /* ALWAYS keep at the end */
+       MAX_CMD_CNT
+};
+
+/* ILITEK I2C R/W APIs */
+static int ilitek_i2c_write_and_read(struct ilitek_ts_data *ts,
+                                    u8 *cmd, int write_len, int delay,
+                                    u8 *data, int read_len)
+{
+       int error;
+       struct i2c_client *client = ts->client;
+       struct i2c_msg msgs[] = {
+               {
+                       .addr = client->addr,
+                       .flags = 0,
+                       .len = write_len,
+                       .buf = cmd,
+               },
+               {
+                       .addr = client->addr,
+                       .flags = I2C_M_RD,
+                       .len = read_len,
+                       .buf = data,
+               },
+       };
+
+       if (delay == 0 && write_len > 0 && read_len > 0) {
+               error = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs));
+               if (error < 0)
+                       return error;
+       } else {
+               if (write_len > 0) {
+                       error = i2c_transfer(client->adapter, msgs, 1);
+                       if (error < 0)
+                               return error;
+               }
+               if (delay > 0)
+                       mdelay(delay);
+
+               if (read_len > 0) {
+                       error = i2c_transfer(client->adapter, msgs + 1, 1);
+                       if (error < 0)
+                               return error;
+               }
+       }
+
+       return 0;
+}
+
+/* ILITEK ISR APIs */
+static void ilitek_touch_down(struct ilitek_ts_data *ts, unsigned int id,
+                             unsigned int x, unsigned int y)
+{
+       struct input_dev *input = ts->input_dev;
+
+       input_mt_slot(input, id);
+       input_mt_report_slot_state(input, MT_TOOL_FINGER, true);
+
+       touchscreen_report_pos(input, &ts->prop, x, y, true);
+}
+
+static int ilitek_process_and_report_v6(struct ilitek_ts_data *ts)
+{
+       int error = 0;
+       u8 buf[512];
+       int packet_len = 5;
+       int packet_max_point = 10;
+       int report_max_point;
+       int i, count;
+       struct input_dev *input = ts->input_dev;
+       struct device *dev = &ts->client->dev;
+       unsigned int x, y, status, id;
+
+       error = ilitek_i2c_write_and_read(ts, NULL, 0, 0, buf, 64);
+       if (error) {
+               dev_err(dev, "get touch info failed, err:%d\n", error);
+               goto err_sync_frame;
+       }
+
+       report_max_point = buf[REPORT_COUNT_ADDRESS];
+       if (report_max_point > ts->max_tp) {
+               dev_err(dev, "FW report max point:%d > panel info. max:%d\n",
+                       report_max_point, ts->max_tp);
+               error = -EINVAL;
+               goto err_sync_frame;
+       }
+
+       count = DIV_ROUND_UP(report_max_point, packet_max_point);
+       for (i = 1; i < count; i++) {
+               error = ilitek_i2c_write_and_read(ts, NULL, 0, 0,
+                                                 buf + i * 64, 64);
+               if (error) {
+                       dev_err(dev, "get touch info. failed, cnt:%d, err:%d\n",
+                               count, error);
+                       goto err_sync_frame;
+               }
+       }
+
+       for (i = 0; i < report_max_point; i++) {
+               status = buf[i * packet_len + 1] & 0x40;
+               if (!status)
+                       continue;
+
+               id = buf[i * packet_len + 1] & 0x3F;
+
+               x = get_unaligned_le16(buf + i * packet_len + 2);
+               y = get_unaligned_le16(buf + i * packet_len + 4);
+
+               if (x > ts->screen_max_x || x < ts->screen_min_x ||
+                   y > ts->screen_max_y || y < ts->screen_min_y) {
+                       dev_warn(dev, "invalid position, X[%d,%u,%d], Y[%d,%u,%d]\n",
+                                ts->screen_min_x, x, ts->screen_max_x,
+                                ts->screen_min_y, y, ts->screen_max_y);
+                       continue;
+               }
+
+               ilitek_touch_down(ts, id, x, y);
+       }
+
+err_sync_frame:
+       input_mt_sync_frame(input);
+       input_sync(input);
+       return error;
+}
+
+/* APIs of cmds for ILITEK Touch IC */
+static int api_protocol_set_cmd(struct ilitek_ts_data *ts,
+                               u16 idx, u8 *inbuf, u8 *outbuf)
+{
+       u16 cmd;
+       int error;
+
+       if (idx >= MAX_CMD_CNT)
+               return -EINVAL;
+
+       cmd = ts->ptl_cb_func[idx].cmd;
+       error = ts->ptl_cb_func[idx].func(ts, cmd, inbuf, outbuf);
+       if (error)
+               return error;
+
+       return 0;
+}
+
+static int api_protocol_get_ptl_ver(struct ilitek_ts_data *ts,
+                                   u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       int error;
+       u8 buf[64];
+
+       buf[0] = cmd;
+       error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 3);
+       if (error)
+               return error;
+
+       ts->ptl.ver = get_unaligned_be16(outbuf);
+       ts->ptl.ver_major = outbuf[0];
+
+       return 0;
+}
+
+static int api_protocol_get_mcu_ver(struct ilitek_ts_data *ts,
+                                   u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       int error;
+       u8 buf[64];
+
+       buf[0] = cmd;
+       error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 32);
+       if (error)
+               return error;
+
+       ts->mcu_ver = get_unaligned_le16(outbuf);
+       memset(ts->product_id, 0, sizeof(ts->product_id));
+       memcpy(ts->product_id, outbuf + 6, 26);
+
+       return 0;
+}
+
+static int api_protocol_get_fw_ver(struct ilitek_ts_data *ts,
+                                  u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       int error;
+       u8 buf[64];
+
+       buf[0] = cmd;
+       error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 8);
+       if (error)
+               return error;
+
+       memcpy(ts->firmware_ver, outbuf, 8);
+
+       return 0;
+}
+
+static int api_protocol_get_scrn_res(struct ilitek_ts_data *ts,
+                                    u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       int error;
+       u8 buf[64];
+
+       buf[0] = cmd;
+       error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 8);
+       if (error)
+               return error;
+
+       ts->screen_min_x = get_unaligned_le16(outbuf);
+       ts->screen_min_y = get_unaligned_le16(outbuf + 2);
+       ts->screen_max_x = get_unaligned_le16(outbuf + 4);
+       ts->screen_max_y = get_unaligned_le16(outbuf + 6);
+
+       return 0;
+}
+
+static int api_protocol_get_tp_res(struct ilitek_ts_data *ts,
+                                  u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       int error;
+       u8 buf[64];
+
+       buf[0] = cmd;
+       error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 15);
+       if (error)
+               return error;
+
+       ts->max_tp = outbuf[8];
+       if (ts->max_tp > ILITEK_SUPPORT_MAX_POINT) {
+               dev_err(&ts->client->dev, "Invalid MAX_TP:%d from FW\n",
+                       ts->max_tp);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int api_protocol_get_ic_mode(struct ilitek_ts_data *ts,
+                                   u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       int error;
+       u8 buf[64];
+
+       buf[0] = cmd;
+       error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 2);
+       if (error)
+               return error;
+
+       ts->ic_mode = outbuf[0];
+       return 0;
+}
+
+static int api_protocol_set_ic_sleep(struct ilitek_ts_data *ts,
+                                    u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       u8 buf[64];
+
+       buf[0] = cmd;
+       return ilitek_i2c_write_and_read(ts, buf, 1, 0, NULL, 0);
+}
+
+static int api_protocol_set_ic_wake(struct ilitek_ts_data *ts,
+                                   u16 cmd, u8 *inbuf, u8 *outbuf)
+{
+       u8 buf[64];
+
+       buf[0] = cmd;
+       return ilitek_i2c_write_and_read(ts, buf, 1, 0, NULL, 0);
+}
+
+static const struct ilitek_protocol_map ptl_func_map[] = {
+       /* common cmds */
+       [GET_PTL_VER] = {
+               ILITEK_TP_CMD_GET_PRL_VER, "GET_PTL_VER",
+               api_protocol_get_ptl_ver
+       },
+       [GET_FW_VER] = {
+               ILITEK_TP_CMD_GET_FW_VER, "GET_FW_VER",
+               api_protocol_get_fw_ver
+       },
+       [GET_SCRN_RES] = {
+               ILITEK_TP_CMD_GET_SCRN_RES, "GET_SCRN_RES",
+               api_protocol_get_scrn_res
+       },
+       [GET_TP_RES] = {
+               ILITEK_TP_CMD_GET_TP_RES, "GET_TP_RES",
+               api_protocol_get_tp_res
+       },
+       [GET_IC_MODE] = {
+               ILITEK_TP_CMD_GET_IC_MODE, "GET_IC_MODE",
+                          api_protocol_get_ic_mode
+       },
+       [GET_MCU_VER] = {
+               ILITEK_TP_CMD_GET_MCU_VER, "GET_MOD_VER",
+                          api_protocol_get_mcu_ver
+       },
+       [SET_IC_SLEEP] = {
+               ILITEK_TP_CMD_SET_IC_SLEEP, "SET_IC_SLEEP",
+               api_protocol_set_ic_sleep
+       },
+       [SET_IC_WAKE] = {
+               ILITEK_TP_CMD_SET_IC_WAKE, "SET_IC_WAKE",
+               api_protocol_set_ic_wake
+       },
+};
+
+/* Probe APIs */
+static void ilitek_reset(struct ilitek_ts_data *ts, int delay)
+{
+       if (ts->reset_gpio) {
+               gpiod_set_value(ts->reset_gpio, 1);
+               mdelay(10);
+               gpiod_set_value(ts->reset_gpio, 0);
+               mdelay(delay);
+       }
+}
+
+static int ilitek_protocol_init(struct ilitek_ts_data *ts)
+{
+       int error;
+       u8 outbuf[64];
+
+       ts->ptl_cb_func = ptl_func_map;
+       ts->reset_time = 600;
+
+       error = api_protocol_set_cmd(ts, GET_PTL_VER, NULL, outbuf);
+       if (error)
+               return error;
+
+       /* Protocol v3 is not support currently */
+       if (ts->ptl.ver_major == 0x3 ||
+           ts->ptl.ver == BL_V1_6 ||
+           ts->ptl.ver == BL_V1_7)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int ilitek_read_tp_info(struct ilitek_ts_data *ts, bool boot)
+{
+       u8 outbuf[256];
+       int error;
+
+       error = api_protocol_set_cmd(ts, GET_PTL_VER, NULL, outbuf);
+       if (error)
+               return error;
+
+       error = api_protocol_set_cmd(ts, GET_MCU_VER, NULL, outbuf);
+       if (error)
+               return error;
+
+       error = api_protocol_set_cmd(ts, GET_FW_VER, NULL, outbuf);
+       if (error)
+               return error;
+
+       if (boot) {
+               error = api_protocol_set_cmd(ts, GET_SCRN_RES, NULL,
+                                            outbuf);
+               if (error)
+                       return error;
+       }
+
+       error = api_protocol_set_cmd(ts, GET_TP_RES, NULL, outbuf);
+       if (error)
+               return error;
+
+       error = api_protocol_set_cmd(ts, GET_IC_MODE, NULL, outbuf);
+       if (error)
+               return error;
+
+       return 0;
+}
+
+static int ilitek_input_dev_init(struct device *dev, struct ilitek_ts_data *ts)
+{
+       int error;
+       struct input_dev *input;
+
+       input = devm_input_allocate_device(dev);
+       if (!input)
+               return -ENOMEM;
+
+       ts->input_dev = input;
+       input->name = ILITEK_TS_NAME;
+       input->id.bustype = BUS_I2C;
+
+       __set_bit(INPUT_PROP_DIRECT, input->propbit);
+
+       input_set_abs_params(input, ABS_MT_POSITION_X,
+                            ts->screen_min_x, ts->screen_max_x, 0, 0);
+       input_set_abs_params(input, ABS_MT_POSITION_Y,
+                            ts->screen_min_y, ts->screen_max_y, 0, 0);
+
+       touchscreen_parse_properties(input, true, &ts->prop);
+
+       error = input_mt_init_slots(input, ts->max_tp,
+                                   INPUT_MT_DIRECT | INPUT_MT_DROP_UNUSED);
+       if (error) {
+               dev_err(dev, "initialize MT slots failed, err:%d\n", error);
+               return error;
+       }
+
+       error = input_register_device(input);
+       if (error) {
+               dev_err(dev, "register input device failed, err:%d\n", error);
+               return error;
+       }
+
+       return 0;
+}
+
+static irqreturn_t ilitek_i2c_isr(int irq, void *dev_id)
+{
+       struct ilitek_ts_data *ts = dev_id;
+       int error;
+
+       error = ilitek_process_and_report_v6(ts);
+       if (error < 0) {
+               dev_err(&ts->client->dev, "[%s] err:%d\n", __func__, error);
+               return IRQ_NONE;
+       }
+
+       return IRQ_HANDLED;
+}
+
+static ssize_t firmware_version_show(struct device *dev,
+                                    struct device_attribute *attr, char *buf)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct ilitek_ts_data *ts = i2c_get_clientdata(client);
+
+       return scnprintf(buf, PAGE_SIZE,
+                        "fw version: [%02X%02X.%02X%02X.%02X%02X.%02X%02X]\n",
+                        ts->firmware_ver[0], ts->firmware_ver[1],
+                        ts->firmware_ver[2], ts->firmware_ver[3],
+                        ts->firmware_ver[4], ts->firmware_ver[5],
+                        ts->firmware_ver[6], ts->firmware_ver[7]);
+}
+static DEVICE_ATTR_RO(firmware_version);
+
+static ssize_t product_id_show(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct ilitek_ts_data *ts = i2c_get_clientdata(client);
+
+       return scnprintf(buf, PAGE_SIZE, "product id: [%04X], module: [%s]\n",
+                        ts->mcu_ver, ts->product_id);
+}
+static DEVICE_ATTR_RO(product_id);
+
+static struct attribute *ilitek_sysfs_attrs[] = {
+       &dev_attr_firmware_version.attr,
+       &dev_attr_product_id.attr,
+       NULL
+};
+
+static struct attribute_group ilitek_attrs_group = {
+       .attrs = ilitek_sysfs_attrs,
+};
+
+static int ilitek_ts_i2c_probe(struct i2c_client *client,
+                              const struct i2c_device_id *id)
+{
+       struct ilitek_ts_data *ts;
+       struct device *dev = &client->dev;
+       int error;
+
+       if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
+               dev_err(dev, "i2c check functionality failed\n");
+               return -ENXIO;
+       }
+
+       ts = devm_kzalloc(dev, sizeof(*ts), GFP_KERNEL);
+       if (!ts)
+               return -ENOMEM;
+
+       ts->client = client;
+       i2c_set_clientdata(client, ts);
+
+       ts->reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW);
+       if (IS_ERR(ts->reset_gpio)) {
+               error = PTR_ERR(ts->reset_gpio);
+               dev_err(dev, "request gpiod failed: %d", error);
+               return error;
+       }
+
+       ilitek_reset(ts, 1000);
+
+       error = ilitek_protocol_init(ts);
+       if (error) {
+               dev_err(dev, "protocol init failed: %d", error);
+               return error;
+       }
+
+       error = ilitek_read_tp_info(ts, true);
+       if (error) {
+               dev_err(dev, "read tp info failed: %d", error);
+               return error;
+       }
+
+       error = ilitek_input_dev_init(dev, ts);
+       if (error) {
+               dev_err(dev, "input dev init failed: %d", error);
+               return error;
+       }
+
+       error = devm_request_threaded_irq(dev, ts->client->irq,
+                                         NULL, ilitek_i2c_isr, IRQF_ONESHOT,
+                                         "ilitek_touch_irq", ts);
+       if (error) {
+               dev_err(dev, "request threaded irq failed: %d\n", error);
+               return error;
+       }
+
+       error = devm_device_add_group(dev, &ilitek_attrs_group);
+       if (error) {
+               dev_err(dev, "sysfs create group failed: %d\n", error);
+               return error;
+       }
+
+       return 0;
+}
+
+static int __maybe_unused ilitek_suspend(struct device *dev)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct ilitek_ts_data *ts = i2c_get_clientdata(client);
+       int error;
+
+       disable_irq(client->irq);
+
+       if (!device_may_wakeup(dev)) {
+               error = api_protocol_set_cmd(ts, SET_IC_SLEEP, NULL, NULL);
+               if (error)
+                       return error;
+       }
+
+       return 0;
+}
+
+static int __maybe_unused ilitek_resume(struct device *dev)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct ilitek_ts_data *ts = i2c_get_clientdata(client);
+       int error;
+
+       if (!device_may_wakeup(dev)) {
+               error = api_protocol_set_cmd(ts, SET_IC_WAKE, NULL, NULL);
+               if (error)
+                       return error;
+
+               ilitek_reset(ts, ts->reset_time);
+       }
+
+       enable_irq(client->irq);
+
+       return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(ilitek_pm_ops, ilitek_suspend, ilitek_resume);
+
+static const struct i2c_device_id ilitek_ts_i2c_id[] = {
+       { ILITEK_TS_NAME, 0 },
+       { },
+};
+MODULE_DEVICE_TABLE(i2c, ilitek_ts_i2c_id);
+
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id ilitekts_acpi_id[] = {
+       { "ILTK0001", 0 },
+       { },
+};
+MODULE_DEVICE_TABLE(acpi, ilitekts_acpi_id);
+#endif
+
+#ifdef CONFIG_OF
+static const struct of_device_id ilitek_ts_i2c_match[] = {
+       {.compatible = "ilitek,ili2130",},
+       {.compatible = "ilitek,ili2131",},
+       {.compatible = "ilitek,ili2132",},
+       {.compatible = "ilitek,ili2316",},
+       {.compatible = "ilitek,ili2322",},
+       {.compatible = "ilitek,ili2323",},
+       {.compatible = "ilitek,ili2326",},
+       {.compatible = "ilitek,ili2520",},
+       {.compatible = "ilitek,ili2521",},
+       { },
+};
+MODULE_DEVICE_TABLE(of, ilitek_ts_i2c_match);
+#endif
+
+static struct i2c_driver ilitek_ts_i2c_driver = {
+       .driver = {
+               .name = ILITEK_TS_NAME,
+               .pm = &ilitek_pm_ops,
+               .of_match_table = of_match_ptr(ilitek_ts_i2c_match),
+               .acpi_match_table = ACPI_PTR(ilitekts_acpi_id),
+       },
+       .probe = ilitek_ts_i2c_probe,
+       .id_table = ilitek_ts_i2c_id,
+};
+module_i2c_driver(ilitek_ts_i2c_driver);
+
+MODULE_AUTHOR("ILITEK");
+MODULE_DESCRIPTION("ILITEK I2C Touchscreen Driver");
+MODULE_LICENSE("GPL");
index 54f3003..b3fa712 100644 (file)
@@ -8,7 +8,7 @@
  * made available by the vendor. Firmware files may be pushed to the device's
  * nonvolatile memory by writing the filename to the 'fw_file' sysfs control.
  *
- * Link to PC-based configuration tool and data sheet: http://www.azoteq.com/
+ * Link to PC-based configuration tool and datasheet: https://www.azoteq.com/
  */
 
 #include <linux/bits.h>
 #define IQS5XX_NUM_RETRIES     10
 #define IQS5XX_NUM_CONTACTS    5
 #define IQS5XX_WR_BYTES_MAX    2
-#define IQS5XX_XY_RES_MAX      0xFFFE
 
 #define IQS5XX_PROD_NUM_IQS550 40
 #define IQS5XX_PROD_NUM_IQS572 58
 #define IQS5XX_PROD_NUM_IQS525 52
-#define IQS5XX_PROJ_NUM_A000   0
-#define IQS5XX_PROJ_NUM_B000   15
-#define IQS5XX_MAJOR_VER_MIN   2
 
 #define IQS5XX_SHOW_RESET      BIT(7)
 #define IQS5XX_ACK_RESET       BIT(7)
@@ -64,6 +60,7 @@
 #define IQS5XX_SYS_CFG1                0x058F
 #define IQS5XX_X_RES           0x066E
 #define IQS5XX_Y_RES           0x0670
+#define IQS5XX_EXP_FILE                0x0677
 #define IQS5XX_CHKSM           0x83C0
 #define IQS5XX_APP             0x8400
 #define IQS5XX_CSTM            0xBE00
 #define IQS5XX_BL_CMD_CRC      0x03
 #define IQS5XX_BL_BLK_LEN_MAX  64
 #define IQS5XX_BL_ID           0x0200
-#define IQS5XX_BL_STATUS_RESET 0x00
-#define IQS5XX_BL_STATUS_AVAIL 0xA5
 #define IQS5XX_BL_STATUS_NONE  0xEE
 #define IQS5XX_BL_CRC_PASS     0x00
 #define IQS5XX_BL_CRC_FAIL     0x01
 #define IQS5XX_BL_ATTEMPTS     3
 
-struct iqs5xx_private {
-       struct i2c_client *client;
-       struct input_dev *input;
-       struct gpio_desc *reset_gpio;
-       struct touchscreen_properties prop;
-       struct mutex lock;
-       u8 bl_status;
-};
-
 struct iqs5xx_dev_id_info {
        __be16 prod_num;
        __be16 proj_num;
@@ -134,6 +120,16 @@ struct iqs5xx_status {
        struct iqs5xx_touch_data touch_data[IQS5XX_NUM_CONTACTS];
 } __packed;
 
+struct iqs5xx_private {
+       struct i2c_client *client;
+       struct input_dev *input;
+       struct gpio_desc *reset_gpio;
+       struct touchscreen_properties prop;
+       struct mutex lock;
+       struct iqs5xx_dev_id_info dev_id_info;
+       u8 exp_file[2];
+};
+
 static int iqs5xx_read_burst(struct i2c_client *client,
                             u16 reg, void *val, u16 len)
 {
@@ -446,7 +442,7 @@ static int iqs5xx_set_state(struct i2c_client *client, u8 state)
        struct iqs5xx_private *iqs5xx = i2c_get_clientdata(client);
        int error1, error2;
 
-       if (iqs5xx->bl_status == IQS5XX_BL_STATUS_RESET)
+       if (!iqs5xx->dev_id_info.bl_status)
                return 0;
 
        mutex_lock(&iqs5xx->lock);
@@ -504,10 +500,6 @@ static int iqs5xx_axis_init(struct i2c_client *client)
                input->open = iqs5xx_open;
                input->close = iqs5xx_close;
 
-               input_set_capability(input, EV_ABS, ABS_MT_POSITION_X);
-               input_set_capability(input, EV_ABS, ABS_MT_POSITION_Y);
-               input_set_capability(input, EV_ABS, ABS_MT_PRESSURE);
-
                input_set_drvdata(input, iqs5xx);
                iqs5xx->input = input;
        }
@@ -520,26 +512,29 @@ static int iqs5xx_axis_init(struct i2c_client *client)
        if (error)
                return error;
 
-       input_abs_set_max(iqs5xx->input, ABS_MT_POSITION_X, max_x);
-       input_abs_set_max(iqs5xx->input, ABS_MT_POSITION_Y, max_y);
+       input_set_abs_params(iqs5xx->input, ABS_MT_POSITION_X, 0, max_x, 0, 0);
+       input_set_abs_params(iqs5xx->input, ABS_MT_POSITION_Y, 0, max_y, 0, 0);
+       input_set_abs_params(iqs5xx->input, ABS_MT_PRESSURE, 0, U16_MAX, 0, 0);
 
        touchscreen_parse_properties(iqs5xx->input, true, prop);
 
-       if (prop->max_x > IQS5XX_XY_RES_MAX) {
-               dev_err(&client->dev, "Invalid maximum x-coordinate: %u > %u\n",
-                       prop->max_x, IQS5XX_XY_RES_MAX);
+       /*
+        * The device reserves 0xFFFF for coordinates that correspond to slots
+        * which are not in a state of touch.
+        */
+       if (prop->max_x >= U16_MAX || prop->max_y >= U16_MAX) {
+               dev_err(&client->dev, "Invalid touchscreen size: %u*%u\n",
+                       prop->max_x, prop->max_y);
                return -EINVAL;
-       } else if (prop->max_x != max_x) {
+       }
+
+       if (prop->max_x != max_x) {
                error = iqs5xx_write_word(client, IQS5XX_X_RES, prop->max_x);
                if (error)
                        return error;
        }
 
-       if (prop->max_y > IQS5XX_XY_RES_MAX) {
-               dev_err(&client->dev, "Invalid maximum y-coordinate: %u > %u\n",
-                       prop->max_y, IQS5XX_XY_RES_MAX);
-               return -EINVAL;
-       } else if (prop->max_y != max_y) {
+       if (prop->max_y != max_y) {
                error = iqs5xx_write_word(client, IQS5XX_Y_RES, prop->max_y);
                if (error)
                        return error;
@@ -574,7 +569,7 @@ static int iqs5xx_dev_init(struct i2c_client *client)
         * the missing zero is prepended).
         */
        buf[0] = 0;
-       dev_id_info = (struct iqs5xx_dev_id_info *)&buf[(buf[1] > 0) ? 0 : 1];
+       dev_id_info = (struct iqs5xx_dev_id_info *)&buf[buf[1] ? 0 : 1];
 
        switch (be16_to_cpu(dev_id_info->prod_num)) {
        case IQS5XX_PROD_NUM_IQS550:
@@ -587,35 +582,20 @@ static int iqs5xx_dev_init(struct i2c_client *client)
                return -EINVAL;
        }
 
-       switch (be16_to_cpu(dev_id_info->proj_num)) {
-       case IQS5XX_PROJ_NUM_A000:
-               dev_err(&client->dev, "Unsupported project number: %u\n",
-                       be16_to_cpu(dev_id_info->proj_num));
-               return iqs5xx_bl_open(client);
-       case IQS5XX_PROJ_NUM_B000:
-               break;
-       default:
-               dev_err(&client->dev, "Unrecognized project number: %u\n",
-                       be16_to_cpu(dev_id_info->proj_num));
-               return -EINVAL;
-       }
-
-       if (dev_id_info->major_ver < IQS5XX_MAJOR_VER_MIN) {
-               dev_err(&client->dev, "Unsupported major version: %u\n",
-                       dev_id_info->major_ver);
+       /*
+        * With the product number recognized yet shifted by one byte, open the
+        * bootloader and wait for user space to convert the A000 device into a
+        * B000 device via new firmware.
+        */
+       if (buf[1]) {
+               dev_err(&client->dev, "Opening bootloader for A000 device\n");
                return iqs5xx_bl_open(client);
        }
 
-       switch (dev_id_info->bl_status) {
-       case IQS5XX_BL_STATUS_AVAIL:
-       case IQS5XX_BL_STATUS_NONE:
-               break;
-       default:
-               dev_err(&client->dev,
-                       "Unrecognized bootloader status: 0x%02X\n",
-                       dev_id_info->bl_status);
-               return -EINVAL;
-       }
+       error = iqs5xx_read_burst(client, IQS5XX_EXP_FILE,
+                                 iqs5xx->exp_file, sizeof(iqs5xx->exp_file));
+       if (error)
+               return error;
 
        error = iqs5xx_axis_init(client);
        if (error)
@@ -640,7 +620,7 @@ static int iqs5xx_dev_init(struct i2c_client *client)
        if (error)
                return error;
 
-       iqs5xx->bl_status = dev_id_info->bl_status;
+       iqs5xx->dev_id_info = *dev_id_info;
 
        /*
         * The following delay allows ATI to complete before the open and close
@@ -666,7 +646,7 @@ static irqreturn_t iqs5xx_irq(int irq, void *data)
         * RDY output during bootloader mode. If the device operates outside of
         * bootloader mode, the input device is guaranteed to be allocated.
         */
-       if (iqs5xx->bl_status == IQS5XX_BL_STATUS_RESET)
+       if (!iqs5xx->dev_id_info.bl_status)
                return IRQ_NONE;
 
        error = iqs5xx_read_burst(client, IQS5XX_SYS_INFO0,
@@ -852,12 +832,9 @@ static int iqs5xx_fw_file_parse(struct i2c_client *client,
 static int iqs5xx_fw_file_write(struct i2c_client *client, const char *fw_file)
 {
        struct iqs5xx_private *iqs5xx = i2c_get_clientdata(client);
-       int error, error_bl = 0;
+       int error, error_init = 0;
        u8 *pmap;
 
-       if (iqs5xx->bl_status == IQS5XX_BL_STATUS_NONE)
-               return -EPERM;
-
        pmap = kzalloc(IQS5XX_PMAP_LEN, GFP_KERNEL);
        if (!pmap)
                return -ENOMEM;
@@ -875,7 +852,7 @@ static int iqs5xx_fw_file_write(struct i2c_client *client, const char *fw_file)
         */
        disable_irq(client->irq);
 
-       iqs5xx->bl_status = IQS5XX_BL_STATUS_RESET;
+       iqs5xx->dev_id_info.bl_status = 0;
 
        error = iqs5xx_bl_cmd(client, IQS5XX_BL_CMD_VER, 0);
        if (error) {
@@ -895,21 +872,14 @@ static int iqs5xx_fw_file_write(struct i2c_client *client, const char *fw_file)
        error = iqs5xx_bl_verify(client, IQS5XX_CSTM,
                                 pmap + IQS5XX_CHKSM_LEN + IQS5XX_APP_LEN,
                                 IQS5XX_CSTM_LEN);
-       if (error)
-               goto err_reset;
-
-       error = iqs5xx_bl_cmd(client, IQS5XX_BL_CMD_EXEC, 0);
 
 err_reset:
-       if (error) {
-               iqs5xx_reset(client);
-               usleep_range(10000, 10100);
-       }
+       iqs5xx_reset(client);
+       usleep_range(15000, 15100);
 
-       error_bl = error;
-       error = iqs5xx_dev_init(client);
-       if (!error && iqs5xx->bl_status == IQS5XX_BL_STATUS_RESET)
-               error = -EINVAL;
+       error_init = iqs5xx_dev_init(client);
+       if (!iqs5xx->dev_id_info.bl_status)
+               error_init = error_init ? : -EINVAL;
 
        enable_irq(client->irq);
 
@@ -918,10 +888,7 @@ err_reset:
 err_kfree:
        kfree(pmap);
 
-       if (error_bl)
-               return error_bl;
-
-       return error;
+       return error ? : error_init;
 }
 
 static ssize_t fw_file_store(struct device *dev,
@@ -968,14 +935,47 @@ static ssize_t fw_file_store(struct device *dev,
        return count;
 }
 
+static ssize_t fw_info_show(struct device *dev,
+                           struct device_attribute *attr, char *buf)
+{
+       struct iqs5xx_private *iqs5xx = dev_get_drvdata(dev);
+
+       if (!iqs5xx->dev_id_info.bl_status)
+               return -ENODATA;
+
+       return scnprintf(buf, PAGE_SIZE, "%u.%u.%u.%u:%u.%u\n",
+                        be16_to_cpu(iqs5xx->dev_id_info.prod_num),
+                        be16_to_cpu(iqs5xx->dev_id_info.proj_num),
+                        iqs5xx->dev_id_info.major_ver,
+                        iqs5xx->dev_id_info.minor_ver,
+                        iqs5xx->exp_file[0], iqs5xx->exp_file[1]);
+}
+
 static DEVICE_ATTR_WO(fw_file);
+static DEVICE_ATTR_RO(fw_info);
 
 static struct attribute *iqs5xx_attrs[] = {
        &dev_attr_fw_file.attr,
+       &dev_attr_fw_info.attr,
        NULL,
 };
 
+static umode_t iqs5xx_attr_is_visible(struct kobject *kobj,
+                                     struct attribute *attr, int i)
+{
+       struct device *dev = kobj_to_dev(kobj);
+       struct iqs5xx_private *iqs5xx = dev_get_drvdata(dev);
+
+       if (attr == &dev_attr_fw_file.attr &&
+           (iqs5xx->dev_id_info.bl_status == IQS5XX_BL_STATUS_NONE ||
+           !iqs5xx->reset_gpio))
+               return 0;
+
+       return attr->mode;
+}
+
 static const struct attribute_group iqs5xx_attr_group = {
+       .is_visible = iqs5xx_attr_is_visible,
        .attrs = iqs5xx_attrs,
 };
 
@@ -1032,8 +1032,8 @@ static int iqs5xx_probe(struct i2c_client *client,
        i2c_set_clientdata(client, iqs5xx);
        iqs5xx->client = client;
 
-       iqs5xx->reset_gpio = devm_gpiod_get(&client->dev,
-                                           "reset", GPIOD_OUT_LOW);
+       iqs5xx->reset_gpio = devm_gpiod_get_optional(&client->dev,
+                                                    "reset", GPIOD_OUT_LOW);
        if (IS_ERR(iqs5xx->reset_gpio)) {
                error = PTR_ERR(iqs5xx->reset_gpio);
                dev_err(&client->dev, "Failed to request GPIO: %d\n", error);
@@ -1042,9 +1042,6 @@ static int iqs5xx_probe(struct i2c_client *client,
 
        mutex_init(&iqs5xx->lock);
 
-       iqs5xx_reset(client);
-       usleep_range(10000, 10100);
-
        error = iqs5xx_dev_init(client);
        if (error)
                return error;
index b51450b..15b5cb7 100644 (file)
 #define LPC32XX_TSC_AUX_MIN                    0x38
 #define LPC32XX_TSC_AUX_MAX                    0x3C
 
-#define LPC32XX_TSC_STAT_FIFO_OVRRN            (1 << 8)
-#define LPC32XX_TSC_STAT_FIFO_EMPTY            (1 << 7)
+#define LPC32XX_TSC_STAT_FIFO_OVRRN            BIT(8)
+#define LPC32XX_TSC_STAT_FIFO_EMPTY            BIT(7)
 
 #define LPC32XX_TSC_SEL_DEFVAL                 0x0284
 
 #define LPC32XX_TSC_ADCCON_IRQ_TO_FIFO_4       (0x1 << 11)
 #define LPC32XX_TSC_ADCCON_X_SAMPLE_SIZE(s)    ((10 - (s)) << 7)
 #define LPC32XX_TSC_ADCCON_Y_SAMPLE_SIZE(s)    ((10 - (s)) << 4)
-#define LPC32XX_TSC_ADCCON_POWER_UP            (1 << 2)
-#define LPC32XX_TSC_ADCCON_AUTO_EN             (1 << 0)
+#define LPC32XX_TSC_ADCCON_POWER_UP            BIT(2)
+#define LPC32XX_TSC_ADCCON_AUTO_EN             BIT(0)
 
-#define LPC32XX_TSC_FIFO_TS_P_LEVEL            (1 << 31)
+#define LPC32XX_TSC_FIFO_TS_P_LEVEL            BIT(31)
 #define LPC32XX_TSC_FIFO_NORMALIZE_X_VAL(x)    (((x) & 0x03FF0000) >> 16)
 #define LPC32XX_TSC_FIFO_NORMALIZE_Y_VAL(y)    ((y) & 0x000003FF)
 
index 225796a..2745bf1 100644 (file)
@@ -1502,7 +1502,8 @@ static int mip4_probe(struct i2c_client *client, const struct i2c_device_id *id)
 
        error = devm_request_threaded_irq(&client->dev, client->irq,
                                          NULL, mip4_interrupt,
-                                         IRQF_ONESHOT, MIP4_DEVICE_NAME, ts);
+                                         IRQF_ONESHOT | IRQF_NO_AUTOEN,
+                                         MIP4_DEVICE_NAME, ts);
        if (error) {
                dev_err(&client->dev,
                        "Failed to request interrupt %d: %d\n",
@@ -1510,8 +1511,6 @@ static int mip4_probe(struct i2c_client *client, const struct i2c_device_id *id)
                return error;
        }
 
-       disable_irq(client->irq);
-
        error = input_register_device(input);
        if (error) {
                dev_err(&client->dev,
index 16557f5..0efd1a1 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-// Melfas MMS114/MMS152 touchscreen device driver
+// Melfas MMS114/MMS136/MMS152 touchscreen device driver
 //
 // Copyright (c) 2012 Samsung Electronics Co., Ltd.
 // Author: Joonyoung Shim <jy0922.shim@samsung.com>
@@ -44,7 +44,8 @@
 #define MMS114_MAX_AREA                        0xff
 
 #define MMS114_MAX_TOUCH               10
-#define MMS114_PACKET_NUM              8
+#define MMS114_EVENT_SIZE              8
+#define MMS136_EVENT_SIZE              6
 
 /* Touch type */
 #define MMS114_TYPE_NONE               0
@@ -53,6 +54,7 @@
 
 enum mms_type {
        TYPE_MMS114     = 114,
+       TYPE_MMS136     = 136,
        TYPE_MMS152     = 152,
        TYPE_MMS345L    = 345,
 };
@@ -209,7 +211,11 @@ static irqreturn_t mms114_interrupt(int irq, void *dev_id)
        if (packet_size <= 0)
                goto out;
 
-       touch_size = packet_size / MMS114_PACKET_NUM;
+       /* MMS136 has slightly different event size */
+       if (data->type == TYPE_MMS136)
+               touch_size = packet_size / MMS136_EVENT_SIZE;
+       else
+               touch_size = packet_size / MMS114_EVENT_SIZE;
 
        error = __mms114_read_reg(data, MMS114_INFORMATION, packet_size,
                        (u8 *)touch);
@@ -275,6 +281,7 @@ static int mms114_get_version(struct mms114_data *data)
                break;
 
        case TYPE_MMS114:
+       case TYPE_MMS136:
                error = __mms114_read_reg(data, MMS114_TSP_REV, 6, buf);
                if (error)
                        return error;
@@ -297,8 +304,8 @@ static int mms114_setup_regs(struct mms114_data *data)
        if (error < 0)
                return error;
 
-       /* Only MMS114 has configuration and power on registers */
-       if (data->type != TYPE_MMS114)
+       /* Only MMS114 and MMS136 have configuration and power on registers */
+       if (data->type != TYPE_MMS114 && data->type != TYPE_MMS136)
                return 0;
 
        error = mms114_set_active(data, true);
@@ -480,7 +487,7 @@ static int mms114_probe(struct i2c_client *client,
                                     0, data->props.max_y, 0, 0);
        }
 
-       if (data->type == TYPE_MMS114) {
+       if (data->type == TYPE_MMS114 || data->type == TYPE_MMS136) {
                /*
                 * The firmware handles movement and pressure fuzz, so
                 * don't duplicate that in software.
@@ -530,13 +537,13 @@ static int mms114_probe(struct i2c_client *client,
        }
 
        error = devm_request_threaded_irq(&client->dev, client->irq,
-                                         NULL, mms114_interrupt, IRQF_ONESHOT,
+                                         NULL, mms114_interrupt,
+                                         IRQF_ONESHOT | IRQF_NO_AUTOEN,
                                          dev_name(&client->dev), data);
        if (error) {
                dev_err(&client->dev, "Failed to register interrupt\n");
                return error;
        }
-       disable_irq(client->irq);
 
        error = input_register_device(data->input_dev);
        if (error) {
@@ -604,6 +611,9 @@ static const struct of_device_id mms114_dt_match[] = {
        {
                .compatible = "melfas,mms114",
                .data = (void *)TYPE_MMS114,
+       }, {
+               .compatible = "melfas,mms136",
+               .data = (void *)TYPE_MMS136,
        }, {
                .compatible = "melfas,mms152",
                .data = (void *)TYPE_MMS152,
diff --git a/drivers/input/touchscreen/msg2638.c b/drivers/input/touchscreen/msg2638.c
new file mode 100644 (file)
index 0000000..75536bc
--- /dev/null
@@ -0,0 +1,337 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Driver for MStar msg2638 touchscreens
+ *
+ * Copyright (c) 2021 Vincent Knecht <vincent.knecht@mailoo.org>
+ *
+ * Checksum and IRQ handler based on mstar_drv_common.c and
+ * mstar_drv_mutual_fw_control.c
+ * Copyright (c) 2006-2012 MStar Semiconductor, Inc.
+ *
+ * Driver structure based on zinitix.c by Michael Srba <Michael.Srba@seznam.cz>
+ */
+
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/i2c.h>
+#include <linux/input.h>
+#include <linux/input/mt.h>
+#include <linux/input/touchscreen.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/regulator/consumer.h>
+#include <linux/slab.h>
+
+#define MODE_DATA_RAW                  0x5A
+
+#define MAX_SUPPORTED_FINGER_NUM       5
+
+#define CHIP_ON_DELAY_MS               15
+#define FIRMWARE_ON_DELAY_MS           50
+#define RESET_DELAY_MIN_US             10000
+#define RESET_DELAY_MAX_US             11000
+
+struct packet {
+       u8      xy_hi; /* higher bits of x and y coordinates */
+       u8      x_low;
+       u8      y_low;
+       u8      pressure;
+};
+
+struct touch_event {
+       u8      mode;
+       struct  packet pkt[MAX_SUPPORTED_FINGER_NUM];
+       u8      proximity;
+       u8      checksum;
+};
+
+struct msg2638_ts_data {
+       struct i2c_client *client;
+       struct input_dev *input_dev;
+       struct touchscreen_properties prop;
+       struct regulator_bulk_data supplies[2];
+       struct gpio_desc *reset_gpiod;
+};
+
+static u8 msg2638_checksum(u8 *data, u32 length)
+{
+       s32 sum = 0;
+       u32 i;
+
+       for (i = 0; i < length; i++)
+               sum += data[i];
+
+       return (u8)((-sum) & 0xFF);
+}
+
+static irqreturn_t msg2638_ts_irq_handler(int irq, void *msg2638_handler)
+{
+       struct msg2638_ts_data *msg2638 = msg2638_handler;
+       struct i2c_client *client = msg2638->client;
+       struct input_dev *input = msg2638->input_dev;
+       struct touch_event touch_event;
+       u32 len = sizeof(touch_event);
+       struct i2c_msg msg[] = {
+               {
+                       .addr   = client->addr,
+                       .flags  = I2C_M_RD,
+                       .len    = sizeof(touch_event),
+                       .buf    = (u8 *)&touch_event,
+               },
+       };
+       struct packet *p;
+       u16 x, y;
+       int ret;
+       int i;
+
+       ret = i2c_transfer(client->adapter, msg, ARRAY_SIZE(msg));
+       if (ret != ARRAY_SIZE(msg)) {
+               dev_err(&client->dev,
+                       "Failed I2C transfer in irq handler: %d\n",
+                       ret < 0 ? ret : -EIO);
+               goto out;
+       }
+
+       if (touch_event.mode != MODE_DATA_RAW)
+               goto out;
+
+       if (msg2638_checksum((u8 *)&touch_event, len - 1) !=
+                                               touch_event.checksum) {
+               dev_err(&client->dev, "Failed checksum!\n");
+               goto out;
+       }
+
+       for (i = 0; i < MAX_SUPPORTED_FINGER_NUM; i++) {
+               p = &touch_event.pkt[i];
+
+               /* Ignore non-pressed finger data */
+               if (p->xy_hi == 0xFF && p->x_low == 0xFF && p->y_low == 0xFF)
+                       continue;
+
+               x = (((p->xy_hi & 0xF0) << 4) | p->x_low);
+               y = (((p->xy_hi & 0x0F) << 8) | p->y_low);
+
+               input_mt_slot(input, i);
+               input_mt_report_slot_state(input, MT_TOOL_FINGER, true);
+               touchscreen_report_pos(input, &msg2638->prop, x, y, true);
+       }
+
+       input_mt_sync_frame(msg2638->input_dev);
+       input_sync(msg2638->input_dev);
+
+out:
+       return IRQ_HANDLED;
+}
+
+static void msg2638_reset(struct msg2638_ts_data *msg2638)
+{
+       gpiod_set_value_cansleep(msg2638->reset_gpiod, 1);
+       usleep_range(RESET_DELAY_MIN_US, RESET_DELAY_MAX_US);
+       gpiod_set_value_cansleep(msg2638->reset_gpiod, 0);
+       msleep(FIRMWARE_ON_DELAY_MS);
+}
+
+static int msg2638_start(struct msg2638_ts_data *msg2638)
+{
+       int error;
+
+       error = regulator_bulk_enable(ARRAY_SIZE(msg2638->supplies),
+                                     msg2638->supplies);
+       if (error) {
+               dev_err(&msg2638->client->dev,
+                       "Failed to enable regulators: %d\n", error);
+               return error;
+       }
+
+       msleep(CHIP_ON_DELAY_MS);
+
+       msg2638_reset(msg2638);
+
+       enable_irq(msg2638->client->irq);
+
+       return 0;
+}
+
+static int msg2638_stop(struct msg2638_ts_data *msg2638)
+{
+       int error;
+
+       disable_irq(msg2638->client->irq);
+
+       error = regulator_bulk_disable(ARRAY_SIZE(msg2638->supplies),
+                                      msg2638->supplies);
+       if (error) {
+               dev_err(&msg2638->client->dev,
+                       "Failed to disable regulators: %d\n", error);
+               return error;
+       }
+
+       return 0;
+}
+
+static int msg2638_input_open(struct input_dev *dev)
+{
+       struct msg2638_ts_data *msg2638 = input_get_drvdata(dev);
+
+       return msg2638_start(msg2638);
+}
+
+static void msg2638_input_close(struct input_dev *dev)
+{
+       struct msg2638_ts_data *msg2638 = input_get_drvdata(dev);
+
+       msg2638_stop(msg2638);
+}
+
+static int msg2638_init_input_dev(struct msg2638_ts_data *msg2638)
+{
+       struct device *dev = &msg2638->client->dev;
+       struct input_dev *input_dev;
+       int error;
+
+       input_dev = devm_input_allocate_device(dev);
+       if (!input_dev) {
+               dev_err(dev, "Failed to allocate input device.\n");
+               return -ENOMEM;
+       }
+
+       input_set_drvdata(input_dev, msg2638);
+       msg2638->input_dev = input_dev;
+
+       input_dev->name = "MStar TouchScreen";
+       input_dev->phys = "input/ts";
+       input_dev->id.bustype = BUS_I2C;
+       input_dev->open = msg2638_input_open;
+       input_dev->close = msg2638_input_close;
+
+       input_set_capability(input_dev, EV_ABS, ABS_MT_POSITION_X);
+       input_set_capability(input_dev, EV_ABS, ABS_MT_POSITION_Y);
+
+       touchscreen_parse_properties(input_dev, true, &msg2638->prop);
+       if (!msg2638->prop.max_x || !msg2638->prop.max_y) {
+               dev_err(dev, "touchscreen-size-x and/or touchscreen-size-y not set in properties\n");
+               return -EINVAL;
+       }
+
+       error = input_mt_init_slots(input_dev, MAX_SUPPORTED_FINGER_NUM,
+                                   INPUT_MT_DIRECT | INPUT_MT_DROP_UNUSED);
+       if (error) {
+               dev_err(dev, "Failed to initialize MT slots: %d\n", error);
+               return error;
+       }
+
+       error = input_register_device(input_dev);
+       if (error) {
+               dev_err(dev, "Failed to register input device: %d\n", error);
+               return error;
+       }
+
+       return 0;
+}
+
+static int msg2638_ts_probe(struct i2c_client *client)
+{
+       struct device *dev = &client->dev;
+       struct msg2638_ts_data *msg2638;
+       int error;
+
+       if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
+               dev_err(dev, "Failed to assert adapter's support for plain I2C.\n");
+               return -ENXIO;
+       }
+
+       msg2638 = devm_kzalloc(dev, sizeof(*msg2638), GFP_KERNEL);
+       if (!msg2638)
+               return -ENOMEM;
+
+       msg2638->client = client;
+       i2c_set_clientdata(client, msg2638);
+
+       msg2638->supplies[0].supply = "vdd";
+       msg2638->supplies[1].supply = "vddio";
+       error = devm_regulator_bulk_get(dev, ARRAY_SIZE(msg2638->supplies),
+                                       msg2638->supplies);
+       if (error) {
+               dev_err(dev, "Failed to get regulators: %d\n", error);
+               return error;
+       }
+
+       msg2638->reset_gpiod = devm_gpiod_get(dev, "reset", GPIOD_OUT_LOW);
+       if (IS_ERR(msg2638->reset_gpiod)) {
+               error = PTR_ERR(msg2638->reset_gpiod);
+               dev_err(dev, "Failed to request reset GPIO: %d\n", error);
+               return error;
+       }
+
+       error = msg2638_init_input_dev(msg2638);
+       if (error) {
+               dev_err(dev, "Failed to initialize input device: %d\n", error);
+               return error;
+       }
+
+       error = devm_request_threaded_irq(dev, client->irq,
+                                         NULL, msg2638_ts_irq_handler,
+                                         IRQF_ONESHOT | IRQF_NO_AUTOEN,
+                                         client->name, msg2638);
+       if (error) {
+               dev_err(dev, "Failed to request IRQ: %d\n", error);
+               return error;
+       }
+
+       return 0;
+}
+
+static int __maybe_unused msg2638_suspend(struct device *dev)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct msg2638_ts_data *msg2638 = i2c_get_clientdata(client);
+
+       mutex_lock(&msg2638->input_dev->mutex);
+
+       if (input_device_enabled(msg2638->input_dev))
+               msg2638_stop(msg2638);
+
+       mutex_unlock(&msg2638->input_dev->mutex);
+
+       return 0;
+}
+
+static int __maybe_unused msg2638_resume(struct device *dev)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct msg2638_ts_data *msg2638 = i2c_get_clientdata(client);
+       int ret = 0;
+
+       mutex_lock(&msg2638->input_dev->mutex);
+
+       if (input_device_enabled(msg2638->input_dev))
+               ret = msg2638_start(msg2638);
+
+       mutex_unlock(&msg2638->input_dev->mutex);
+
+       return ret;
+}
+
+static SIMPLE_DEV_PM_OPS(msg2638_pm_ops, msg2638_suspend, msg2638_resume);
+
+static const struct of_device_id msg2638_of_match[] = {
+       { .compatible = "mstar,msg2638" },
+       { }
+};
+MODULE_DEVICE_TABLE(of, msg2638_of_match);
+
+static struct i2c_driver msg2638_ts_driver = {
+       .probe_new = msg2638_ts_probe,
+       .driver = {
+               .name = "MStar-TS",
+               .pm = &msg2638_pm_ops,
+               .of_match_table = msg2638_of_match,
+       },
+};
+module_i2c_driver(msg2638_ts_driver);
+
+MODULE_AUTHOR("Vincent Knecht <vincent.knecht@mailoo.org>");
+MODULE_DESCRIPTION("MStar MSG2638 touchscreen driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/input/touchscreen/of_touchscreen.c b/drivers/input/touchscreen/of_touchscreen.c
deleted file mode 100644 (file)
index 97342e1..0000000
+++ /dev/null
@@ -1,206 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Generic DT helper functions for touchscreen devices
- *
- *  Copyright (c) 2014 Sebastian Reichel <sre@kernel.org>
- */
-
-#include <linux/property.h>
-#include <linux/input.h>
-#include <linux/input/mt.h>
-#include <linux/input/touchscreen.h>
-#include <linux/module.h>
-
-static bool touchscreen_get_prop_u32(struct device *dev,
-                                    const char *property,
-                                    unsigned int default_value,
-                                    unsigned int *value)
-{
-       u32 val;
-       int error;
-
-       error = device_property_read_u32(dev, property, &val);
-       if (error) {
-               *value = default_value;
-               return false;
-       }
-
-       *value = val;
-       return true;
-}
-
-static void touchscreen_set_params(struct input_dev *dev,
-                                  unsigned long axis,
-                                  int min, int max, int fuzz)
-{
-       struct input_absinfo *absinfo;
-
-       if (!test_bit(axis, dev->absbit)) {
-               dev_warn(&dev->dev,
-                        "DT specifies parameters but the axis %lu is not set up\n",
-                        axis);
-               return;
-       }
-
-       absinfo = &dev->absinfo[axis];
-       absinfo->minimum = min;
-       absinfo->maximum = max;
-       absinfo->fuzz = fuzz;
-}
-
-/**
- * touchscreen_parse_properties - parse common touchscreen DT properties
- * @input: input device that should be parsed
- * @multitouch: specifies whether parsed properties should be applied to
- *     single-touch or multi-touch axes
- * @prop: pointer to a struct touchscreen_properties into which to store
- *     axis swap and invert info for use with touchscreen_report_x_y();
- *     or %NULL
- *
- * This function parses common DT properties for touchscreens and setups the
- * input device accordingly. The function keeps previously set up default
- * values if no value is specified via DT.
- */
-void touchscreen_parse_properties(struct input_dev *input, bool multitouch,
-                                 struct touchscreen_properties *prop)
-{
-       struct device *dev = input->dev.parent;
-       struct input_absinfo *absinfo;
-       unsigned int axis, axis_x, axis_y;
-       unsigned int minimum, maximum, fuzz;
-       bool data_present;
-
-       input_alloc_absinfo(input);
-       if (!input->absinfo)
-               return;
-
-       axis_x = multitouch ? ABS_MT_POSITION_X : ABS_X;
-       axis_y = multitouch ? ABS_MT_POSITION_Y : ABS_Y;
-
-       data_present = touchscreen_get_prop_u32(dev, "touchscreen-min-x",
-                                               input_abs_get_min(input, axis_x),
-                                               &minimum) |
-                      touchscreen_get_prop_u32(dev, "touchscreen-size-x",
-                                               input_abs_get_max(input,
-                                                                 axis_x) + 1,
-                                               &maximum) |
-                      touchscreen_get_prop_u32(dev, "touchscreen-fuzz-x",
-                                               input_abs_get_fuzz(input, axis_x),
-                                               &fuzz);
-       if (data_present)
-               touchscreen_set_params(input, axis_x, minimum, maximum - 1, fuzz);
-
-       data_present = touchscreen_get_prop_u32(dev, "touchscreen-min-y",
-                                               input_abs_get_min(input, axis_y),
-                                               &minimum) |
-                      touchscreen_get_prop_u32(dev, "touchscreen-size-y",
-                                               input_abs_get_max(input,
-                                                                 axis_y) + 1,
-                                               &maximum) |
-                      touchscreen_get_prop_u32(dev, "touchscreen-fuzz-y",
-                                               input_abs_get_fuzz(input, axis_y),
-                                               &fuzz);
-       if (data_present)
-               touchscreen_set_params(input, axis_y, minimum, maximum - 1, fuzz);
-
-       axis = multitouch ? ABS_MT_PRESSURE : ABS_PRESSURE;
-       data_present = touchscreen_get_prop_u32(dev,
-                                               "touchscreen-max-pressure",
-                                               input_abs_get_max(input, axis),
-                                               &maximum) |
-                      touchscreen_get_prop_u32(dev,
-                                               "touchscreen-fuzz-pressure",
-                                               input_abs_get_fuzz(input, axis),
-                                               &fuzz);
-       if (data_present)
-               touchscreen_set_params(input, axis, 0, maximum, fuzz);
-
-       if (!prop)
-               return;
-
-       prop->max_x = input_abs_get_max(input, axis_x);
-       prop->max_y = input_abs_get_max(input, axis_y);
-
-       prop->invert_x =
-               device_property_read_bool(dev, "touchscreen-inverted-x");
-       if (prop->invert_x) {
-               absinfo = &input->absinfo[axis_x];
-               absinfo->maximum -= absinfo->minimum;
-               absinfo->minimum = 0;
-       }
-
-       prop->invert_y =
-               device_property_read_bool(dev, "touchscreen-inverted-y");
-       if (prop->invert_y) {
-               absinfo = &input->absinfo[axis_y];
-               absinfo->maximum -= absinfo->minimum;
-               absinfo->minimum = 0;
-       }
-
-       prop->swap_x_y =
-               device_property_read_bool(dev, "touchscreen-swapped-x-y");
-       if (prop->swap_x_y)
-               swap(input->absinfo[axis_x], input->absinfo[axis_y]);
-}
-EXPORT_SYMBOL(touchscreen_parse_properties);
-
-static void
-touchscreen_apply_prop_to_x_y(const struct touchscreen_properties *prop,
-                             unsigned int *x, unsigned int *y)
-{
-       if (prop->invert_x)
-               *x = prop->max_x - *x;
-
-       if (prop->invert_y)
-               *y = prop->max_y - *y;
-
-       if (prop->swap_x_y)
-               swap(*x, *y);
-}
-
-/**
- * touchscreen_set_mt_pos - Set input_mt_pos coordinates
- * @pos: input_mt_pos to set coordinates of
- * @prop: pointer to a struct touchscreen_properties
- * @x: X coordinate to store in pos
- * @y: Y coordinate to store in pos
- *
- * Adjust the passed in x and y values applying any axis inversion and
- * swapping requested in the passed in touchscreen_properties and store
- * the result in a struct input_mt_pos.
- */
-void touchscreen_set_mt_pos(struct input_mt_pos *pos,
-                           const struct touchscreen_properties *prop,
-                           unsigned int x, unsigned int y)
-{
-       touchscreen_apply_prop_to_x_y(prop, &x, &y);
-       pos->x = x;
-       pos->y = y;
-}
-EXPORT_SYMBOL(touchscreen_set_mt_pos);
-
-/**
- * touchscreen_report_pos - Report touchscreen coordinates
- * @input: input_device to report coordinates for
- * @prop: pointer to a struct touchscreen_properties
- * @x: X coordinate to report
- * @y: Y coordinate to report
- * @multitouch: Report coordinates on single-touch or multi-touch axes
- *
- * Adjust the passed in x and y values applying any axis inversion and
- * swapping requested in the passed in touchscreen_properties and then
- * report the resulting coordinates on the input_dev's x and y axis.
- */
-void touchscreen_report_pos(struct input_dev *input,
-                           const struct touchscreen_properties *prop,
-                           unsigned int x, unsigned int y,
-                           bool multitouch)
-{
-       touchscreen_apply_prop_to_x_y(prop, &x, &y);
-       input_report_abs(input, multitouch ? ABS_MT_POSITION_X : ABS_X, x);
-       input_report_abs(input, multitouch ? ABS_MT_POSITION_Y : ABS_Y, y);
-}
-EXPORT_SYMBOL(touchscreen_report_pos);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Device-tree helpers functions for touchscreen devices");
index 8fa2f3b..1ee760b 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/input/mt.h>
 #include <linux/input/touchscreen.h>
 #include <linux/pm.h>
+#include <linux/pm_runtime.h>
 #include <linux/irq.h>
 #include <linux/regulator/consumer.h>
 
@@ -335,10 +336,8 @@ static int silead_ts_get_id(struct i2c_client *client)
 
        error = i2c_smbus_read_i2c_block_data(client, SILEAD_REG_ID,
                                              sizeof(chip_id), (u8 *)&chip_id);
-       if (error < 0) {
-               dev_err(&client->dev, "Chip ID read error %d\n", error);
+       if (error < 0)
                return error;
-       }
 
        data->chip_id = le32_to_cpu(chip_id);
        dev_info(&client->dev, "Silead chip ID: 0x%8X", data->chip_id);
@@ -351,12 +350,49 @@ static int silead_ts_setup(struct i2c_client *client)
        int error;
        u32 status;
 
+       /*
+        * Some buggy BIOS-es bring up the chip in a stuck state where it
+        * blocks the I2C bus. The following steps are necessary to
+        * unstuck the chip / bus:
+        * 1. Turn off the Silead chip.
+        * 2. Try to do an I2C transfer with the chip, this will fail in
+        *    response to which the I2C-bus-driver will call:
+        *    i2c_recover_bus() which will unstuck the I2C-bus. Note the
+        *    unstuck-ing of the I2C bus only works if we first drop the
+        *    chip off the bus by turning it off.
+        * 3. Turn the chip back on.
+        *
+        * On the x86/ACPI systems were this problem is seen, step 1. and
+        * 3. require making ACPI calls and dealing with ACPI Power
+        * Resources. The workaround below runtime-suspends the chip to
+        * turn it off, leaving it up to the ACPI subsystem to deal with
+        * this.
+        */
+
+       if (device_property_read_bool(&client->dev,
+                                     "silead,stuck-controller-bug")) {
+               pm_runtime_set_active(&client->dev);
+               pm_runtime_enable(&client->dev);
+               pm_runtime_allow(&client->dev);
+
+               pm_runtime_suspend(&client->dev);
+
+               dev_warn(&client->dev, FW_BUG "Stuck I2C bus: please ignore the next 'controller timed out' error\n");
+               silead_ts_get_id(client);
+
+               /* The forbid will also resume the device */
+               pm_runtime_forbid(&client->dev);
+               pm_runtime_disable(&client->dev);
+       }
+
        silead_ts_set_power(client, SILEAD_POWER_OFF);
        silead_ts_set_power(client, SILEAD_POWER_ON);
 
        error = silead_ts_get_id(client);
-       if (error)
+       if (error) {
+               dev_err(&client->dev, "Chip ID read error %d\n", error);
                return error;
+       }
 
        error = silead_ts_init(client);
        if (error)
@@ -486,7 +522,7 @@ static int silead_ts_probe(struct i2c_client *client,
 
        silead_ts_read_props(client);
 
-       /* We must have the IRQ provided by DT or ACPI subsytem */
+       /* We must have the IRQ provided by DT or ACPI subsystem */
        if (client->irq <= 0)
                return -ENODEV;
 
index 9a64e1d..bc11203 100644 (file)
@@ -691,10 +691,9 @@ static int stmfts_probe(struct i2c_client *client,
         * interrupts. To be on the safe side it's better to not enable
         * the interrupts during their request.
         */
-       irq_set_status_flags(client->irq, IRQ_NOAUTOEN);
        err = devm_request_threaded_irq(&client->dev, client->irq,
                                        NULL, stmfts_irq_handler,
-                                       IRQF_ONESHOT,
+                                       IRQF_ONESHOT | IRQF_NO_AUTOEN,
                                        "stmfts_irq", sdata);
        if (err)
                return err;
index 91c60bf..69b08dd 100644 (file)
@@ -19,6 +19,8 @@
 #ifndef _TSC2007_H
 #define _TSC2007_H
 
+struct gpio_desc;
+
 #define TSC2007_MEASURE_TEMP0          (0x0 << 4)
 #define TSC2007_MEASURE_AUX            (0x2 << 4)
 #define TSC2007_MEASURE_TEMP1          (0x4 << 4)
@@ -69,7 +71,7 @@ struct tsc2007 {
        int                     fuzzy;
        int                     fuzzz;
 
-       unsigned int            gpio;
+       struct gpio_desc        *gpiod;
        int                     irq;
 
        wait_queue_head_t       wait;
index 3b80abf..3e871d1 100644 (file)
 
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/gpio/consumer.h>
 #include <linux/input.h>
 #include <linux/interrupt.h>
 #include <linux/i2c.h>
-#include <linux/of_device.h>
-#include <linux/of_gpio.h>
+#include <linux/mod_devicetable.h>
+#include <linux/property.h>
 #include <linux/platform_data/tsc2007.h>
 #include "tsc2007.h"
 
@@ -220,71 +221,58 @@ static void tsc2007_close(struct input_dev *input_dev)
        tsc2007_stop(ts);
 }
 
-#ifdef CONFIG_OF
 static int tsc2007_get_pendown_state_gpio(struct device *dev)
 {
        struct i2c_client *client = to_i2c_client(dev);
        struct tsc2007 *ts = i2c_get_clientdata(client);
 
-       return !gpio_get_value(ts->gpio);
+       return gpiod_get_value(ts->gpiod);
 }
 
-static int tsc2007_probe_dt(struct i2c_client *client, struct tsc2007 *ts)
+static int tsc2007_probe_properties(struct device *dev, struct tsc2007 *ts)
 {
-       struct device_node *np = client->dev.of_node;
        u32 val32;
        u64 val64;
 
-       if (!np) {
-               dev_err(&client->dev, "missing device tree data\n");
-               return -EINVAL;
-       }
-
-       if (!of_property_read_u32(np, "ti,max-rt", &val32))
+       if (!device_property_read_u32(dev, "ti,max-rt", &val32))
                ts->max_rt = val32;
        else
                ts->max_rt = MAX_12BIT;
 
-       if (!of_property_read_u32(np, "ti,fuzzx", &val32))
+       if (!device_property_read_u32(dev, "ti,fuzzx", &val32))
                ts->fuzzx = val32;
 
-       if (!of_property_read_u32(np, "ti,fuzzy", &val32))
+       if (!device_property_read_u32(dev, "ti,fuzzy", &val32))
                ts->fuzzy = val32;
 
-       if (!of_property_read_u32(np, "ti,fuzzz", &val32))
+       if (!device_property_read_u32(dev, "ti,fuzzz", &val32))
                ts->fuzzz = val32;
 
-       if (!of_property_read_u64(np, "ti,poll-period", &val64))
+       if (!device_property_read_u64(dev, "ti,poll-period", &val64))
                ts->poll_period = msecs_to_jiffies(val64);
        else
                ts->poll_period = msecs_to_jiffies(1);
 
-       if (!of_property_read_u32(np, "ti,x-plate-ohms", &val32)) {
+       if (!device_property_read_u32(dev, "ti,x-plate-ohms", &val32)) {
                ts->x_plate_ohms = val32;
        } else {
-               dev_err(&client->dev, "missing ti,x-plate-ohms devicetree property.");
+               dev_err(dev, "Missing ti,x-plate-ohms device property\n");
                return -EINVAL;
        }
 
-       ts->gpio = of_get_gpio(np, 0);
-       if (gpio_is_valid(ts->gpio))
+       ts->gpiod = devm_gpiod_get_optional(dev, NULL, GPIOD_IN);
+       if (IS_ERR(ts->gpiod))
+               return PTR_ERR(ts->gpiod);
+
+       if (ts->gpiod)
                ts->get_pendown_state = tsc2007_get_pendown_state_gpio;
        else
-               dev_warn(&client->dev,
-                        "GPIO not specified in DT (of_get_gpio returned %d)\n",
-                        ts->gpio);
+               dev_warn(dev, "Pen down GPIO is not specified in properties\n");
 
        return 0;
 }
-#else
-static int tsc2007_probe_dt(struct i2c_client *client, struct tsc2007 *ts)
-{
-       dev_err(&client->dev, "platform data is required!\n");
-       return -EINVAL;
-}
-#endif
 
-static int tsc2007_probe_pdev(struct i2c_client *client, struct tsc2007 *ts,
+static int tsc2007_probe_pdev(struct device *dev, struct tsc2007 *ts,
                              const struct tsc2007_platform_data *pdata,
                              const struct i2c_device_id *id)
 {
@@ -299,7 +287,7 @@ static int tsc2007_probe_pdev(struct i2c_client *client, struct tsc2007 *ts,
        ts->fuzzz             = pdata->fuzzz;
 
        if (pdata->x_plate_ohms == 0) {
-               dev_err(&client->dev, "x_plate_ohms is not set up in platform data");
+               dev_err(dev, "x_plate_ohms is not set up in platform data\n");
                return -EINVAL;
        }
 
@@ -332,9 +320,9 @@ static int tsc2007_probe(struct i2c_client *client,
                return -ENOMEM;
 
        if (pdata)
-               err = tsc2007_probe_pdev(client, ts, pdata, id);
+               err = tsc2007_probe_pdev(&client->dev, ts, pdata, id);
        else
-               err = tsc2007_probe_dt(client, ts);
+               err = tsc2007_probe_properties(&client->dev, ts);
        if (err)
                return err;
 
@@ -431,18 +419,16 @@ static const struct i2c_device_id tsc2007_idtable[] = {
 
 MODULE_DEVICE_TABLE(i2c, tsc2007_idtable);
 
-#ifdef CONFIG_OF
 static const struct of_device_id tsc2007_of_match[] = {
        { .compatible = "ti,tsc2007" },
        { /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, tsc2007_of_match);
-#endif
 
 static struct i2c_driver tsc2007_driver = {
        .driver = {
                .name   = "tsc2007",
-               .of_match_table = of_match_ptr(tsc2007_of_match),
+               .of_match_table = tsc2007_of_match,
        },
        .id_table       = tsc2007_idtable,
        .probe          = tsc2007_probe,
index 1afc6bd..22826c3 100644 (file)
@@ -145,15 +145,16 @@ static void wacom_i2c_close(struct input_dev *dev)
 }
 
 static int wacom_i2c_probe(struct i2c_client *client,
-                                    const struct i2c_device_id *id)
+                          const struct i2c_device_id *id)
 {
+       struct device *dev = &client->dev;
        struct wacom_i2c *wac_i2c;
        struct input_dev *input;
        struct wacom_features features = { 0 };
        int error;
 
        if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
-               dev_err(&client->dev, "i2c_check_functionality error\n");
+               dev_err(dev, "i2c_check_functionality error\n");
                return -EIO;
        }
 
@@ -161,21 +162,22 @@ static int wacom_i2c_probe(struct i2c_client *client,
        if (error)
                return error;
 
-       wac_i2c = kzalloc(sizeof(*wac_i2c), GFP_KERNEL);
-       input = input_allocate_device();
-       if (!wac_i2c || !input) {
-               error = -ENOMEM;
-               goto err_free_mem;
-       }
+       wac_i2c = devm_kzalloc(dev, sizeof(*wac_i2c), GFP_KERNEL);
+       if (!wac_i2c)
+               return -ENOMEM;
 
        wac_i2c->client = client;
+
+       input = devm_input_allocate_device(dev);
+       if (!input)
+               return -ENOMEM;
+
        wac_i2c->input = input;
 
        input->name = "Wacom I2C Digitizer";
        input->id.bustype = BUS_I2C;
        input->id.vendor = 0x56a;
        input->id.version = features.fw_version;
-       input->dev.parent = &client->dev;
        input->open = wacom_i2c_open;
        input->close = wacom_i2c_close;
 
@@ -194,13 +196,11 @@ static int wacom_i2c_probe(struct i2c_client *client,
 
        input_set_drvdata(input, wac_i2c);
 
-       error = request_threaded_irq(client->irq, NULL, wacom_i2c_irq,
-                                    IRQF_TRIGGER_LOW | IRQF_ONESHOT,
-                                    "wacom_i2c", wac_i2c);
+       error = devm_request_threaded_irq(dev, client->irq, NULL, wacom_i2c_irq,
+                                         IRQF_ONESHOT, "wacom_i2c", wac_i2c);
        if (error) {
-               dev_err(&client->dev,
-                       "Failed to enable IRQ, error: %d\n", error);
-               goto err_free_mem;
+               dev_err(dev, "Failed to request IRQ: %d\n", error);
+               return error;
        }
 
        /* Disable the IRQ, we'll enable it in wac_i2c_open() */
@@ -208,31 +208,10 @@ static int wacom_i2c_probe(struct i2c_client *client,
 
        error = input_register_device(wac_i2c->input);
        if (error) {
-               dev_err(&client->dev,
-                       "Failed to register input device, error: %d\n", error);
-               goto err_free_irq;
+               dev_err(dev, "Failed to register input device: %d\n", error);
+               return error;
        }
 
-       i2c_set_clientdata(client, wac_i2c);
-       return 0;
-
-err_free_irq:
-       free_irq(client->irq, wac_i2c);
-err_free_mem:
-       input_free_device(input);
-       kfree(wac_i2c);
-
-       return error;
-}
-
-static int wacom_i2c_remove(struct i2c_client *client)
-{
-       struct wacom_i2c *wac_i2c = i2c_get_clientdata(client);
-
-       free_irq(client->irq, wac_i2c);
-       input_unregister_device(wac_i2c->input);
-       kfree(wac_i2c);
-
        return 0;
 }
 
@@ -269,7 +248,6 @@ static struct i2c_driver wacom_i2c_driver = {
        },
 
        .probe          = wacom_i2c_probe,
-       .remove         = wacom_i2c_remove,
        .id_table       = wacom_i2c_id,
 };
 module_i2c_driver(wacom_i2c_driver);
index bb1699e..319f57f 100644 (file)
@@ -317,14 +317,13 @@ static int wm831x_ts_probe(struct platform_device *pdev)
 
        error = request_threaded_irq(wm831x_ts->data_irq,
                                     NULL, wm831x_ts_data_irq,
-                                    irqf | IRQF_ONESHOT,
+                                    irqf | IRQF_ONESHOT | IRQF_NO_AUTOEN,
                                     "Touchscreen data", wm831x_ts);
        if (error) {
                dev_err(&pdev->dev, "Failed to request data IRQ %d: %d\n",
                        wm831x_ts->data_irq, error);
                goto err_alloc;
        }
-       disable_irq(wm831x_ts->data_irq);
 
        if (pdata && pdata->pd_irqf)
                irqf = pdata->pd_irqf;
index 3b636be..b8d9010 100644 (file)
@@ -513,10 +513,10 @@ static int zinitix_ts_probe(struct i2c_client *client)
                return -EINVAL;
        }
 
-       irq_set_status_flags(client->irq, IRQ_NOAUTOEN);
        error = devm_request_threaded_irq(&client->dev, client->irq,
                                          NULL, zinitix_ts_irq_handler,
-                                         IRQF_ONESHOT, client->name, bt541);
+                                         IRQF_ONESHOT | IRQF_NO_AUTOEN,
+                                         client->name, bt541);
        if (error) {
                dev_err(&client->dev, "Failed to request IRQ: %d\n", error);
                return error;
index 192ef8f..1f111b3 100644 (file)
@@ -349,7 +349,7 @@ config S390_AP_IOMMU
          is not implemented as it is not necessary for VFIO.
 
 config MTK_IOMMU
-       bool "MTK IOMMU Support"
+       tristate "MediaTek IOMMU Support"
        depends on ARCH_MEDIATEK || COMPILE_TEST
        select ARM_DMA_USE_IOMMU
        select IOMMU_API
@@ -364,7 +364,7 @@ config MTK_IOMMU
          If unsure, say N here.
 
 config MTK_IOMMU_V1
-       bool "MTK IOMMU Version 1 (M4U gen1) Support"
+       tristate "MediaTek IOMMU Version 1 (M4U gen1) Support"
        depends on ARM
        depends on ARCH_MEDIATEK || COMPILE_TEST
        select ARM_DMA_USE_IOMMU
@@ -408,4 +408,16 @@ config VIRTIO_IOMMU
 
          Say Y here if you intend to run this kernel as a guest.
 
+config SPRD_IOMMU
+       tristate "Unisoc IOMMU Support"
+       depends on ARCH_SPRD || COMPILE_TEST
+       select IOMMU_API
+       help
+         Support for IOMMU on Unisoc's SoCs, this IOMMU can be used by
+         Unisoc's multimedia devices, such as display, Image codec(jpeg)
+         and a few signal processors, including VSP(video), GSP(graphic),
+         ISP(image), and CPP(camera pixel processor), etc.
+
+         Say Y here if you want to use the multimedia devices listed above.
+
 endif # IOMMU_SUPPORT
index 61bd30c..c0fb0ba 100644 (file)
@@ -27,4 +27,5 @@ obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
 obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
-obj-$(CONFIG_IOMMU_SVA_LIB) += iommu-sva-lib.o
+obj-$(CONFIG_IOMMU_SVA_LIB) += iommu-sva-lib.o io-pgfault.o
+obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o
index 026ce7f..55dd38d 100644 (file)
@@ -11,7 +11,6 @@
 
 #include "amd_iommu_types.h"
 
-extern int amd_iommu_get_num_iommus(void);
 extern int amd_iommu_init_dma_ops(void);
 extern int amd_iommu_init_passthrough(void);
 extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
@@ -65,7 +64,6 @@ extern int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid);
 extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid,
                                     unsigned long cr3);
 extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid);
-extern struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev);
 
 #ifdef CONFIG_IRQ_REMAP
 extern int amd_iommu_create_irq_domain(struct amd_iommu *iommu);
index 6937e36..94c1a7a 100644 (file)
@@ -693,7 +693,6 @@ struct iommu_dev_data {
        } ats;                            /* ATS state */
        bool pri_tlp;                     /* PASID TLB required for
                                             PPR completions */
-       u32 errata;                       /* Bitmap for errata to apply */
        bool use_vapic;                   /* Enable device to use vapic mode */
        bool defer_attach;
 
index 321f590..d006724 100644 (file)
@@ -12,7 +12,6 @@
 #include <linux/acpi.h>
 #include <linux/list.h>
 #include <linux/bitmap.h>
-#include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/syscore_ops.h>
 #include <linux/interrupt.h>
@@ -208,7 +207,6 @@ u16 *amd_iommu_alias_table;
  * for a specific device. It is also indexed by the PCI device id.
  */
 struct amd_iommu **amd_iommu_rlookup_table;
-EXPORT_SYMBOL(amd_iommu_rlookup_table);
 
 /*
  * This table is used to find the irq remapping table for a given device id
@@ -257,8 +255,6 @@ static enum iommu_init_state init_state = IOMMU_START_STATE;
 static int amd_iommu_enable_interrupts(void);
 static int __init iommu_go_to_state(enum iommu_init_state state);
 static void init_device_table_dma(void);
-static int iommu_pc_get_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
-                               u8 fxn, u64 *value, bool is_write);
 
 static bool amd_iommu_pre_enabled = true;
 
@@ -268,7 +264,6 @@ bool translation_pre_enabled(struct amd_iommu *iommu)
 {
        return (iommu->flags & AMD_IOMMU_FLAG_TRANS_PRE_ENABLED);
 }
-EXPORT_SYMBOL(translation_pre_enabled);
 
 static void clear_translation_pre_enabled(struct amd_iommu *iommu)
 {
@@ -1717,53 +1712,16 @@ static int __init init_iommu_all(struct acpi_table_header *table)
        return 0;
 }
 
-static void __init init_iommu_perf_ctr(struct amd_iommu *iommu)
+static void init_iommu_perf_ctr(struct amd_iommu *iommu)
 {
-       int retry;
+       u64 val;
        struct pci_dev *pdev = iommu->dev;
-       u64 val = 0xabcd, val2 = 0, save_reg, save_src;
 
        if (!iommu_feature(iommu, FEATURE_PC))
                return;
 
        amd_iommu_pc_present = true;
 
-       /* save the value to restore, if writable */
-       if (iommu_pc_get_set_reg(iommu, 0, 0, 0, &save_reg, false) ||
-           iommu_pc_get_set_reg(iommu, 0, 0, 8, &save_src, false))
-               goto pc_false;
-
-       /*
-        * Disable power gating by programing the performance counter
-        * source to 20 (i.e. counts the reads and writes from/to IOMMU
-        * Reserved Register [MMIO Offset 1FF8h] that are ignored.),
-        * which never get incremented during this init phase.
-        * (Note: The event is also deprecated.)
-        */
-       val = 20;
-       if (iommu_pc_get_set_reg(iommu, 0, 0, 8, &val, true))
-               goto pc_false;
-
-       /* Check if the performance counters can be written to */
-       val = 0xabcd;
-       for (retry = 5; retry; retry--) {
-               if (iommu_pc_get_set_reg(iommu, 0, 0, 0, &val, true) ||
-                   iommu_pc_get_set_reg(iommu, 0, 0, 0, &val2, false) ||
-                   val2)
-                       break;
-
-               /* Wait about 20 msec for power gating to disable and retry. */
-               msleep(20);
-       }
-
-       /* restore */
-       if (iommu_pc_get_set_reg(iommu, 0, 0, 0, &save_reg, true) ||
-           iommu_pc_get_set_reg(iommu, 0, 0, 8, &save_src, true))
-               goto pc_false;
-
-       if (val != val2)
-               goto pc_false;
-
        pci_info(pdev, "IOMMU performance counters supported\n");
 
        val = readl(iommu->mmio_base + MMIO_CNTR_CONF_OFFSET);
@@ -1771,11 +1729,6 @@ static void __init init_iommu_perf_ctr(struct amd_iommu *iommu)
        iommu->max_counters = (u8) ((val >> 7) & 0xf);
 
        return;
-
-pc_false:
-       pci_err(pdev, "Unable to read/write to IOMMU perf counter.\n");
-       amd_iommu_pc_present = false;
-       return;
 }
 
 static ssize_t amd_iommu_show_cap(struct device *dev,
@@ -1837,7 +1790,7 @@ static void __init late_iommu_features_init(struct amd_iommu *iommu)
         * IVHD and MMIO conflict.
         */
        if (features != iommu->features)
-               pr_warn(FW_WARN "EFR mismatch. Use IVHD EFR (%#llx : %#llx\n).",
+               pr_warn(FW_WARN "EFR mismatch. Use IVHD EFR (%#llx : %#llx).\n",
                        features, iommu->features);
 }
 
@@ -1935,8 +1888,7 @@ static int __init iommu_init_pci(struct amd_iommu *iommu)
 
        iommu_device_sysfs_add(&iommu->iommu, &iommu->dev->dev,
                               amd_iommu_groups, "ivhd%d", iommu->index);
-       iommu_device_set_ops(&iommu->iommu, &amd_iommu_ops);
-       iommu_device_register(&iommu->iommu);
+       iommu_device_register(&iommu->iommu, &amd_iommu_ops, NULL);
 
        return pci_enable_device(iommu->dev);
 }
@@ -3277,7 +3229,6 @@ struct amd_iommu *get_amd_iommu(unsigned int idx)
                        return iommu;
        return NULL;
 }
-EXPORT_SYMBOL(get_amd_iommu);
 
 /****************************************************************************
  *
@@ -3359,7 +3310,6 @@ int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64
 
        return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, false);
 }
-EXPORT_SYMBOL(amd_iommu_pc_get_reg);
 
 int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value)
 {
@@ -3368,4 +3318,3 @@ int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64
 
        return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, true);
 }
-EXPORT_SYMBOL(amd_iommu_pc_set_reg);
index a69a8b5..80e8e19 100644 (file)
@@ -290,15 +290,6 @@ static bool pci_iommuv2_capable(struct pci_dev *pdev)
        return true;
 }
 
-static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum)
-{
-       struct iommu_dev_data *dev_data;
-
-       dev_data = dev_iommu_priv_get(&pdev->dev);
-
-       return dev_data->errata & (1 << erratum) ? true : false;
-}
-
 /*
  * This function checks if the driver got a valid device from the caller to
  * avoid dereferencing invalid pointers.
@@ -861,33 +852,58 @@ static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
        CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
 }
 
-static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
-                                 size_t size, u16 domid, int pde)
+/*
+ * Builds an invalidation address which is suitable for one page or multiple
+ * pages. Sets the size bit (S) as needed is more than one page is flushed.
+ */
+static inline u64 build_inv_address(u64 address, size_t size)
 {
-       u64 pages;
-       bool s;
+       u64 pages, end, msb_diff;
 
        pages = iommu_num_pages(address, size, PAGE_SIZE);
-       s     = false;
 
-       if (pages > 1) {
+       if (pages == 1)
+               return address & PAGE_MASK;
+
+       end = address + size - 1;
+
+       /*
+        * msb_diff would hold the index of the most significant bit that
+        * flipped between the start and end.
+        */
+       msb_diff = fls64(end ^ address) - 1;
+
+       /*
+        * Bits 63:52 are sign extended. If for some reason bit 51 is different
+        * between the start and the end, invalidate everything.
+        */
+       if (unlikely(msb_diff > 51)) {
+               address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+       } else {
                /*
-                * If we have to flush more than one page, flush all
-                * TLB entries for this domain
+                * The msb-bit must be clear on the address. Just set all the
+                * lower bits.
                 */
-               address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-               s = true;
+               address |= 1ull << (msb_diff - 1);
        }
 
+       /* Clear bits 11:0 */
        address &= PAGE_MASK;
 
+       /* Set the size bit - we flush more than one 4kb page */
+       return address | CMD_INV_IOMMU_PAGES_SIZE_MASK;
+}
+
+static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
+                                 size_t size, u16 domid, int pde)
+{
+       u64 inv_address = build_inv_address(address, size);
+
        memset(cmd, 0, sizeof(*cmd));
        cmd->data[1] |= domid;
-       cmd->data[2]  = lower_32_bits(address);
-       cmd->data[3]  = upper_32_bits(address);
+       cmd->data[2]  = lower_32_bits(inv_address);
+       cmd->data[3]  = upper_32_bits(inv_address);
        CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
-       if (s) /* size bit - we flush more than one 4kb page */
-               cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
        if (pde) /* PDE bit - we want to flush everything, not only the PTEs */
                cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
 }
@@ -895,32 +911,15 @@ static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
 static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
                                  u64 address, size_t size)
 {
-       u64 pages;
-       bool s;
-
-       pages = iommu_num_pages(address, size, PAGE_SIZE);
-       s     = false;
-
-       if (pages > 1) {
-               /*
-                * If we have to flush more than one page, flush all
-                * TLB entries for this domain
-                */
-               address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-               s = true;
-       }
-
-       address &= PAGE_MASK;
+       u64 inv_address = build_inv_address(address, size);
 
        memset(cmd, 0, sizeof(*cmd));
        cmd->data[0]  = devid;
        cmd->data[0] |= (qdep & 0xff) << 24;
        cmd->data[1]  = devid;
-       cmd->data[2]  = lower_32_bits(address);
-       cmd->data[3]  = upper_32_bits(address);
+       cmd->data[2]  = lower_32_bits(inv_address);
+       cmd->data[3]  = upper_32_bits(inv_address);
        CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
-       if (s)
-               cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
 }
 
 static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, u32 pasid,
@@ -1531,33 +1530,9 @@ static void pdev_iommuv2_disable(struct pci_dev *pdev)
        pci_disable_pasid(pdev);
 }
 
-/* FIXME: Change generic reset-function to do the same */
-static int pri_reset_while_enabled(struct pci_dev *pdev)
-{
-       u16 control;
-       int pos;
-
-       pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
-       if (!pos)
-               return -EINVAL;
-
-       pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control);
-       control |= PCI_PRI_CTRL_RESET;
-       pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
-
-       return 0;
-}
-
 static int pdev_iommuv2_enable(struct pci_dev *pdev)
 {
-       bool reset_enable;
-       int reqs, ret;
-
-       /* FIXME: Hardcode number of outstanding requests for now */
-       reqs = 32;
-       if (pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE))
-               reqs = 1;
-       reset_enable = pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_ENABLE_RESET);
+       int ret;
 
        /* Only allow access to user-accessible pages */
        ret = pci_enable_pasid(pdev, 0);
@@ -1570,16 +1545,11 @@ static int pdev_iommuv2_enable(struct pci_dev *pdev)
                goto out_err;
 
        /* Enable PRI */
-       ret = pci_enable_pri(pdev, reqs);
+       /* FIXME: Hardcode number of outstanding requests for now */
+       ret = pci_enable_pri(pdev, 32);
        if (ret)
                goto out_err;
 
-       if (reset_enable) {
-               ret = pri_reset_while_enabled(pdev);
-               if (ret)
-                       goto out_err;
-       }
-
        ret = pci_enable_ats(pdev, PAGE_SHIFT);
        if (ret)
                goto out_err;
@@ -1715,9 +1685,6 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev)
                return ERR_PTR(-ENODEV);
 
        devid = get_device_id(dev);
-       if (devid < 0)
-               return ERR_PTR(devid);
-
        iommu = amd_iommu_rlookup_table[devid];
 
        if (dev_iommu_priv_get(dev))
@@ -1771,26 +1738,6 @@ static struct iommu_group *amd_iommu_device_group(struct device *dev)
        return acpihid_device_group(dev);
 }
 
-static int amd_iommu_domain_get_attr(struct iommu_domain *domain,
-               enum iommu_attr attr, void *data)
-{
-       switch (domain->type) {
-       case IOMMU_DOMAIN_UNMANAGED:
-               return -ENODEV;
-       case IOMMU_DOMAIN_DMA:
-               switch (attr) {
-               case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-                       *(int *)data = !amd_iommu_unmap_flush;
-                       return 0;
-               default:
-                       return -ENODEV;
-               }
-               break;
-       default:
-               return -EINVAL;
-       }
-}
-
 /*****************************************************************************
  *
  * The next functions belong to the dma_ops mapping/unmapping code.
@@ -1855,7 +1802,7 @@ int __init amd_iommu_init_dma_ops(void)
                pr_info("IO/TLB flush on unmap enabled\n");
        else
                pr_info("Lazy IO/TLB flushing enabled\n");
-
+       iommu_set_dma_strict(amd_iommu_unmap_flush);
        return 0;
 
 }
@@ -2019,16 +1966,12 @@ static void amd_iommu_detach_device(struct iommu_domain *dom,
                                    struct device *dev)
 {
        struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
+       int devid = get_device_id(dev);
        struct amd_iommu *iommu;
-       int devid;
 
        if (!check_device(dev))
                return;
 
-       devid = get_device_id(dev);
-       if (devid < 0)
-               return;
-
        if (dev_data->domain != NULL)
                detach_device(dev);
 
@@ -2257,7 +2200,6 @@ const struct iommu_ops amd_iommu_ops = {
        .release_device = amd_iommu_release_device,
        .probe_finalize = amd_iommu_probe_finalize,
        .device_group = amd_iommu_device_group,
-       .domain_get_attr = amd_iommu_domain_get_attr,
        .get_resv_regions = amd_iommu_get_resv_regions,
        .put_resv_regions = generic_iommu_put_resv_regions,
        .is_attach_deferred = amd_iommu_is_attach_deferred,
@@ -2310,9 +2252,6 @@ int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
        unsigned long flags;
        int levels, ret;
 
-       if (pasids <= 0 || pasids > (PASID_MASK + 1))
-               return -EINVAL;
-
        /* Number of GCR3 table levels required */
        for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9)
                levels += 1;
@@ -2563,52 +2502,6 @@ int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid,
 }
 EXPORT_SYMBOL(amd_iommu_complete_ppr);
 
-struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev)
-{
-       struct protection_domain *pdomain;
-       struct iommu_dev_data *dev_data;
-       struct device *dev = &pdev->dev;
-       struct iommu_domain *io_domain;
-
-       if (!check_device(dev))
-               return NULL;
-
-       dev_data  = dev_iommu_priv_get(&pdev->dev);
-       pdomain   = dev_data->domain;
-       io_domain = iommu_get_domain_for_dev(dev);
-
-       if (pdomain == NULL && dev_data->defer_attach) {
-               dev_data->defer_attach = false;
-               pdomain = to_pdomain(io_domain);
-               attach_device(dev, pdomain);
-       }
-
-       if (pdomain == NULL)
-               return NULL;
-
-       if (io_domain->type != IOMMU_DOMAIN_DMA)
-               return NULL;
-
-       /* Only return IOMMUv2 domains */
-       if (!(pdomain->flags & PD_IOMMUV2_MASK))
-               return NULL;
-
-       return &pdomain->domain;
-}
-EXPORT_SYMBOL(amd_iommu_get_v2_domain);
-
-void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum)
-{
-       struct iommu_dev_data *dev_data;
-
-       if (!amd_iommu_v2_supported())
-               return;
-
-       dev_data = dev_iommu_priv_get(&pdev->dev);
-       dev_data->errata |= (1 << erratum);
-}
-EXPORT_SYMBOL(amd_iommu_enable_device_erratum);
-
 int amd_iommu_device_info(struct pci_dev *pdev,
                           struct amd_iommu_device_info *info)
 {
index 8594b4a..54b2f27 100644 (file)
@@ -245,8 +245,6 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
                break;
        case CMDQ_OP_PREFETCH_CFG:
                cmd[0] |= FIELD_PREP(CMDQ_PREFETCH_0_SID, ent->prefetch.sid);
-               cmd[1] |= FIELD_PREP(CMDQ_PREFETCH_1_SIZE, ent->prefetch.size);
-               cmd[1] |= ent->prefetch.addr & CMDQ_PREFETCH_1_ADDR_MASK;
                break;
        case CMDQ_OP_CFGI_CD:
                cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SSID, ent->cfgi.ssid);
@@ -909,8 +907,8 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain,
 
        spin_lock_irqsave(&smmu_domain->devices_lock, flags);
        list_for_each_entry(master, &smmu_domain->devices, domain_head) {
-               for (i = 0; i < master->num_sids; i++) {
-                       cmd.cfgi.sid = master->sids[i];
+               for (i = 0; i < master->num_streams; i++) {
+                       cmd.cfgi.sid = master->streams[i].id;
                        arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
                }
        }
@@ -1355,6 +1353,29 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
        return 0;
 }
 
+__maybe_unused
+static struct arm_smmu_master *
+arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid)
+{
+       struct rb_node *node;
+       struct arm_smmu_stream *stream;
+
+       lockdep_assert_held(&smmu->streams_mutex);
+
+       node = smmu->streams.rb_node;
+       while (node) {
+               stream = rb_entry(node, struct arm_smmu_stream, node);
+               if (stream->id < sid)
+                       node = node->rb_right;
+               else if (stream->id > sid)
+                       node = node->rb_left;
+               else
+                       return stream->master;
+       }
+
+       return NULL;
+}
+
 /* IRQ and event handlers */
 static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
 {
@@ -1588,8 +1609,8 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master)
 
        arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd);
 
-       for (i = 0; i < master->num_sids; i++) {
-               cmd.atc.sid = master->sids[i];
+       for (i = 0; i < master->num_streams; i++) {
+               cmd.atc.sid = master->streams[i].id;
                arm_smmu_cmdq_issue_cmd(master->smmu, &cmd);
        }
 
@@ -1632,8 +1653,8 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
                if (!master->ats_enabled)
                        continue;
 
-               for (i = 0; i < master->num_sids; i++) {
-                       cmd.atc.sid = master->sids[i];
+               for (i = 0; i < master->num_streams; i++) {
+                       cmd.atc.sid = master->streams[i].id;
                        arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd);
                }
        }
@@ -2017,7 +2038,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain,
                .iommu_dev      = smmu->dev,
        };
 
-       if (smmu_domain->non_strict)
+       if (!iommu_get_dma_strict(domain))
                pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
 
        pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
@@ -2065,13 +2086,13 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master)
        int i, j;
        struct arm_smmu_device *smmu = master->smmu;
 
-       for (i = 0; i < master->num_sids; ++i) {
-               u32 sid = master->sids[i];
+       for (i = 0; i < master->num_streams; ++i) {
+               u32 sid = master->streams[i].id;
                __le64 *step = arm_smmu_get_step_for_sid(smmu, sid);
 
                /* Bridged PCI devices may end up with duplicated IDs */
                for (j = 0; j < i; j++)
-                       if (master->sids[j] == sid)
+                       if (master->streams[j].id == sid)
                                break;
                if (j < i)
                        continue;
@@ -2305,6 +2326,9 @@ static void arm_smmu_iotlb_sync(struct iommu_domain *domain,
 {
        struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 
+       if (!gather->pgsize)
+               return;
+
        arm_smmu_tlb_inv_range_domain(gather->start,
                                      gather->end - gather->start + 1,
                                      gather->pgsize, true, smmu_domain);
@@ -2345,11 +2369,101 @@ static bool arm_smmu_sid_in_range(struct arm_smmu_device *smmu, u32 sid)
        return sid < limit;
 }
 
+static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
+                                 struct arm_smmu_master *master)
+{
+       int i;
+       int ret = 0;
+       struct arm_smmu_stream *new_stream, *cur_stream;
+       struct rb_node **new_node, *parent_node = NULL;
+       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev);
+
+       master->streams = kcalloc(fwspec->num_ids, sizeof(*master->streams),
+                                 GFP_KERNEL);
+       if (!master->streams)
+               return -ENOMEM;
+       master->num_streams = fwspec->num_ids;
+
+       mutex_lock(&smmu->streams_mutex);
+       for (i = 0; i < fwspec->num_ids; i++) {
+               u32 sid = fwspec->ids[i];
+
+               new_stream = &master->streams[i];
+               new_stream->id = sid;
+               new_stream->master = master;
+
+               /*
+                * Check the SIDs are in range of the SMMU and our stream table
+                */
+               if (!arm_smmu_sid_in_range(smmu, sid)) {
+                       ret = -ERANGE;
+                       break;
+               }
+
+               /* Ensure l2 strtab is initialised */
+               if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
+                       ret = arm_smmu_init_l2_strtab(smmu, sid);
+                       if (ret)
+                               break;
+               }
+
+               /* Insert into SID tree */
+               new_node = &(smmu->streams.rb_node);
+               while (*new_node) {
+                       cur_stream = rb_entry(*new_node, struct arm_smmu_stream,
+                                             node);
+                       parent_node = *new_node;
+                       if (cur_stream->id > new_stream->id) {
+                               new_node = &((*new_node)->rb_left);
+                       } else if (cur_stream->id < new_stream->id) {
+                               new_node = &((*new_node)->rb_right);
+                       } else {
+                               dev_warn(master->dev,
+                                        "stream %u already in tree\n",
+                                        cur_stream->id);
+                               ret = -EINVAL;
+                               break;
+                       }
+               }
+               if (ret)
+                       break;
+
+               rb_link_node(&new_stream->node, parent_node, new_node);
+               rb_insert_color(&new_stream->node, &smmu->streams);
+       }
+
+       if (ret) {
+               for (i--; i >= 0; i--)
+                       rb_erase(&master->streams[i].node, &smmu->streams);
+               kfree(master->streams);
+       }
+       mutex_unlock(&smmu->streams_mutex);
+
+       return ret;
+}
+
+static void arm_smmu_remove_master(struct arm_smmu_master *master)
+{
+       int i;
+       struct arm_smmu_device *smmu = master->smmu;
+       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev);
+
+       if (!smmu || !master->streams)
+               return;
+
+       mutex_lock(&smmu->streams_mutex);
+       for (i = 0; i < fwspec->num_ids; i++)
+               rb_erase(&master->streams[i].node, &smmu->streams);
+       mutex_unlock(&smmu->streams_mutex);
+
+       kfree(master->streams);
+}
+
 static struct iommu_ops arm_smmu_ops;
 
 static struct iommu_device *arm_smmu_probe_device(struct device *dev)
 {
-       int i, ret;
+       int ret;
        struct arm_smmu_device *smmu;
        struct arm_smmu_master *master;
        struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
@@ -2370,29 +2484,15 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev)
 
        master->dev = dev;
        master->smmu = smmu;
-       master->sids = fwspec->ids;
-       master->num_sids = fwspec->num_ids;
        INIT_LIST_HEAD(&master->bonds);
        dev_iommu_priv_set(dev, master);
 
-       /* Check the SIDs are in range of the SMMU and our stream table */
-       for (i = 0; i < master->num_sids; i++) {
-               u32 sid = master->sids[i];
-
-               if (!arm_smmu_sid_in_range(smmu, sid)) {
-                       ret = -ERANGE;
-                       goto err_free_master;
-               }
-
-               /* Ensure l2 strtab is initialised */
-               if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
-                       ret = arm_smmu_init_l2_strtab(smmu, sid);
-                       if (ret)
-                               goto err_free_master;
-               }
-       }
+       ret = arm_smmu_insert_master(smmu, master);
+       if (ret)
+               goto err_free_master;
 
-       master->ssid_bits = min(smmu->ssid_bits, fwspec->num_pasid_bits);
+       device_property_read_u32(dev, "pasid-num-bits", &master->ssid_bits);
+       master->ssid_bits = min(smmu->ssid_bits, master->ssid_bits);
 
        /*
         * Note that PASID must be enabled before, and disabled after ATS:
@@ -2428,6 +2528,7 @@ static void arm_smmu_release_device(struct device *dev)
        WARN_ON(arm_smmu_master_sva_enabled(master));
        arm_smmu_detach_dev(master);
        arm_smmu_disable_pasid(master);
+       arm_smmu_remove_master(master);
        kfree(master);
        iommu_fwspec_free(dev);
 }
@@ -2449,76 +2550,18 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
        return group;
 }
 
-static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
-                                   enum iommu_attr attr, void *data)
+static int arm_smmu_enable_nesting(struct iommu_domain *domain)
 {
        struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-
-       switch (domain->type) {
-       case IOMMU_DOMAIN_UNMANAGED:
-               switch (attr) {
-               case DOMAIN_ATTR_NESTING:
-                       *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
-                       return 0;
-               default:
-                       return -ENODEV;
-               }
-               break;
-       case IOMMU_DOMAIN_DMA:
-               switch (attr) {
-               case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-                       *(int *)data = smmu_domain->non_strict;
-                       return 0;
-               default:
-                       return -ENODEV;
-               }
-               break;
-       default:
-               return -EINVAL;
-       }
-}
-
-static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
-                                   enum iommu_attr attr, void *data)
-{
        int ret = 0;
-       struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 
        mutex_lock(&smmu_domain->init_mutex);
-
-       switch (domain->type) {
-       case IOMMU_DOMAIN_UNMANAGED:
-               switch (attr) {
-               case DOMAIN_ATTR_NESTING:
-                       if (smmu_domain->smmu) {
-                               ret = -EPERM;
-                               goto out_unlock;
-                       }
-
-                       if (*(int *)data)
-                               smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
-                       else
-                               smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
-                       break;
-               default:
-                       ret = -ENODEV;
-               }
-               break;
-       case IOMMU_DOMAIN_DMA:
-               switch(attr) {
-               case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-                       smmu_domain->non_strict = *(int *)data;
-                       break;
-               default:
-                       ret = -ENODEV;
-               }
-               break;
-       default:
-               ret = -EINVAL;
-       }
-
-out_unlock:
+       if (smmu_domain->smmu)
+               ret = -EPERM;
+       else
+               smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
        mutex_unlock(&smmu_domain->init_mutex);
+
        return ret;
 }
 
@@ -2619,8 +2662,7 @@ static struct iommu_ops arm_smmu_ops = {
        .probe_device           = arm_smmu_probe_device,
        .release_device         = arm_smmu_release_device,
        .device_group           = arm_smmu_device_group,
-       .domain_get_attr        = arm_smmu_domain_get_attr,
-       .domain_set_attr        = arm_smmu_domain_set_attr,
+       .enable_nesting         = arm_smmu_enable_nesting,
        .of_xlate               = arm_smmu_of_xlate,
        .get_resv_regions       = arm_smmu_get_resv_regions,
        .put_resv_regions       = generic_iommu_put_resv_regions,
@@ -2632,6 +2674,7 @@ static struct iommu_ops arm_smmu_ops = {
        .sva_unbind             = arm_smmu_sva_unbind,
        .sva_get_pasid          = arm_smmu_sva_get_pasid,
        .pgsize_bitmap          = -1UL, /* Restricted during device attach */
+       .owner                  = THIS_MODULE,
 };
 
 /* Probing and initialisation functions */
@@ -2851,6 +2894,9 @@ static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
 {
        int ret;
 
+       mutex_init(&smmu->streams_mutex);
+       smmu->streams = RB_ROOT;
+
        ret = arm_smmu_init_queues(smmu);
        if (ret)
                return ret;
@@ -3620,10 +3666,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
        if (ret)
                return ret;
 
-       iommu_device_set_ops(&smmu->iommu, &arm_smmu_ops);
-       iommu_device_set_fwnode(&smmu->iommu, dev->fwnode);
-
-       ret = iommu_device_register(&smmu->iommu);
+       ret = iommu_device_register(&smmu->iommu, &arm_smmu_ops, dev);
        if (ret) {
                dev_err(dev, "Failed to register iommu\n");
                return ret;
index f985817..46e8c49 100644 (file)
 #define GERROR_PRIQ_ABT_ERR            (1 << 3)
 #define GERROR_EVTQ_ABT_ERR            (1 << 2)
 #define GERROR_CMDQ_ERR                        (1 << 0)
-#define GERROR_ERR_MASK                        0xfd
+#define GERROR_ERR_MASK                        0x1fd
 
 #define ARM_SMMU_GERRORN               0x64
 
@@ -410,8 +410,6 @@ struct arm_smmu_cmdq_ent {
                #define CMDQ_OP_PREFETCH_CFG    0x1
                struct {
                        u32                     sid;
-                       u8                      size;
-                       u64                     addr;
                } prefetch;
 
                #define CMDQ_OP_CFGI_STE        0x3
@@ -639,6 +637,15 @@ struct arm_smmu_device {
 
        /* IOMMU core code handle */
        struct iommu_device             iommu;
+
+       struct rb_root                  streams;
+       struct mutex                    streams_mutex;
+};
+
+struct arm_smmu_stream {
+       u32                             id;
+       struct arm_smmu_master          *master;
+       struct rb_node                  node;
 };
 
 /* SMMU private data for each master */
@@ -647,8 +654,8 @@ struct arm_smmu_master {
        struct device                   *dev;
        struct arm_smmu_domain          *domain;
        struct list_head                domain_head;
-       u32                             *sids;
-       unsigned int                    num_sids;
+       struct arm_smmu_stream          *streams;
+       unsigned int                    num_streams;
        bool                            ats_enabled;
        bool                            sva_enabled;
        struct list_head                bonds;
@@ -668,7 +675,6 @@ struct arm_smmu_domain {
        struct mutex                    init_mutex; /* Protects smmu pointer */
 
        struct io_pgtable_ops           *pgtbl_ops;
-       bool                            non_strict;
        atomic_t                        nr_ats_masters;
 
        enum arm_smmu_domain_stage      stage;
index d8c6bfd..6f72c4d 100644 (file)
@@ -761,14 +761,17 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
                .iommu_dev      = smmu->dev,
        };
 
+       if (!iommu_get_dma_strict(domain))
+               pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
+
        if (smmu->impl && smmu->impl->init_context) {
                ret = smmu->impl->init_context(smmu_domain, &pgtbl_cfg, dev);
                if (ret)
                        goto out_clear_smmu;
        }
 
-       if (smmu_domain->pgtbl_cfg.quirks)
-               pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks;
+       if (smmu_domain->pgtbl_quirks)
+               pgtbl_cfg.quirks |= smmu_domain->pgtbl_quirks;
 
        pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
        if (!pgtbl_ops) {
@@ -1481,98 +1484,34 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
        return group;
 }
 
-static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
-                                   enum iommu_attr attr, void *data)
+static int arm_smmu_enable_nesting(struct iommu_domain *domain)
 {
        struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+       int ret = 0;
 
-       switch(domain->type) {
-       case IOMMU_DOMAIN_UNMANAGED:
-               switch (attr) {
-               case DOMAIN_ATTR_NESTING:
-                       *(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
-                       return 0;
-               case DOMAIN_ATTR_IO_PGTABLE_CFG: {
-                       struct io_pgtable_domain_attr *pgtbl_cfg = data;
-                       *pgtbl_cfg = smmu_domain->pgtbl_cfg;
-
-                       return 0;
-               }
-               default:
-                       return -ENODEV;
-               }
-               break;
-       case IOMMU_DOMAIN_DMA:
-               switch (attr) {
-               case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE: {
-                       bool non_strict = smmu_domain->pgtbl_cfg.quirks &
-                                         IO_PGTABLE_QUIRK_NON_STRICT;
-                       *(int *)data = non_strict;
-                       return 0;
-               }
-               default:
-                       return -ENODEV;
-               }
-               break;
-       default:
-               return -EINVAL;
-       }
+       mutex_lock(&smmu_domain->init_mutex);
+       if (smmu_domain->smmu)
+               ret = -EPERM;
+       else
+               smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
+       mutex_unlock(&smmu_domain->init_mutex);
+
+       return ret;
 }
 
-static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
-                                   enum iommu_attr attr, void *data)
+static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain,
+               unsigned long quirks)
 {
-       int ret = 0;
        struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+       int ret = 0;
 
        mutex_lock(&smmu_domain->init_mutex);
-
-       switch(domain->type) {
-       case IOMMU_DOMAIN_UNMANAGED:
-               switch (attr) {
-               case DOMAIN_ATTR_NESTING:
-                       if (smmu_domain->smmu) {
-                               ret = -EPERM;
-                               goto out_unlock;
-                       }
-
-                       if (*(int *)data)
-                               smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
-                       else
-                               smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
-                       break;
-               case DOMAIN_ATTR_IO_PGTABLE_CFG: {
-                       struct io_pgtable_domain_attr *pgtbl_cfg = data;
-
-                       if (smmu_domain->smmu) {
-                               ret = -EPERM;
-                               goto out_unlock;
-                       }
-
-                       smmu_domain->pgtbl_cfg = *pgtbl_cfg;
-                       break;
-               }
-               default:
-                       ret = -ENODEV;
-               }
-               break;
-       case IOMMU_DOMAIN_DMA:
-               switch (attr) {
-               case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-                       if (*(int *)data)
-                               smmu_domain->pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
-                       else
-                               smmu_domain->pgtbl_cfg.quirks &= ~IO_PGTABLE_QUIRK_NON_STRICT;
-                       break;
-               default:
-                       ret = -ENODEV;
-               }
-               break;
-       default:
-               ret = -EINVAL;
-       }
-out_unlock:
+       if (smmu_domain->smmu)
+               ret = -EPERM;
+       else
+               smmu_domain->pgtbl_quirks = quirks;
        mutex_unlock(&smmu_domain->init_mutex);
+
        return ret;
 }
 
@@ -1631,13 +1570,14 @@ static struct iommu_ops arm_smmu_ops = {
        .probe_device           = arm_smmu_probe_device,
        .release_device         = arm_smmu_release_device,
        .device_group           = arm_smmu_device_group,
-       .domain_get_attr        = arm_smmu_domain_get_attr,
-       .domain_set_attr        = arm_smmu_domain_set_attr,
+       .enable_nesting         = arm_smmu_enable_nesting,
+       .set_pgtable_quirks     = arm_smmu_set_pgtable_quirks,
        .of_xlate               = arm_smmu_of_xlate,
        .get_resv_regions       = arm_smmu_get_resv_regions,
        .put_resv_regions       = generic_iommu_put_resv_regions,
        .def_domain_type        = arm_smmu_def_domain_type,
        .pgsize_bitmap          = -1UL, /* Restricted during device attach */
+       .owner                  = THIS_MODULE,
 };
 
 static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
@@ -2221,10 +2161,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
                return err;
        }
 
-       iommu_device_set_ops(&smmu->iommu, &arm_smmu_ops);
-       iommu_device_set_fwnode(&smmu->iommu, dev->fwnode);
-
-       err = iommu_device_register(&smmu->iommu);
+       err = iommu_device_register(&smmu->iommu, &arm_smmu_ops, dev);
        if (err) {
                dev_err(dev, "Failed to register iommu\n");
                return err;
index d2a2d1b..c31a59d 100644 (file)
@@ -364,7 +364,7 @@ enum arm_smmu_domain_stage {
 struct arm_smmu_domain {
        struct arm_smmu_device          *smmu;
        struct io_pgtable_ops           *pgtbl_ops;
-       struct io_pgtable_domain_attr   pgtbl_cfg;
+       unsigned long                   pgtbl_quirks;
        const struct iommu_flush_ops    *flush_ops;
        struct arm_smmu_cfg             cfg;
        enum arm_smmu_domain_stage      stage;
index 7f280c8..4294abe 100644 (file)
@@ -847,10 +847,7 @@ static int qcom_iommu_device_probe(struct platform_device *pdev)
                return ret;
        }
 
-       iommu_device_set_ops(&qcom_iommu->iommu, &qcom_iommu_ops);
-       iommu_device_set_fwnode(&qcom_iommu->iommu, dev->fwnode);
-
-       ret = iommu_device_register(&qcom_iommu->iommu);
+       ret = iommu_device_register(&qcom_iommu->iommu, &qcom_iommu_ops, dev);
        if (ret) {
                dev_err(dev, "Failed to register iommu\n");
                return ret;
index af765c8..7bcdd12 100644 (file)
@@ -52,15 +52,17 @@ struct iommu_dma_cookie {
 };
 
 static DEFINE_STATIC_KEY_FALSE(iommu_deferred_attach_enabled);
+bool iommu_dma_forcedac __read_mostly;
 
-void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
-               struct iommu_domain *domain)
+static int __init iommu_dma_forcedac_setup(char *str)
 {
-       struct iommu_dma_cookie *cookie = domain->iova_cookie;
-       struct iova_domain *iovad = &cookie->iovad;
+       int ret = kstrtobool(str, &iommu_dma_forcedac);
 
-       free_cpu_cached_iovas(cpu, iovad);
+       if (!ret && iommu_dma_forcedac)
+               pr_info("Forcing DAC for PCI devices\n");
+       return ret;
 }
+early_param("iommu.forcedac", iommu_dma_forcedac_setup);
 
 static void iommu_dma_entry_dtor(unsigned long data)
 {
@@ -304,10 +306,7 @@ static void iommu_dma_flush_iotlb_all(struct iova_domain *iovad)
 
        cookie = container_of(iovad, struct iommu_dma_cookie, iovad);
        domain = cookie->fq_domain;
-       /*
-        * The IOMMU driver supporting DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE
-        * implies that ops->flush_iotlb_all must be non-NULL.
-        */
+
        domain->ops->flush_iotlb_all(domain);
 }
 
@@ -334,7 +333,6 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
        struct iommu_dma_cookie *cookie = domain->iova_cookie;
        unsigned long order, base_pfn;
        struct iova_domain *iovad;
-       int attr;
 
        if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)
                return -EINVAL;
@@ -371,8 +369,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
        init_iova_domain(iovad, 1UL << order, base_pfn);
 
        if (!cookie->fq_domain && (!dev || !dev_is_untrusted(dev)) &&
-           !iommu_domain_get_attr(domain, DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) &&
-           attr) {
+           domain->ops->flush_iotlb_all && !iommu_get_dma_strict(domain)) {
                if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
                                          iommu_dma_entry_dtor))
                        pr_warn("iova flush queue initialization failed\n");
@@ -444,7 +441,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
                dma_limit = min(dma_limit, (u64)domain->geometry.aperture_end);
 
        /* Try to get PCI devices a SAC address */
-       if (dma_limit > DMA_BIT_MASK(32) && dev_is_pci(dev))
+       if (dma_limit > DMA_BIT_MASK(32) && !iommu_dma_forcedac && dev_is_pci(dev))
                iova = alloc_iova_fast(iovad, iova_len,
                                       DMA_BIT_MASK(32) >> shift, false);
 
@@ -499,8 +496,6 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
                unsigned long attrs)
 {
        struct iommu_domain *domain = iommu_get_dma_domain(dev);
-       struct iommu_dma_cookie *cookie = domain->iova_cookie;
-       struct iova_domain *iovad = &cookie->iovad;
        phys_addr_t phys;
 
        phys = iommu_iova_to_phys(domain, dma_addr);
@@ -510,8 +505,7 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
        __iommu_dma_unmap(dev, dma_addr, size);
 
        if (unlikely(is_swiotlb_buffer(phys)))
-               swiotlb_tbl_unmap_single(dev, phys, size,
-                               iova_align(iovad, size), dir, attrs);
+               swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
 
 static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
@@ -581,10 +575,8 @@ static dma_addr_t __iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys,
        }
 
        iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask);
-       if ((iova == DMA_MAPPING_ERROR) && is_swiotlb_buffer(phys))
-               swiotlb_tbl_unmap_single(dev, phys, org_size,
-                               aligned_size, dir, attrs);
-
+       if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys))
+               swiotlb_tbl_unmap_single(dev, phys, org_size, dir, attrs);
        return iova;
 }
 
@@ -650,23 +642,12 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
        return pages;
 }
 
-/**
- * iommu_dma_alloc_remap - Allocate and map a buffer contiguous in IOVA space
- * @dev: Device to allocate memory for. Must be a real device
- *      attached to an iommu_dma_domain
- * @size: Size of buffer in bytes
- * @dma_handle: Out argument for allocated DMA handle
- * @gfp: Allocation flags
- * @prot: pgprot_t to use for the remapped mapping
- * @attrs: DMA attributes for this allocation
- *
- * If @size is less than PAGE_SIZE, then a full CPU page will be allocated,
+/*
+ * If size is less than PAGE_SIZE, then a full CPU page will be allocated,
  * but an IOMMU which supports smaller pages might not map the whole thing.
- *
- * Return: Mapped virtual address, or NULL on failure.
  */
-static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
-               dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
+static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev,
+               size_t size, struct sg_table *sgt, gfp_t gfp, pgprot_t prot,
                unsigned long attrs)
 {
        struct iommu_domain *domain = iommu_get_dma_domain(dev);
@@ -676,11 +657,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
        int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
        unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap;
        struct page **pages;
-       struct sg_table sgt;
        dma_addr_t iova;
-       void *vaddr;
-
-       *dma_handle = DMA_MAPPING_ERROR;
 
        if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
            iommu_deferred_attach(dev, domain))
@@ -707,41 +684,91 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
        if (!iova)
                goto out_free_pages;
 
-       if (sg_alloc_table_from_pages(&sgt, pages, count, 0, size, GFP_KERNEL))
+       if (sg_alloc_table_from_pages(sgt, pages, count, 0, size, GFP_KERNEL))
                goto out_free_iova;
 
        if (!(ioprot & IOMMU_CACHE)) {
                struct scatterlist *sg;
                int i;
 
-               for_each_sg(sgt.sgl, sg, sgt.orig_nents, i)
+               for_each_sg(sgt->sgl, sg, sgt->orig_nents, i)
                        arch_dma_prep_coherent(sg_page(sg), sg->length);
        }
 
-       if (iommu_map_sg_atomic(domain, iova, sgt.sgl, sgt.orig_nents, ioprot)
+       if (iommu_map_sg_atomic(domain, iova, sgt->sgl, sgt->orig_nents, ioprot)
                        < size)
                goto out_free_sg;
 
+       sgt->sgl->dma_address = iova;
+       sgt->sgl->dma_length = size;
+       return pages;
+
+out_free_sg:
+       sg_free_table(sgt);
+out_free_iova:
+       iommu_dma_free_iova(cookie, iova, size, NULL);
+out_free_pages:
+       __iommu_dma_free_pages(pages, count);
+       return NULL;
+}
+
+static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
+               dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
+               unsigned long attrs)
+{
+       struct page **pages;
+       struct sg_table sgt;
+       void *vaddr;
+
+       pages = __iommu_dma_alloc_noncontiguous(dev, size, &sgt, gfp, prot,
+                                               attrs);
+       if (!pages)
+               return NULL;
+       *dma_handle = sgt.sgl->dma_address;
+       sg_free_table(&sgt);
        vaddr = dma_common_pages_remap(pages, size, prot,
                        __builtin_return_address(0));
        if (!vaddr)
                goto out_unmap;
-
-       *dma_handle = iova;
-       sg_free_table(&sgt);
        return vaddr;
 
 out_unmap:
-       __iommu_dma_unmap(dev, iova, size);
-out_free_sg:
-       sg_free_table(&sgt);
-out_free_iova:
-       iommu_dma_free_iova(cookie, iova, size, NULL);
-out_free_pages:
-       __iommu_dma_free_pages(pages, count);
+       __iommu_dma_unmap(dev, *dma_handle, size);
+       __iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
        return NULL;
 }
 
+#ifdef CONFIG_DMA_REMAP
+static struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev,
+               size_t size, enum dma_data_direction dir, gfp_t gfp,
+               unsigned long attrs)
+{
+       struct dma_sgt_handle *sh;
+
+       sh = kmalloc(sizeof(*sh), gfp);
+       if (!sh)
+               return NULL;
+
+       sh->pages = __iommu_dma_alloc_noncontiguous(dev, size, &sh->sgt, gfp,
+                                                   PAGE_KERNEL, attrs);
+       if (!sh->pages) {
+               kfree(sh);
+               return NULL;
+       }
+       return &sh->sgt;
+}
+
+static void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
+               struct sg_table *sgt, enum dma_data_direction dir)
+{
+       struct dma_sgt_handle *sh = sgt_handle(sgt);
+
+       __iommu_dma_unmap(dev, sgt->sgl->dma_address, size);
+       __iommu_dma_free_pages(sh->pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
+       sg_free_table(&sh->sgt);
+}
+#endif /* CONFIG_DMA_REMAP */
+
 static void iommu_dma_sync_single_for_cpu(struct device *dev,
                dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
 {
@@ -755,7 +782,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev,
                arch_sync_dma_for_cpu(phys, size, dir);
 
        if (is_swiotlb_buffer(phys))
-               swiotlb_tbl_sync_single(dev, phys, size, dir, SYNC_FOR_CPU);
+               swiotlb_sync_single_for_cpu(dev, phys, size, dir);
 }
 
 static void iommu_dma_sync_single_for_device(struct device *dev,
@@ -768,7 +795,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev,
 
        phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
        if (is_swiotlb_buffer(phys))
-               swiotlb_tbl_sync_single(dev, phys, size, dir, SYNC_FOR_DEVICE);
+               swiotlb_sync_single_for_device(dev, phys, size, dir);
 
        if (!dev_is_dma_coherent(dev))
                arch_sync_dma_for_device(phys, size, dir);
@@ -789,8 +816,8 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
                        arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
 
                if (is_swiotlb_buffer(sg_phys(sg)))
-                       swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length,
-                                               dir, SYNC_FOR_CPU);
+                       swiotlb_sync_single_for_cpu(dev, sg_phys(sg),
+                                                   sg->length, dir);
        }
 }
 
@@ -806,8 +833,8 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
 
        for_each_sg(sgl, sg, nelems, i) {
                if (is_swiotlb_buffer(sg_phys(sg)))
-                       swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length,
-                                               dir, SYNC_FOR_DEVICE);
+                       swiotlb_sync_single_for_device(dev, sg_phys(sg),
+                                                      sg->length, dir);
 
                if (!dev_is_dma_coherent(dev))
                        arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
@@ -1258,6 +1285,10 @@ static const struct dma_map_ops iommu_dma_ops = {
        .free                   = iommu_dma_free,
        .alloc_pages            = dma_common_alloc_pages,
        .free_pages             = dma_common_free_pages,
+#ifdef CONFIG_DMA_REMAP
+       .alloc_noncontiguous    = iommu_dma_alloc_noncontiguous,
+       .free_noncontiguous     = iommu_dma_free_noncontiguous,
+#endif
        .mmap                   = iommu_dma_mmap,
        .get_sgtable            = iommu_dma_get_sgtable,
        .map_page               = iommu_dma_map_page,
index de324b4..7623d8c 100644 (file)
@@ -407,7 +407,7 @@ static irqreturn_t exynos_sysmmu_irq(int irq, void *dev_id)
        struct sysmmu_drvdata *data = dev_id;
        const struct sysmmu_fault_info *finfo;
        unsigned int i, n, itype;
-       sysmmu_iova_t fault_addr = -1;
+       sysmmu_iova_t fault_addr;
        unsigned short reg_status, reg_clear;
        int ret = -ENOSYS;
 
@@ -630,10 +630,7 @@ static int exynos_sysmmu_probe(struct platform_device *pdev)
        if (ret)
                return ret;
 
-       iommu_device_set_ops(&data->iommu, &exynos_iommu_ops);
-       iommu_device_set_fwnode(&data->iommu, &dev->of_node->fwnode);
-
-       ret = iommu_device_register(&data->iommu);
+       ret = iommu_device_register(&data->iommu, &exynos_iommu_ops, dev);
        if (ret)
                return ret;
 
index b9a974d..fc38b1f 100644 (file)
@@ -63,19 +63,6 @@ static const struct of_device_id l3_device_ids[] = {
 /* maximum subwindows permitted per liodn */
 static u32 max_subwindow_count;
 
-/* Pool for fspi allocation */
-static struct gen_pool *spaace_pool;
-
-/**
- * pamu_get_max_subwin_cnt() - Return the maximum supported
- * subwindow count per liodn.
- *
- */
-u32 pamu_get_max_subwin_cnt(void)
-{
-       return max_subwindow_count;
-}
-
 /**
  * pamu_get_ppaace() - Return the primary PACCE
  * @liodn: liodn PAACT index for desired PAACE
@@ -155,13 +142,6 @@ static unsigned int map_addrspace_size_to_wse(phys_addr_t addrspace_size)
        return fls64(addrspace_size) - 2;
 }
 
-/* Derive the PAACE window count encoding for the subwindow count */
-static unsigned int map_subwindow_cnt_to_wce(u32 subwindow_cnt)
-{
-       /* window count is 2^(WCE+1) bytes */
-       return __ffs(subwindow_cnt) - 1;
-}
-
 /*
  * Set the PAACE type as primary and set the coherency required domain
  * attribute
@@ -174,89 +154,11 @@ static void pamu_init_ppaace(struct paace *ppaace)
               PAACE_M_COHERENCE_REQ);
 }
 
-/*
- * Set the PAACE type as secondary and set the coherency required domain
- * attribute.
- */
-static void pamu_init_spaace(struct paace *spaace)
-{
-       set_bf(spaace->addr_bitfields, PAACE_AF_PT, PAACE_PT_SECONDARY);
-       set_bf(spaace->domain_attr.to_host.coherency_required, PAACE_DA_HOST_CR,
-              PAACE_M_COHERENCE_REQ);
-}
-
-/*
- * Return the spaace (corresponding to the secondary window index)
- * for a particular ppaace.
- */
-static struct paace *pamu_get_spaace(struct paace *paace, u32 wnum)
-{
-       u32 subwin_cnt;
-       struct paace *spaace = NULL;
-
-       subwin_cnt = 1UL << (get_bf(paace->impl_attr, PAACE_IA_WCE) + 1);
-
-       if (wnum < subwin_cnt)
-               spaace = &spaact[paace->fspi + wnum];
-       else
-               pr_debug("secondary paace out of bounds\n");
-
-       return spaace;
-}
-
-/**
- * pamu_get_fspi_and_allocate() - Allocates fspi index and reserves subwindows
- *                                required for primary PAACE in the secondary
- *                                PAACE table.
- * @subwin_cnt: Number of subwindows to be reserved.
- *
- * A PPAACE entry may have a number of associated subwindows. A subwindow
- * corresponds to a SPAACE entry in the SPAACT table. Each PAACE entry stores
- * the index (fspi) of the first SPAACE entry in the SPAACT table. This
- * function returns the index of the first SPAACE entry. The remaining
- * SPAACE entries are reserved contiguously from that index.
- *
- * Returns a valid fspi index in the range of 0 - SPAACE_NUMBER_ENTRIES on success.
- * If no SPAACE entry is available or the allocator can not reserve the required
- * number of contiguous entries function returns ULONG_MAX indicating a failure.
- *
- */
-static unsigned long pamu_get_fspi_and_allocate(u32 subwin_cnt)
-{
-       unsigned long spaace_addr;
-
-       spaace_addr = gen_pool_alloc(spaace_pool, subwin_cnt * sizeof(struct paace));
-       if (!spaace_addr)
-               return ULONG_MAX;
-
-       return (spaace_addr - (unsigned long)spaact) / (sizeof(struct paace));
-}
-
-/* Release the subwindows reserved for a particular LIODN */
-void pamu_free_subwins(int liodn)
-{
-       struct paace *ppaace;
-       u32 subwin_cnt, size;
-
-       ppaace = pamu_get_ppaace(liodn);
-       if (!ppaace) {
-               pr_debug("Invalid liodn entry\n");
-               return;
-       }
-
-       if (get_bf(ppaace->addr_bitfields, PPAACE_AF_MW)) {
-               subwin_cnt = 1UL << (get_bf(ppaace->impl_attr, PAACE_IA_WCE) + 1);
-               size = (subwin_cnt - 1) * sizeof(struct paace);
-               gen_pool_free(spaace_pool, (unsigned long)&spaact[ppaace->fspi], size);
-               set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0);
-       }
-}
-
 /*
  * Function used for updating stash destination for the coressponding
  * LIODN.
  */
-int  pamu_update_paace_stash(int liodn, u32 subwin, u32 value)
+int pamu_update_paace_stash(int liodn, u32 value)
 {
        struct paace *paace;
 
@@ -265,11 +167,6 @@ int  pamu_update_paace_stash(int liodn, u32 subwin, u32 value)
                pr_debug("Invalid liodn entry\n");
                return -ENOENT;
        }
-       if (subwin) {
-               paace = pamu_get_spaace(paace, subwin - 1);
-               if (!paace)
-                       return -ENOENT;
-       }
        set_bf(paace->impl_attr, PAACE_IA_CID, value);
 
        mb();
@@ -277,65 +174,20 @@ int  pamu_update_paace_stash(int liodn, u32 subwin, u32 value)
        return 0;
 }
 
-/* Disable a subwindow corresponding to the LIODN */
-int pamu_disable_spaace(int liodn, u32 subwin)
-{
-       struct paace *paace;
-
-       paace = pamu_get_ppaace(liodn);
-       if (!paace) {
-               pr_debug("Invalid liodn entry\n");
-               return -ENOENT;
-       }
-       if (subwin) {
-               paace = pamu_get_spaace(paace, subwin - 1);
-               if (!paace)
-                       return -ENOENT;
-               set_bf(paace->addr_bitfields, PAACE_AF_V, PAACE_V_INVALID);
-       } else {
-               set_bf(paace->addr_bitfields, PAACE_AF_AP,
-                      PAACE_AP_PERMS_DENIED);
-       }
-
-       mb();
-
-       return 0;
-}
-
 /**
  * pamu_config_paace() - Sets up PPAACE entry for specified liodn
  *
  * @liodn: Logical IO device number
- * @win_addr: starting address of DSA window
- * @win-size: size of DSA window
  * @omi: Operation mapping index -- if ~omi == 0 then omi not defined
- * @rpn: real (true physical) page number
  * @stashid: cache stash id for associated cpu -- if ~stashid == 0 then
  *          stashid not defined
- * @snoopid: snoop id for hardware coherency -- if ~snoopid == 0 then
- *          snoopid not defined
- * @subwin_cnt: number of sub-windows
  * @prot: window permissions
  *
  * Returns 0 upon success else error code < 0 returned
  */
-int pamu_config_ppaace(int liodn, phys_addr_t win_addr, phys_addr_t win_size,
-                      u32 omi, unsigned long rpn, u32 snoopid, u32 stashid,
-                      u32 subwin_cnt, int prot)
+int pamu_config_ppaace(int liodn, u32 omi, u32 stashid, int prot)
 {
        struct paace *ppaace;
-       unsigned long fspi;
-
-       if ((win_size & (win_size - 1)) || win_size < PAMU_PAGE_SIZE) {
-               pr_debug("window size too small or not a power of two %pa\n",
-                        &win_size);
-               return -EINVAL;
-       }
-
-       if (win_addr & (win_size - 1)) {
-               pr_debug("window address is not aligned with window size\n");
-               return -EINVAL;
-       }
 
        ppaace = pamu_get_ppaace(liodn);
        if (!ppaace)
@@ -343,13 +195,12 @@ int pamu_config_ppaace(int liodn, phys_addr_t win_addr, phys_addr_t win_size,
 
        /* window size is 2^(WSE+1) bytes */
        set_bf(ppaace->addr_bitfields, PPAACE_AF_WSE,
-              map_addrspace_size_to_wse(win_size));
+              map_addrspace_size_to_wse(1ULL << 36));
 
        pamu_init_ppaace(ppaace);
 
-       ppaace->wbah = win_addr >> (PAMU_PAGE_SHIFT + 20);
-       set_bf(ppaace->addr_bitfields, PPAACE_AF_WBAL,
-              (win_addr >> PAMU_PAGE_SHIFT));
+       ppaace->wbah = 0;
+       set_bf(ppaace->addr_bitfields, PPAACE_AF_WBAL, 0);
 
        /* set up operation mapping if it's configured */
        if (omi < OME_NUMBER_ENTRIES) {
@@ -364,120 +215,12 @@ int pamu_config_ppaace(int liodn, phys_addr_t win_addr, phys_addr_t win_size,
        if (~stashid != 0)
                set_bf(ppaace->impl_attr, PAACE_IA_CID, stashid);
 
-       /* configure snoop id */
-       if (~snoopid != 0)
-               ppaace->domain_attr.to_host.snpid = snoopid;
-
-       if (subwin_cnt) {
-               /* The first entry is in the primary PAACE instead */
-               fspi = pamu_get_fspi_and_allocate(subwin_cnt - 1);
-               if (fspi == ULONG_MAX) {
-                       pr_debug("spaace indexes exhausted\n");
-                       return -EINVAL;
-               }
-
-               /* window count is 2^(WCE+1) bytes */
-               set_bf(ppaace->impl_attr, PAACE_IA_WCE,
-                      map_subwindow_cnt_to_wce(subwin_cnt));
-               set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0x1);
-               ppaace->fspi = fspi;
-       } else {
-               set_bf(ppaace->impl_attr, PAACE_IA_ATM, PAACE_ATM_WINDOW_XLATE);
-               ppaace->twbah = rpn >> 20;
-               set_bf(ppaace->win_bitfields, PAACE_WIN_TWBAL, rpn);
-               set_bf(ppaace->addr_bitfields, PAACE_AF_AP, prot);
-               set_bf(ppaace->impl_attr, PAACE_IA_WCE, 0);
-               set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0);
-       }
-       mb();
-
-       return 0;
-}
-
-/**
- * pamu_config_spaace() - Sets up SPAACE entry for specified subwindow
- *
- * @liodn:  Logical IO device number
- * @subwin_cnt:  number of sub-windows associated with dma-window
- * @subwin: subwindow index
- * @subwin_size: size of subwindow
- * @omi: Operation mapping index
- * @rpn: real (true physical) page number
- * @snoopid: snoop id for hardware coherency -- if ~snoopid == 0 then
- *                       snoopid not defined
- * @stashid: cache stash id for associated cpu
- * @enable: enable/disable subwindow after reconfiguration
- * @prot: sub window permissions
- *
- * Returns 0 upon success else error code < 0 returned
- */
-int pamu_config_spaace(int liodn, u32 subwin_cnt, u32 subwin,
-                      phys_addr_t subwin_size, u32 omi, unsigned long rpn,
-                      u32 snoopid, u32 stashid, int enable, int prot)
-{
-       struct paace *paace;
-
-       /* setup sub-windows */
-       if (!subwin_cnt) {
-               pr_debug("Invalid subwindow count\n");
-               return -EINVAL;
-       }
-
-       paace = pamu_get_ppaace(liodn);
-       if (subwin > 0 && subwin < subwin_cnt && paace) {
-               paace = pamu_get_spaace(paace, subwin - 1);
-
-               if (paace && !(paace->addr_bitfields & PAACE_V_VALID)) {
-                       pamu_init_spaace(paace);
-                       set_bf(paace->addr_bitfields, SPAACE_AF_LIODN, liodn);
-               }
-       }
-
-       if (!paace) {
-               pr_debug("Invalid liodn entry\n");
-               return -ENOENT;
-       }
-
-       if ((subwin_size & (subwin_size - 1)) || subwin_size < PAMU_PAGE_SIZE) {
-               pr_debug("subwindow size out of range, or not a power of 2\n");
-               return -EINVAL;
-       }
-
-       if (rpn == ULONG_MAX) {
-               pr_debug("real page number out of range\n");
-               return -EINVAL;
-       }
-
-       /* window size is 2^(WSE+1) bytes */
-       set_bf(paace->win_bitfields, PAACE_WIN_SWSE,
-              map_addrspace_size_to_wse(subwin_size));
-
-       set_bf(paace->impl_attr, PAACE_IA_ATM, PAACE_ATM_WINDOW_XLATE);
-       paace->twbah = rpn >> 20;
-       set_bf(paace->win_bitfields, PAACE_WIN_TWBAL, rpn);
-       set_bf(paace->addr_bitfields, PAACE_AF_AP, prot);
-
-       /* configure snoop id */
-       if (~snoopid != 0)
-               paace->domain_attr.to_host.snpid = snoopid;
-
-       /* set up operation mapping if it's configured */
-       if (omi < OME_NUMBER_ENTRIES) {
-               set_bf(paace->impl_attr, PAACE_IA_OTM, PAACE_OTM_INDEXED);
-               paace->op_encode.index_ot.omi = omi;
-       } else if (~omi != 0) {
-               pr_debug("bad operation mapping index: %d\n", omi);
-               return -EINVAL;
-       }
-
-       if (~stashid != 0)
-               set_bf(paace->impl_attr, PAACE_IA_CID, stashid);
-
-       smp_wmb();
-
-       if (enable)
-               set_bf(paace->addr_bitfields, PAACE_AF_V, PAACE_V_VALID);
-
+       set_bf(ppaace->impl_attr, PAACE_IA_ATM, PAACE_ATM_WINDOW_XLATE);
+       ppaace->twbah = 0;
+       set_bf(ppaace->win_bitfields, PAACE_WIN_TWBAL, 0);
+       set_bf(ppaace->addr_bitfields, PAACE_AF_AP, prot);
+       set_bf(ppaace->impl_attr, PAACE_IA_WCE, 0);
+       set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0);
        mb();
 
        return 0;
@@ -1129,17 +872,6 @@ static int fsl_pamu_probe(struct platform_device *pdev)
        spaact_phys = virt_to_phys(spaact);
        omt_phys = virt_to_phys(omt);
 
-       spaace_pool = gen_pool_create(ilog2(sizeof(struct paace)), -1);
-       if (!spaace_pool) {
-               ret = -ENOMEM;
-               dev_err(dev, "Failed to allocate spaace gen pool\n");
-               goto error;
-       }
-
-       ret = gen_pool_add(spaace_pool, (unsigned long)spaact, SPAACT_SIZE, -1);
-       if (ret)
-               goto error_genpool;
-
        pamubypenr = in_be32(&guts_regs->pamubypenr);
 
        for (pamu_reg_off = 0, pamu_counter = 0x80000000; pamu_reg_off < size;
@@ -1167,9 +899,6 @@ static int fsl_pamu_probe(struct platform_device *pdev)
 
        return 0;
 
-error_genpool:
-       gen_pool_destroy(spaace_pool);
-
 error:
        if (irq != NO_IRQ)
                free_irq(irq, data);
index e1496ba..36df797 100644 (file)
@@ -383,18 +383,10 @@ struct ome {
 int pamu_domain_init(void);
 int pamu_enable_liodn(int liodn);
 int pamu_disable_liodn(int liodn);
-void pamu_free_subwins(int liodn);
-int pamu_config_ppaace(int liodn, phys_addr_t win_addr, phys_addr_t win_size,
-                      u32 omi, unsigned long rpn, u32 snoopid, uint32_t stashid,
-                      u32 subwin_cnt, int prot);
-int pamu_config_spaace(int liodn, u32 subwin_cnt, u32 subwin_addr,
-                      phys_addr_t subwin_size, u32 omi, unsigned long rpn,
-                      uint32_t snoopid, u32 stashid, int enable, int prot);
+int pamu_config_ppaace(int liodn, u32 omi, uint32_t stashid, int prot);
 
 u32 get_stash_id(u32 stash_dest_hint, u32 vcpu);
 void get_ome_index(u32 *omi_index, struct device *dev);
-int  pamu_update_paace_stash(int liodn, u32 subwin, u32 value);
-int pamu_disable_spaace(int liodn, u32 subwin);
-u32 pamu_get_max_subwin_cnt(void);
+int  pamu_update_paace_stash(int liodn, u32 value);
 
 #endif  /* __FSL_PAMU_H */
index b211076..a47f473 100644 (file)
@@ -54,159 +54,18 @@ static int __init iommu_init_mempool(void)
        return 0;
 }
 
-static phys_addr_t get_phys_addr(struct fsl_dma_domain *dma_domain, dma_addr_t iova)
-{
-       u32 win_cnt = dma_domain->win_cnt;
-       struct dma_window *win_ptr = &dma_domain->win_arr[0];
-       struct iommu_domain_geometry *geom;
-
-       geom = &dma_domain->iommu_domain.geometry;
-
-       if (!win_cnt || !dma_domain->geom_size) {
-               pr_debug("Number of windows/geometry not configured for the domain\n");
-               return 0;
-       }
-
-       if (win_cnt > 1) {
-               u64 subwin_size;
-               dma_addr_t subwin_iova;
-               u32 wnd;
-
-               subwin_size = dma_domain->geom_size >> ilog2(win_cnt);
-               subwin_iova = iova & ~(subwin_size - 1);
-               wnd = (subwin_iova - geom->aperture_start) >> ilog2(subwin_size);
-               win_ptr = &dma_domain->win_arr[wnd];
-       }
-
-       if (win_ptr->valid)
-               return win_ptr->paddr + (iova & (win_ptr->size - 1));
-
-       return 0;
-}
-
-static int map_subwins(int liodn, struct fsl_dma_domain *dma_domain)
-{
-       struct dma_window *sub_win_ptr = &dma_domain->win_arr[0];
-       int i, ret;
-       unsigned long rpn, flags;
-
-       for (i = 0; i < dma_domain->win_cnt; i++) {
-               if (sub_win_ptr[i].valid) {
-                       rpn = sub_win_ptr[i].paddr >> PAMU_PAGE_SHIFT;
-                       spin_lock_irqsave(&iommu_lock, flags);
-                       ret = pamu_config_spaace(liodn, dma_domain->win_cnt, i,
-                                                sub_win_ptr[i].size,
-                                                ~(u32)0,
-                                                rpn,
-                                                dma_domain->snoop_id,
-                                                dma_domain->stash_id,
-                                                (i > 0) ? 1 : 0,
-                                                sub_win_ptr[i].prot);
-                       spin_unlock_irqrestore(&iommu_lock, flags);
-                       if (ret) {
-                               pr_debug("SPAACE configuration failed for liodn %d\n",
-                                        liodn);
-                               return ret;
-                       }
-               }
-       }
-
-       return ret;
-}
-
-static int map_win(int liodn, struct fsl_dma_domain *dma_domain)
-{
-       int ret;
-       struct dma_window *wnd = &dma_domain->win_arr[0];
-       phys_addr_t wnd_addr = dma_domain->iommu_domain.geometry.aperture_start;
-       unsigned long flags;
-
-       spin_lock_irqsave(&iommu_lock, flags);
-       ret = pamu_config_ppaace(liodn, wnd_addr,
-                                wnd->size,
-                                ~(u32)0,
-                                wnd->paddr >> PAMU_PAGE_SHIFT,
-                                dma_domain->snoop_id, dma_domain->stash_id,
-                                0, wnd->prot);
-       spin_unlock_irqrestore(&iommu_lock, flags);
-       if (ret)
-               pr_debug("PAACE configuration failed for liodn %d\n", liodn);
-
-       return ret;
-}
-
-/* Map the DMA window corresponding to the LIODN */
-static int map_liodn(int liodn, struct fsl_dma_domain *dma_domain)
-{
-       if (dma_domain->win_cnt > 1)
-               return map_subwins(liodn, dma_domain);
-       else
-               return map_win(liodn, dma_domain);
-}
-
-/* Update window/subwindow mapping for the LIODN */
-static int update_liodn(int liodn, struct fsl_dma_domain *dma_domain, u32 wnd_nr)
-{
-       int ret;
-       struct dma_window *wnd = &dma_domain->win_arr[wnd_nr];
-       unsigned long flags;
-
-       spin_lock_irqsave(&iommu_lock, flags);
-       if (dma_domain->win_cnt > 1) {
-               ret = pamu_config_spaace(liodn, dma_domain->win_cnt, wnd_nr,
-                                        wnd->size,
-                                        ~(u32)0,
-                                        wnd->paddr >> PAMU_PAGE_SHIFT,
-                                        dma_domain->snoop_id,
-                                        dma_domain->stash_id,
-                                        (wnd_nr > 0) ? 1 : 0,
-                                        wnd->prot);
-               if (ret)
-                       pr_debug("Subwindow reconfiguration failed for liodn %d\n",
-                                liodn);
-       } else {
-               phys_addr_t wnd_addr;
-
-               wnd_addr = dma_domain->iommu_domain.geometry.aperture_start;
-
-               ret = pamu_config_ppaace(liodn, wnd_addr,
-                                        wnd->size,
-                                        ~(u32)0,
-                                        wnd->paddr >> PAMU_PAGE_SHIFT,
-                                        dma_domain->snoop_id, dma_domain->stash_id,
-                                        0, wnd->prot);
-               if (ret)
-                       pr_debug("Window reconfiguration failed for liodn %d\n",
-                                liodn);
-       }
-
-       spin_unlock_irqrestore(&iommu_lock, flags);
-
-       return ret;
-}
-
 static int update_liodn_stash(int liodn, struct fsl_dma_domain *dma_domain,
                              u32 val)
 {
-       int ret = 0, i;
+       int ret = 0;
        unsigned long flags;
 
        spin_lock_irqsave(&iommu_lock, flags);
-       if (!dma_domain->win_arr) {
-               pr_debug("Windows not configured, stash destination update failed for liodn %d\n",
-                        liodn);
+       ret = pamu_update_paace_stash(liodn, val);
+       if (ret) {
+               pr_debug("Failed to update SPAACE for liodn %d\n ", liodn);
                spin_unlock_irqrestore(&iommu_lock, flags);
-               return -EINVAL;
-       }
-
-       for (i = 0; i < dma_domain->win_cnt; i++) {
-               ret = pamu_update_paace_stash(liodn, i, val);
-               if (ret) {
-                       pr_debug("Failed to update SPAACE %d field for liodn %d\n ",
-                                i, liodn);
-                       spin_unlock_irqrestore(&iommu_lock, flags);
-                       return ret;
-               }
+               return ret;
        }
 
        spin_unlock_irqrestore(&iommu_lock, flags);
@@ -215,16 +74,12 @@ static int update_liodn_stash(int liodn, struct fsl_dma_domain *dma_domain,
 }
 
 /* Set the geometry parameters for a LIODN */
-static int pamu_set_liodn(int liodn, struct device *dev,
-                         struct fsl_dma_domain *dma_domain,
-                         struct iommu_domain_geometry *geom_attr,
-                         u32 win_cnt)
+static int pamu_set_liodn(struct fsl_dma_domain *dma_domain, struct device *dev,
+                         int liodn)
 {
-       phys_addr_t window_addr, window_size;
-       phys_addr_t subwin_size;
-       int ret = 0, i;
        u32 omi_index = ~(u32)0;
        unsigned long flags;
+       int ret;
 
        /*
         * Configure the omi_index at the geometry setup time.
@@ -233,93 +88,30 @@ static int pamu_set_liodn(int liodn, struct device *dev,
         */
        get_ome_index(&omi_index, dev);
 
-       window_addr = geom_attr->aperture_start;
-       window_size = dma_domain->geom_size;
-
        spin_lock_irqsave(&iommu_lock, flags);
        ret = pamu_disable_liodn(liodn);
-       if (!ret)
-               ret = pamu_config_ppaace(liodn, window_addr, window_size, omi_index,
-                                        0, dma_domain->snoop_id,
-                                        dma_domain->stash_id, win_cnt, 0);
+       if (ret)
+               goto out_unlock;
+       ret = pamu_config_ppaace(liodn, omi_index, dma_domain->stash_id, 0);
+       if (ret)
+               goto out_unlock;
+       ret = pamu_config_ppaace(liodn, ~(u32)0, dma_domain->stash_id,
+                                PAACE_AP_PERMS_QUERY | PAACE_AP_PERMS_UPDATE);
+out_unlock:
        spin_unlock_irqrestore(&iommu_lock, flags);
        if (ret) {
-               pr_debug("PAACE configuration failed for liodn %d, win_cnt =%d\n",
-                        liodn, win_cnt);
-               return ret;
-       }
-
-       if (win_cnt > 1) {
-               subwin_size = window_size >> ilog2(win_cnt);
-               for (i = 0; i < win_cnt; i++) {
-                       spin_lock_irqsave(&iommu_lock, flags);
-                       ret = pamu_disable_spaace(liodn, i);
-                       if (!ret)
-                               ret = pamu_config_spaace(liodn, win_cnt, i,
-                                                        subwin_size, omi_index,
-                                                        0, dma_domain->snoop_id,
-                                                        dma_domain->stash_id,
-                                                        0, 0);
-                       spin_unlock_irqrestore(&iommu_lock, flags);
-                       if (ret) {
-                               pr_debug("SPAACE configuration failed for liodn %d\n",
-                                        liodn);
-                               return ret;
-                       }
-               }
+               pr_debug("PAACE configuration failed for liodn %d\n",
+                        liodn);
        }
-
        return ret;
 }
 
-static int check_size(u64 size, dma_addr_t iova)
-{
-       /*
-        * Size must be a power of two and at least be equal
-        * to PAMU page size.
-        */
-       if ((size & (size - 1)) || size < PAMU_PAGE_SIZE) {
-               pr_debug("Size too small or not a power of two\n");
-               return -EINVAL;
-       }
-
-       /* iova must be page size aligned */
-       if (iova & (size - 1)) {
-               pr_debug("Address is not aligned with window size\n");
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-static struct fsl_dma_domain *iommu_alloc_dma_domain(void)
-{
-       struct fsl_dma_domain *domain;
-
-       domain = kmem_cache_zalloc(fsl_pamu_domain_cache, GFP_KERNEL);
-       if (!domain)
-               return NULL;
-
-       domain->stash_id = ~(u32)0;
-       domain->snoop_id = ~(u32)0;
-       domain->win_cnt = pamu_get_max_subwin_cnt();
-       domain->geom_size = 0;
-
-       INIT_LIST_HEAD(&domain->devices);
-
-       spin_lock_init(&domain->domain_lock);
-
-       return domain;
-}
-
-static void remove_device_ref(struct device_domain_info *info, u32 win_cnt)
+static void remove_device_ref(struct device_domain_info *info)
 {
        unsigned long flags;
 
        list_del(&info->link);
        spin_lock_irqsave(&iommu_lock, flags);
-       if (win_cnt > 1)
-               pamu_free_subwins(info->liodn);
        pamu_disable_liodn(info->liodn);
        spin_unlock_irqrestore(&iommu_lock, flags);
        spin_lock_irqsave(&device_domain_lock, flags);
@@ -337,7 +129,7 @@ static void detach_device(struct device *dev, struct fsl_dma_domain *dma_domain)
        /* Remove the device from the domain device list */
        list_for_each_entry_safe(info, tmp, &dma_domain->devices, link) {
                if (!dev || (info->dev == dev))
-                       remove_device_ref(info, dma_domain->win_cnt);
+                       remove_device_ref(info);
        }
        spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
 }
@@ -379,13 +171,10 @@ static void attach_device(struct fsl_dma_domain *dma_domain, int liodn, struct d
 static phys_addr_t fsl_pamu_iova_to_phys(struct iommu_domain *domain,
                                         dma_addr_t iova)
 {
-       struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-
        if (iova < domain->geometry.aperture_start ||
            iova > domain->geometry.aperture_end)
                return 0;
-
-       return get_phys_addr(dma_domain, iova);
+       return iova;
 }
 
 static bool fsl_pamu_capable(enum iommu_cap cap)
@@ -399,10 +188,6 @@ static void fsl_pamu_domain_free(struct iommu_domain *domain)
 
        /* remove all the devices from the device list */
        detach_device(NULL, dma_domain);
-
-       dma_domain->enabled = 0;
-       dma_domain->mapped = 0;
-
        kmem_cache_free(fsl_pamu_domain_cache, dma_domain);
 }
 
@@ -413,12 +198,15 @@ static struct iommu_domain *fsl_pamu_domain_alloc(unsigned type)
        if (type != IOMMU_DOMAIN_UNMANAGED)
                return NULL;
 
-       dma_domain = iommu_alloc_dma_domain();
-       if (!dma_domain) {
-               pr_debug("dma_domain allocation failed\n");
+       dma_domain = kmem_cache_zalloc(fsl_pamu_domain_cache, GFP_KERNEL);
+       if (!dma_domain)
                return NULL;
-       }
-       /* defaul geometry 64 GB i.e. maximum system address */
+
+       dma_domain->stash_id = ~(u32)0;
+       INIT_LIST_HEAD(&dma_domain->devices);
+       spin_lock_init(&dma_domain->domain_lock);
+
+       /* default geometry 64 GB i.e. maximum system address */
        dma_domain->iommu_domain. geometry.aperture_start = 0;
        dma_domain->iommu_domain.geometry.aperture_end = (1ULL << 36) - 1;
        dma_domain->iommu_domain.geometry.force_aperture = true;
@@ -426,24 +214,6 @@ static struct iommu_domain *fsl_pamu_domain_alloc(unsigned type)
        return &dma_domain->iommu_domain;
 }
 
-/* Configure geometry settings for all LIODNs associated with domain */
-static int pamu_set_domain_geometry(struct fsl_dma_domain *dma_domain,
-                                   struct iommu_domain_geometry *geom_attr,
-                                   u32 win_cnt)
-{
-       struct device_domain_info *info;
-       int ret = 0;
-
-       list_for_each_entry(info, &dma_domain->devices, link) {
-               ret = pamu_set_liodn(info->liodn, info->dev, dma_domain,
-                                    geom_attr, win_cnt);
-               if (ret)
-                       break;
-       }
-
-       return ret;
-}
-
 /* Update stash destination for all LIODNs associated with the domain */
 static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val)
 {
@@ -459,198 +229,13 @@ static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val)
        return ret;
 }
 
-/* Update domain mappings for all LIODNs associated with the domain */
-static int update_domain_mapping(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
-{
-       struct device_domain_info *info;
-       int ret = 0;
-
-       list_for_each_entry(info, &dma_domain->devices, link) {
-               ret = update_liodn(info->liodn, dma_domain, wnd_nr);
-               if (ret)
-                       break;
-       }
-       return ret;
-}
-
-static int disable_domain_win(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
-{
-       struct device_domain_info *info;
-       int ret = 0;
-
-       list_for_each_entry(info, &dma_domain->devices, link) {
-               if (dma_domain->win_cnt == 1 && dma_domain->enabled) {
-                       ret = pamu_disable_liodn(info->liodn);
-                       if (!ret)
-                               dma_domain->enabled = 0;
-               } else {
-                       ret = pamu_disable_spaace(info->liodn, wnd_nr);
-               }
-       }
-
-       return ret;
-}
-
-static void fsl_pamu_window_disable(struct iommu_domain *domain, u32 wnd_nr)
-{
-       struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-       unsigned long flags;
-       int ret;
-
-       spin_lock_irqsave(&dma_domain->domain_lock, flags);
-       if (!dma_domain->win_arr) {
-               pr_debug("Number of windows not configured\n");
-               spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-               return;
-       }
-
-       if (wnd_nr >= dma_domain->win_cnt) {
-               pr_debug("Invalid window index\n");
-               spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-               return;
-       }
-
-       if (dma_domain->win_arr[wnd_nr].valid) {
-               ret = disable_domain_win(dma_domain, wnd_nr);
-               if (!ret) {
-                       dma_domain->win_arr[wnd_nr].valid = 0;
-                       dma_domain->mapped--;
-               }
-       }
-
-       spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-}
-
-static int fsl_pamu_window_enable(struct iommu_domain *domain, u32 wnd_nr,
-                                 phys_addr_t paddr, u64 size, int prot)
-{
-       struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-       struct dma_window *wnd;
-       int pamu_prot = 0;
-       int ret;
-       unsigned long flags;
-       u64 win_size;
-
-       if (prot & IOMMU_READ)
-               pamu_prot |= PAACE_AP_PERMS_QUERY;
-       if (prot & IOMMU_WRITE)
-               pamu_prot |= PAACE_AP_PERMS_UPDATE;
-
-       spin_lock_irqsave(&dma_domain->domain_lock, flags);
-       if (!dma_domain->win_arr) {
-               pr_debug("Number of windows not configured\n");
-               spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-               return -ENODEV;
-       }
-
-       if (wnd_nr >= dma_domain->win_cnt) {
-               pr_debug("Invalid window index\n");
-               spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-               return -EINVAL;
-       }
-
-       win_size = dma_domain->geom_size >> ilog2(dma_domain->win_cnt);
-       if (size > win_size) {
-               pr_debug("Invalid window size\n");
-               spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-               return -EINVAL;
-       }
-
-       if (dma_domain->win_cnt == 1) {
-               if (dma_domain->enabled) {
-                       pr_debug("Disable the window before updating the mapping\n");
-                       spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-                       return -EBUSY;
-               }
-
-               ret = check_size(size, domain->geometry.aperture_start);
-               if (ret) {
-                       pr_debug("Aperture start not aligned to the size\n");
-                       spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-                       return -EINVAL;
-               }
-       }
-
-       wnd = &dma_domain->win_arr[wnd_nr];
-       if (!wnd->valid) {
-               wnd->paddr = paddr;
-               wnd->size = size;
-               wnd->prot = pamu_prot;
-
-               ret = update_domain_mapping(dma_domain, wnd_nr);
-               if (!ret) {
-                       wnd->valid = 1;
-                       dma_domain->mapped++;
-               }
-       } else {
-               pr_debug("Disable the window before updating the mapping\n");
-               ret = -EBUSY;
-       }
-
-       spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-
-       return ret;
-}
-
-/*
- * Attach the LIODN to the DMA domain and configure the geometry
- * and window mappings.
- */
-static int handle_attach_device(struct fsl_dma_domain *dma_domain,
-                               struct device *dev, const u32 *liodn,
-                               int num)
-{
-       unsigned long flags;
-       struct iommu_domain *domain = &dma_domain->iommu_domain;
-       int ret = 0;
-       int i;
-
-       spin_lock_irqsave(&dma_domain->domain_lock, flags);
-       for (i = 0; i < num; i++) {
-               /* Ensure that LIODN value is valid */
-               if (liodn[i] >= PAACE_NUMBER_ENTRIES) {
-                       pr_debug("Invalid liodn %d, attach device failed for %pOF\n",
-                                liodn[i], dev->of_node);
-                       ret = -EINVAL;
-                       break;
-               }
-
-               attach_device(dma_domain, liodn[i], dev);
-               /*
-                * Check if geometry has already been configured
-                * for the domain. If yes, set the geometry for
-                * the LIODN.
-                */
-               if (dma_domain->win_arr) {
-                       u32 win_cnt = dma_domain->win_cnt > 1 ? dma_domain->win_cnt : 0;
-
-                       ret = pamu_set_liodn(liodn[i], dev, dma_domain,
-                                            &domain->geometry, win_cnt);
-                       if (ret)
-                               break;
-                       if (dma_domain->mapped) {
-                               /*
-                                * Create window/subwindow mapping for
-                                * the LIODN.
-                                */
-                               ret = map_liodn(liodn[i], dma_domain);
-                               if (ret)
-                                       break;
-                       }
-               }
-       }
-       spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-
-       return ret;
-}
-
 static int fsl_pamu_attach_device(struct iommu_domain *domain,
                                  struct device *dev)
 {
        struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
+       unsigned long flags;
+       int len, ret = 0, i;
        const u32 *liodn;
-       u32 liodn_cnt;
-       int len, ret = 0;
        struct pci_dev *pdev = NULL;
        struct pci_controller *pci_ctl;
 
@@ -670,14 +255,30 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain,
        }
 
        liodn = of_get_property(dev->of_node, "fsl,liodn", &len);
-       if (liodn) {
-               liodn_cnt = len / sizeof(u32);
-               ret = handle_attach_device(dma_domain, dev, liodn, liodn_cnt);
-       } else {
+       if (!liodn) {
                pr_debug("missing fsl,liodn property at %pOF\n", dev->of_node);
-               ret = -EINVAL;
+               return -EINVAL;
        }
 
+       spin_lock_irqsave(&dma_domain->domain_lock, flags);
+       for (i = 0; i < len / sizeof(u32); i++) {
+               /* Ensure that LIODN value is valid */
+               if (liodn[i] >= PAACE_NUMBER_ENTRIES) {
+                       pr_debug("Invalid liodn %d, attach device failed for %pOF\n",
+                                liodn[i], dev->of_node);
+                       ret = -EINVAL;
+                       break;
+               }
+
+               attach_device(dma_domain, liodn[i], dev);
+               ret = pamu_set_liodn(dma_domain, dev, liodn[i]);
+               if (ret)
+                       break;
+               ret = pamu_enable_liodn(liodn[i]);
+               if (ret)
+                       break;
+       }
+       spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
        return ret;
 }
 
@@ -712,202 +313,26 @@ static void fsl_pamu_detach_device(struct iommu_domain *domain,
                pr_debug("missing fsl,liodn property at %pOF\n", dev->of_node);
 }
 
-static  int configure_domain_geometry(struct iommu_domain *domain, void *data)
-{
-       struct iommu_domain_geometry *geom_attr = data;
-       struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-       dma_addr_t geom_size;
-       unsigned long flags;
-
-       geom_size = geom_attr->aperture_end - geom_attr->aperture_start + 1;
-       /*
-        * Sanity check the geometry size. Also, we do not support
-        * DMA outside of the geometry.
-        */
-       if (check_size(geom_size, geom_attr->aperture_start) ||
-           !geom_attr->force_aperture) {
-               pr_debug("Invalid PAMU geometry attributes\n");
-               return -EINVAL;
-       }
-
-       spin_lock_irqsave(&dma_domain->domain_lock, flags);
-       if (dma_domain->enabled) {
-               pr_debug("Can't set geometry attributes as domain is active\n");
-               spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-               return  -EBUSY;
-       }
-
-       /* Copy the domain geometry information */
-       memcpy(&domain->geometry, geom_attr,
-              sizeof(struct iommu_domain_geometry));
-       dma_domain->geom_size = geom_size;
-
-       spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-
-       return 0;
-}
-
 /* Set the domain stash attribute */
-static int configure_domain_stash(struct fsl_dma_domain *dma_domain, void *data)
+int fsl_pamu_configure_l1_stash(struct iommu_domain *domain, u32 cpu)
 {
-       struct pamu_stash_attribute *stash_attr = data;
+       struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
        unsigned long flags;
        int ret;
 
        spin_lock_irqsave(&dma_domain->domain_lock, flags);
-
-       memcpy(&dma_domain->dma_stash, stash_attr,
-              sizeof(struct pamu_stash_attribute));
-
-       dma_domain->stash_id = get_stash_id(stash_attr->cache,
-                                           stash_attr->cpu);
+       dma_domain->stash_id = get_stash_id(PAMU_ATTR_CACHE_L1, cpu);
        if (dma_domain->stash_id == ~(u32)0) {
                pr_debug("Invalid stash attributes\n");
                spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
                return -EINVAL;
        }
-
        ret = update_domain_stash(dma_domain, dma_domain->stash_id);
-
        spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
 
        return ret;
 }
 
-/* Configure domain dma state i.e. enable/disable DMA */
-static int configure_domain_dma_state(struct fsl_dma_domain *dma_domain, bool enable)
-{
-       struct device_domain_info *info;
-       unsigned long flags;
-       int ret;
-
-       spin_lock_irqsave(&dma_domain->domain_lock, flags);
-
-       if (enable && !dma_domain->mapped) {
-               pr_debug("Can't enable DMA domain without valid mapping\n");
-               spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-               return -ENODEV;
-       }
-
-       dma_domain->enabled = enable;
-       list_for_each_entry(info, &dma_domain->devices, link) {
-               ret = (enable) ? pamu_enable_liodn(info->liodn) :
-                       pamu_disable_liodn(info->liodn);
-               if (ret)
-                       pr_debug("Unable to set dma state for liodn %d",
-                                info->liodn);
-       }
-       spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-
-       return 0;
-}
-
-static int fsl_pamu_set_windows(struct iommu_domain *domain, u32 w_count)
-{
-       struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-       unsigned long flags;
-       int ret;
-
-       spin_lock_irqsave(&dma_domain->domain_lock, flags);
-       /* Ensure domain is inactive i.e. DMA should be disabled for the domain */
-       if (dma_domain->enabled) {
-               pr_debug("Can't set geometry attributes as domain is active\n");
-               spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-               return  -EBUSY;
-       }
-
-       /* Ensure that the geometry has been set for the domain */
-       if (!dma_domain->geom_size) {
-               pr_debug("Please configure geometry before setting the number of windows\n");
-               spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-               return -EINVAL;
-       }
-
-       /*
-        * Ensure we have valid window count i.e. it should be less than
-        * maximum permissible limit and should be a power of two.
-        */
-       if (w_count > pamu_get_max_subwin_cnt() || !is_power_of_2(w_count)) {
-               pr_debug("Invalid window count\n");
-               spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-               return -EINVAL;
-       }
-
-       ret = pamu_set_domain_geometry(dma_domain, &domain->geometry,
-                                      w_count > 1 ? w_count : 0);
-       if (!ret) {
-               kfree(dma_domain->win_arr);
-               dma_domain->win_arr = kcalloc(w_count,
-                                             sizeof(*dma_domain->win_arr),
-                                             GFP_ATOMIC);
-               if (!dma_domain->win_arr) {
-                       spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-                       return -ENOMEM;
-               }
-               dma_domain->win_cnt = w_count;
-       }
-       spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-
-       return ret;
-}
-
-static int fsl_pamu_set_domain_attr(struct iommu_domain *domain,
-                                   enum iommu_attr attr_type, void *data)
-{
-       struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-       int ret = 0;
-
-       switch (attr_type) {
-       case DOMAIN_ATTR_GEOMETRY:
-               ret = configure_domain_geometry(domain, data);
-               break;
-       case DOMAIN_ATTR_FSL_PAMU_STASH:
-               ret = configure_domain_stash(dma_domain, data);
-               break;
-       case DOMAIN_ATTR_FSL_PAMU_ENABLE:
-               ret = configure_domain_dma_state(dma_domain, *(int *)data);
-               break;
-       case DOMAIN_ATTR_WINDOWS:
-               ret = fsl_pamu_set_windows(domain, *(u32 *)data);
-               break;
-       default:
-               pr_debug("Unsupported attribute type\n");
-               ret = -EINVAL;
-               break;
-       }
-
-       return ret;
-}
-
-static int fsl_pamu_get_domain_attr(struct iommu_domain *domain,
-                                   enum iommu_attr attr_type, void *data)
-{
-       struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-       int ret = 0;
-
-       switch (attr_type) {
-       case DOMAIN_ATTR_FSL_PAMU_STASH:
-               memcpy(data, &dma_domain->dma_stash,
-                      sizeof(struct pamu_stash_attribute));
-               break;
-       case DOMAIN_ATTR_FSL_PAMU_ENABLE:
-               *(int *)data = dma_domain->enabled;
-               break;
-       case DOMAIN_ATTR_FSL_PAMUV1:
-               *(int *)data = DOMAIN_ATTR_FSL_PAMUV1;
-               break;
-       case DOMAIN_ATTR_WINDOWS:
-               *(u32 *)data = dma_domain->win_cnt;
-               break;
-       default:
-               pr_debug("Unsupported attribute type\n");
-               ret = -EINVAL;
-               break;
-       }
-
-       return ret;
-}
-
 static struct iommu_group *get_device_iommu_group(struct device *dev)
 {
        struct iommu_group *group;
@@ -1031,11 +456,7 @@ static const struct iommu_ops fsl_pamu_ops = {
        .domain_free    = fsl_pamu_domain_free,
        .attach_dev     = fsl_pamu_attach_device,
        .detach_dev     = fsl_pamu_detach_device,
-       .domain_window_enable = fsl_pamu_window_enable,
-       .domain_window_disable = fsl_pamu_window_disable,
        .iova_to_phys   = fsl_pamu_iova_to_phys,
-       .domain_set_attr = fsl_pamu_set_domain_attr,
-       .domain_get_attr = fsl_pamu_get_domain_attr,
        .probe_device   = fsl_pamu_probe_device,
        .release_device = fsl_pamu_release_device,
        .device_group   = fsl_pamu_device_group,
@@ -1053,9 +474,7 @@ int __init pamu_domain_init(void)
        if (ret)
                return ret;
 
-       iommu_device_set_ops(&pamu_iommu, &fsl_pamu_ops);
-
-       ret = iommu_device_register(&pamu_iommu);
+       ret = iommu_device_register(&pamu_iommu, &fsl_pamu_ops, NULL);
        if (ret) {
                iommu_device_sysfs_remove(&pamu_iommu);
                pr_err("Can't register iommu device\n");
index 2865d42..95ac1b3 100644 (file)
@@ -9,56 +9,10 @@
 
 #include "fsl_pamu.h"
 
-struct dma_window {
-       phys_addr_t paddr;
-       u64 size;
-       int valid;
-       int prot;
-};
-
 struct fsl_dma_domain {
-       /*
-        * Indicates the geometry size for the domain.
-        * This would be set when the geometry is
-        * configured for the domain.
-        */
-       dma_addr_t                      geom_size;
-       /*
-        * Number of windows assocaited with this domain.
-        * During domain initialization, it is set to the
-        * the maximum number of subwindows allowed for a LIODN.
-        * Minimum value for this is 1 indicating a single PAMU
-        * window, without any sub windows. Value can be set/
-        * queried by set_attr/get_attr API for DOMAIN_ATTR_WINDOWS.
-        * Value can only be set once the geometry has been configured.
-        */
-       u32                             win_cnt;
-       /*
-        * win_arr contains information of the configured
-        * windows for a domain. This is allocated only
-        * when the number of windows for the domain are
-        * set.
-        */
-       struct dma_window               *win_arr;
        /* list of devices associated with the domain */
        struct list_head                devices;
-       /* dma_domain states:
-        * mapped - A particular mapping has been created
-        * within the configured geometry.
-        * enabled - DMA has been enabled for the given
-        * domain. This translates to setting of the
-        * valid bit for the primary PAACE in the PAMU
-        * PAACT table. Domain geometry should be set and
-        * it must have a valid mapping before DMA can be
-        * enabled for it.
-        *
-        */
-       int                             mapped;
-       int                             enabled;
-       /* stash_id obtained from the stash attribute details */
        u32                             stash_id;
-       struct pamu_stash_attribute     dma_stash;
-       u32                             snoop_id;
        struct iommu_domain             iommu_domain;
        spinlock_t                      domain_lock;
 };
index d5c51b5..1757ac1 100644 (file)
@@ -1140,9 +1140,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
                if (err)
                        goto err_unmap;
 
-               iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
-
-               err = iommu_device_register(&iommu->iommu);
+               err = iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
                if (err)
                        goto err_unmap;
        }
@@ -1205,6 +1203,63 @@ static inline void reclaim_free_desc(struct q_inval *qi)
        }
 }
 
+static const char *qi_type_string(u8 type)
+{
+       switch (type) {
+       case QI_CC_TYPE:
+               return "Context-cache Invalidation";
+       case QI_IOTLB_TYPE:
+               return "IOTLB Invalidation";
+       case QI_DIOTLB_TYPE:
+               return "Device-TLB Invalidation";
+       case QI_IEC_TYPE:
+               return "Interrupt Entry Cache Invalidation";
+       case QI_IWD_TYPE:
+               return "Invalidation Wait";
+       case QI_EIOTLB_TYPE:
+               return "PASID-based IOTLB Invalidation";
+       case QI_PC_TYPE:
+               return "PASID-cache Invalidation";
+       case QI_DEIOTLB_TYPE:
+               return "PASID-based Device-TLB Invalidation";
+       case QI_PGRP_RESP_TYPE:
+               return "Page Group Response";
+       default:
+               return "UNKNOWN";
+       }
+}
+
+static void qi_dump_fault(struct intel_iommu *iommu, u32 fault)
+{
+       unsigned int head = dmar_readl(iommu->reg + DMAR_IQH_REG);
+       u64 iqe_err = dmar_readq(iommu->reg + DMAR_IQER_REG);
+       struct qi_desc *desc = iommu->qi->desc + head;
+
+       if (fault & DMA_FSTS_IQE)
+               pr_err("VT-d detected Invalidation Queue Error: Reason %llx",
+                      DMAR_IQER_REG_IQEI(iqe_err));
+       if (fault & DMA_FSTS_ITE)
+               pr_err("VT-d detected Invalidation Time-out Error: SID %llx",
+                      DMAR_IQER_REG_ITESID(iqe_err));
+       if (fault & DMA_FSTS_ICE)
+               pr_err("VT-d detected Invalidation Completion Error: SID %llx",
+                      DMAR_IQER_REG_ICESID(iqe_err));
+
+       pr_err("QI HEAD: %s qw0 = 0x%llx, qw1 = 0x%llx\n",
+              qi_type_string(desc->qw0 & 0xf),
+              (unsigned long long)desc->qw0,
+              (unsigned long long)desc->qw1);
+
+       head = ((head >> qi_shift(iommu)) + QI_LENGTH - 1) % QI_LENGTH;
+       head <<= qi_shift(iommu);
+       desc = iommu->qi->desc + head;
+
+       pr_err("QI PRIOR: %s qw0 = 0x%llx, qw1 = 0x%llx\n",
+              qi_type_string(desc->qw0 & 0xf),
+              (unsigned long long)desc->qw0,
+              (unsigned long long)desc->qw1);
+}
+
 static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
 {
        u32 fault;
@@ -1216,6 +1271,8 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
                return -EAGAIN;
 
        fault = readl(iommu->reg + DMAR_FSTS_REG);
+       if (fault & (DMA_FSTS_IQE | DMA_FSTS_ITE | DMA_FSTS_ICE))
+               qi_dump_fault(iommu, fault);
 
        /*
         * If IQE happens, the head points to the descriptor associated
@@ -1232,12 +1289,10 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
                         * used by software as private data. We won't print
                         * out these two qw's for security consideration.
                         */
-                       pr_err("VT-d detected invalid descriptor: qw0 = %llx, qw1 = %llx\n",
-                              (unsigned long long)desc->qw0,
-                              (unsigned long long)desc->qw1);
                        memcpy(desc, qi->desc + (wait_index << shift),
                               1 << shift);
                        writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG);
+                       pr_info("Invalidation Queue Error (IQE) cleared\n");
                        return -EINVAL;
                }
        }
@@ -1254,6 +1309,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
                tail = ((tail >> shift) - 1 + QI_LENGTH) % QI_LENGTH;
 
                writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
+               pr_info("Invalidation Time-out Error (ITE) cleared\n");
 
                do {
                        if (qi->desc_status[head] == QI_IN_USE)
@@ -1265,8 +1321,10 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
                        return -EAGAIN;
        }
 
-       if (fault & DMA_FSTS_ICE)
+       if (fault & DMA_FSTS_ICE) {
                writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
+               pr_info("Invalidation Completion Error (ICE) cleared\n");
+       }
 
        return 0;
 }
index ee09323..708f430 100644 (file)
@@ -360,7 +360,6 @@ int intel_iommu_enabled = 0;
 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 
 static int dmar_map_gfx = 1;
-static int dmar_forcedac;
 static int intel_iommu_strict;
 static int intel_iommu_superpage = 1;
 static int iommu_identity_mapping;
@@ -451,8 +450,8 @@ static int __init intel_iommu_setup(char *str)
                        dmar_map_gfx = 0;
                        pr_info("Disable GFX device mapping\n");
                } else if (!strncmp(str, "forcedac", 8)) {
-                       pr_info("Forcing DAC for PCI devices\n");
-                       dmar_forcedac = 1;
+                       pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
+                       iommu_dma_forcedac = true;
                } else if (!strncmp(str, "strict", 6)) {
                        pr_info("Disable batched IOTLB flush\n");
                        intel_iommu_strict = 1;
@@ -658,7 +657,14 @@ static int domain_update_iommu_snooping(struct intel_iommu *skip)
        rcu_read_lock();
        for_each_active_iommu(iommu, drhd) {
                if (iommu != skip) {
-                       if (!ecap_sc_support(iommu->ecap)) {
+                       /*
+                        * If the hardware is operating in the scalable mode,
+                        * the snooping control is always supported since we
+                        * always set PASID-table-entry.PGSNP bit if the domain
+                        * is managed outside (UNMANAGED).
+                        */
+                       if (!sm_supported(iommu) &&
+                           !ecap_sc_support(iommu->ecap)) {
                                ret = 0;
                                break;
                        }
@@ -1340,6 +1346,11 @@ static void iommu_set_root_entry(struct intel_iommu *iommu)
                      readl, (sts & DMA_GSTS_RTPS), sts);
 
        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+       iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
+       if (sm_supported(iommu))
+               qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
+       iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 }
 
 void iommu_flush_write_buffer(struct intel_iommu *iommu)
@@ -2289,6 +2300,41 @@ static inline int hardware_largepage_caps(struct dmar_domain *domain,
        return level;
 }
 
+/*
+ * Ensure that old small page tables are removed to make room for superpage(s).
+ * We're going to add new large pages, so make sure we don't remove their parent
+ * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
+ */
+static void switch_to_super_page(struct dmar_domain *domain,
+                                unsigned long start_pfn,
+                                unsigned long end_pfn, int level)
+{
+       unsigned long lvl_pages = lvl_to_nr_pages(level);
+       struct dma_pte *pte = NULL;
+       int i;
+
+       while (start_pfn <= end_pfn) {
+               if (!pte)
+                       pte = pfn_to_dma_pte(domain, start_pfn, &level);
+
+               if (dma_pte_present(pte)) {
+                       dma_pte_free_pagetable(domain, start_pfn,
+                                              start_pfn + lvl_pages - 1,
+                                              level + 1);
+
+                       for_each_domain_iommu(i, domain)
+                               iommu_flush_iotlb_psi(g_iommus[i], domain,
+                                                     start_pfn, lvl_pages,
+                                                     0, 0);
+               }
+
+               pte++;
+               start_pfn += lvl_pages;
+               if (first_pte_in_page(pte))
+                       pte = NULL;
+       }
+}
+
 static int
 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
                 unsigned long phys_pfn, unsigned long nr_pages, int prot)
@@ -2305,8 +2351,9 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
                return -EINVAL;
 
        attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
+       attr |= DMA_FL_PTE_PRESENT;
        if (domain_use_first_level(domain)) {
-               attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
+               attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
 
                if (domain->domain.type == IOMMU_DOMAIN_DMA) {
                        attr |= DMA_FL_PTE_ACCESS;
@@ -2329,22 +2376,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
                                return -ENOMEM;
                        /* It is large page*/
                        if (largepage_lvl > 1) {
-                               unsigned long nr_superpages, end_pfn;
+                               unsigned long end_pfn;
 
                                pteval |= DMA_PTE_LARGE_PAGE;
-                               lvl_pages = lvl_to_nr_pages(largepage_lvl);
-
-                               nr_superpages = nr_pages / lvl_pages;
-                               end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
-
-                               /*
-                                * Ensure that old small page tables are
-                                * removed to make room for superpage(s).
-                                * We're adding new large pages, so make sure
-                                * we don't remove their parent tables.
-                                */
-                               dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
-                                                      largepage_lvl + 1);
+                               end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1;
+                               switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
                        } else {
                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
                        }
@@ -2422,6 +2458,10 @@ static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn
                                   (((u16)bus) << 8) | devfn,
                                   DMA_CCMD_MASK_NOBIT,
                                   DMA_CCMD_DEVICE_INVL);
+
+       if (sm_supported(iommu))
+               qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
+
        iommu->flush.flush_iotlb(iommu,
                                 did_old,
                                 0,
@@ -2505,6 +2545,9 @@ static int domain_setup_first_level(struct intel_iommu *iommu,
 
        flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
 
+       if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
+               flags |= PASID_FLAG_PAGE_SNOOP;
+
        return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
                                             domain->iommu_did[iommu->seq_id],
                                             flags);
@@ -3267,8 +3310,6 @@ static int __init init_dmars(void)
                register_pasid_allocator(iommu);
 #endif
                iommu_set_root_entry(iommu);
-               iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
-               iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
        }
 
 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
@@ -3458,12 +3499,7 @@ static int init_iommu_hw(void)
                }
 
                iommu_flush_write_buffer(iommu);
-
                iommu_set_root_entry(iommu);
-
-               iommu->flush.flush_context(iommu, 0, 0, 0,
-                                          DMA_CCMD_GLOBAL_INVL);
-               iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
                iommu_enable_translation(iommu);
                iommu_disable_protect_mem_regions(iommu);
        }
@@ -3846,8 +3882,6 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
                goto disable_iommu;
 
        iommu_set_root_entry(iommu);
-       iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
-       iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
        iommu_enable_translation(iommu);
 
        iommu_disable_protect_mem_regions(iommu);
@@ -4065,35 +4099,6 @@ static struct notifier_block intel_iommu_memory_nb = {
        .priority = 0
 };
 
-static void free_all_cpu_cached_iovas(unsigned int cpu)
-{
-       int i;
-
-       for (i = 0; i < g_num_of_iommus; i++) {
-               struct intel_iommu *iommu = g_iommus[i];
-               struct dmar_domain *domain;
-               int did;
-
-               if (!iommu)
-                       continue;
-
-               for (did = 0; did < cap_ndoms(iommu->cap); did++) {
-                       domain = get_iommu_domain(iommu, (u16)did);
-
-                       if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
-                               continue;
-
-                       iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain);
-               }
-       }
-}
-
-static int intel_iommu_cpu_dead(unsigned int cpu)
-{
-       free_all_cpu_cached_iovas(cpu);
-       return 0;
-}
-
 static void intel_disable_iommus(void)
 {
        struct intel_iommu *iommu = NULL;
@@ -4377,19 +4382,28 @@ int __init intel_iommu_init(void)
 
        down_read(&dmar_global_lock);
        for_each_active_iommu(iommu, drhd) {
+               /*
+                * The flush queue implementation does not perform
+                * page-selective invalidations that are required for efficient
+                * TLB flushes in virtual environments.  The benefit of batching
+                * is likely to be much lower than the overhead of synchronizing
+                * the virtual and physical IOMMU page-tables.
+                */
+               if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
+                       pr_warn("IOMMU batching is disabled due to virtualization");
+                       intel_iommu_strict = 1;
+               }
                iommu_device_sysfs_add(&iommu->iommu, NULL,
                                       intel_iommu_groups,
                                       "%s", iommu->name);
-               iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
-               iommu_device_register(&iommu->iommu);
+               iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
        }
        up_read(&dmar_global_lock);
 
+       iommu_set_dma_strict(intel_iommu_strict);
        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
        if (si_domain && !hw_pass_through)
                register_memory_notifier(&intel_iommu_memory_nb);
-       cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
-                         intel_iommu_cpu_dead);
 
        down_read(&dmar_global_lock);
        if (probe_acpi_namespace_devices())
@@ -5343,6 +5357,8 @@ static int siov_find_pci_dvsec(struct pci_dev *pdev)
 static bool
 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
 {
+       struct device_domain_info *info = get_domain_info(dev);
+
        if (feat == IOMMU_DEV_FEAT_AUX) {
                int ret;
 
@@ -5357,13 +5373,13 @@ intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
                return !!siov_find_pci_dvsec(to_pci_dev(dev));
        }
 
-       if (feat == IOMMU_DEV_FEAT_SVA) {
-               struct device_domain_info *info = get_domain_info(dev);
+       if (feat == IOMMU_DEV_FEAT_IOPF)
+               return info && info->pri_supported;
 
+       if (feat == IOMMU_DEV_FEAT_SVA)
                return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
                        info->pasid_supported && info->pri_supported &&
                        info->ats_supported;
-       }
 
        return false;
 }
@@ -5374,12 +5390,18 @@ intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
        if (feat == IOMMU_DEV_FEAT_AUX)
                return intel_iommu_enable_auxd(dev);
 
+       if (feat == IOMMU_DEV_FEAT_IOPF)
+               return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
+
        if (feat == IOMMU_DEV_FEAT_SVA) {
                struct device_domain_info *info = get_domain_info(dev);
 
                if (!info)
                        return -EINVAL;
 
+               if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
+                       return -EINVAL;
+
                if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
                        return 0;
        }
@@ -5423,87 +5445,23 @@ static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
 }
 
 static int
-intel_iommu_domain_set_attr(struct iommu_domain *domain,
-                           enum iommu_attr attr, void *data)
+intel_iommu_enable_nesting(struct iommu_domain *domain)
 {
        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
        unsigned long flags;
-       int ret = 0;
-
-       if (domain->type != IOMMU_DOMAIN_UNMANAGED)
-               return -EINVAL;
+       int ret = -ENODEV;
 
-       switch (attr) {
-       case DOMAIN_ATTR_NESTING:
-               spin_lock_irqsave(&device_domain_lock, flags);
-               if (nested_mode_support() &&
-                   list_empty(&dmar_domain->devices)) {
-                       dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
-                       dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
-               } else {
-                       ret = -ENODEV;
-               }
-               spin_unlock_irqrestore(&device_domain_lock, flags);
-               break;
-       default:
-               ret = -EINVAL;
-               break;
+       spin_lock_irqsave(&device_domain_lock, flags);
+       if (nested_mode_support() && list_empty(&dmar_domain->devices)) {
+               dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
+               dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
+               ret = 0;
        }
+       spin_unlock_irqrestore(&device_domain_lock, flags);
 
        return ret;
 }
 
-static bool domain_use_flush_queue(void)
-{
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
-       bool r = true;
-
-       if (intel_iommu_strict)
-               return false;
-
-       /*
-        * The flush queue implementation does not perform page-selective
-        * invalidations that are required for efficient TLB flushes in virtual
-        * environments. The benefit of batching is likely to be much lower than
-        * the overhead of synchronizing the virtual and physical IOMMU
-        * page-tables.
-        */
-       rcu_read_lock();
-       for_each_active_iommu(iommu, drhd) {
-               if (!cap_caching_mode(iommu->cap))
-                       continue;
-
-               pr_warn_once("IOMMU batching is disabled due to virtualization");
-               r = false;
-               break;
-       }
-       rcu_read_unlock();
-
-       return r;
-}
-
-static int
-intel_iommu_domain_get_attr(struct iommu_domain *domain,
-                           enum iommu_attr attr, void *data)
-{
-       switch (domain->type) {
-       case IOMMU_DOMAIN_UNMANAGED:
-               return -ENODEV;
-       case IOMMU_DOMAIN_DMA:
-               switch (attr) {
-               case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-                       *(int *)data = domain_use_flush_queue();
-                       return 0;
-               default:
-                       return -ENODEV;
-               }
-               break;
-       default:
-               return -EINVAL;
-       }
-}
-
 /*
  * Check that the device does not live on an external facing PCI port that is
  * marked as untrusted. Such devices should not be able to apply quirks and
@@ -5576,8 +5534,7 @@ const struct iommu_ops intel_iommu_ops = {
        .capable                = intel_iommu_capable,
        .domain_alloc           = intel_iommu_domain_alloc,
        .domain_free            = intel_iommu_domain_free,
-       .domain_get_attr        = intel_iommu_domain_get_attr,
-       .domain_set_attr        = intel_iommu_domain_set_attr,
+       .enable_nesting         = intel_iommu_enable_nesting,
        .attach_dev             = intel_iommu_attach_device,
        .detach_dev             = intel_iommu_detach_device,
        .aux_attach_dev         = intel_iommu_aux_attach_device,
index 611ef52..f912fe4 100644 (file)
@@ -736,7 +736,7 @@ static int __init intel_prepare_irq_remapping(void)
                return -ENODEV;
 
        if (intel_cap_audit(CAP_AUDIT_STATIC_IRQR, NULL))
-               goto error;
+               return -ENODEV;
 
        if (!dmar_ir_support())
                return -ENODEV;
@@ -1280,7 +1280,8 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
                break;
        case X86_IRQ_ALLOC_TYPE_PCI_MSI:
        case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
-               set_msi_sid(irte, msi_desc_to_pci_dev(info->desc));
+               set_msi_sid(irte,
+                           pci_real_dma_dev(msi_desc_to_pci_dev(info->desc)));
                break;
        default:
                BUG_ON(1);
index f26cb61..72646ba 100644 (file)
@@ -24,7 +24,6 @@
 /*
  * Intel IOMMU system wide PASID name space:
  */
-static DEFINE_SPINLOCK(pasid_lock);
 u32 intel_pasid_max_id = PASID_MAX;
 
 int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid)
@@ -231,7 +230,7 @@ struct pasid_table *intel_pasid_get_table(struct device *dev)
        return info->pasid_table;
 }
 
-int intel_pasid_get_dev_max_id(struct device *dev)
+static int intel_pasid_get_dev_max_id(struct device *dev)
 {
        struct device_domain_info *info;
 
@@ -242,7 +241,7 @@ int intel_pasid_get_dev_max_id(struct device *dev)
        return info->pasid_table->max_pasid;
 }
 
-struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
+static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
 {
        struct device_domain_info *info;
        struct pasid_table *pasid_table;
@@ -259,19 +258,25 @@ struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
        dir_index = pasid >> PASID_PDE_SHIFT;
        index = pasid & PASID_PTE_MASK;
 
-       spin_lock(&pasid_lock);
+retry:
        entries = get_pasid_table_from_pde(&dir[dir_index]);
        if (!entries) {
                entries = alloc_pgtable_page(info->iommu->node);
-               if (!entries) {
-                       spin_unlock(&pasid_lock);
+               if (!entries)
                        return NULL;
-               }
 
-               WRITE_ONCE(dir[dir_index].val,
-                          (u64)virt_to_phys(entries) | PASID_PTE_PRESENT);
+               /*
+                * The pasid directory table entry won't be freed after
+                * allocation. No worry about the race with free and
+                * clear. However, this entry might be populated by others
+                * while we are preparing it. Use theirs with a retry.
+                */
+               if (cmpxchg64(&dir[dir_index].val, 0ULL,
+                             (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) {
+                       free_pgtable_page(entries);
+                       goto retry;
+               }
        }
-       spin_unlock(&pasid_lock);
 
        return &entries[index];
 }
@@ -393,6 +398,15 @@ static inline void pasid_set_sre(struct pasid_entry *pe)
        pasid_set_bits(&pe->val[2], 1 << 0, 1);
 }
 
+/*
+ * Setup the WPE(Write Protect Enable) field (Bit 132) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_wpe(struct pasid_entry *pe)
+{
+       pasid_set_bits(&pe->val[2], 1 << 4, 1 << 4);
+}
+
 /*
  * Setup the P(Present) field (Bit 0) of a scalable mode PASID
  * entry.
@@ -411,6 +425,16 @@ static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value)
        pasid_set_bits(&pe->val[1], 1 << 23, value << 23);
 }
 
+/*
+ * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode
+ * PASID entry.
+ */
+static inline void
+pasid_set_pgsnp(struct pasid_entry *pe)
+{
+       pasid_set_bits(&pe->val[1], 1ULL << 24, 1ULL << 24);
+}
+
 /*
  * Setup the First Level Page table Pointer field (Bit 140~191)
  * of a scalable mode PASID entry.
@@ -493,6 +517,9 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
        if (WARN_ON(!pte))
                return;
 
+       if (!(pte->val[0] & PASID_PTE_PRESENT))
+               return;
+
        did = pasid_get_domain_id(pte);
        intel_pasid_clear_entry(dev, pasid, fault_ignore);
 
@@ -522,6 +549,22 @@ static void pasid_flush_caches(struct intel_iommu *iommu,
        }
 }
 
+static inline int pasid_enable_wpe(struct pasid_entry *pte)
+{
+#ifdef CONFIG_X86
+       unsigned long cr0 = read_cr0();
+
+       /* CR0.WP is normally set but just to be sure */
+       if (unlikely(!(cr0 & X86_CR0_WP))) {
+               pr_err_ratelimited("No CPU write protect!\n");
+               return -EINVAL;
+       }
+#endif
+       pasid_set_wpe(pte);
+
+       return 0;
+};
+
 /*
  * Set up the scalable mode pasid table entry for first only
  * translation type.
@@ -553,6 +596,9 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
                        return -EINVAL;
                }
                pasid_set_sre(pte);
+               if (pasid_enable_wpe(pte))
+                       return -EINVAL;
+
        }
 
        if (flags & PASID_FLAG_FL5LP) {
@@ -565,6 +611,9 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
                }
        }
 
+       if (flags & PASID_FLAG_PAGE_SNOOP)
+               pasid_set_pgsnp(pte);
+
        pasid_set_domain_id(pte, did);
        pasid_set_address_width(pte, iommu->agaw);
        pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
@@ -643,6 +692,9 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
        pasid_set_fault_enable(pte);
        pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
 
+       if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
+               pasid_set_pgsnp(pte);
+
        /*
         * Since it is a second level only translation setup, we should
         * set SRE bit as well (addresses are expected to be GPAs).
@@ -706,6 +758,9 @@ intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte,
                        return -EINVAL;
                }
                pasid_set_sre(pte);
+               /* Enable write protect WP if guest requested */
+               if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_WPE)
+                       pasid_set_wpe(pte);
        }
 
        if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) {
index 444c0be..5ff61c3 100644 (file)
@@ -48,6 +48,7 @@
  */
 #define PASID_FLAG_SUPERVISOR_MODE     BIT(0)
 #define PASID_FLAG_NESTED              BIT(1)
+#define PASID_FLAG_PAGE_SNOOP          BIT(2)
 
 /*
  * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first-
@@ -99,14 +100,9 @@ static inline bool pasid_pte_is_present(struct pasid_entry *pte)
 }
 
 extern unsigned int intel_pasid_max_id;
-int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp);
-void intel_pasid_free_id(u32 pasid);
-void *intel_pasid_lookup_id(u32 pasid);
 int intel_pasid_alloc_table(struct device *dev);
 void intel_pasid_free_table(struct device *dev);
 struct pasid_table *intel_pasid_get_table(struct device *dev);
-int intel_pasid_get_dev_max_id(struct device *dev);
-struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid);
 int intel_pasid_setup_first_level(struct intel_iommu *iommu,
                                  struct device *dev, pgd_t *pgd,
                                  u32 pasid, u16 did, int flags);
index 574a7e6..5165cea 100644 (file)
@@ -462,13 +462,12 @@ static void load_pasid(struct mm_struct *mm, u32 pasid)
 /* Caller must hold pasid_mutex, mm reference */
 static int
 intel_svm_bind_mm(struct device *dev, unsigned int flags,
-                 struct svm_dev_ops *ops,
                  struct mm_struct *mm, struct intel_svm_dev **sd)
 {
        struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+       struct intel_svm *svm = NULL, *t;
        struct device_domain_info *info;
        struct intel_svm_dev *sdev;
-       struct intel_svm *svm = NULL;
        unsigned long iflags;
        int pasid_max;
        int ret;
@@ -494,34 +493,26 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
                }
        }
 
-       if (!(flags & SVM_FLAG_PRIVATE_PASID)) {
-               struct intel_svm *t;
-
-               list_for_each_entry(t, &global_svm_list, list) {
-                       if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
-                               continue;
-
-                       svm = t;
-                       if (svm->pasid >= pasid_max) {
-                               dev_warn(dev,
-                                        "Limited PASID width. Cannot use existing PASID %d\n",
-                                        svm->pasid);
-                               ret = -ENOSPC;
-                               goto out;
-                       }
+       list_for_each_entry(t, &global_svm_list, list) {
+               if (t->mm != mm)
+                       continue;
 
-                       /* Find the matching device in svm list */
-                       for_each_svm_dev(sdev, svm, dev) {
-                               if (sdev->ops != ops) {
-                                       ret = -EBUSY;
-                                       goto out;
-                               }
-                               sdev->users++;
-                               goto success;
-                       }
+               svm = t;
+               if (svm->pasid >= pasid_max) {
+                       dev_warn(dev,
+                                "Limited PASID width. Cannot use existing PASID %d\n",
+                                svm->pasid);
+                       ret = -ENOSPC;
+                       goto out;
+               }
 
-                       break;
+               /* Find the matching device in svm list */
+               for_each_svm_dev(sdev, svm, dev) {
+                       sdev->users++;
+                       goto success;
                }
+
+               break;
        }
 
        sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
@@ -550,7 +541,6 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 
        /* Finish the setup now we know we're keeping it */
        sdev->users = 1;
-       sdev->ops = ops;
        init_rcu_head(&sdev->rcu);
 
        if (!svm) {
@@ -862,7 +852,7 @@ intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
        /* Fill in event data for device specific processing */
        memset(&event, 0, sizeof(struct iommu_fault_event));
        event.fault.type = IOMMU_FAULT_PAGE_REQ;
-       event.fault.prm.addr = desc->addr;
+       event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT;
        event.fault.prm.pasid = desc->pasid;
        event.fault.prm.grpid = desc->prg_index;
        event.fault.prm.perm = prq_to_iommu_prot(desc);
@@ -895,6 +885,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
        struct intel_iommu *iommu = d;
        struct intel_svm *svm = NULL;
        int head, tail, handled = 0;
+       unsigned int flags = 0;
 
        /* Clear PPR bit before reading head/tail registers, to
         * ensure that we get a new interrupt if needed. */
@@ -920,7 +911,17 @@ static irqreturn_t prq_event_thread(int irq, void *d)
                               ((unsigned long long *)req)[1]);
                        goto no_pasid;
                }
-
+               /* We shall not receive page request for supervisor SVM */
+               if (req->pm_req && (req->rd_req | req->wr_req)) {
+                       pr_err("Unexpected page request in Privilege Mode");
+                       /* No need to find the matching sdev as for bad_req */
+                       goto no_pasid;
+               }
+               /* DMA read with exec requeset is not supported. */
+               if (req->exe_req && req->rd_req) {
+                       pr_err("Execution request not supported\n");
+                       goto no_pasid;
+               }
                if (!svm || svm->pasid != req->pasid) {
                        rcu_read_lock();
                        svm = ioasid_find(NULL, req->pasid, NULL);
@@ -982,9 +983,11 @@ static irqreturn_t prq_event_thread(int irq, void *d)
                if (access_error(vma, req))
                        goto invalid;
 
-               ret = handle_mm_fault(vma, address,
-                                     req->wr_req ? FAULT_FLAG_WRITE : 0,
-                                     NULL);
+               flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE;
+               if (req->wr_req)
+                       flags |= FAULT_FLAG_WRITE;
+
+               ret = handle_mm_fault(vma, address, flags, NULL);
                if (ret & VM_FAULT_ERROR)
                        goto invalid;
 
@@ -993,13 +996,6 @@ invalid:
                mmap_read_unlock(svm->mm);
                mmput(svm->mm);
 bad_req:
-               WARN_ON(!sdev);
-               if (sdev && sdev->ops && sdev->ops->fault_cb) {
-                       int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
-                               (req->exe_req << 1) | (req->pm_req);
-                       sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
-                                           req->priv_data, rwxp, result);
-               }
                /* We get here in the error case where the PASID lookup failed,
                   and these can be NULL. Do not use them below this point! */
                sdev = NULL;
@@ -1021,12 +1017,12 @@ no_pasid:
                                QI_PGRP_RESP_TYPE;
                        resp.qw1 = QI_PGRP_IDX(req->prg_index) |
                                QI_PGRP_LPIG(req->lpig);
+                       resp.qw2 = 0;
+                       resp.qw3 = 0;
 
                        if (req->priv_data_present)
                                memcpy(&resp.qw2, req->priv_data,
                                       sizeof(req->priv_data));
-                       resp.qw2 = 0;
-                       resp.qw3 = 0;
                        qi_submit_sync(iommu, &resp, 1, 0);
                }
 prq_advance:
@@ -1074,7 +1070,7 @@ intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
        if (drvdata)
                flags = *(unsigned int *)drvdata;
        mutex_lock(&pasid_mutex);
-       ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
+       ret = intel_svm_bind_mm(dev, flags, mm, &sdev);
        if (ret)
                sva = ERR_PTR(ret);
        else if (sdev)
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
new file mode 100644 (file)
index 0000000..1df8c1d
--- /dev/null
@@ -0,0 +1,461 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Handle device page faults
+ *
+ * Copyright (C) 2020 ARM Ltd.
+ */
+
+#include <linux/iommu.h>
+#include <linux/list.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "iommu-sva-lib.h"
+
+/**
+ * struct iopf_queue - IO Page Fault queue
+ * @wq: the fault workqueue
+ * @devices: devices attached to this queue
+ * @lock: protects the device list
+ */
+struct iopf_queue {
+       struct workqueue_struct         *wq;
+       struct list_head                devices;
+       struct mutex                    lock;
+};
+
+/**
+ * struct iopf_device_param - IO Page Fault data attached to a device
+ * @dev: the device that owns this param
+ * @queue: IOPF queue
+ * @queue_list: index into queue->devices
+ * @partial: faults that are part of a Page Request Group for which the last
+ *           request hasn't been submitted yet.
+ */
+struct iopf_device_param {
+       struct device                   *dev;
+       struct iopf_queue               *queue;
+       struct list_head                queue_list;
+       struct list_head                partial;
+};
+
+struct iopf_fault {
+       struct iommu_fault              fault;
+       struct list_head                list;
+};
+
+struct iopf_group {
+       struct iopf_fault               last_fault;
+       struct list_head                faults;
+       struct work_struct              work;
+       struct device                   *dev;
+};
+
+static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf,
+                              enum iommu_page_response_code status)
+{
+       struct iommu_page_response resp = {
+               .version                = IOMMU_PAGE_RESP_VERSION_1,
+               .pasid                  = iopf->fault.prm.pasid,
+               .grpid                  = iopf->fault.prm.grpid,
+               .code                   = status,
+       };
+
+       if ((iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
+           (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID))
+               resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
+
+       return iommu_page_response(dev, &resp);
+}
+
+static enum iommu_page_response_code
+iopf_handle_single(struct iopf_fault *iopf)
+{
+       vm_fault_t ret;
+       struct mm_struct *mm;
+       struct vm_area_struct *vma;
+       unsigned int access_flags = 0;
+       unsigned int fault_flags = FAULT_FLAG_REMOTE;
+       struct iommu_fault_page_request *prm = &iopf->fault.prm;
+       enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
+
+       if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID))
+               return status;
+
+       mm = iommu_sva_find(prm->pasid);
+       if (IS_ERR_OR_NULL(mm))
+               return status;
+
+       mmap_read_lock(mm);
+
+       vma = find_extend_vma(mm, prm->addr);
+       if (!vma)
+               /* Unmapped area */
+               goto out_put_mm;
+
+       if (prm->perm & IOMMU_FAULT_PERM_READ)
+               access_flags |= VM_READ;
+
+       if (prm->perm & IOMMU_FAULT_PERM_WRITE) {
+               access_flags |= VM_WRITE;
+               fault_flags |= FAULT_FLAG_WRITE;
+       }
+
+       if (prm->perm & IOMMU_FAULT_PERM_EXEC) {
+               access_flags |= VM_EXEC;
+               fault_flags |= FAULT_FLAG_INSTRUCTION;
+       }
+
+       if (!(prm->perm & IOMMU_FAULT_PERM_PRIV))
+               fault_flags |= FAULT_FLAG_USER;
+
+       if (access_flags & ~vma->vm_flags)
+               /* Access fault */
+               goto out_put_mm;
+
+       ret = handle_mm_fault(vma, prm->addr, fault_flags, NULL);
+       status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID :
+               IOMMU_PAGE_RESP_SUCCESS;
+
+out_put_mm:
+       mmap_read_unlock(mm);
+       mmput(mm);
+
+       return status;
+}
+
+static void iopf_handle_group(struct work_struct *work)
+{
+       struct iopf_group *group;
+       struct iopf_fault *iopf, *next;
+       enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS;
+
+       group = container_of(work, struct iopf_group, work);
+
+       list_for_each_entry_safe(iopf, next, &group->faults, list) {
+               /*
+                * For the moment, errors are sticky: don't handle subsequent
+                * faults in the group if there is an error.
+                */
+               if (status == IOMMU_PAGE_RESP_SUCCESS)
+                       status = iopf_handle_single(iopf);
+
+               if (!(iopf->fault.prm.flags &
+                     IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE))
+                       kfree(iopf);
+       }
+
+       iopf_complete_group(group->dev, &group->last_fault, status);
+       kfree(group);
+}
+
+/**
+ * iommu_queue_iopf - IO Page Fault handler
+ * @fault: fault event
+ * @cookie: struct device, passed to iommu_register_device_fault_handler.
+ *
+ * Add a fault to the device workqueue, to be handled by mm.
+ *
+ * This module doesn't handle PCI PASID Stop Marker; IOMMU drivers must discard
+ * them before reporting faults. A PASID Stop Marker (LRW = 0b100) doesn't
+ * expect a response. It may be generated when disabling a PASID (issuing a
+ * PASID stop request) by some PCI devices.
+ *
+ * The PASID stop request is issued by the device driver before unbind(). Once
+ * it completes, no page request is generated for this PASID anymore and
+ * outstanding ones have been pushed to the IOMMU (as per PCIe 4.0r1.0 - 6.20.1
+ * and 10.4.1.2 - Managing PASID TLP Prefix Usage). Some PCI devices will wait
+ * for all outstanding page requests to come back with a response before
+ * completing the PASID stop request. Others do not wait for page responses, and
+ * instead issue this Stop Marker that tells us when the PASID can be
+ * reallocated.
+ *
+ * It is safe to discard the Stop Marker because it is an optimization.
+ * a. Page requests, which are posted requests, have been flushed to the IOMMU
+ *    when the stop request completes.
+ * b. The IOMMU driver flushes all fault queues on unbind() before freeing the
+ *    PASID.
+ *
+ * So even though the Stop Marker might be issued by the device *after* the stop
+ * request completes, outstanding faults will have been dealt with by the time
+ * the PASID is freed.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
+{
+       int ret;
+       struct iopf_group *group;
+       struct iopf_fault *iopf, *next;
+       struct iopf_device_param *iopf_param;
+
+       struct device *dev = cookie;
+       struct dev_iommu *param = dev->iommu;
+
+       lockdep_assert_held(&param->lock);
+
+       if (fault->type != IOMMU_FAULT_PAGE_REQ)
+               /* Not a recoverable page fault */
+               return -EOPNOTSUPP;
+
+       /*
+        * As long as we're holding param->lock, the queue can't be unlinked
+        * from the device and therefore cannot disappear.
+        */
+       iopf_param = param->iopf_param;
+       if (!iopf_param)
+               return -ENODEV;
+
+       if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
+               iopf = kzalloc(sizeof(*iopf), GFP_KERNEL);
+               if (!iopf)
+                       return -ENOMEM;
+
+               iopf->fault = *fault;
+
+               /* Non-last request of a group. Postpone until the last one */
+               list_add(&iopf->list, &iopf_param->partial);
+
+               return 0;
+       }
+
+       group = kzalloc(sizeof(*group), GFP_KERNEL);
+       if (!group) {
+               /*
+                * The caller will send a response to the hardware. But we do
+                * need to clean up before leaving, otherwise partial faults
+                * will be stuck.
+                */
+               ret = -ENOMEM;
+               goto cleanup_partial;
+       }
+
+       group->dev = dev;
+       group->last_fault.fault = *fault;
+       INIT_LIST_HEAD(&group->faults);
+       list_add(&group->last_fault.list, &group->faults);
+       INIT_WORK(&group->work, iopf_handle_group);
+
+       /* See if we have partial faults for this group */
+       list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
+               if (iopf->fault.prm.grpid == fault->prm.grpid)
+                       /* Insert *before* the last fault */
+                       list_move(&iopf->list, &group->faults);
+       }
+
+       queue_work(iopf_param->queue->wq, &group->work);
+       return 0;
+
+cleanup_partial:
+       list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
+               if (iopf->fault.prm.grpid == fault->prm.grpid) {
+                       list_del(&iopf->list);
+                       kfree(iopf);
+               }
+       }
+       return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_queue_iopf);
+
+/**
+ * iopf_queue_flush_dev - Ensure that all queued faults have been processed
+ * @dev: the endpoint whose faults need to be flushed.
+ *
+ * The IOMMU driver calls this before releasing a PASID, to ensure that all
+ * pending faults for this PASID have been handled, and won't hit the address
+ * space of the next process that uses this PASID. The driver must make sure
+ * that no new fault is added to the queue. In particular it must flush its
+ * low-level queue before calling this function.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_flush_dev(struct device *dev)
+{
+       int ret = 0;
+       struct iopf_device_param *iopf_param;
+       struct dev_iommu *param = dev->iommu;
+
+       if (!param)
+               return -ENODEV;
+
+       mutex_lock(&param->lock);
+       iopf_param = param->iopf_param;
+       if (iopf_param)
+               flush_workqueue(iopf_param->queue->wq);
+       else
+               ret = -ENODEV;
+       mutex_unlock(&param->lock);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);
+
+/**
+ * iopf_queue_discard_partial - Remove all pending partial fault
+ * @queue: the queue whose partial faults need to be discarded
+ *
+ * When the hardware queue overflows, last page faults in a group may have been
+ * lost and the IOMMU driver calls this to discard all partial faults. The
+ * driver shouldn't be adding new faults to this queue concurrently.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_discard_partial(struct iopf_queue *queue)
+{
+       struct iopf_fault *iopf, *next;
+       struct iopf_device_param *iopf_param;
+
+       if (!queue)
+               return -EINVAL;
+
+       mutex_lock(&queue->lock);
+       list_for_each_entry(iopf_param, &queue->devices, queue_list) {
+               list_for_each_entry_safe(iopf, next, &iopf_param->partial,
+                                        list) {
+                       list_del(&iopf->list);
+                       kfree(iopf);
+               }
+       }
+       mutex_unlock(&queue->lock);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_discard_partial);
+
+/**
+ * iopf_queue_add_device - Add producer to the fault queue
+ * @queue: IOPF queue
+ * @dev: device to add
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev)
+{
+       int ret = -EBUSY;
+       struct iopf_device_param *iopf_param;
+       struct dev_iommu *param = dev->iommu;
+
+       if (!param)
+               return -ENODEV;
+
+       iopf_param = kzalloc(sizeof(*iopf_param), GFP_KERNEL);
+       if (!iopf_param)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(&iopf_param->partial);
+       iopf_param->queue = queue;
+       iopf_param->dev = dev;
+
+       mutex_lock(&queue->lock);
+       mutex_lock(&param->lock);
+       if (!param->iopf_param) {
+               list_add(&iopf_param->queue_list, &queue->devices);
+               param->iopf_param = iopf_param;
+               ret = 0;
+       }
+       mutex_unlock(&param->lock);
+       mutex_unlock(&queue->lock);
+
+       if (ret)
+               kfree(iopf_param);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_add_device);
+
+/**
+ * iopf_queue_remove_device - Remove producer from fault queue
+ * @queue: IOPF queue
+ * @dev: device to remove
+ *
+ * Caller makes sure that no more faults are reported for this device.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
+{
+       int ret = -EINVAL;
+       struct iopf_fault *iopf, *next;
+       struct iopf_device_param *iopf_param;
+       struct dev_iommu *param = dev->iommu;
+
+       if (!param || !queue)
+               return -EINVAL;
+
+       mutex_lock(&queue->lock);
+       mutex_lock(&param->lock);
+       iopf_param = param->iopf_param;
+       if (iopf_param && iopf_param->queue == queue) {
+               list_del(&iopf_param->queue_list);
+               param->iopf_param = NULL;
+               ret = 0;
+       }
+       mutex_unlock(&param->lock);
+       mutex_unlock(&queue->lock);
+       if (ret)
+               return ret;
+
+       /* Just in case some faults are still stuck */
+       list_for_each_entry_safe(iopf, next, &iopf_param->partial, list)
+               kfree(iopf);
+
+       kfree(iopf_param);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_remove_device);
+
+/**
+ * iopf_queue_alloc - Allocate and initialize a fault queue
+ * @name: a unique string identifying the queue (for workqueue)
+ *
+ * Return: the queue on success and NULL on error.
+ */
+struct iopf_queue *iopf_queue_alloc(const char *name)
+{
+       struct iopf_queue *queue;
+
+       queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+       if (!queue)
+               return NULL;
+
+       /*
+        * The WQ is unordered because the low-level handler enqueues faults by
+        * group. PRI requests within a group have to be ordered, but once
+        * that's dealt with, the high-level function can handle groups out of
+        * order.
+        */
+       queue->wq = alloc_workqueue("iopf_queue/%s", WQ_UNBOUND, 0, name);
+       if (!queue->wq) {
+               kfree(queue);
+               return NULL;
+       }
+
+       INIT_LIST_HEAD(&queue->devices);
+       mutex_init(&queue->lock);
+
+       return queue;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_alloc);
+
+/**
+ * iopf_queue_free - Free IOPF queue
+ * @queue: queue to free
+ *
+ * Counterpart to iopf_queue_alloc(). The driver must not be queuing faults or
+ * adding/removing devices on this queue anymore.
+ */
+void iopf_queue_free(struct iopf_queue *queue)
+{
+       struct iopf_device_param *iopf_param, *next;
+
+       if (!queue)
+               return;
+
+       list_for_each_entry_safe(iopf_param, next, &queue->devices, queue_list)
+               iopf_queue_remove_device(queue, iopf_param->dev);
+
+       destroy_workqueue(queue->wq);
+       kfree(queue);
+}
+EXPORT_SYMBOL_GPL(iopf_queue_free);
index b40990a..0311550 100644 (file)
@@ -12,4 +12,57 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max);
 void iommu_sva_free_pasid(struct mm_struct *mm);
 struct mm_struct *iommu_sva_find(ioasid_t pasid);
 
+/* I/O Page fault */
+struct device;
+struct iommu_fault;
+struct iopf_queue;
+
+#ifdef CONFIG_IOMMU_SVA_LIB
+int iommu_queue_iopf(struct iommu_fault *fault, void *cookie);
+
+int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev);
+int iopf_queue_remove_device(struct iopf_queue *queue,
+                            struct device *dev);
+int iopf_queue_flush_dev(struct device *dev);
+struct iopf_queue *iopf_queue_alloc(const char *name);
+void iopf_queue_free(struct iopf_queue *queue);
+int iopf_queue_discard_partial(struct iopf_queue *queue);
+
+#else /* CONFIG_IOMMU_SVA_LIB */
+static inline int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
+{
+       return -ENODEV;
+}
+
+static inline int iopf_queue_add_device(struct iopf_queue *queue,
+                                       struct device *dev)
+{
+       return -ENODEV;
+}
+
+static inline int iopf_queue_remove_device(struct iopf_queue *queue,
+                                          struct device *dev)
+{
+       return -ENODEV;
+}
+
+static inline int iopf_queue_flush_dev(struct device *dev)
+{
+       return -ENODEV;
+}
+
+static inline struct iopf_queue *iopf_queue_alloc(const char *name)
+{
+       return NULL;
+}
+
+static inline void iopf_queue_free(struct iopf_queue *queue)
+{
+}
+
+static inline int iopf_queue_discard_partial(struct iopf_queue *queue)
+{
+       return -ENODEV;
+}
+#endif /* CONFIG_IOMMU_SVA_LIB */
 #endif /* _IOMMU_SVA_LIB_H */
index d0b0a15..808ab70 100644 (file)
@@ -69,16 +69,7 @@ static const char * const iommu_group_resv_type_string[] = {
 };
 
 #define IOMMU_CMD_LINE_DMA_API         BIT(0)
-
-static void iommu_set_cmd_line_dma_api(void)
-{
-       iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API;
-}
-
-static bool iommu_cmd_line_dma_api(void)
-{
-       return !!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API);
-}
+#define IOMMU_CMD_LINE_STRICT          BIT(1)
 
 static int iommu_alloc_default_domain(struct iommu_group *group,
                                      struct device *dev);
@@ -130,9 +121,7 @@ static const char *iommu_domain_type_str(unsigned int t)
 
 static int __init iommu_subsys_init(void)
 {
-       bool cmd_line = iommu_cmd_line_dma_api();
-
-       if (!cmd_line) {
+       if (!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API)) {
                if (IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH))
                        iommu_set_default_passthrough(false);
                else
@@ -146,14 +135,32 @@ static int __init iommu_subsys_init(void)
 
        pr_info("Default domain type: %s %s\n",
                iommu_domain_type_str(iommu_def_domain_type),
-               cmd_line ? "(set via kernel command line)" : "");
+               (iommu_cmd_line & IOMMU_CMD_LINE_DMA_API) ?
+                       "(set via kernel command line)" : "");
 
        return 0;
 }
 subsys_initcall(iommu_subsys_init);
 
-int iommu_device_register(struct iommu_device *iommu)
+/**
+ * iommu_device_register() - Register an IOMMU hardware instance
+ * @iommu: IOMMU handle for the instance
+ * @ops:   IOMMU ops to associate with the instance
+ * @hwdev: (optional) actual instance device, used for fwnode lookup
+ *
+ * Return: 0 on success, or an error.
+ */
+int iommu_device_register(struct iommu_device *iommu,
+                         const struct iommu_ops *ops, struct device *hwdev)
 {
+       /* We need to be able to take module references appropriately */
+       if (WARN_ON(is_module_address((unsigned long)ops) && !ops->owner))
+               return -EINVAL;
+
+       iommu->ops = ops;
+       if (hwdev)
+               iommu->fwnode = hwdev->fwnode;
+
        spin_lock(&iommu_device_lock);
        list_add_tail(&iommu->list, &iommu_device_list);
        spin_unlock(&iommu_device_lock);
@@ -329,10 +336,29 @@ early_param("iommu.passthrough", iommu_set_def_domain_type);
 
 static int __init iommu_dma_setup(char *str)
 {
-       return kstrtobool(str, &iommu_dma_strict);
+       int ret = kstrtobool(str, &iommu_dma_strict);
+
+       if (!ret)
+               iommu_cmd_line |= IOMMU_CMD_LINE_STRICT;
+       return ret;
 }
 early_param("iommu.strict", iommu_dma_setup);
 
+void iommu_set_dma_strict(bool strict)
+{
+       if (strict || !(iommu_cmd_line & IOMMU_CMD_LINE_STRICT))
+               iommu_dma_strict = strict;
+}
+
+bool iommu_get_dma_strict(struct iommu_domain *domain)
+{
+       /* only allow lazy flushing for DMA domains */
+       if (domain->type == IOMMU_DOMAIN_DMA)
+               return iommu_dma_strict;
+       return true;
+}
+EXPORT_SYMBOL_GPL(iommu_get_dma_strict);
+
 static ssize_t iommu_group_attr_show(struct kobject *kobj,
                                     struct attribute *__attr, char *buf)
 {
@@ -1511,14 +1537,6 @@ static int iommu_group_alloc_default_domain(struct bus_type *bus,
        group->default_domain = dom;
        if (!group->domain)
                group->domain = dom;
-
-       if (!iommu_dma_strict) {
-               int attr = 1;
-               iommu_domain_set_attr(dom,
-                                     DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
-                                     &attr);
-       }
-
        return 0;
 }
 
@@ -2610,17 +2628,6 @@ size_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova,
        return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_ATOMIC);
 }
 
-int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
-                              phys_addr_t paddr, u64 size, int prot)
-{
-       if (unlikely(domain->ops->domain_window_enable == NULL))
-               return -ENODEV;
-
-       return domain->ops->domain_window_enable(domain, wnd_nr, paddr, size,
-                                                prot);
-}
-EXPORT_SYMBOL_GPL(iommu_domain_window_enable);
-
 /**
  * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
  * @domain: the iommu domain where the fault has happened
@@ -2675,50 +2682,26 @@ static int __init iommu_init(void)
 }
 core_initcall(iommu_init);
 
-int iommu_domain_get_attr(struct iommu_domain *domain,
-                         enum iommu_attr attr, void *data)
+int iommu_enable_nesting(struct iommu_domain *domain)
 {
-       struct iommu_domain_geometry *geometry;
-       bool *paging;
-       int ret = 0;
-
-       switch (attr) {
-       case DOMAIN_ATTR_GEOMETRY:
-               geometry  = data;
-               *geometry = domain->geometry;
-
-               break;
-       case DOMAIN_ATTR_PAGING:
-               paging  = data;
-               *paging = (domain->pgsize_bitmap != 0UL);
-               break;
-       default:
-               if (!domain->ops->domain_get_attr)
-                       return -EINVAL;
-
-               ret = domain->ops->domain_get_attr(domain, attr, data);
-       }
-
-       return ret;
+       if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+               return -EINVAL;
+       if (!domain->ops->enable_nesting)
+               return -EINVAL;
+       return domain->ops->enable_nesting(domain);
 }
-EXPORT_SYMBOL_GPL(iommu_domain_get_attr);
+EXPORT_SYMBOL_GPL(iommu_enable_nesting);
 
-int iommu_domain_set_attr(struct iommu_domain *domain,
-                         enum iommu_attr attr, void *data)
+int iommu_set_pgtable_quirks(struct iommu_domain *domain,
+               unsigned long quirk)
 {
-       int ret = 0;
-
-       switch (attr) {
-       default:
-               if (domain->ops->domain_set_attr == NULL)
-                       return -EINVAL;
-
-               ret = domain->ops->domain_set_attr(domain, attr, data);
-       }
-
-       return ret;
+       if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+               return -EINVAL;
+       if (!domain->ops->set_pgtable_quirks)
+               return -EINVAL;
+       return domain->ops->set_pgtable_quirks(domain, quirk);
 }
-EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
+EXPORT_SYMBOL_GPL(iommu_set_pgtable_quirks);
 
 void iommu_get_resv_regions(struct device *dev, struct list_head *list)
 {
@@ -2777,16 +2760,14 @@ EXPORT_SYMBOL_GPL(iommu_alloc_resv_region);
 void iommu_set_default_passthrough(bool cmd_line)
 {
        if (cmd_line)
-               iommu_set_cmd_line_dma_api();
-
+               iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API;
        iommu_def_domain_type = IOMMU_DOMAIN_IDENTITY;
 }
 
 void iommu_set_default_translated(bool cmd_line)
 {
        if (cmd_line)
-               iommu_set_cmd_line_dma_api();
-
+               iommu_cmd_line |= IOMMU_CMD_LINE_DMA_API;
        iommu_def_domain_type = IOMMU_DOMAIN_DMA;
 }
 
@@ -2878,10 +2859,12 @@ EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
  */
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
 {
-       const struct iommu_ops *ops = dev->bus->iommu_ops;
+       if (dev->iommu && dev->iommu->iommu_dev) {
+               const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
 
-       if (ops && ops->dev_enable_feat)
-               return ops->dev_enable_feat(dev, feat);
+               if (ops->dev_enable_feat)
+                       return ops->dev_enable_feat(dev, feat);
+       }
 
        return -ENODEV;
 }
@@ -2894,10 +2877,12 @@ EXPORT_SYMBOL_GPL(iommu_dev_enable_feature);
  */
 int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat)
 {
-       const struct iommu_ops *ops = dev->bus->iommu_ops;
+       if (dev->iommu && dev->iommu->iommu_dev) {
+               const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
 
-       if (ops && ops->dev_disable_feat)
-               return ops->dev_disable_feat(dev, feat);
+               if (ops->dev_disable_feat)
+                       return ops->dev_disable_feat(dev, feat);
+       }
 
        return -EBUSY;
 }
@@ -2905,10 +2890,12 @@ EXPORT_SYMBOL_GPL(iommu_dev_disable_feature);
 
 bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat)
 {
-       const struct iommu_ops *ops = dev->bus->iommu_ops;
+       if (dev->iommu && dev->iommu->iommu_dev) {
+               const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
 
-       if (ops && ops->dev_feat_enabled)
-               return ops->dev_feat_enabled(dev, feat);
+               if (ops->dev_feat_enabled)
+                       return ops->dev_feat_enabled(dev, feat);
+       }
 
        return false;
 }
index e6e2fa8..b7ecd5b 100644 (file)
@@ -22,11 +22,28 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad,
                                     unsigned long size,
                                     unsigned long limit_pfn);
 static void init_iova_rcaches(struct iova_domain *iovad);
+static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
 static void fq_destroy_all_entries(struct iova_domain *iovad);
 static void fq_flush_timeout(struct timer_list *t);
+
+static int iova_cpuhp_dead(unsigned int cpu, struct hlist_node *node)
+{
+       struct iova_domain *iovad;
+
+       iovad = hlist_entry_safe(node, struct iova_domain, cpuhp_dead);
+
+       free_cpu_cached_iovas(cpu, iovad);
+       return 0;
+}
+
 static void free_global_cached_iovas(struct iova_domain *iovad);
 
+static struct iova *to_iova(struct rb_node *node)
+{
+       return rb_entry(node, struct iova, node);
+}
+
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
        unsigned long start_pfn)
@@ -51,6 +68,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
        iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
        rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
        rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
+       cpuhp_state_add_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD, &iovad->cpuhp_dead);
        init_iova_rcaches(iovad);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
@@ -136,7 +154,7 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
 {
        struct iova *cached_iova;
 
-       cached_iova = rb_entry(iovad->cached32_node, struct iova, node);
+       cached_iova = to_iova(iovad->cached32_node);
        if (free == cached_iova ||
            (free->pfn_hi < iovad->dma_32bit_pfn &&
             free->pfn_lo >= cached_iova->pfn_lo)) {
@@ -144,11 +162,48 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
                iovad->max32_alloc_size = iovad->dma_32bit_pfn;
        }
 
-       cached_iova = rb_entry(iovad->cached_node, struct iova, node);
+       cached_iova = to_iova(iovad->cached_node);
        if (free->pfn_lo >= cached_iova->pfn_lo)
                iovad->cached_node = rb_next(&free->node);
 }
 
+static struct rb_node *iova_find_limit(struct iova_domain *iovad, unsigned long limit_pfn)
+{
+       struct rb_node *node, *next;
+       /*
+        * Ideally what we'd like to judge here is whether limit_pfn is close
+        * enough to the highest-allocated IOVA that starting the allocation
+        * walk from the anchor node will be quicker than this initial work to
+        * find an exact starting point (especially if that ends up being the
+        * anchor node anyway). This is an incredibly crude approximation which
+        * only really helps the most likely case, but is at least trivially easy.
+        */
+       if (limit_pfn > iovad->dma_32bit_pfn)
+               return &iovad->anchor.node;
+
+       node = iovad->rbroot.rb_node;
+       while (to_iova(node)->pfn_hi < limit_pfn)
+               node = node->rb_right;
+
+search_left:
+       while (node->rb_left && to_iova(node->rb_left)->pfn_lo >= limit_pfn)
+               node = node->rb_left;
+
+       if (!node->rb_left)
+               return node;
+
+       next = node->rb_left;
+       while (next->rb_right) {
+               next = next->rb_right;
+               if (to_iova(next)->pfn_lo >= limit_pfn) {
+                       node = next;
+                       goto search_left;
+               }
+       }
+
+       return node;
+}
+
 /* Insert the iova into domain rbtree by holding writer lock */
 static void
 iova_insert_rbtree(struct rb_root *root, struct iova *iova,
@@ -159,7 +214,7 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova,
        new = (start) ? &start : &(root->rb_node);
        /* Figure out where to put new node */
        while (*new) {
-               struct iova *this = rb_entry(*new, struct iova, node);
+               struct iova *this = to_iova(*new);
 
                parent = *new;
 
@@ -198,7 +253,7 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
                goto iova32_full;
 
        curr = __get_cached_rbnode(iovad, limit_pfn);
-       curr_iova = rb_entry(curr, struct iova, node);
+       curr_iova = to_iova(curr);
        retry_pfn = curr_iova->pfn_hi + 1;
 
 retry:
@@ -207,15 +262,15 @@ retry:
                new_pfn = (high_pfn - size) & align_mask;
                prev = curr;
                curr = rb_prev(curr);
-               curr_iova = rb_entry(curr, struct iova, node);
+               curr_iova = to_iova(curr);
        } while (curr && new_pfn <= curr_iova->pfn_hi && new_pfn >= low_pfn);
 
        if (high_pfn < size || new_pfn < low_pfn) {
                if (low_pfn == iovad->start_pfn && retry_pfn < limit_pfn) {
                        high_pfn = limit_pfn;
                        low_pfn = retry_pfn;
-                       curr = &iovad->anchor.node;
-                       curr_iova = rb_entry(curr, struct iova, node);
+                       curr = iova_find_limit(iovad, limit_pfn);
+                       curr_iova = to_iova(curr);
                        goto retry;
                }
                iovad->max32_alloc_size = size;
@@ -257,10 +312,21 @@ int iova_cache_get(void)
 {
        mutex_lock(&iova_cache_mutex);
        if (!iova_cache_users) {
+               int ret;
+
+               ret = cpuhp_setup_state_multi(CPUHP_IOMMU_IOVA_DEAD, "iommu/iova:dead", NULL,
+                                       iova_cpuhp_dead);
+               if (ret) {
+                       mutex_unlock(&iova_cache_mutex);
+                       pr_err("Couldn't register cpuhp handler\n");
+                       return ret;
+               }
+
                iova_cache = kmem_cache_create(
                        "iommu_iova", sizeof(struct iova), 0,
                        SLAB_HWCACHE_ALIGN, NULL);
                if (!iova_cache) {
+                       cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD);
                        mutex_unlock(&iova_cache_mutex);
                        pr_err("Couldn't create iova cache\n");
                        return -ENOMEM;
@@ -282,8 +348,10 @@ void iova_cache_put(void)
                return;
        }
        iova_cache_users--;
-       if (!iova_cache_users)
+       if (!iova_cache_users) {
+               cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD);
                kmem_cache_destroy(iova_cache);
+       }
        mutex_unlock(&iova_cache_mutex);
 }
 EXPORT_SYMBOL_GPL(iova_cache_put);
@@ -331,7 +399,7 @@ private_find_iova(struct iova_domain *iovad, unsigned long pfn)
        assert_spin_locked(&iovad->iova_rbtree_lock);
 
        while (node) {
-               struct iova *iova = rb_entry(node, struct iova, node);
+               struct iova *iova = to_iova(node);
 
                if (pfn < iova->pfn_lo)
                        node = node->rb_left;
@@ -467,7 +535,6 @@ free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
 
        free_iova(iovad, pfn);
 }
-EXPORT_SYMBOL_GPL(free_iova_fast);
 
 #define fq_ring_for_each(i, fq) \
        for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % IOVA_FQ_SIZE)
@@ -606,6 +673,9 @@ void put_iova_domain(struct iova_domain *iovad)
 {
        struct iova *iova, *tmp;
 
+       cpuhp_state_remove_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD,
+                                           &iovad->cpuhp_dead);
+
        free_iova_flush_queue(iovad);
        free_iova_rcaches(iovad);
        rbtree_postorder_for_each_entry_safe(iova, tmp, &iovad->rbroot, node)
@@ -617,7 +687,7 @@ static int
 __is_range_overlap(struct rb_node *node,
        unsigned long pfn_lo, unsigned long pfn_hi)
 {
-       struct iova *iova = rb_entry(node, struct iova, node);
+       struct iova *iova = to_iova(node);
 
        if ((pfn_lo <= iova->pfn_hi) && (pfn_hi >= iova->pfn_lo))
                return 1;
@@ -685,7 +755,7 @@ reserve_iova(struct iova_domain *iovad,
        spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
        for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) {
                if (__is_range_overlap(node, pfn_lo, pfn_hi)) {
-                       iova = rb_entry(node, struct iova, node);
+                       iova = to_iova(node);
                        __adjust_overlap_range(iova, &pfn_lo, &pfn_hi);
                        if ((pfn_lo >= iova->pfn_lo) &&
                                (pfn_hi <= iova->pfn_hi))
@@ -970,7 +1040,7 @@ static void free_iova_rcaches(struct iova_domain *iovad)
 /*
  * free all the IOVA ranges cached by a cpu (used when cpu is unplugged)
  */
-void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
+static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
 {
        struct iova_cpu_rcache *cpu_rcache;
        struct iova_rcache *rcache;
index eaaec0a..aaa6a4d 100644 (file)
@@ -1076,11 +1076,7 @@ static int ipmmu_probe(struct platform_device *pdev)
                if (ret)
                        return ret;
 
-               iommu_device_set_ops(&mmu->iommu, &ipmmu_ops);
-               iommu_device_set_fwnode(&mmu->iommu,
-                                       &pdev->dev.of_node->fwnode);
-
-               ret = iommu_device_register(&mmu->iommu);
+               ret = iommu_device_register(&mmu->iommu, &ipmmu_ops, &pdev->dev);
                if (ret)
                        return ret;
 
index f0ba6a0..7880f30 100644 (file)
@@ -792,10 +792,7 @@ static int msm_iommu_probe(struct platform_device *pdev)
                goto fail;
        }
 
-       iommu_device_set_ops(&iommu->iommu, &msm_iommu_ops);
-       iommu_device_set_fwnode(&iommu->iommu, &pdev->dev.of_node->fwnode);
-
-       ret = iommu_device_register(&iommu->iommu);
+       ret = iommu_device_register(&iommu->iommu, &msm_iommu_ops, &pdev->dev);
        if (ret) {
                pr_err("Could not register msm-smmu at %pa\n", &ioaddr);
                goto fail;
index 6ecc007..e06b8a0 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/iopoll.h>
 #include <linux/list.h>
 #include <linux/mfd/syscon.h>
+#include <linux/module.h>
 #include <linux/of_address.h>
 #include <linux/of_iommu.h>
 #include <linux/of_irq.h>
@@ -683,18 +684,12 @@ static const struct iommu_ops mtk_iommu_ops = {
        .get_resv_regions = mtk_iommu_get_resv_regions,
        .put_resv_regions = generic_iommu_put_resv_regions,
        .pgsize_bitmap  = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
+       .owner          = THIS_MODULE,
 };
 
 static int mtk_iommu_hw_init(const struct mtk_iommu_data *data)
 {
        u32 regval;
-       int ret;
-
-       ret = clk_prepare_enable(data->bclk);
-       if (ret) {
-               dev_err(data->dev, "Failed to enable iommu bclk(%d)\n", ret);
-               return ret;
-       }
 
        if (data->plat_data->m4u_plat == M4U_MT8173) {
                regval = F_MMU_PREFETCH_RT_REPLACE_MOD |
@@ -760,7 +755,6 @@ static int mtk_iommu_hw_init(const struct mtk_iommu_data *data)
        if (devm_request_irq(data->dev, data->irq, mtk_iommu_isr, 0,
                             dev_name(data->dev), (void *)data)) {
                writel_relaxed(0, data->base + REG_MMU_PT_BASE_ADDR);
-               clk_disable_unprepare(data->bclk);
                dev_err(data->dev, "Failed @ IRQ-%d Request\n", data->irq);
                return -ENODEV;
        }
@@ -898,10 +892,7 @@ static int mtk_iommu_probe(struct platform_device *pdev)
        if (ret)
                goto out_link_remove;
 
-       iommu_device_set_ops(&data->iommu, &mtk_iommu_ops);
-       iommu_device_set_fwnode(&data->iommu, &pdev->dev.of_node->fwnode);
-
-       ret = iommu_device_register(&data->iommu);
+       ret = iommu_device_register(&data->iommu, &mtk_iommu_ops, dev);
        if (ret)
                goto out_sysfs_remove;
 
@@ -977,14 +968,19 @@ static int __maybe_unused mtk_iommu_runtime_resume(struct device *dev)
        void __iomem *base = data->base;
        int ret;
 
-       /* Avoid first resume to affect the default value of registers below. */
-       if (!m4u_dom)
-               return 0;
        ret = clk_prepare_enable(data->bclk);
        if (ret) {
                dev_err(data->dev, "Failed to enable clk(%d) in resume\n", ret);
                return ret;
        }
+
+       /*
+        * Uppon first resume, only enable the clk and return, since the values of the
+        * registers are not yet set.
+        */
+       if (!m4u_dom)
+               return 0;
+
        writel_relaxed(reg->wr_len_ctrl, base + REG_MMU_WR_LEN_CTRL);
        writel_relaxed(reg->misc_ctrl, base + REG_MMU_MISC_CTRL);
        writel_relaxed(reg->dcm_dis, base + REG_MMU_DCM_DIS);
@@ -1079,16 +1075,7 @@ static struct platform_driver mtk_iommu_driver = {
                .pm = &mtk_iommu_pm_ops,
        }
 };
+module_platform_driver(mtk_iommu_driver);
 
-static int __init mtk_iommu_init(void)
-{
-       int ret;
-
-       ret = platform_driver_register(&mtk_iommu_driver);
-       if (ret != 0)
-               pr_err("Failed to register MTK IOMMU driver\n");
-
-       return ret;
-}
-
-subsys_initcall(mtk_iommu_init)
+MODULE_DESCRIPTION("IOMMU API for MediaTek M4U implementations");
+MODULE_LICENSE("GPL v2");
index 82ddfe9..5915d7b 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/iommu.h>
 #include <linux/iopoll.h>
 #include <linux/list.h>
+#include <linux/module.h>
 #include <linux/of_address.h>
 #include <linux/of_iommu.h>
 #include <linux/of_irq.h>
@@ -423,23 +424,21 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev)
 {
        struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
        struct of_phandle_args iommu_spec;
-       struct of_phandle_iterator it;
        struct mtk_iommu_data *data;
-       int err;
+       int err, idx = 0;
 
-       of_for_each_phandle(&it, err, dev->of_node, "iommus",
-                       "#iommu-cells", -1) {
-               int count = of_phandle_iterator_args(&it, iommu_spec.args,
-                                       MAX_PHANDLE_ARGS);
-               iommu_spec.np = of_node_get(it.node);
-               iommu_spec.args_count = count;
+       while (!of_parse_phandle_with_args(dev->of_node, "iommus",
+                                          "#iommu-cells",
+                                          idx, &iommu_spec)) {
 
-               mtk_iommu_create_mapping(dev, &iommu_spec);
+               err = mtk_iommu_create_mapping(dev, &iommu_spec);
+               of_node_put(iommu_spec.np);
+               if (err)
+                       return ERR_PTR(err);
 
                /* dev->iommu_fwspec might have changed */
                fwspec = dev_iommu_fwspec_get(dev);
-
-               of_node_put(iommu_spec.np);
+               idx++;
        }
 
        if (!fwspec || fwspec->ops != &mtk_iommu_ops)
@@ -529,6 +528,7 @@ static const struct iommu_ops mtk_iommu_ops = {
        .def_domain_type = mtk_iommu_def_domain_type,
        .device_group   = generic_device_group,
        .pgsize_bitmap  = ~0UL << MT2701_IOMMU_PAGE_SHIFT,
+       .owner          = THIS_MODULE,
 };
 
 static const struct of_device_id mtk_iommu_of_ids[] = {
@@ -547,10 +547,8 @@ static int mtk_iommu_probe(struct platform_device *pdev)
        struct device                   *dev = &pdev->dev;
        struct resource                 *res;
        struct component_match          *match = NULL;
-       struct of_phandle_args          larb_spec;
-       struct of_phandle_iterator      it;
        void                            *protect;
-       int                             larb_nr, ret, err;
+       int                             larb_nr, ret, i;
 
        data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
        if (!data)
@@ -578,35 +576,33 @@ static int mtk_iommu_probe(struct platform_device *pdev)
        if (IS_ERR(data->bclk))
                return PTR_ERR(data->bclk);
 
-       larb_nr = 0;
-       of_for_each_phandle(&it, err, dev->of_node,
-                       "mediatek,larbs", NULL, 0) {
+       larb_nr = of_count_phandle_with_args(dev->of_node,
+                                            "mediatek,larbs", NULL);
+       if (larb_nr < 0)
+               return larb_nr;
+
+       for (i = 0; i < larb_nr; i++) {
+               struct device_node *larbnode;
                struct platform_device *plarbdev;
-               int count = of_phandle_iterator_args(&it, larb_spec.args,
-                                       MAX_PHANDLE_ARGS);
 
-               if (count)
-                       continue;
+               larbnode = of_parse_phandle(dev->of_node, "mediatek,larbs", i);
+               if (!larbnode)
+                       return -EINVAL;
 
-               larb_spec.np = of_node_get(it.node);
-               if (!of_device_is_available(larb_spec.np))
+               if (!of_device_is_available(larbnode)) {
+                       of_node_put(larbnode);
                        continue;
+               }
 
-               plarbdev = of_find_device_by_node(larb_spec.np);
+               plarbdev = of_find_device_by_node(larbnode);
                if (!plarbdev) {
-                       plarbdev = of_platform_device_create(
-                                               larb_spec.np, NULL,
-                                               platform_bus_type.dev_root);
-                       if (!plarbdev) {
-                               of_node_put(larb_spec.np);
-                               return -EPROBE_DEFER;
-                       }
+                       of_node_put(larbnode);
+                       return -EPROBE_DEFER;
                }
+               data->larb_imu[i].dev = &plarbdev->dev;
 
-               data->larb_imu[larb_nr].dev = &plarbdev->dev;
                component_match_add_release(dev, &match, release_of,
-                                           compare_of, larb_spec.np);
-               larb_nr++;
+                                           compare_of, larbnode);
        }
 
        platform_set_drvdata(pdev, data);
@@ -620,16 +616,28 @@ static int mtk_iommu_probe(struct platform_device *pdev)
        if (ret)
                return ret;
 
-       iommu_device_set_ops(&data->iommu, &mtk_iommu_ops);
-
-       ret = iommu_device_register(&data->iommu);
+       ret = iommu_device_register(&data->iommu, &mtk_iommu_ops, dev);
        if (ret)
-               return ret;
+               goto out_sysfs_remove;
 
-       if (!iommu_present(&platform_bus_type))
-               bus_set_iommu(&platform_bus_type,  &mtk_iommu_ops);
+       if (!iommu_present(&platform_bus_type)) {
+               ret = bus_set_iommu(&platform_bus_type,  &mtk_iommu_ops);
+               if (ret)
+                       goto out_dev_unreg;
+       }
+
+       ret = component_master_add_with_match(dev, &mtk_iommu_com_ops, match);
+       if (ret)
+               goto out_bus_set_null;
+       return ret;
 
-       return component_master_add_with_match(dev, &mtk_iommu_com_ops, match);
+out_bus_set_null:
+       bus_set_iommu(&platform_bus_type, NULL);
+out_dev_unreg:
+       iommu_device_unregister(&data->iommu);
+out_sysfs_remove:
+       iommu_device_sysfs_remove(&data->iommu);
+       return ret;
 }
 
 static int mtk_iommu_remove(struct platform_device *pdev)
@@ -691,9 +699,7 @@ static struct platform_driver mtk_iommu_driver = {
                .pm = &mtk_iommu_pm_ops,
        }
 };
+module_platform_driver(mtk_iommu_driver);
 
-static int __init m4u_init(void)
-{
-       return platform_driver_register(&mtk_iommu_driver);
-}
-subsys_initcall(m4u_init);
+MODULE_DESCRIPTION("IOMMU API for MediaTek M4U v1 implementations");
+MODULE_LICENSE("GPL v2");
index e505b91..a9d2df0 100644 (file)
@@ -210,11 +210,6 @@ const struct iommu_ops *of_iommu_configure(struct device *dev,
                                             of_pci_iommu_init, &info);
        } else {
                err = of_iommu_configure_device(master_np, dev, id);
-
-               fwspec = dev_iommu_fwspec_get(dev);
-               if (!err && fwspec)
-                       of_property_read_u32(master_np, "pasid-num-bits",
-                                            &fwspec->num_pasid_bits);
        }
 
        /*
index 71f29c0..26e517e 100644 (file)
@@ -1235,10 +1235,7 @@ static int omap_iommu_probe(struct platform_device *pdev)
                if (err)
                        goto out_group;
 
-               iommu_device_set_ops(&obj->iommu, &omap_iommu_ops);
-               iommu_device_set_fwnode(&obj->iommu, &of->fwnode);
-
-               err = iommu_device_register(&obj->iommu);
+               err = iommu_device_register(&obj->iommu, &omap_iommu_ops, &pdev->dev);
                if (err)
                        goto out_sysfs;
        }
index e5d86b7..7a29327 100644 (file)
@@ -1196,10 +1196,7 @@ static int rk_iommu_probe(struct platform_device *pdev)
        if (err)
                goto err_put_group;
 
-       iommu_device_set_ops(&iommu->iommu, &rk_iommu_ops);
-       iommu_device_set_fwnode(&iommu->iommu, &dev->of_node->fwnode);
-
-       err = iommu_device_register(&iommu->iommu);
+       err = iommu_device_register(&iommu->iommu, &rk_iommu_ops, dev);
        if (err)
                goto err_remove_sysfs;
 
index 8895dbb..6019e58 100644 (file)
@@ -333,9 +333,7 @@ int zpci_init_iommu(struct zpci_dev *zdev)
        if (rc)
                goto out_err;
 
-       iommu_device_set_ops(&zdev->iommu_dev, &s390_iommu_ops);
-
-       rc = iommu_device_register(&zdev->iommu_dev);
+       rc = iommu_device_register(&zdev->iommu_dev, &s390_iommu_ops, NULL);
        if (rc)
                goto out_sysfs;
 
diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c
new file mode 100644 (file)
index 0000000..73dfd99
--- /dev/null
@@ -0,0 +1,575 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Unisoc IOMMU driver
+ *
+ * Copyright (C) 2020 Unisoc, Inc.
+ * Author: Chunyan Zhang <chunyan.zhang@unisoc.com>
+ */
+
+#include <linux/clk.h>
+#include <linux/device.h>
+#include <linux/dma-iommu.h>
+#include <linux/dma-mapping.h>
+#include <linux/errno.h>
+#include <linux/iommu.h>
+#include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/of_platform.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+
+#define SPRD_IOMMU_PAGE_SHIFT  12
+#define SPRD_IOMMU_PAGE_SIZE   SZ_4K
+
+#define SPRD_EX_CFG            0x0
+#define SPRD_IOMMU_VAOR_BYPASS BIT(4)
+#define SPRD_IOMMU_GATE_EN     BIT(1)
+#define SPRD_IOMMU_EN          BIT(0)
+#define SPRD_EX_UPDATE         0x4
+#define SPRD_EX_FIRST_VPN      0x8
+#define SPRD_EX_VPN_RANGE      0xc
+#define SPRD_EX_FIRST_PPN      0x10
+#define SPRD_EX_DEFAULT_PPN    0x14
+
+#define SPRD_IOMMU_VERSION     0x0
+#define SPRD_VERSION_MASK      GENMASK(15, 8)
+#define SPRD_VERSION_SHIFT     0x8
+#define SPRD_VAU_CFG           0x4
+#define SPRD_VAU_UPDATE                0x8
+#define SPRD_VAU_AUTH_CFG      0xc
+#define SPRD_VAU_FIRST_PPN     0x10
+#define SPRD_VAU_DEFAULT_PPN_RD        0x14
+#define SPRD_VAU_DEFAULT_PPN_WR        0x18
+#define SPRD_VAU_FIRST_VPN     0x1c
+#define SPRD_VAU_VPN_RANGE     0x20
+
+enum sprd_iommu_version {
+       SPRD_IOMMU_EX,
+       SPRD_IOMMU_VAU,
+};
+
+/*
+ * struct sprd_iommu_device - high-level sprd IOMMU device representation,
+ * including hardware information and configuration, also driver data, etc
+ *
+ * @ver: sprd IOMMU IP version
+ * @prot_page_va: protect page base virtual address
+ * @prot_page_pa: protect page base physical address, data would be
+ *               written to here while translation fault
+ * @base: mapped base address for accessing registers
+ * @dev: pointer to basic device structure
+ * @iommu: IOMMU core representation
+ * @group: IOMMU group
+ * @eb: gate clock which controls IOMMU access
+ */
+struct sprd_iommu_device {
+       enum sprd_iommu_version ver;
+       u32                     *prot_page_va;
+       dma_addr_t              prot_page_pa;
+       void __iomem            *base;
+       struct device           *dev;
+       struct iommu_device     iommu;
+       struct iommu_group      *group;
+       struct clk              *eb;
+};
+
+struct sprd_iommu_domain {
+       spinlock_t              pgtlock; /* lock for page table */
+       struct iommu_domain     domain;
+       u32                     *pgt_va; /* page table virtual address base */
+       dma_addr_t              pgt_pa; /* page table physical address base */
+       struct sprd_iommu_device        *sdev;
+};
+
+static const struct iommu_ops sprd_iommu_ops;
+
+static struct sprd_iommu_domain *to_sprd_domain(struct iommu_domain *dom)
+{
+       return container_of(dom, struct sprd_iommu_domain, domain);
+}
+
+static inline void
+sprd_iommu_write(struct sprd_iommu_device *sdev, unsigned int reg, u32 val)
+{
+       writel_relaxed(val, sdev->base + reg);
+}
+
+static inline u32
+sprd_iommu_read(struct sprd_iommu_device *sdev, unsigned int reg)
+{
+       return readl_relaxed(sdev->base + reg);
+}
+
+static inline void
+sprd_iommu_update_bits(struct sprd_iommu_device *sdev, unsigned int reg,
+                 u32 mask, u32 shift, u32 val)
+{
+       u32 t = sprd_iommu_read(sdev, reg);
+
+       t = (t & (~(mask << shift))) | ((val & mask) << shift);
+       sprd_iommu_write(sdev, reg, t);
+}
+
+static inline int
+sprd_iommu_get_version(struct sprd_iommu_device *sdev)
+{
+       int ver = (sprd_iommu_read(sdev, SPRD_IOMMU_VERSION) &
+                  SPRD_VERSION_MASK) >> SPRD_VERSION_SHIFT;
+
+       switch (ver) {
+       case SPRD_IOMMU_EX:
+       case SPRD_IOMMU_VAU:
+               return ver;
+       default:
+               return -EINVAL;
+       }
+}
+
+static size_t
+sprd_iommu_pgt_size(struct iommu_domain *domain)
+{
+       return ((domain->geometry.aperture_end -
+                domain->geometry.aperture_start + 1) >>
+               SPRD_IOMMU_PAGE_SHIFT) * sizeof(u32);
+}
+
+static struct iommu_domain *sprd_iommu_domain_alloc(unsigned int domain_type)
+{
+       struct sprd_iommu_domain *dom;
+
+       if (domain_type != IOMMU_DOMAIN_DMA && domain_type != IOMMU_DOMAIN_UNMANAGED)
+               return NULL;
+
+       dom = kzalloc(sizeof(*dom), GFP_KERNEL);
+       if (!dom)
+               return NULL;
+
+       if (iommu_get_dma_cookie(&dom->domain)) {
+               kfree(dom);
+               return NULL;
+       }
+
+       spin_lock_init(&dom->pgtlock);
+
+       dom->domain.geometry.aperture_start = 0;
+       dom->domain.geometry.aperture_end = SZ_256M - 1;
+
+       return &dom->domain;
+}
+
+static void sprd_iommu_domain_free(struct iommu_domain *domain)
+{
+       struct sprd_iommu_domain *dom = to_sprd_domain(domain);
+
+       iommu_put_dma_cookie(domain);
+       kfree(dom);
+}
+
+static void sprd_iommu_first_vpn(struct sprd_iommu_domain *dom)
+{
+       struct sprd_iommu_device *sdev = dom->sdev;
+       u32 val;
+       unsigned int reg;
+
+       if (sdev->ver == SPRD_IOMMU_EX)
+               reg = SPRD_EX_FIRST_VPN;
+       else
+               reg = SPRD_VAU_FIRST_VPN;
+
+       val = dom->domain.geometry.aperture_start >> SPRD_IOMMU_PAGE_SHIFT;
+       sprd_iommu_write(sdev, reg, val);
+}
+
+static void sprd_iommu_vpn_range(struct sprd_iommu_domain *dom)
+{
+       struct sprd_iommu_device *sdev = dom->sdev;
+       u32 val;
+       unsigned int reg;
+
+       if (sdev->ver == SPRD_IOMMU_EX)
+               reg = SPRD_EX_VPN_RANGE;
+       else
+               reg = SPRD_VAU_VPN_RANGE;
+
+       val = (dom->domain.geometry.aperture_end -
+              dom->domain.geometry.aperture_start) >> SPRD_IOMMU_PAGE_SHIFT;
+       sprd_iommu_write(sdev, reg, val);
+}
+
+static void sprd_iommu_first_ppn(struct sprd_iommu_domain *dom)
+{
+       u32 val = dom->pgt_pa >> SPRD_IOMMU_PAGE_SHIFT;
+       struct sprd_iommu_device *sdev = dom->sdev;
+       unsigned int reg;
+
+       if (sdev->ver == SPRD_IOMMU_EX)
+               reg = SPRD_EX_FIRST_PPN;
+       else
+               reg = SPRD_VAU_FIRST_PPN;
+
+       sprd_iommu_write(sdev, reg, val);
+}
+
+static void sprd_iommu_default_ppn(struct sprd_iommu_device *sdev)
+{
+       u32 val = sdev->prot_page_pa >> SPRD_IOMMU_PAGE_SHIFT;
+
+       if (sdev->ver == SPRD_IOMMU_EX) {
+               sprd_iommu_write(sdev, SPRD_EX_DEFAULT_PPN, val);
+       } else if (sdev->ver == SPRD_IOMMU_VAU) {
+               sprd_iommu_write(sdev, SPRD_VAU_DEFAULT_PPN_RD, val);
+               sprd_iommu_write(sdev, SPRD_VAU_DEFAULT_PPN_WR, val);
+       }
+}
+
+static void sprd_iommu_hw_en(struct sprd_iommu_device *sdev, bool en)
+{
+       unsigned int reg_cfg;
+       u32 mask, val;
+
+       if (sdev->ver == SPRD_IOMMU_EX)
+               reg_cfg = SPRD_EX_CFG;
+       else
+               reg_cfg = SPRD_VAU_CFG;
+
+       mask = SPRD_IOMMU_EN | SPRD_IOMMU_GATE_EN;
+       val = en ? mask : 0;
+       sprd_iommu_update_bits(sdev, reg_cfg, mask, 0, val);
+}
+
+static int sprd_iommu_attach_device(struct iommu_domain *domain,
+                                   struct device *dev)
+{
+       struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev);
+       struct sprd_iommu_domain *dom = to_sprd_domain(domain);
+       size_t pgt_size = sprd_iommu_pgt_size(domain);
+
+       if (dom->sdev) {
+               pr_err("There's already a device attached to this domain.\n");
+               return -EINVAL;
+       }
+
+       dom->pgt_va = dma_alloc_coherent(sdev->dev, pgt_size, &dom->pgt_pa, GFP_KERNEL);
+       if (!dom->pgt_va)
+               return -ENOMEM;
+
+       dom->sdev = sdev;
+
+       sprd_iommu_first_ppn(dom);
+       sprd_iommu_first_vpn(dom);
+       sprd_iommu_vpn_range(dom);
+       sprd_iommu_default_ppn(sdev);
+       sprd_iommu_hw_en(sdev, true);
+
+       return 0;
+}
+
+static void sprd_iommu_detach_device(struct iommu_domain *domain,
+                                            struct device *dev)
+{
+       struct sprd_iommu_domain *dom = to_sprd_domain(domain);
+       struct sprd_iommu_device *sdev = dom->sdev;
+       size_t pgt_size = sprd_iommu_pgt_size(domain);
+
+       if (!sdev)
+               return;
+
+       dma_free_coherent(sdev->dev, pgt_size, dom->pgt_va, dom->pgt_pa);
+       sprd_iommu_hw_en(sdev, false);
+       dom->sdev = NULL;
+}
+
+static int sprd_iommu_map(struct iommu_domain *domain, unsigned long iova,
+                         phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+       struct sprd_iommu_domain *dom = to_sprd_domain(domain);
+       unsigned int page_num = size >> SPRD_IOMMU_PAGE_SHIFT;
+       unsigned long flags;
+       unsigned int i;
+       u32 *pgt_base_iova;
+       u32 pabase = (u32)paddr;
+       unsigned long start = domain->geometry.aperture_start;
+       unsigned long end = domain->geometry.aperture_end;
+
+       if (!dom->sdev) {
+               pr_err("No sprd_iommu_device attached to the domain\n");
+               return -EINVAL;
+       }
+
+       if (iova < start || (iova + size) > (end + 1)) {
+               dev_err(dom->sdev->dev, "(iova(0x%lx) + size(%zx)) are not in the range!\n",
+                       iova, size);
+               return -EINVAL;
+       }
+
+       pgt_base_iova = dom->pgt_va + ((iova - start) >> SPRD_IOMMU_PAGE_SHIFT);
+
+       spin_lock_irqsave(&dom->pgtlock, flags);
+       for (i = 0; i < page_num; i++) {
+               pgt_base_iova[i] = pabase >> SPRD_IOMMU_PAGE_SHIFT;
+               pabase += SPRD_IOMMU_PAGE_SIZE;
+       }
+       spin_unlock_irqrestore(&dom->pgtlock, flags);
+
+       return 0;
+}
+
+static size_t sprd_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
+                       size_t size, struct iommu_iotlb_gather *iotlb_gather)
+{
+       struct sprd_iommu_domain *dom = to_sprd_domain(domain);
+       unsigned long flags;
+       u32 *pgt_base_iova;
+       unsigned int page_num = size >> SPRD_IOMMU_PAGE_SHIFT;
+       unsigned long start = domain->geometry.aperture_start;
+       unsigned long end = domain->geometry.aperture_end;
+
+       if (iova < start || (iova + size) > (end + 1))
+               return -EINVAL;
+
+       pgt_base_iova = dom->pgt_va + ((iova - start) >> SPRD_IOMMU_PAGE_SHIFT);
+
+       spin_lock_irqsave(&dom->pgtlock, flags);
+       memset(pgt_base_iova, 0, page_num * sizeof(u32));
+       spin_unlock_irqrestore(&dom->pgtlock, flags);
+
+       return 0;
+}
+
+static void sprd_iommu_sync_map(struct iommu_domain *domain,
+                               unsigned long iova, size_t size)
+{
+       struct sprd_iommu_domain *dom = to_sprd_domain(domain);
+       unsigned int reg;
+
+       if (dom->sdev->ver == SPRD_IOMMU_EX)
+               reg = SPRD_EX_UPDATE;
+       else
+               reg = SPRD_VAU_UPDATE;
+
+       /* clear IOMMU TLB buffer after page table updated */
+       sprd_iommu_write(dom->sdev, reg, 0xffffffff);
+}
+
+static void sprd_iommu_sync(struct iommu_domain *domain,
+                           struct iommu_iotlb_gather *iotlb_gather)
+{
+       sprd_iommu_sync_map(domain, 0, 0);
+}
+
+static phys_addr_t sprd_iommu_iova_to_phys(struct iommu_domain *domain,
+                                          dma_addr_t iova)
+{
+       struct sprd_iommu_domain *dom = to_sprd_domain(domain);
+       unsigned long flags;
+       phys_addr_t pa;
+       unsigned long start = domain->geometry.aperture_start;
+       unsigned long end = domain->geometry.aperture_end;
+
+       if (WARN_ON(iova < start || iova > end))
+               return 0;
+
+       spin_lock_irqsave(&dom->pgtlock, flags);
+       pa = *(dom->pgt_va + ((iova - start) >> SPRD_IOMMU_PAGE_SHIFT));
+       pa = (pa << SPRD_IOMMU_PAGE_SHIFT) + ((iova - start) & (SPRD_IOMMU_PAGE_SIZE - 1));
+       spin_unlock_irqrestore(&dom->pgtlock, flags);
+
+       return pa;
+}
+
+static struct iommu_device *sprd_iommu_probe_device(struct device *dev)
+{
+       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+       struct sprd_iommu_device *sdev;
+
+       if (!fwspec || fwspec->ops != &sprd_iommu_ops)
+               return ERR_PTR(-ENODEV);
+
+       sdev = dev_iommu_priv_get(dev);
+
+       return &sdev->iommu;
+}
+
+static void sprd_iommu_release_device(struct device *dev)
+{
+       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+       if (!fwspec || fwspec->ops != &sprd_iommu_ops)
+               return;
+
+       iommu_fwspec_free(dev);
+}
+
+static struct iommu_group *sprd_iommu_device_group(struct device *dev)
+{
+       struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev);
+
+       return iommu_group_ref_get(sdev->group);
+}
+
+static int sprd_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
+{
+       struct platform_device *pdev;
+
+       if (!dev_iommu_priv_get(dev)) {
+               pdev = of_find_device_by_node(args->np);
+               dev_iommu_priv_set(dev, platform_get_drvdata(pdev));
+               platform_device_put(pdev);
+       }
+
+       return 0;
+}
+
+
+static const struct iommu_ops sprd_iommu_ops = {
+       .domain_alloc   = sprd_iommu_domain_alloc,
+       .domain_free    = sprd_iommu_domain_free,
+       .attach_dev     = sprd_iommu_attach_device,
+       .detach_dev     = sprd_iommu_detach_device,
+       .map            = sprd_iommu_map,
+       .unmap          = sprd_iommu_unmap,
+       .iotlb_sync_map = sprd_iommu_sync_map,
+       .iotlb_sync     = sprd_iommu_sync,
+       .iova_to_phys   = sprd_iommu_iova_to_phys,
+       .probe_device   = sprd_iommu_probe_device,
+       .release_device = sprd_iommu_release_device,
+       .device_group   = sprd_iommu_device_group,
+       .of_xlate       = sprd_iommu_of_xlate,
+       .pgsize_bitmap  = ~0UL << SPRD_IOMMU_PAGE_SHIFT,
+       .owner          = THIS_MODULE,
+};
+
+static const struct of_device_id sprd_iommu_of_match[] = {
+       { .compatible = "sprd,iommu-v1" },
+       { },
+};
+MODULE_DEVICE_TABLE(of, sprd_iommu_of_match);
+
+/*
+ * Clock is not required, access to some of IOMMUs is controlled by gate
+ * clk, enabled clocks for that kind of IOMMUs before accessing.
+ * Return 0 for success or no clocks found.
+ */
+static int sprd_iommu_clk_enable(struct sprd_iommu_device *sdev)
+{
+       struct clk *eb;
+
+       eb = devm_clk_get_optional(sdev->dev, NULL);
+       if (!eb)
+               return 0;
+
+       if (IS_ERR(eb))
+               return PTR_ERR(eb);
+
+       sdev->eb = eb;
+       return clk_prepare_enable(eb);
+}
+
+static void sprd_iommu_clk_disable(struct sprd_iommu_device *sdev)
+{
+       if (sdev->eb)
+               clk_disable_unprepare(sdev->eb);
+}
+
+static int sprd_iommu_probe(struct platform_device *pdev)
+{
+       struct sprd_iommu_device *sdev;
+       struct device *dev = &pdev->dev;
+       void __iomem *base;
+       int ret;
+
+       sdev = devm_kzalloc(dev, sizeof(*sdev), GFP_KERNEL);
+       if (!sdev)
+               return -ENOMEM;
+
+       base = devm_platform_ioremap_resource(pdev, 0);
+       if (IS_ERR(base)) {
+               dev_err(dev, "Failed to get ioremap resource.\n");
+               return PTR_ERR(base);
+       }
+       sdev->base = base;
+
+       sdev->prot_page_va = dma_alloc_coherent(dev, SPRD_IOMMU_PAGE_SIZE,
+                                               &sdev->prot_page_pa, GFP_KERNEL);
+       if (!sdev->prot_page_va)
+               return -ENOMEM;
+
+       platform_set_drvdata(pdev, sdev);
+       sdev->dev = dev;
+
+       /* All the client devices are in the same iommu-group */
+       sdev->group = iommu_group_alloc();
+       if (IS_ERR(sdev->group)) {
+               ret = PTR_ERR(sdev->group);
+               goto free_page;
+       }
+
+       ret = iommu_device_sysfs_add(&sdev->iommu, dev, NULL, dev_name(dev));
+       if (ret)
+               goto put_group;
+
+       ret = iommu_device_register(&sdev->iommu, &sprd_iommu_ops, dev);
+       if (ret)
+               goto remove_sysfs;
+
+       if (!iommu_present(&platform_bus_type))
+               bus_set_iommu(&platform_bus_type, &sprd_iommu_ops);
+
+       ret = sprd_iommu_clk_enable(sdev);
+       if (ret)
+               goto unregister_iommu;
+
+       ret = sprd_iommu_get_version(sdev);
+       if (ret < 0) {
+               dev_err(dev, "IOMMU version(%d) is invalid.\n", ret);
+               goto disable_clk;
+       }
+       sdev->ver = ret;
+
+       return 0;
+
+disable_clk:
+       sprd_iommu_clk_disable(sdev);
+unregister_iommu:
+       iommu_device_unregister(&sdev->iommu);
+remove_sysfs:
+       iommu_device_sysfs_remove(&sdev->iommu);
+put_group:
+       iommu_group_put(sdev->group);
+free_page:
+       dma_free_coherent(sdev->dev, SPRD_IOMMU_PAGE_SIZE, sdev->prot_page_va, sdev->prot_page_pa);
+       return ret;
+}
+
+static int sprd_iommu_remove(struct platform_device *pdev)
+{
+       struct sprd_iommu_device *sdev = platform_get_drvdata(pdev);
+
+       dma_free_coherent(sdev->dev, SPRD_IOMMU_PAGE_SIZE, sdev->prot_page_va, sdev->prot_page_pa);
+
+       iommu_group_put(sdev->group);
+       sdev->group = NULL;
+
+       bus_set_iommu(&platform_bus_type, NULL);
+
+       platform_set_drvdata(pdev, NULL);
+       iommu_device_sysfs_remove(&sdev->iommu);
+       iommu_device_unregister(&sdev->iommu);
+
+       return 0;
+}
+
+static struct platform_driver sprd_iommu_driver = {
+       .driver = {
+               .name           = "sprd-iommu",
+               .of_match_table = sprd_iommu_of_match,
+               .suppress_bind_attrs = true,
+       },
+       .probe  = sprd_iommu_probe,
+       .remove = sprd_iommu_remove,
+};
+module_platform_driver(sprd_iommu_driver);
+
+MODULE_DESCRIPTION("IOMMU driver for Unisoc SoCs");
+MODULE_ALIAS("platform:sprd-iommu");
+MODULE_LICENSE("GPL");
index ea6db13..181bb1c 100644 (file)
@@ -968,10 +968,7 @@ static int sun50i_iommu_probe(struct platform_device *pdev)
        if (ret)
                goto err_free_group;
 
-       iommu_device_set_ops(&iommu->iommu, &sun50i_iommu_ops);
-       iommu_device_set_fwnode(&iommu->iommu, &pdev->dev.of_node->fwnode);
-
-       ret = iommu_device_register(&iommu->iommu);
+       ret = iommu_device_register(&iommu->iommu, &sun50i_iommu_ops, &pdev->dev);
        if (ret)
                goto err_remove_sysfs;
 
index 6f130e5..6a358f9 100644 (file)
@@ -353,10 +353,7 @@ struct gart_device *tegra_gart_probe(struct device *dev, struct tegra_mc *mc)
        if (err)
                goto free_gart;
 
-       iommu_device_set_ops(&gart->iommu, &gart_iommu_ops);
-       iommu_device_set_fwnode(&gart->iommu, dev->fwnode);
-
-       err = iommu_device_register(&gart->iommu);
+       err = iommu_device_register(&gart->iommu, &gart_iommu_ops, dev);
        if (err)
                goto remove_sysfs;
 
index 602aab9..1e98dc6 100644 (file)
@@ -1145,10 +1145,7 @@ struct tegra_smmu *tegra_smmu_probe(struct device *dev,
        if (err)
                return ERR_PTR(err);
 
-       iommu_device_set_ops(&smmu->iommu, &tegra_smmu_ops);
-       iommu_device_set_fwnode(&smmu->iommu, dev->fwnode);
-
-       err = iommu_device_register(&smmu->iommu);
+       err = iommu_device_register(&smmu->iommu, &tegra_smmu_ops, dev);
        if (err)
                goto remove_sysfs;
 
index 2bfdd57..7c02481 100644 (file)
@@ -945,6 +945,7 @@ static struct iommu_ops viommu_ops = {
        .get_resv_regions       = viommu_get_resv_regions,
        .put_resv_regions       = generic_iommu_put_resv_regions,
        .of_xlate               = viommu_of_xlate,
+       .owner                  = THIS_MODULE,
 };
 
 static int viommu_init_vqs(struct viommu_dev *viommu)
@@ -1065,10 +1066,7 @@ static int viommu_probe(struct virtio_device *vdev)
        if (ret)
                goto err_free_vqs;
 
-       iommu_device_set_ops(&viommu->iommu, &viommu_ops);
-       iommu_device_set_fwnode(&viommu->iommu, parent_dev->fwnode);
-
-       iommu_device_register(&viommu->iommu);
+       iommu_device_register(&viommu->iommu, &viommu_ops, parent_dev);
 
 #ifdef CONFIG_PCI
        if (pci_bus_type.iommu_ops != &viommu_ops) {
index c3485b2..2e6923c 100644 (file)
@@ -794,8 +794,13 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its,
 
        its_encode_alloc(cmd, alloc);
 
-       /* We can only signal PTZ when alloc==1. Why do we have two bits? */
-       its_encode_ptz(cmd, alloc);
+       /*
+        * GICv4.1 provides a way to get the VLPI state, which needs the vPE
+        * to be unmapped first, and in this case, we may remap the vPE
+        * back while the VPT is not empty. So we can't assume that the
+        * VPT is empty on map. This is why we never advertise PTZ.
+        */
+       its_encode_ptz(cmd, false);
        its_encode_vconf_addr(cmd, vconf_addr);
        its_encode_vmapp_default_db(cmd, desc->its_vmapp_cmd.vpe->vpe_db_lpi);
 
@@ -4554,6 +4559,15 @@ static void its_vpe_irq_domain_deactivate(struct irq_domain *domain,
 
                its_send_vmapp(its, vpe, false);
        }
+
+       /*
+        * There may be a direct read to the VPT after unmapping the
+        * vPE, to guarantee the validity of this, we make the VPT
+        * memory coherent with the CPU caches here.
+        */
+       if (find_4_1_its() && !atomic_read(&vpe->vmapp_count))
+               gic_flush_dcache_to_poc(page_address(vpe->vpt_page),
+                                       LPI_PENDBASE_SZ);
 }
 
 static const struct irq_domain_ops its_vpe_domain_ops = {
index b5ed4ea..77e9512 100644 (file)
@@ -201,6 +201,7 @@ static ssize_t empty_read(struct file *file, char __user *buf,
 
 static const struct proc_ops empty_proc_ops = {
        .proc_read      = empty_read,
+       .proc_lseek     = default_llseek,
 };
 
 // ---------------------------------------------------------------------------
index b6742b4..49d99cb 100644 (file)
@@ -18,7 +18,7 @@ config LEDS_CLASS
        tristate "LED Class Support"
        help
          This option enables the LED sysfs class in /sys/class/leds.  You'll
-         need this to do anything useful with LEDs.  If unsure, say N.
+         need this to do anything useful with LEDs.  If unsure, say Y.
 
 config LEDS_CLASS_FLASH
        tristate "LED Flash Class Support"
@@ -928,13 +928,12 @@ config LEDS_ACER_A500
          This option enables support for the Power Button LED of
          Acer Iconia Tab A500.
 
+source "drivers/leds/blink/Kconfig"
+
 comment "Flash and Torch LED drivers"
 source "drivers/leds/flash/Kconfig"
 
 comment "LED Triggers"
 source "drivers/leds/trigger/Kconfig"
 
-comment "LED Blink"
-source "drivers/leds/blink/Kconfig"
-
 endif # NEW_LEDS
index 2a698df..7e604d3 100644 (file)
@@ -110,4 +110,4 @@ obj-$(CONFIG_LEDS_CLASS_FLASH)              += flash/
 obj-$(CONFIG_LEDS_TRIGGERS)            += trigger/
 
 # LED Blink
-obj-$(CONFIG_LEDS_BLINK)                += blink/
+obj-y                                  += blink/
index 265b534..59ba81e 100644 (file)
@@ -1,20 +1,17 @@
-menuconfig LEDS_BLINK
-       bool "LED Blink support"
-       depends on LEDS_CLASS
-       help
-         This option enables blink support for the leds class.
-         If unsure, say Y.
+config LEDS_LGM
+       tristate "LED support for LGM SoC series"
+       depends on X86 || COMPILE_TEST
+       depends on GPIOLIB && LEDS_CLASS && MFD_SYSCON && OF
+       help
+         This option enables support for LEDs connected to GPIO lines on
+         Lightning Mountain (LGM) SoC. Lightning Mountain is a AnyWAN
+         gateway-on-a-chip SoC to be shipped on mid and high end home
+         gateways and routers.
 
-if LEDS_BLINK
+         These LEDs are driven by a Serial Shift Output (SSO) controller.
+         The driver supports hardware blinking and the LEDs can be configured
+         to be triggered by software/CPU or by hardware.
 
-config LEDS_BLINK_LGM
-       tristate "LED support for Intel LGM SoC series"
-       depends on LEDS_CLASS
-       depends on MFD_SYSCON
-       depends on OF
-       help
-         Parallel to serial conversion, which is also called SSO controller,
-         can drive external shift register for LED outputs.
-         This enables LED support for Serial Shift Output controller(SSO).
-
-endif # LEDS_BLINK
+         Say 'Y' here if you are working on LGM SoC based platform. Otherwise,
+         say 'N'. To compile this driver as a module, choose M here: the module
+         will be called leds-lgm-sso.
index 2fa6c7b..fa5d04d 100644 (file)
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_LEDS_BLINK_LGM)   += leds-lgm-sso.o
+obj-$(CONFIG_LEDS_LGM) += leds-lgm-sso.o
index 7d5c9ca..6a63846 100644 (file)
@@ -793,7 +793,7 @@ static int intel_sso_led_probe(struct platform_device *pdev)
 
        ret = clk_prepare_enable(priv->gclk);
        if (ret) {
-               dev_err(dev, "Failed to prepate/enable sso gate clock!\n");
+               dev_err(dev, "Failed to prepare/enable sso gate clock!\n");
                return ret;
        }
 
index b580b41..3f49f3e 100644 (file)
@@ -2,6 +2,17 @@
 
 if LEDS_CLASS_FLASH
 
+config LEDS_RT4505
+       tristate "LED support for RT4505 flashlight controller"
+       depends on I2C && OF
+       depends on V4L2_FLASH_LED_CLASS || !V4L2_FLASH_LED_CLASS
+       select REGMAP_I2C
+       help
+         This option enables support for the RT4505 flash LED controller.
+         RT4505 includes torch and flash functions with programmable current.
+         And it's commonly used to compensate the illuminance for the camera
+         inside the mobile product like as phones or tablets.
+
 config LEDS_RT8515
        tristate "LED support for Richtek RT8515 flash/torch LED"
        depends on GPIOLIB
index e990e25..09aee56 100644 (file)
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 
+obj-$(CONFIG_LEDS_RT4505)      += leds-rt4505.o
 obj-$(CONFIG_LEDS_RT8515)      += leds-rt8515.o
diff --git a/drivers/leds/flash/leds-rt4505.c b/drivers/leds/flash/leds-rt4505.c
new file mode 100644 (file)
index 0000000..ee129ab
--- /dev/null
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/bitops.h>
+#include <linux/i2c.h>
+#include <linux/kernel.h>
+#include <linux/led-class-flash.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/property.h>
+#include <linux/regmap.h>
+#include <media/v4l2-flash-led-class.h>
+
+#define RT4505_REG_RESET       0x0
+#define RT4505_REG_CONFIG      0x8
+#define RT4505_REG_ILED                0x9
+#define RT4505_REG_ENABLE      0xA
+#define RT4505_REG_FLAGS       0xB
+
+#define RT4505_RESET_MASK      BIT(7)
+#define RT4505_FLASHTO_MASK    GENMASK(2, 0)
+#define RT4505_ITORCH_MASK     GENMASK(7, 5)
+#define RT4505_ITORCH_SHIFT    5
+#define RT4505_IFLASH_MASK     GENMASK(4, 0)
+#define RT4505_ENABLE_MASK     GENMASK(5, 0)
+#define RT4505_TORCH_SET       (BIT(0) | BIT(4))
+#define RT4505_FLASH_SET       (BIT(0) | BIT(1) | BIT(2) | BIT(4))
+#define RT4505_EXT_FLASH_SET   (BIT(0) | BIT(1) | BIT(4) | BIT(5))
+#define RT4505_FLASH_GET       (BIT(0) | BIT(1) | BIT(4))
+#define RT4505_OVP_MASK                BIT(3)
+#define RT4505_SHORT_MASK      BIT(2)
+#define RT4505_OTP_MASK                BIT(1)
+#define RT4505_TIMEOUT_MASK    BIT(0)
+
+#define RT4505_ITORCH_MINUA    46000
+#define RT4505_ITORCH_MAXUA    375000
+#define RT4505_ITORCH_STPUA    47000
+#define RT4505_IFLASH_MINUA    93750
+#define RT4505_IFLASH_MAXUA    1500000
+#define RT4505_IFLASH_STPUA    93750
+#define RT4505_FLASHTO_MINUS   100000
+#define RT4505_FLASHTO_MAXUS   800000
+#define RT4505_FLASHTO_STPUS   100000
+
+struct rt4505_priv {
+       struct device *dev;
+       struct regmap *regmap;
+       struct mutex lock;
+       struct led_classdev_flash flash;
+       struct v4l2_flash *v4l2_flash;
+};
+
+static int rt4505_torch_brightness_set(struct led_classdev *lcdev,
+                                      enum led_brightness level)
+{
+       struct rt4505_priv *priv =
+               container_of(lcdev, struct rt4505_priv, flash.led_cdev);
+       u32 val = 0;
+       int ret;
+
+       mutex_lock(&priv->lock);
+
+       if (level != LED_OFF) {
+               ret = regmap_update_bits(priv->regmap,
+                                        RT4505_REG_ILED, RT4505_ITORCH_MASK,
+                                        (level - 1) << RT4505_ITORCH_SHIFT);
+               if (ret)
+                       goto unlock;
+
+               val = RT4505_TORCH_SET;
+       }
+
+       ret = regmap_update_bits(priv->regmap, RT4505_REG_ENABLE,
+                                RT4505_ENABLE_MASK, val);
+
+unlock:
+       mutex_unlock(&priv->lock);
+       return ret;
+}
+
+static enum led_brightness rt4505_torch_brightness_get(
+                                               struct led_classdev *lcdev)
+{
+       struct rt4505_priv *priv =
+               container_of(lcdev, struct rt4505_priv, flash.led_cdev);
+       u32 val;
+       int ret;
+
+       mutex_lock(&priv->lock);
+
+       ret = regmap_read(priv->regmap, RT4505_REG_ENABLE, &val);
+       if (ret) {
+               dev_err(lcdev->dev, "Failed to get LED enable\n");
+               ret = LED_OFF;
+               goto unlock;
+       }
+
+       if ((val & RT4505_ENABLE_MASK) != RT4505_TORCH_SET) {
+               ret = LED_OFF;
+               goto unlock;
+       }
+
+       ret = regmap_read(priv->regmap, RT4505_REG_ILED, &val);
+       if (ret) {
+               dev_err(lcdev->dev, "Failed to get LED brightness\n");
+               ret = LED_OFF;
+               goto unlock;
+       }
+
+       ret = ((val & RT4505_ITORCH_MASK) >> RT4505_ITORCH_SHIFT) + 1;
+
+unlock:
+       mutex_unlock(&priv->lock);
+       return ret;
+}
+
+static int rt4505_flash_brightness_set(struct led_classdev_flash *fled_cdev,
+                                      u32 brightness)
+{
+       struct rt4505_priv *priv =
+               container_of(fled_cdev, struct rt4505_priv, flash);
+       struct led_flash_setting *s = &fled_cdev->brightness;
+       u32 val = (brightness - s->min) / s->step;
+       int ret;
+
+       mutex_lock(&priv->lock);
+       ret = regmap_update_bits(priv->regmap, RT4505_REG_ILED,
+                                RT4505_IFLASH_MASK, val);
+       mutex_unlock(&priv->lock);
+
+       return ret;
+}
+
+static int rt4505_flash_strobe_set(struct led_classdev_flash *fled_cdev,
+                                  bool state)
+{
+       struct rt4505_priv *priv =
+               container_of(fled_cdev, struct rt4505_priv, flash);
+       u32 val = state ? RT4505_FLASH_SET : 0;
+       int ret;
+
+       mutex_lock(&priv->lock);
+       ret = regmap_update_bits(priv->regmap, RT4505_REG_ENABLE,
+                                RT4505_ENABLE_MASK, val);
+       mutex_unlock(&priv->lock);
+
+       return ret;
+}
+
+static int rt4505_flash_strobe_get(struct led_classdev_flash *fled_cdev,
+                                  bool *state)
+{
+       struct rt4505_priv *priv =
+               container_of(fled_cdev, struct rt4505_priv, flash);
+       u32 val;
+       int ret;
+
+       mutex_lock(&priv->lock);
+
+       ret = regmap_read(priv->regmap, RT4505_REG_ENABLE, &val);
+       if (ret)
+               goto unlock;
+
+       *state = (val & RT4505_FLASH_GET) == RT4505_FLASH_GET;
+
+unlock:
+       mutex_unlock(&priv->lock);
+       return ret;
+}
+
+static int rt4505_flash_timeout_set(struct led_classdev_flash *fled_cdev,
+                                   u32 timeout)
+{
+       struct rt4505_priv *priv =
+               container_of(fled_cdev, struct rt4505_priv, flash);
+       struct led_flash_setting *s = &fled_cdev->timeout;
+       u32 val = (timeout - s->min) / s->step;
+       int ret;
+
+       mutex_lock(&priv->lock);
+       ret = regmap_update_bits(priv->regmap, RT4505_REG_CONFIG,
+                                RT4505_FLASHTO_MASK, val);
+       mutex_unlock(&priv->lock);
+
+       return ret;
+}
+
+static int rt4505_fault_get(struct led_classdev_flash *fled_cdev, u32 *fault)
+{
+       struct rt4505_priv *priv =
+               container_of(fled_cdev, struct rt4505_priv, flash);
+       u32 val, led_faults = 0;
+       int ret;
+
+       ret = regmap_read(priv->regmap, RT4505_REG_FLAGS, &val);
+       if (ret)
+               return ret;
+
+       if (val & RT4505_OVP_MASK)
+               led_faults |= LED_FAULT_OVER_VOLTAGE;
+
+       if (val & RT4505_SHORT_MASK)
+               led_faults |= LED_FAULT_SHORT_CIRCUIT;
+
+       if (val & RT4505_OTP_MASK)
+               led_faults |= LED_FAULT_OVER_TEMPERATURE;
+
+       if (val & RT4505_TIMEOUT_MASK)
+               led_faults |= LED_FAULT_TIMEOUT;
+
+       *fault = led_faults;
+       return 0;
+}
+
+static const struct led_flash_ops rt4505_flash_ops = {
+       .flash_brightness_set = rt4505_flash_brightness_set,
+       .strobe_set = rt4505_flash_strobe_set,
+       .strobe_get = rt4505_flash_strobe_get,
+       .timeout_set = rt4505_flash_timeout_set,
+       .fault_get = rt4505_fault_get,
+};
+
+static bool rt4505_is_accessible_reg(struct device *dev, unsigned int reg)
+{
+       if (reg == RT4505_REG_RESET ||
+               (reg >= RT4505_REG_CONFIG && reg <= RT4505_REG_FLAGS))
+               return true;
+       return false;
+}
+
+static const struct regmap_config rt4505_regmap_config = {
+       .reg_bits = 8,
+       .val_bits = 8,
+       .max_register = RT4505_REG_FLAGS,
+
+       .readable_reg = rt4505_is_accessible_reg,
+       .writeable_reg = rt4505_is_accessible_reg,
+};
+
+#if IS_ENABLED(CONFIG_V4L2_FLASH_LED_CLASS)
+static int rt4505_flash_external_strobe_set(struct v4l2_flash *v4l2_flash,
+                                           bool enable)
+{
+       struct led_classdev_flash *flash = v4l2_flash->fled_cdev;
+       struct rt4505_priv *priv =
+               container_of(flash, struct rt4505_priv, flash);
+       u32 val = enable ? RT4505_EXT_FLASH_SET : 0;
+       int ret;
+
+       mutex_lock(&priv->lock);
+       ret = regmap_update_bits(priv->regmap, RT4505_REG_ENABLE,
+                                RT4505_ENABLE_MASK, val);
+       mutex_unlock(&priv->lock);
+
+       return ret;
+}
+
+static const struct v4l2_flash_ops v4l2_flash_ops = {
+       .external_strobe_set = rt4505_flash_external_strobe_set,
+};
+
+static void rt4505_init_v4l2_config(struct rt4505_priv *priv,
+                                   struct v4l2_flash_config *config)
+{
+       struct led_classdev_flash *flash = &priv->flash;
+       struct led_classdev *lcdev = &flash->led_cdev;
+       struct led_flash_setting *s;
+
+       strscpy(config->dev_name, lcdev->dev->kobj.name,
+               sizeof(config->dev_name));
+
+       s = &config->intensity;
+       s->min = RT4505_ITORCH_MINUA;
+       s->step = RT4505_ITORCH_STPUA;
+       s->max = s->val = s->min + (lcdev->max_brightness - 1) * s->step;
+
+       config->flash_faults = LED_FAULT_OVER_VOLTAGE |
+                              LED_FAULT_SHORT_CIRCUIT |
+                              LED_FAULT_LED_OVER_TEMPERATURE |
+                              LED_FAULT_TIMEOUT;
+       config->has_external_strobe = 1;
+}
+#else
+static const struct v4l2_flash_ops v4l2_flash_ops;
+static void rt4505_init_v4l2_config(struct rt4505_priv *priv,
+                                   struct v4l2_flash_config *config)
+{
+}
+#endif
+
+static void rt4505_init_flash_properties(struct rt4505_priv *priv,
+                                        struct fwnode_handle *child)
+{
+       struct led_classdev_flash *flash = &priv->flash;
+       struct led_classdev *lcdev = &flash->led_cdev;
+       struct led_flash_setting *s;
+       u32 val;
+       int ret;
+
+       ret = fwnode_property_read_u32(child, "led-max-microamp", &val);
+       if (ret) {
+               dev_warn(priv->dev, "led-max-microamp DT property missing\n");
+               val = RT4505_ITORCH_MINUA;
+       } else
+               val = clamp_val(val, RT4505_ITORCH_MINUA, RT4505_ITORCH_MAXUA);
+
+       lcdev->max_brightness =
+               (val - RT4505_ITORCH_MINUA) / RT4505_ITORCH_STPUA + 1;
+       lcdev->brightness_set_blocking = rt4505_torch_brightness_set;
+       lcdev->brightness_get = rt4505_torch_brightness_get;
+       lcdev->flags |= LED_DEV_CAP_FLASH;
+
+       ret = fwnode_property_read_u32(child, "flash-max-microamp", &val);
+       if (ret) {
+               dev_warn(priv->dev, "flash-max-microamp DT property missing\n");
+               val = RT4505_IFLASH_MINUA;
+       } else
+               val = clamp_val(val, RT4505_IFLASH_MINUA, RT4505_IFLASH_MAXUA);
+
+       s = &flash->brightness;
+       s->min = RT4505_IFLASH_MINUA;
+       s->step = RT4505_IFLASH_STPUA;
+       s->max = s->val = val;
+
+       ret = fwnode_property_read_u32(child, "flash-max-timeout-us", &val);
+       if (ret) {
+               dev_warn(priv->dev,
+                        "flash-max-timeout-us DT property missing\n");
+               val = RT4505_FLASHTO_MINUS;
+       } else
+               val = clamp_val(val, RT4505_FLASHTO_MINUS,
+                               RT4505_FLASHTO_MAXUS);
+
+       s = &flash->timeout;
+       s->min = RT4505_FLASHTO_MINUS;
+       s->step = RT4505_FLASHTO_STPUS;
+       s->max = s->val = val;
+
+       flash->ops = &rt4505_flash_ops;
+}
+
+static int rt4505_probe(struct i2c_client *client)
+{
+       struct rt4505_priv *priv;
+       struct fwnode_handle *child;
+       struct led_init_data init_data = {};
+       struct v4l2_flash_config v4l2_config = {};
+       int ret;
+
+       priv = devm_kzalloc(&client->dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       priv->dev = &client->dev;
+       mutex_init(&priv->lock);
+
+       priv->regmap = devm_regmap_init_i2c(client, &rt4505_regmap_config);
+       if (IS_ERR(priv->regmap)) {
+               dev_err(priv->dev, "Failed to allocate register map\n");
+               return PTR_ERR(priv->regmap);
+       }
+
+       ret = regmap_write(priv->regmap, RT4505_REG_RESET, RT4505_RESET_MASK);
+       if (ret) {
+               dev_err(priv->dev, "Failed to reset registers\n");
+               return ret;
+       }
+
+       child = fwnode_get_next_available_child_node(client->dev.fwnode, NULL);
+       if (!child) {
+               dev_err(priv->dev, "Failed to get child node\n");
+               return -EINVAL;
+       }
+       init_data.fwnode = child;
+
+       rt4505_init_flash_properties(priv, child);
+       ret = devm_led_classdev_flash_register_ext(priv->dev, &priv->flash,
+                                                  &init_data);
+       if (ret) {
+               dev_err(priv->dev, "Failed to register flash\n");
+               return ret;
+       }
+
+       rt4505_init_v4l2_config(priv, &v4l2_config);
+       priv->v4l2_flash = v4l2_flash_init(priv->dev, init_data.fwnode,
+                                          &priv->flash, &v4l2_flash_ops,
+                                          &v4l2_config);
+       if (IS_ERR(priv->v4l2_flash)) {
+               dev_err(priv->dev, "Failed to register v4l2 flash\n");
+               return PTR_ERR(priv->v4l2_flash);
+       }
+
+       i2c_set_clientdata(client, priv);
+       return 0;
+}
+
+static int rt4505_remove(struct i2c_client *client)
+{
+       struct rt4505_priv *priv = i2c_get_clientdata(client);
+
+       v4l2_flash_release(priv->v4l2_flash);
+       return 0;
+}
+
+static void rt4505_shutdown(struct i2c_client *client)
+{
+       struct rt4505_priv *priv = i2c_get_clientdata(client);
+
+       /* Reset registers to make sure all off before shutdown */
+       regmap_write(priv->regmap, RT4505_REG_RESET, RT4505_RESET_MASK);
+}
+
+static const struct of_device_id __maybe_unused rt4505_leds_match[] = {
+       { .compatible = "richtek,rt4505", },
+       {}
+};
+MODULE_DEVICE_TABLE(of, rt4505_leds_match);
+
+static struct i2c_driver rt4505_driver = {
+       .driver = {
+               .name = "rt4505",
+               .of_match_table = of_match_ptr(rt4505_leds_match),
+       },
+       .probe_new = rt4505_probe,
+       .remove = rt4505_remove,
+       .shutdown = rt4505_shutdown,
+};
+module_i2c_driver(rt4505_driver);
+
+MODULE_AUTHOR("ChiYuan Huang <cy_huang@richtek.com>");
+MODULE_LICENSE("GPL v2");
index 8007b82..4353091 100644 (file)
@@ -339,7 +339,7 @@ static int lm3642_probe(struct i2c_client *client,
        chip->cdev_flash.max_brightness = 16;
        chip->cdev_flash.brightness_set_blocking = lm3642_strobe_brightness_set;
        chip->cdev_flash.default_trigger = "flash";
-       chip->cdev_flash.groups = lm3642_flash_groups,
+       chip->cdev_flash.groups = lm3642_flash_groups;
        err = led_classdev_register(&client->dev, &chip->cdev_flash);
        if (err < 0) {
                dev_err(chip->dev, "failed to register flash\n");
@@ -351,7 +351,7 @@ static int lm3642_probe(struct i2c_client *client,
        chip->cdev_torch.max_brightness = 8;
        chip->cdev_torch.brightness_set_blocking = lm3642_torch_brightness_set;
        chip->cdev_torch.default_trigger = "torch";
-       chip->cdev_torch.groups = lm3642_torch_groups,
+       chip->cdev_torch.groups = lm3642_torch_groups;
        err = led_classdev_register(&client->dev, &chip->cdev_torch);
        if (err < 0) {
                dev_err(chip->dev, "failed to register torch\n");
index 27d0271..017794b 100644 (file)
@@ -480,6 +480,8 @@ pca9532_of_populate_pdata(struct device *dev, struct device_node *np)
        if (!pdata)
                return ERR_PTR(-ENOMEM);
 
+       pdata->gpio_base = -1;
+
        of_property_read_u8_array(np, "nxp,pwm", &pdata->pwm[0],
                                  ARRAY_SIZE(pdata->pwm));
        of_property_read_u8_array(np, "nxp,psc", &pdata->psc[0],
index 4d138d5..43a265d 100644 (file)
@@ -333,7 +333,7 @@ static DEVICE_ATTR_RW(hw_pattern);
 static umode_t pattern_trig_attrs_mode(struct kobject *kobj,
                                       struct attribute *attr, int index)
 {
-       struct device *dev = container_of(kobj, struct device, kobj);
+       struct device *dev = kobj_to_dev(kobj);
        struct led_classdev *led_cdev = dev_get_drvdata(dev);
 
        if (attr == &dev_attr_repeat.attr || attr == &dev_attr_pattern.attr)
index 2b6d6e9..bea8c44 100644 (file)
@@ -16,6 +16,7 @@
 #include "features.h"
 
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/debugfs.h>
 #include <linux/genhd.h>
 #include <linux/idr.h>
index 541c450..6ab01ff 100644 (file)
@@ -3387,7 +3387,7 @@ static bool origin_dev_supports_discard(struct block_device *origin_bdev)
 {
        struct request_queue *q = bdev_get_queue(origin_bdev);
 
-       return q && blk_queue_discard(q);
+       return blk_queue_discard(q);
 }
 
 /*
index 1771245..c43d556 100644 (file)
@@ -276,12 +276,6 @@ static inline int superblock_read_lock(struct dm_clone_metadata *cmd,
        return dm_bm_read_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock);
 }
 
-static inline int superblock_write_lock(struct dm_clone_metadata *cmd,
-                                       struct dm_block **sblock)
-{
-       return dm_bm_write_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock);
-}
-
 static inline int superblock_write_lock_zero(struct dm_clone_metadata *cmd,
                                             struct dm_block **sblock)
 {
index 55bcfb7..71475a2 100644 (file)
@@ -28,7 +28,7 @@ struct ebs_c {
        spinlock_t lock;                /* Guard bios input list above. */
        sector_t start;                 /* <start> table line argument, see ebs_ctr below. */
        unsigned int e_bs;              /* Emulated block size in sectors exposed to upper layer. */
-       unsigned int u_bs;              /* Underlying block size in sectors retrievd from/set on lower layer device. */
+       unsigned int u_bs;              /* Underlying block size in sectors retrieved from/set on lower layer device. */
        unsigned char block_shift;      /* bitshift sectors -> blocks used in dm-bufio API. */
        bool u_bs_set:1;                /* Flag to indicate underlying block size is set on table line. */
 };
@@ -43,7 +43,7 @@ static inline sector_t __block_mod(sector_t sector, unsigned int bs)
        return sector & (bs - 1);
 }
 
-/* Return number of blocks for a bio, accounting for misalignement of start and end sectors. */
+/* Return number of blocks for a bio, accounting for misalignment of start and end sectors. */
 static inline unsigned int __nr_blocks(struct ebs_c *ec, struct bio *bio)
 {
        sector_t end_sector = __block_mod(bio->bi_iter.bi_sector, ec->u_bs) + bio_sectors(bio);
@@ -171,7 +171,7 @@ static void __ebs_forget_bio(struct ebs_c *ec, struct bio *bio)
        dm_bufio_forget_buffers(ec->bufio, __sector_to_block(ec, sector), blocks);
 }
 
-/* Worker funtion to process incoming bios. */
+/* Worker function to process incoming bios. */
 static void __ebs_process_bios(struct work_struct *ws)
 {
        int r;
index 46b5d54..781942a 100644 (file)
@@ -35,7 +35,7 @@
 #define MIN_LOG2_INTERLEAVE_SECTORS    3
 #define MAX_LOG2_INTERLEAVE_SECTORS    31
 #define METADATA_WORKQUEUE_MAX_ACTIVE  16
-#define RECALC_SECTORS                 8192
+#define RECALC_SECTORS                 32768
 #define RECALC_WRITE_SUPER             16
 #define BITMAP_BLOCK_SIZE              4096    /* don't change it */
 #define BITMAP_FLUSH_INTERVAL          (10 * HZ)
@@ -262,6 +262,7 @@ struct dm_integrity_c {
        bool journal_uptodate;
        bool just_formatted;
        bool recalculate_flag;
+       bool reset_recalculate_flag;
        bool discard;
        bool fix_padding;
        bool fix_hmac;
@@ -1428,8 +1429,10 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
                if (op == TAG_READ) {
                        memcpy(tag, dp, to_copy);
                } else if (op == TAG_WRITE) {
-                       memcpy(dp, tag, to_copy);
-                       dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
+                       if (memcmp(dp, tag, to_copy)) {
+                               memcpy(dp, tag, to_copy);
+                               dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
+                       }
                } else {
                        /* e.g.: op == TAG_CMP */
 
@@ -2686,26 +2689,30 @@ next_chunk:
        if (unlikely(dm_integrity_failed(ic)))
                goto err;
 
-       io_req.bi_op = REQ_OP_READ;
-       io_req.bi_op_flags = 0;
-       io_req.mem.type = DM_IO_VMA;
-       io_req.mem.ptr.addr = ic->recalc_buffer;
-       io_req.notify.fn = NULL;
-       io_req.client = ic->io;
-       io_loc.bdev = ic->dev->bdev;
-       io_loc.sector = get_data_sector(ic, area, offset);
-       io_loc.count = n_sectors;
+       if (!ic->discard) {
+               io_req.bi_op = REQ_OP_READ;
+               io_req.bi_op_flags = 0;
+               io_req.mem.type = DM_IO_VMA;
+               io_req.mem.ptr.addr = ic->recalc_buffer;
+               io_req.notify.fn = NULL;
+               io_req.client = ic->io;
+               io_loc.bdev = ic->dev->bdev;
+               io_loc.sector = get_data_sector(ic, area, offset);
+               io_loc.count = n_sectors;
 
-       r = dm_io(&io_req, 1, &io_loc, NULL);
-       if (unlikely(r)) {
-               dm_integrity_io_error(ic, "reading data", r);
-               goto err;
-       }
+               r = dm_io(&io_req, 1, &io_loc, NULL);
+               if (unlikely(r)) {
+                       dm_integrity_io_error(ic, "reading data", r);
+                       goto err;
+               }
 
-       t = ic->recalc_tags;
-       for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
-               integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
-               t += ic->tag_size;
+               t = ic->recalc_tags;
+               for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
+                       integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
+                       t += ic->tag_size;
+               }
+       } else {
+               t = ic->recalc_tags + (n_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
        }
 
        metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
@@ -3134,7 +3141,8 @@ static void dm_integrity_resume(struct dm_target *ti)
                rw_journal_sectors(ic, REQ_OP_READ, 0, 0,
                                   ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
                if (ic->mode == 'B') {
-                       if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) {
+                       if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit &&
+                           !ic->reset_recalculate_flag) {
                                block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal);
                                block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal);
                                if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors,
@@ -3156,7 +3164,8 @@ static void dm_integrity_resume(struct dm_target *ti)
                        }
                } else {
                        if (!(ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit &&
-                             block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR))) {
+                             block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR)) ||
+                           ic->reset_recalculate_flag) {
                                ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
                                ic->sb->recalc_sector = cpu_to_le64(0);
                        }
@@ -3169,6 +3178,10 @@ static void dm_integrity_resume(struct dm_target *ti)
                        dm_integrity_io_error(ic, "writing superblock", r);
        } else {
                replay_journal(ic);
+               if (ic->reset_recalculate_flag) {
+                       ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
+                       ic->sb->recalc_sector = cpu_to_le64(0);
+               }
                if (ic->mode == 'B') {
                        ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
                        ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
@@ -3242,6 +3255,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
                arg_count += !!ic->meta_dev;
                arg_count += ic->sectors_per_block != 1;
                arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
+               arg_count += ic->reset_recalculate_flag;
                arg_count += ic->discard;
                arg_count += ic->mode == 'J';
                arg_count += ic->mode == 'J';
@@ -3261,6 +3275,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
                        DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
                if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
                        DMEMIT(" recalculate");
+               if (ic->reset_recalculate_flag)
+                       DMEMIT(" reset_recalculate");
                if (ic->discard)
                        DMEMIT(" allow_discards");
                DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
@@ -3914,7 +3930,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
        unsigned extra_args;
        struct dm_arg_set as;
        static const struct dm_arg _args[] = {
-               {0, 17, "Invalid number of feature args"},
+               {0, 18, "Invalid number of feature args"},
        };
        unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
        bool should_write_sb;
@@ -4039,6 +4055,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                        if (val >= (uint64_t)UINT_MAX * 1000 / HZ) {
                                r = -EINVAL;
                                ti->error = "Invalid bitmap_flush_interval argument";
+                               goto bad;
                        }
                        ic->bitmap_flush_interval = msecs_to_jiffies(val);
                } else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
@@ -4058,6 +4075,9 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                                goto bad;
                } else if (!strcmp(opt_string, "recalculate")) {
                        ic->recalculate_flag = true;
+               } else if (!strcmp(opt_string, "reset_recalculate")) {
+                       ic->recalculate_flag = true;
+                       ic->reset_recalculate_flag = true;
                } else if (!strcmp(opt_string, "allow_discards")) {
                        ic->discard = true;
                } else if (!strcmp(opt_string, "fix_padding")) {
@@ -4348,11 +4368,13 @@ try_smaller_buffer:
                        goto bad;
                }
                INIT_WORK(&ic->recalc_work, integrity_recalc);
-               ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT);
-               if (!ic->recalc_buffer) {
-                       ti->error = "Cannot allocate buffer for recalculating";
-                       r = -ENOMEM;
-                       goto bad;
+               if (!ic->discard) {
+                       ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT);
+                       if (!ic->recalc_buffer) {
+                               ti->error = "Cannot allocate buffer for recalculating";
+                               r = -ENOMEM;
+                               goto bad;
+                       }
                }
                ic->recalc_tags = kvmalloc_array(RECALC_SECTORS >> ic->sb->log2_sectors_per_block,
                                                 ic->tag_size, GFP_KERNEL);
@@ -4361,6 +4383,9 @@ try_smaller_buffer:
                        r = -ENOMEM;
                        goto bad;
                }
+               if (ic->discard)
+                       memset(ic->recalc_tags, DISCARD_FILLER,
+                              (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size);
        } else {
                if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
                        ti->error = "Recalculate can only be specified with internal_hash";
@@ -4554,7 +4579,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
 
 static struct target_type integrity_target = {
        .name                   = "integrity",
-       .version                = {1, 7, 0},
+       .version                = {1, 9, 0},
        .module                 = THIS_MODULE,
        .features               = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
        .ctr                    = dm_integrity_ctr,
index 1ca65b4..2209cbc 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
+#include <linux/rbtree.h>
 #include <linux/dm-ioctl.h>
 #include <linux/hdreg.h>
 #include <linux/compat.h>
@@ -36,8 +37,10 @@ struct dm_file {
  * name or uuid.
  *---------------------------------------------------------------*/
 struct hash_cell {
-       struct list_head name_list;
-       struct list_head uuid_list;
+       struct rb_node name_node;
+       struct rb_node uuid_node;
+       bool name_set;
+       bool uuid_set;
 
        char *name;
        char *uuid;
@@ -53,10 +56,8 @@ struct vers_iter {
 };
 
 
-#define NUM_BUCKETS 64
-#define MASK_BUCKETS (NUM_BUCKETS - 1)
-static struct list_head _name_buckets[NUM_BUCKETS];
-static struct list_head _uuid_buckets[NUM_BUCKETS];
+static struct rb_root name_rb_tree = RB_ROOT;
+static struct rb_root uuid_rb_tree = RB_ROOT;
 
 static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred);
 
@@ -70,73 +71,110 @@ static DECLARE_RWSEM(_hash_lock);
  */
 static DEFINE_MUTEX(dm_hash_cells_mutex);
 
-static void init_buckets(struct list_head *buckets)
-{
-       unsigned int i;
-
-       for (i = 0; i < NUM_BUCKETS; i++)
-               INIT_LIST_HEAD(buckets + i);
-}
-
-static int dm_hash_init(void)
-{
-       init_buckets(_name_buckets);
-       init_buckets(_uuid_buckets);
-       return 0;
-}
-
 static void dm_hash_exit(void)
 {
        dm_hash_remove_all(false, false, false);
 }
 
-/*-----------------------------------------------------------------
- * Hash function:
- * We're not really concerned with the str hash function being
- * fast since it's only used by the ioctl interface.
- *---------------------------------------------------------------*/
-static unsigned int hash_str(const char *str)
-{
-       const unsigned int hash_mult = 2654435387U;
-       unsigned int h = 0;
-
-       while (*str)
-               h = (h + (unsigned int) *str++) * hash_mult;
-
-       return h & MASK_BUCKETS;
-}
-
 /*-----------------------------------------------------------------
  * Code for looking up a device by name
  *---------------------------------------------------------------*/
 static struct hash_cell *__get_name_cell(const char *str)
 {
-       struct hash_cell *hc;
-       unsigned int h = hash_str(str);
+       struct rb_node *n = name_rb_tree.rb_node;
 
-       list_for_each_entry (hc, _name_buckets + h, name_list)
-               if (!strcmp(hc->name, str)) {
+       while (n) {
+               struct hash_cell *hc = container_of(n, struct hash_cell, name_node);
+               int c = strcmp(hc->name, str);
+               if (!c) {
                        dm_get(hc->md);
                        return hc;
                }
+               n = c >= 0 ? n->rb_left : n->rb_right;
+       }
 
        return NULL;
 }
 
 static struct hash_cell *__get_uuid_cell(const char *str)
 {
-       struct hash_cell *hc;
-       unsigned int h = hash_str(str);
+       struct rb_node *n = uuid_rb_tree.rb_node;
 
-       list_for_each_entry (hc, _uuid_buckets + h, uuid_list)
-               if (!strcmp(hc->uuid, str)) {
+       while (n) {
+               struct hash_cell *hc = container_of(n, struct hash_cell, uuid_node);
+               int c = strcmp(hc->uuid, str);
+               if (!c) {
                        dm_get(hc->md);
                        return hc;
                }
+               n = c >= 0 ? n->rb_left : n->rb_right;
+       }
 
        return NULL;
 }
 
+static void __unlink_name(struct hash_cell *hc)
+{
+       if (hc->name_set) {
+               hc->name_set = false;
+               rb_erase(&hc->name_node, &name_rb_tree);
+       }
+}
+
+static void __unlink_uuid(struct hash_cell *hc)
+{
+       if (hc->uuid_set) {
+               hc->uuid_set = false;
+               rb_erase(&hc->uuid_node, &uuid_rb_tree);
+       }
+}
+
+static void __link_name(struct hash_cell *new_hc)
+{
+       struct rb_node **n, *parent;
+
+       __unlink_name(new_hc);
+
+       new_hc->name_set = true;
+
+       n = &name_rb_tree.rb_node;
+       parent = NULL;
+
+       while (*n) {
+               struct hash_cell *hc = container_of(*n, struct hash_cell, name_node);
+               int c = strcmp(hc->name, new_hc->name);
+               BUG_ON(!c);
+               parent = *n;
+               n = c >= 0 ? &hc->name_node.rb_left : &hc->name_node.rb_right;
+       }
+
+       rb_link_node(&new_hc->name_node, parent, n);
+       rb_insert_color(&new_hc->name_node, &name_rb_tree);
+}
+
+static void __link_uuid(struct hash_cell *new_hc)
+{
+       struct rb_node **n, *parent;
+
+       __unlink_uuid(new_hc);
+
+       new_hc->uuid_set = true;
+
+       n = &uuid_rb_tree.rb_node;
+       parent = NULL;
+
+       while (*n) {
+               struct hash_cell *hc = container_of(*n, struct hash_cell, uuid_node);
+               int c = strcmp(hc->uuid, new_hc->uuid);
+               BUG_ON(!c);
+               parent = *n;
+               n = c > 0 ? &hc->uuid_node.rb_left : &hc->uuid_node.rb_right;
+       }
+
+       rb_link_node(&new_hc->uuid_node, parent, n);
+       rb_insert_color(&new_hc->uuid_node, &uuid_rb_tree);
+}
+
 static struct hash_cell *__get_dev_cell(uint64_t dev)
 {
        struct mapped_device *md;
@@ -185,8 +223,7 @@ static struct hash_cell *alloc_cell(const char *name, const char *uuid,
                }
        }
 
-       INIT_LIST_HEAD(&hc->name_list);
-       INIT_LIST_HEAD(&hc->uuid_list);
+       hc->name_set = hc->uuid_set = false;
        hc->md = md;
        hc->new_map = NULL;
        return hc;
@@ -226,16 +263,16 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
                goto bad;
        }
 
-       list_add(&cell->name_list, _name_buckets + hash_str(name));
+       __link_name(cell);
 
        if (uuid) {
                hc = __get_uuid_cell(uuid);
                if (hc) {
-                       list_del(&cell->name_list);
+                       __unlink_name(cell);
                        dm_put(hc->md);
                        goto bad;
                }
-               list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
+               __link_uuid(cell);
        }
        dm_get(md);
        mutex_lock(&dm_hash_cells_mutex);
@@ -256,9 +293,9 @@ static struct dm_table *__hash_remove(struct hash_cell *hc)
        struct dm_table *table;
        int srcu_idx;
 
-       /* remove from the dev hash */
-       list_del(&hc->uuid_list);
-       list_del(&hc->name_list);
+       /* remove from the dev trees */
+       __unlink_name(hc);
+       __unlink_uuid(hc);
        mutex_lock(&dm_hash_cells_mutex);
        dm_set_mdptr(hc->md, NULL);
        mutex_unlock(&dm_hash_cells_mutex);
@@ -279,7 +316,8 @@ static struct dm_table *__hash_remove(struct hash_cell *hc)
 
 static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred)
 {
-       int i, dev_skipped;
+       int dev_skipped;
+       struct rb_node *n;
        struct hash_cell *hc;
        struct mapped_device *md;
        struct dm_table *t;
@@ -289,40 +327,39 @@ retry:
 
        down_write(&_hash_lock);
 
-       for (i = 0; i < NUM_BUCKETS; i++) {
-               list_for_each_entry(hc, _name_buckets + i, name_list) {
-                       md = hc->md;
-                       dm_get(md);
+       for (n = rb_first(&name_rb_tree); n; n = rb_next(n)) {
+               hc = container_of(n, struct hash_cell, name_node);
+               md = hc->md;
+               dm_get(md);
 
-                       if (keep_open_devices &&
-                           dm_lock_for_deletion(md, mark_deferred, only_deferred)) {
-                               dm_put(md);
-                               dev_skipped++;
-                               continue;
-                       }
+               if (keep_open_devices &&
+                   dm_lock_for_deletion(md, mark_deferred, only_deferred)) {
+                       dm_put(md);
+                       dev_skipped++;
+                       continue;
+               }
 
-                       t = __hash_remove(hc);
+               t = __hash_remove(hc);
 
-                       up_write(&_hash_lock);
+               up_write(&_hash_lock);
 
-                       if (t) {
-                               dm_sync_table(md);
-                               dm_table_destroy(t);
-                       }
-                       dm_put(md);
-                       if (likely(keep_open_devices))
-                               dm_destroy(md);
-                       else
-                               dm_destroy_immediate(md);
-
-                       /*
-                        * Some mapped devices may be using other mapped
-                        * devices, so repeat until we make no further
-                        * progress.  If a new mapped device is created
-                        * here it will also get removed.
-                        */
-                       goto retry;
+               if (t) {
+                       dm_sync_table(md);
+                       dm_table_destroy(t);
                }
+               dm_put(md);
+               if (likely(keep_open_devices))
+                       dm_destroy(md);
+               else
+                       dm_destroy_immediate(md);
+
+               /*
+                * Some mapped devices may be using other mapped
+                * devices, so repeat until we make no further
+                * progress.  If a new mapped device is created
+                * here it will also get removed.
+                */
+               goto retry;
        }
 
        up_write(&_hash_lock);
@@ -340,7 +377,7 @@ static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid)
        hc->uuid = new_uuid;
        mutex_unlock(&dm_hash_cells_mutex);
 
-       list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid));
+       __link_uuid(hc);
 }
 
 /*
@@ -354,14 +391,14 @@ static char *__change_cell_name(struct hash_cell *hc, char *new_name)
        /*
         * Rename and move the name cell.
         */
-       list_del(&hc->name_list);
+       __unlink_name(hc);
        old_name = hc->name;
 
        mutex_lock(&dm_hash_cells_mutex);
        hc->name = new_name;
        mutex_unlock(&dm_hash_cells_mutex);
 
-       list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+       __link_name(hc);
 
        return old_name;
 }
@@ -503,9 +540,33 @@ static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
        return ((void *) param) + param->data_start;
 }
 
+static bool filter_device(struct hash_cell *hc, const char *pfx_name, const char *pfx_uuid)
+{
+       const char *val;
+       size_t val_len, pfx_len;
+
+       val = hc->name;
+       val_len = strlen(val);
+       pfx_len = strnlen(pfx_name, DM_NAME_LEN);
+       if (pfx_len > val_len)
+               return false;
+       if (memcmp(val, pfx_name, pfx_len))
+               return false;
+
+       val = hc->uuid ? hc->uuid : "";
+       val_len = strlen(val);
+       pfx_len = strnlen(pfx_uuid, DM_UUID_LEN);
+       if (pfx_len > val_len)
+               return false;
+       if (memcmp(val, pfx_uuid, pfx_len))
+               return false;
+
+       return true;
+}
+
 static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
-       unsigned int i;
+       struct rb_node *n;
        struct hash_cell *hc;
        size_t len, needed = 0;
        struct gendisk *disk;
@@ -518,11 +579,14 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_
         * Loop through all the devices working out how much
         * space we need.
         */
-       for (i = 0; i < NUM_BUCKETS; i++) {
-               list_for_each_entry (hc, _name_buckets + i, name_list) {
-                       needed += align_val(offsetof(struct dm_name_list, name) + strlen(hc->name) + 1);
-                       needed += align_val(sizeof(uint32_t));
-               }
+       for (n = rb_first(&name_rb_tree); n; n = rb_next(n)) {
+               hc = container_of(n, struct hash_cell, name_node);
+               if (!filter_device(hc, param->name, param->uuid))
+                       continue;
+               needed += align_val(offsetof(struct dm_name_list, name) + strlen(hc->name) + 1);
+               needed += align_val(sizeof(uint32_t) * 2);
+               if (param->flags & DM_UUID_FLAG && hc->uuid)
+                       needed += align_val(strlen(hc->uuid) + 1);
        }
 
        /*
@@ -540,21 +604,34 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_
        /*
         * Now loop through filling out the names.
         */
-       for (i = 0; i < NUM_BUCKETS; i++) {
-               list_for_each_entry (hc, _name_buckets + i, name_list) {
-                       if (old_nl)
-                               old_nl->next = (uint32_t) ((void *) nl -
-                                                          (void *) old_nl);
-                       disk = dm_disk(hc->md);
-                       nl->dev = huge_encode_dev(disk_devt(disk));
-                       nl->next = 0;
-                       strcpy(nl->name, hc->name);
-
-                       old_nl = nl;
-                       event_nr = align_ptr(nl->name + strlen(hc->name) + 1);
-                       *event_nr = dm_get_event_nr(hc->md);
-                       nl = align_ptr(event_nr + 1);
+       for (n = rb_first(&name_rb_tree); n; n = rb_next(n)) {
+               void *uuid_ptr;
+               hc = container_of(n, struct hash_cell, name_node);
+               if (!filter_device(hc, param->name, param->uuid))
+                       continue;
+               if (old_nl)
+                       old_nl->next = (uint32_t) ((void *) nl -
+                                                  (void *) old_nl);
+               disk = dm_disk(hc->md);
+               nl->dev = huge_encode_dev(disk_devt(disk));
+               nl->next = 0;
+               strcpy(nl->name, hc->name);
+
+               old_nl = nl;
+               event_nr = align_ptr(nl->name + strlen(hc->name) + 1);
+               event_nr[0] = dm_get_event_nr(hc->md);
+               event_nr[1] = 0;
+               uuid_ptr = align_ptr(event_nr + 2);
+               if (param->flags & DM_UUID_FLAG) {
+                       if (hc->uuid) {
+                               event_nr[1] |= DM_NAME_LIST_FLAG_HAS_UUID;
+                               strcpy(uuid_ptr, hc->uuid);
+                               uuid_ptr = align_ptr(uuid_ptr + strlen(hc->uuid) + 1);
+                       } else {
+                               event_nr[1] |= DM_NAME_LIST_FLAG_DOESNT_HAVE_UUID;
+                       }
                }
+               nl = uuid_ptr;
        }
        /*
         * If mismatch happens, security may be compromised due to buffer
@@ -1991,14 +2068,9 @@ int __init dm_interface_init(void)
 {
        int r;
 
-       r = dm_hash_init();
-       if (r)
-               return r;
-
        r = misc_register(&_dm_misc);
        if (r) {
                DMERR("misc_register failed for control device");
-               dm_hash_exit();
                return r;
        }
 
index cab12b2..bf4a467 100644 (file)
@@ -1853,6 +1853,7 @@ static int rs_check_takeover(struct raid_set *rs)
                    ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) ||
                     __within_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC, ALGORITHM_RIGHT_SYMMETRIC)))
                        return 0;
+               break;
 
        default:
                break;
@@ -1868,6 +1869,14 @@ static bool rs_takeover_requested(struct raid_set *rs)
        return rs->md.new_level != rs->md.level;
 }
 
+/* True if layout is set to reshape. */
+static bool rs_is_layout_change(struct raid_set *rs, bool use_mddev)
+{
+       return (use_mddev ? rs->md.delta_disks : rs->delta_disks) ||
+              rs->md.new_layout != rs->md.layout ||
+              rs->md.new_chunk_sectors != rs->md.chunk_sectors;
+}
+
 /* True if @rs is requested to reshape by ctr */
 static bool rs_reshape_requested(struct raid_set *rs)
 {
@@ -1880,9 +1889,7 @@ static bool rs_reshape_requested(struct raid_set *rs)
        if (rs_is_raid0(rs))
                return false;
 
-       change = mddev->new_layout != mddev->layout ||
-                mddev->new_chunk_sectors != mddev->chunk_sectors ||
-                rs->delta_disks;
+       change = rs_is_layout_change(rs, false);
 
        /* Historical case to support raid1 reshape without delta disks */
        if (rs_is_raid1(rs)) {
@@ -2817,7 +2824,7 @@ static sector_t _get_reshape_sectors(struct raid_set *rs)
 }
 
 /*
- *
+ * Reshape:
  * - change raid layout
  * - change chunk size
  * - add disks
@@ -2926,6 +2933,20 @@ static int rs_setup_reshape(struct raid_set *rs)
        return r;
 }
 
+/*
+ * If the md resync thread has updated superblock with max reshape position
+ * at the end of a reshape but not (yet) reset the layout configuration
+ * changes -> reset the latter.
+ */
+static void rs_reset_inconclusive_reshape(struct raid_set *rs)
+{
+       if (!rs_is_reshaping(rs) && rs_is_layout_change(rs, true)) {
+               rs_set_cur(rs);
+               rs->md.delta_disks = 0;
+               rs->md.reshape_backwards = 0;
+       }
+}
+
 /*
  * Enable/disable discard support on RAID set depending on
  * RAID level and discard properties of underlying RAID members.
@@ -3212,11 +3233,14 @@ size_check:
        if (r)
                goto bad;
 
+       /* Catch any inconclusive reshape superblock content. */
+       rs_reset_inconclusive_reshape(rs);
+
        /* Start raid set read-only and assumed clean to change in raid_resume() */
        rs->md.ro = 1;
        rs->md.in_sync = 1;
 
-       /* Keep array frozen */
+       /* Keep array frozen until resume. */
        set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
 
        /* Has to be held on running the array */
@@ -3230,7 +3254,6 @@ size_check:
        }
 
        r = md_start(&rs->md);
-
        if (r) {
                ti->error = "Failed to start raid array";
                mddev_unlock(&rs->md);
@@ -3727,15 +3750,6 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
        blk_limits_io_min(limits, chunk_size_bytes);
        blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
-
-       /*
-        * RAID0 and RAID10 personalities require bio splitting,
-        * RAID1/4/5/6 don't and process large discard bios properly.
-        */
-       if (rs_is_raid0(rs) || rs_is_raid10(rs)) {
-               limits->discard_granularity = chunk_size_bytes;
-               limits->max_discard_sectors = rs->md.chunk_sectors;
-       }
 }
 
 static void raid_postsuspend(struct dm_target *ti)
index 13b4385..9c3bc37 100644 (file)
@@ -569,6 +569,7 @@ out_tag_set:
        blk_mq_free_tag_set(md->tag_set);
 out_kfree_tag_set:
        kfree(md->tag_set);
+       md->tag_set = NULL;
 
        return err;
 }
@@ -578,6 +579,7 @@ void dm_mq_cleanup_mapped_device(struct mapped_device *md)
        if (md->tag_set) {
                blk_mq_free_tag_set(md->tag_set);
                kfree(md->tag_set);
+               md->tag_set = NULL;
        }
 }
 
index 8e329c3..9ab4bf6 100644 (file)
@@ -596,7 +596,7 @@ static void persistent_dtr(struct dm_exception_store *store)
        free_area(ps);
 
        /* Allocated in persistent_read_metadata */
-       vfree(ps->callbacks);
+       kvfree(ps->callbacks);
 
        kfree(ps);
 }
@@ -621,8 +621,8 @@ static int persistent_read_metadata(struct dm_exception_store *store,
         */
        ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
                                  sizeof(struct disk_exception);
-       ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
-                                  sizeof(*ps->callbacks));
+       ps->callbacks = kvcalloc(ps->exceptions_per_area,
+                                sizeof(*ps->callbacks), GFP_KERNEL);
        if (!ps->callbacks)
                return -ENOMEM;
 
index 11890db..a2acb01 100644 (file)
@@ -663,7 +663,8 @@ static int dm_exception_table_init(struct dm_exception_table *et,
 
        et->hash_shift = hash_shift;
        et->hash_mask = size - 1;
-       et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head));
+       et->table = kvmalloc_array(size, sizeof(struct hlist_bl_head),
+                                  GFP_KERNEL);
        if (!et->table)
                return -ENOMEM;
 
@@ -689,7 +690,7 @@ static void dm_exception_table_exit(struct dm_exception_table *et,
                        kmem_cache_free(mem, ex);
        }
 
-       vfree(et->table);
+       kvfree(et->table);
 }
 
 static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
index e5f0f17..ee47a33 100644 (file)
@@ -94,24 +94,6 @@ static int setup_btree_index(unsigned int l, struct dm_table *t)
        return 0;
 }
 
-void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
-{
-       unsigned long size;
-       void *addr;
-
-       /*
-        * Check that we're not going to overflow.
-        */
-       if (nmemb > (ULONG_MAX / elem_size))
-               return NULL;
-
-       size = nmemb * elem_size;
-       addr = vzalloc(size);
-
-       return addr;
-}
-EXPORT_SYMBOL(dm_vcalloc);
-
 /*
  * highs, and targets are managed as dynamic arrays during a
  * table load.
@@ -124,15 +106,15 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
        /*
         * Allocate both the target array and offset array at once.
         */
-       n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) +
-                                         sizeof(sector_t));
+       n_highs = kvcalloc(num, sizeof(struct dm_target) + sizeof(sector_t),
+                          GFP_KERNEL);
        if (!n_highs)
                return -ENOMEM;
 
        n_targets = (struct dm_target *) (n_highs + num);
 
        memset(n_highs, -1, sizeof(*n_highs) * num);
-       vfree(t->highs);
+       kvfree(t->highs);
 
        t->num_allocated = num;
        t->highs = n_highs;
@@ -198,7 +180,7 @@ void dm_table_destroy(struct dm_table *t)
 
        /* free the indexes */
        if (t->depth >= 2)
-               vfree(t->index[t->depth - 2]);
+               kvfree(t->index[t->depth - 2]);
 
        /* free the targets */
        for (i = 0; i < t->num_targets; i++) {
@@ -210,7 +192,7 @@ void dm_table_destroy(struct dm_table *t)
                dm_put_target_type(tgt->type);
        }
 
-       vfree(t->highs);
+       kvfree(t->highs);
 
        /* free the device list */
        free_devices(&t->devices, t->md);
@@ -1077,7 +1059,7 @@ static int setup_indexes(struct dm_table *t)
                total += t->counts[i];
        }
 
-       indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE);
+       indexes = kvcalloc(total, NODE_SIZE, GFP_KERNEL);
        if (!indexes)
                return -ENOMEM;
 
index fff4c50..985baee 100644 (file)
@@ -2816,7 +2816,7 @@ static bool data_dev_supports_discard(struct pool_c *pt)
 {
        struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
 
-       return q && blk_queue_discard(q);
+       return blk_queue_discard(q);
 }
 
 static bool is_factor(sector_t block_size, uint32_t n)
index 808a98e..d3e76ae 100644 (file)
@@ -893,6 +893,28 @@ out:
        return r;
 }
 
+static inline bool verity_is_verity_mode(const char *arg_name)
+{
+       return (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING) ||
+               !strcasecmp(arg_name, DM_VERITY_OPT_RESTART) ||
+               !strcasecmp(arg_name, DM_VERITY_OPT_PANIC));
+}
+
+static int verity_parse_verity_mode(struct dm_verity *v, const char *arg_name)
+{
+       if (v->mode)
+               return -EINVAL;
+
+       if (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING))
+               v->mode = DM_VERITY_MODE_LOGGING;
+       else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART))
+               v->mode = DM_VERITY_MODE_RESTART;
+       else if (!strcasecmp(arg_name, DM_VERITY_OPT_PANIC))
+               v->mode = DM_VERITY_MODE_PANIC;
+
+       return 0;
+}
+
 static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
                                 struct dm_verity_sig_opts *verify_args)
 {
@@ -916,16 +938,12 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
                arg_name = dm_shift_arg(as);
                argc--;
 
-               if (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING)) {
-                       v->mode = DM_VERITY_MODE_LOGGING;
-                       continue;
-
-               } else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART)) {
-                       v->mode = DM_VERITY_MODE_RESTART;
-                       continue;
-
-               } else if (!strcasecmp(arg_name, DM_VERITY_OPT_PANIC)) {
-                       v->mode = DM_VERITY_MODE_PANIC;
+               if (verity_is_verity_mode(arg_name)) {
+                       r = verity_parse_verity_mode(v, arg_name);
+                       if (r) {
+                               ti->error = "Conflicting error handling parameters";
+                               return r;
+                       }
                        continue;
 
                } else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) {
@@ -1242,7 +1260,7 @@ bad:
 
 static struct target_type verity_target = {
        .name           = "verity",
-       .version        = {1, 7, 0},
+       .version        = {1, 8, 0},
        .module         = THIS_MODULE,
        .ctr            = verity_ctr,
        .dtr            = verity_dtr,
index 4f72b6f..aecc246 100644 (file)
@@ -73,7 +73,7 @@ struct wc_memory_superblock {
                };
                __le64 padding[8];
        };
-       struct wc_memory_entry entries[0];
+       struct wc_memory_entry entries[];
 };
 
 struct wc_entry {
index 3f3be94..ca2aedd 100644 (file)
@@ -840,7 +840,6 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
        *result = &td->dm_dev;
        return 0;
 }
-EXPORT_SYMBOL_GPL(dm_get_table_device);
 
 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 {
@@ -854,7 +853,6 @@ void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
        }
        mutex_unlock(&md->table_devices_lock);
 }
-EXPORT_SYMBOL(dm_put_table_device);
 
 static void free_table_devices(struct list_head *devices)
 {
@@ -1641,38 +1639,35 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
        } else {
                ci.bio = bio;
                ci.sector_count = bio_sectors(bio);
-               while (ci.sector_count && !error) {
-                       error = __split_and_process_non_flush(&ci);
-                       if (ci.sector_count && !error) {
-                               /*
-                                * Remainder must be passed to submit_bio_noacct()
-                                * so that it gets handled *after* bios already submitted
-                                * have been completely processed.
-                                * We take a clone of the original to store in
-                                * ci.io->orig_bio to be used by end_io_acct() and
-                                * for dec_pending to use for completion handling.
-                                */
-                               struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
-                                                         GFP_NOIO, &md->queue->bio_split);
-                               ci.io->orig_bio = b;
-
-                               /*
-                                * Adjust IO stats for each split, otherwise upon queue
-                                * reentry there will be redundant IO accounting.
-                                * NOTE: this is a stop-gap fix, a proper fix involves
-                                * significant refactoring of DM core's bio splitting
-                                * (by eliminating DM's splitting and just using bio_split)
-                                */
-                               part_stat_lock();
-                               __dm_part_stat_sub(dm_disk(md)->part0,
-                                                  sectors[op_stat_group(bio_op(bio))], ci.sector_count);
-                               part_stat_unlock();
-
-                               bio_chain(b, bio);
-                               trace_block_split(b, bio->bi_iter.bi_sector);
-                               ret = submit_bio_noacct(bio);
-                               break;
-                       }
+               error = __split_and_process_non_flush(&ci);
+               if (ci.sector_count && !error) {
+                       /*
+                        * Remainder must be passed to submit_bio_noacct()
+                        * so that it gets handled *after* bios already submitted
+                        * have been completely processed.
+                        * We take a clone of the original to store in
+                        * ci.io->orig_bio to be used by end_io_acct() and
+                        * for dec_pending to use for completion handling.
+                        */
+                       struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
+                                                 GFP_NOIO, &md->queue->bio_split);
+                       ci.io->orig_bio = b;
+
+                       /*
+                        * Adjust IO stats for each split, otherwise upon queue
+                        * reentry there will be redundant IO accounting.
+                        * NOTE: this is a stop-gap fix, a proper fix involves
+                        * significant refactoring of DM core's bio splitting
+                        * (by eliminating DM's splitting and just using bio_split)
+                        */
+                       part_stat_lock();
+                       __dm_part_stat_sub(dm_disk(md)->part0,
+                                          sectors[op_stat_group(bio_op(bio))], ci.sector_count);
+                       part_stat_unlock();
+
+                       bio_chain(b, bio);
+                       trace_block_split(b, bio->bi_iter.bi_sector);
+                       ret = submit_bio_noacct(bio);
                }
        }
 
index fe073d9..b178885 100644 (file)
@@ -34,12 +34,12 @@ struct node_header {
        __le32 max_entries;
        __le32 value_size;
        __le32 padding;
-} __packed;
+} __attribute__((packed, aligned(8)));
 
 struct btree_node {
        struct node_header header;
        __le64 keys[];
-} __packed;
+} __attribute__((packed, aligned(8)));
 
 
 /*
@@ -83,7 +83,7 @@ struct shadow_spine {
 };
 
 void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info);
-int exit_shadow_spine(struct shadow_spine *s);
+void exit_shadow_spine(struct shadow_spine *s);
 
 int shadow_step(struct shadow_spine *s, dm_block_t b,
                struct dm_btree_value_type *vt);
index 8a2bfbf..2061ab8 100644 (file)
@@ -30,8 +30,6 @@ static void node_prepare_for_write(struct dm_block_validator *v,
        h->csum = cpu_to_le32(dm_bm_checksum(&h->flags,
                                             block_size - sizeof(__le32),
                                             BTREE_CSUM_XOR));
-
-       BUG_ON(node_check(v, b, 4096));
 }
 
 static int node_check(struct dm_block_validator *v,
@@ -183,15 +181,13 @@ void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info)
        s->count = 0;
 }
 
-int exit_shadow_spine(struct shadow_spine *s)
+void exit_shadow_spine(struct shadow_spine *s)
 {
-       int r = 0, i;
+       int i;
 
        for (i = 0; i < s->count; i++) {
                unlock_block(s->info, s->nodes[i]);
        }
-
-       return r;
 }
 
 int shadow_step(struct shadow_spine *s, dm_block_t b,
index d8b4125..a213bf1 100644 (file)
@@ -339,6 +339,8 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
         */
        begin = do_div(index_begin, ll->entries_per_block);
        end = do_div(end, ll->entries_per_block);
+       if (end == 0)
+               end = ll->entries_per_block;
 
        for (i = index_begin; i < index_end; i++, begin = 0) {
                struct dm_block *blk;
index 8de63ce..87e1790 100644 (file)
@@ -33,7 +33,7 @@ struct disk_index_entry {
        __le64 blocknr;
        __le32 nr_free;
        __le32 none_free_before;
-} __packed;
+} __attribute__ ((packed, aligned(8)));
 
 
 #define MAX_METADATA_BITMAPS 255
@@ -43,7 +43,7 @@ struct disk_metadata_index {
        __le64 blocknr;
 
        struct disk_index_entry index[MAX_METADATA_BITMAPS];
-} __packed;
+} __attribute__ ((packed, aligned(8)));
 
 struct ll_disk;
 
@@ -86,7 +86,7 @@ struct disk_sm_root {
        __le64 nr_allocated;
        __le64 bitmap_root;
        __le64 ref_count_root;
-} __packed;
+} __attribute__ ((packed, aligned(8)));
 
 #define ENTRIES_PER_BYTE 4
 
@@ -94,7 +94,7 @@ struct disk_bitmap_header {
        __le32 csum;
        __le32 not_used;
        __le64 blocknr;
-} __packed;
+} __attribute__ ((packed, aligned(8)));
 
 enum allocation_event {
        SM_NONE,
index bf4c5e2..61f5690 100644 (file)
@@ -187,13 +187,8 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
 static int sm_disk_commit(struct dm_space_map *sm)
 {
        int r;
-       dm_block_t nr_free;
        struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 
-       r = sm_disk_get_nr_free(sm, &nr_free);
-       if (r)
-               return r;
-
        r = sm_ll_commit(&smd->ll);
        if (r)
                return r;
@@ -202,10 +197,6 @@ static int sm_disk_commit(struct dm_space_map *sm)
        smd->begin = 0;
        smd->nr_allocated_this_transaction = 0;
 
-       r = sm_disk_get_nr_free(sm, &nr_free);
-       if (r)
-               return r;
-
        return 0;
 }
 
index abfc883..68bc382 100644 (file)
@@ -9,9 +9,6 @@
    Please send bug reports and support requests to <luc@saillard.org>.
    The decompression routines have been implemented by reverse-engineering the
    Nemosoft binary pwcx module. Caveat emptor.
-
-
-   vim: set ts=8:
 */
 
 #include <asm/current.h>
index f2f5652..a777b38 100644 (file)
@@ -6,11 +6,14 @@
  *          Laurent Pinchart (laurent.pinchart@ideasonboard.com)
  */
 
+#include <linux/dma-mapping.h>
+#include <linux/highmem.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/usb.h>
+#include <linux/usb/hcd.h>
 #include <linux/videodev2.h>
 #include <linux/vmalloc.h>
 #include <linux/wait.h>
@@ -1096,6 +1099,29 @@ static int uvc_video_decode_start(struct uvc_streaming *stream,
        return data[0];
 }
 
+static inline enum dma_data_direction uvc_stream_dir(
+                               struct uvc_streaming *stream)
+{
+       if (stream->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
+               return DMA_FROM_DEVICE;
+       else
+               return DMA_TO_DEVICE;
+}
+
+static inline struct device *uvc_stream_to_dmadev(struct uvc_streaming *stream)
+{
+       return bus_to_hcd(stream->dev->udev->bus)->self.sysdev;
+}
+
+static int uvc_submit_urb(struct uvc_urb *uvc_urb, gfp_t mem_flags)
+{
+       /* Sync DMA. */
+       dma_sync_sgtable_for_device(uvc_stream_to_dmadev(uvc_urb->stream),
+                                   uvc_urb->sgt,
+                                   uvc_stream_dir(uvc_urb->stream));
+       return usb_submit_urb(uvc_urb->urb, mem_flags);
+}
+
 /*
  * uvc_video_decode_data_work: Asynchronous memcpy processing
  *
@@ -1117,7 +1143,7 @@ static void uvc_video_copy_data_work(struct work_struct *work)
                uvc_queue_buffer_release(op->buf);
        }
 
-       ret = usb_submit_urb(uvc_urb->urb, GFP_KERNEL);
+       ret = uvc_submit_urb(uvc_urb, GFP_KERNEL);
        if (ret < 0)
                dev_err(&uvc_urb->stream->intf->dev,
                        "Failed to resubmit video URB (%d).\n", ret);
@@ -1537,6 +1563,12 @@ static void uvc_video_complete(struct urb *urb)
        /* Re-initialise the URB async work. */
        uvc_urb->async_operations = 0;
 
+       /* Sync DMA and invalidate vmap range. */
+       dma_sync_sgtable_for_cpu(uvc_stream_to_dmadev(uvc_urb->stream),
+                                uvc_urb->sgt, uvc_stream_dir(stream));
+       invalidate_kernel_vmap_range(uvc_urb->buffer,
+                                    uvc_urb->stream->urb_size);
+
        /*
         * Process the URB headers, and optionally queue expensive memcpy tasks
         * to be deferred to a work queue.
@@ -1545,7 +1577,7 @@ static void uvc_video_complete(struct urb *urb)
 
        /* If no async work is needed, resubmit the URB immediately. */
        if (!uvc_urb->async_operations) {
-               ret = usb_submit_urb(uvc_urb->urb, GFP_ATOMIC);
+               ret = uvc_submit_urb(uvc_urb, GFP_ATOMIC);
                if (ret < 0)
                        dev_err(&stream->intf->dev,
                                "Failed to resubmit video URB (%d).\n", ret);
@@ -1560,24 +1592,49 @@ static void uvc_video_complete(struct urb *urb)
  */
 static void uvc_free_urb_buffers(struct uvc_streaming *stream)
 {
+       struct device *dma_dev = uvc_stream_to_dmadev(stream);
        struct uvc_urb *uvc_urb;
 
        for_each_uvc_urb(uvc_urb, stream) {
                if (!uvc_urb->buffer)
                        continue;
 
-#ifndef CONFIG_DMA_NONCOHERENT
-               usb_free_coherent(stream->dev->udev, stream->urb_size,
-                                 uvc_urb->buffer, uvc_urb->dma);
-#else
-               kfree(uvc_urb->buffer);
-#endif
+               dma_vunmap_noncontiguous(dma_dev, uvc_urb->buffer);
+               dma_free_noncontiguous(dma_dev, stream->urb_size, uvc_urb->sgt,
+                                      uvc_stream_dir(stream));
+
                uvc_urb->buffer = NULL;
+               uvc_urb->sgt = NULL;
        }
 
        stream->urb_size = 0;
 }
 
+static bool uvc_alloc_urb_buffer(struct uvc_streaming *stream,
+                                struct uvc_urb *uvc_urb, gfp_t gfp_flags)
+{
+       struct device *dma_dev = uvc_stream_to_dmadev(stream);
+
+       uvc_urb->sgt = dma_alloc_noncontiguous(dma_dev, stream->urb_size,
+                                              uvc_stream_dir(stream),
+                                              gfp_flags, 0);
+       if (!uvc_urb->sgt)
+               return false;
+       uvc_urb->dma = uvc_urb->sgt->sgl->dma_address;
+
+       uvc_urb->buffer = dma_vmap_noncontiguous(dma_dev, stream->urb_size,
+                                                uvc_urb->sgt);
+       if (!uvc_urb->buffer) {
+               dma_free_noncontiguous(dma_dev, stream->urb_size,
+                                      uvc_urb->sgt,
+                                      uvc_stream_dir(stream));
+               uvc_urb->sgt = NULL;
+               return false;
+       }
+
+       return true;
+}
+
 /*
  * Allocate transfer buffers. This function can be called with buffers
  * already allocated when resuming from suspend, in which case it will
@@ -1608,19 +1665,12 @@ static int uvc_alloc_urb_buffers(struct uvc_streaming *stream,
 
        /* Retry allocations until one succeed. */
        for (; npackets > 1; npackets /= 2) {
+               stream->urb_size = psize * npackets;
+
                for (i = 0; i < UVC_URBS; ++i) {
                        struct uvc_urb *uvc_urb = &stream->uvc_urb[i];
 
-                       stream->urb_size = psize * npackets;
-#ifndef CONFIG_DMA_NONCOHERENT
-                       uvc_urb->buffer = usb_alloc_coherent(
-                               stream->dev->udev, stream->urb_size,
-                               gfp_flags | __GFP_NOWARN, &uvc_urb->dma);
-#else
-                       uvc_urb->buffer =
-                           kmalloc(stream->urb_size, gfp_flags | __GFP_NOWARN);
-#endif
-                       if (!uvc_urb->buffer) {
+                       if (!uvc_alloc_urb_buffer(stream, uvc_urb, gfp_flags)) {
                                uvc_free_urb_buffers(stream);
                                break;
                        }
@@ -1730,12 +1780,8 @@ static int uvc_init_video_isoc(struct uvc_streaming *stream,
                urb->context = uvc_urb;
                urb->pipe = usb_rcvisocpipe(stream->dev->udev,
                                ep->desc.bEndpointAddress);
-#ifndef CONFIG_DMA_NONCOHERENT
                urb->transfer_flags = URB_ISO_ASAP | URB_NO_TRANSFER_DMA_MAP;
                urb->transfer_dma = uvc_urb->dma;
-#else
-               urb->transfer_flags = URB_ISO_ASAP;
-#endif
                urb->interval = ep->desc.bInterval;
                urb->transfer_buffer = uvc_urb->buffer;
                urb->complete = uvc_video_complete;
@@ -1795,10 +1841,8 @@ static int uvc_init_video_bulk(struct uvc_streaming *stream,
 
                usb_fill_bulk_urb(urb, stream->dev->udev, pipe, uvc_urb->buffer,
                                  size, uvc_video_complete, uvc_urb);
-#ifndef CONFIG_DMA_NONCOHERENT
                urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP;
                urb->transfer_dma = uvc_urb->dma;
-#endif
 
                uvc_urb->urb = urb;
        }
@@ -1895,7 +1939,7 @@ static int uvc_video_start_transfer(struct uvc_streaming *stream,
 
        /* Submit the URBs. */
        for_each_uvc_urb(uvc_urb, stream) {
-               ret = usb_submit_urb(uvc_urb->urb, gfp_flags);
+               ret = uvc_submit_urb(uvc_urb, gfp_flags);
                if (ret < 0) {
                        dev_err(&stream->intf->dev,
                                "Failed to submit URB %u (%d).\n",
index 97df5ec..cce5e38 100644 (file)
  */
 
 struct gpio_desc;
+struct sg_table;
 struct uvc_device;
 
 /* TODO: Put the most frequently accessed fields at the beginning of
@@ -545,7 +546,8 @@ struct uvc_copy_op {
  * @urb: the URB described by this context structure
  * @stream: UVC streaming context
  * @buffer: memory storage for the URB
- * @dma: DMA coherent addressing for the urb_buffer
+ * @dma: Allocated DMA handle
+ * @sgt: sgt_table with the urb locations in memory
  * @async_operations: counter to indicate the number of copy operations
  * @copy_operations: work descriptors for asynchronous copy operations
  * @work: work queue entry for asynchronous decode
@@ -556,6 +558,7 @@ struct uvc_urb {
 
        char *buffer;
        dma_addr_t dma;
+       struct sg_table *sgt;
 
        unsigned int async_operations;
        struct uvc_copy_op copy_operations[UVC_MAX_PACKETS];
index 94843e0..bae18ef 100644 (file)
@@ -385,6 +385,33 @@ static void uacce_release(struct device *dev)
        kfree(uacce);
 }
 
+static unsigned int uacce_enable_sva(struct device *parent, unsigned int flags)
+{
+       if (!(flags & UACCE_DEV_SVA))
+               return flags;
+
+       flags &= ~UACCE_DEV_SVA;
+
+       if (iommu_dev_enable_feature(parent, IOMMU_DEV_FEAT_IOPF))
+               return flags;
+
+       if (iommu_dev_enable_feature(parent, IOMMU_DEV_FEAT_SVA)) {
+               iommu_dev_disable_feature(parent, IOMMU_DEV_FEAT_IOPF);
+               return flags;
+       }
+
+       return flags | UACCE_DEV_SVA;
+}
+
+static void uacce_disable_sva(struct uacce_device *uacce)
+{
+       if (!(uacce->flags & UACCE_DEV_SVA))
+               return;
+
+       iommu_dev_disable_feature(uacce->parent, IOMMU_DEV_FEAT_SVA);
+       iommu_dev_disable_feature(uacce->parent, IOMMU_DEV_FEAT_IOPF);
+}
+
 /**
  * uacce_alloc() - alloc an accelerator
  * @parent: pointer of uacce parent device
@@ -404,11 +431,7 @@ struct uacce_device *uacce_alloc(struct device *parent,
        if (!uacce)
                return ERR_PTR(-ENOMEM);
 
-       if (flags & UACCE_DEV_SVA) {
-               ret = iommu_dev_enable_feature(parent, IOMMU_DEV_FEAT_SVA);
-               if (ret)
-                       flags &= ~UACCE_DEV_SVA;
-       }
+       flags = uacce_enable_sva(parent, flags);
 
        uacce->parent = parent;
        uacce->flags = flags;
@@ -432,8 +455,7 @@ struct uacce_device *uacce_alloc(struct device *parent,
        return uacce;
 
 err_with_uacce:
-       if (flags & UACCE_DEV_SVA)
-               iommu_dev_disable_feature(uacce->parent, IOMMU_DEV_FEAT_SVA);
+       uacce_disable_sva(uacce);
        kfree(uacce);
        return ERR_PTR(ret);
 }
@@ -487,8 +509,7 @@ void uacce_remove(struct uacce_device *uacce)
        mutex_unlock(&uacce->queues_lock);
 
        /* disable sva now since no opened queues */
-       if (uacce->flags & UACCE_DEV_SVA)
-               iommu_dev_disable_feature(uacce->parent, IOMMU_DEV_FEAT_SVA);
+       uacce_disable_sva(uacce);
 
        if (uacce->cdev)
                cdev_device_del(uacce->cdev, &uacce->dev);
index f399edc..a7e3eb9 100644 (file)
@@ -1350,6 +1350,7 @@ static int bytes_str_to_int(const char *str)
                fallthrough;
        case 'K':
                result *= 1024;
+               break;
        case '\0':
                break;
        default:
index c2da771..7c083ad 100644 (file)
@@ -388,8 +388,6 @@ struct ubi_volume_desc {
        int mode;
 };
 
-struct ubi_wl_entry;
-
 /**
  * struct ubi_debug_info - debugging information for an UBI device.
  *
index 34073cd..3cf6de2 100644 (file)
@@ -1562,6 +1562,8 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev)
        int i;
        int putidx;
 
+       cdev->tx_skb = NULL;
+
        /* Generate ID field for TX buffer Element */
        /* Common to all supported M_CAN versions */
        if (cf->can_id & CAN_EFF_FLAG) {
@@ -1678,7 +1680,6 @@ static void m_can_tx_work_queue(struct work_struct *ws)
                                                   tx_work);
 
        m_can_tx_handler(cdev);
-       cdev->tx_skb = NULL;
 }
 
 static netdev_tx_t m_can_start_xmit(struct sk_buff *skb,
index 492f1bc..173c661 100644 (file)
@@ -956,8 +956,6 @@ static int mcp251x_stop(struct net_device *net)
 
        priv->force_quit = 1;
        free_irq(spi->irq, priv);
-       destroy_workqueue(priv->wq);
-       priv->wq = NULL;
 
        mutex_lock(&priv->mcp_lock);
 
@@ -1224,24 +1222,15 @@ static int mcp251x_open(struct net_device *net)
                goto out_close;
        }
 
-       priv->wq = alloc_workqueue("mcp251x_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM,
-                                  0);
-       if (!priv->wq) {
-               ret = -ENOMEM;
-               goto out_clean;
-       }
-       INIT_WORK(&priv->tx_work, mcp251x_tx_work_handler);
-       INIT_WORK(&priv->restart_work, mcp251x_restart_work_handler);
-
        ret = mcp251x_hw_wake(spi);
        if (ret)
-               goto out_free_wq;
+               goto out_free_irq;
        ret = mcp251x_setup(net, spi);
        if (ret)
-               goto out_free_wq;
+               goto out_free_irq;
        ret = mcp251x_set_normal_mode(spi);
        if (ret)
-               goto out_free_wq;
+               goto out_free_irq;
 
        can_led_event(net, CAN_LED_EVENT_OPEN);
 
@@ -1250,9 +1239,7 @@ static int mcp251x_open(struct net_device *net)
 
        return 0;
 
-out_free_wq:
-       destroy_workqueue(priv->wq);
-out_clean:
+out_free_irq:
        free_irq(spi->irq, priv);
        mcp251x_hw_sleep(spi);
 out_close:
@@ -1373,6 +1360,15 @@ static int mcp251x_can_probe(struct spi_device *spi)
        if (ret)
                goto out_clk;
 
+       priv->wq = alloc_workqueue("mcp251x_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM,
+                                  0);
+       if (!priv->wq) {
+               ret = -ENOMEM;
+               goto out_clk;
+       }
+       INIT_WORK(&priv->tx_work, mcp251x_tx_work_handler);
+       INIT_WORK(&priv->restart_work, mcp251x_restart_work_handler);
+
        priv->spi = spi;
        mutex_init(&priv->mcp_lock);
 
@@ -1417,6 +1413,8 @@ static int mcp251x_can_probe(struct spi_device *spi)
        return 0;
 
 error_probe:
+       destroy_workqueue(priv->wq);
+       priv->wq = NULL;
        mcp251x_power_enable(priv->power, 0);
 
 out_clk:
@@ -1438,6 +1436,9 @@ static int mcp251x_can_remove(struct spi_device *spi)
 
        mcp251x_power_enable(priv->power, 0);
 
+       destroy_workqueue(priv->wq);
+       priv->wq = NULL;
+
        clk_disable_unprepare(priv->clk);
 
        free_candev(net);
index 970dc57..e0ae00e 100644 (file)
@@ -2885,8 +2885,8 @@ static int mcp251xfd_probe(struct spi_device *spi)
 
        clk = devm_clk_get(&spi->dev, NULL);
        if (IS_ERR(clk))
-               dev_err_probe(&spi->dev, PTR_ERR(clk),
-                             "Failed to get Oscillator (clock)!\n");
+               return dev_err_probe(&spi->dev, PTR_ERR(clk),
+                                    "Failed to get Oscillator (clock)!\n");
        freq = clk_get_rate(clk);
 
        /* Sanity check */
@@ -2986,10 +2986,12 @@ static int mcp251xfd_probe(struct spi_device *spi)
 
        err = mcp251xfd_register(priv);
        if (err)
-               goto out_free_candev;
+               goto out_can_rx_offload_del;
 
        return 0;
 
+ out_can_rx_offload_del:
+       can_rx_offload_del(&priv->offload);
  out_free_candev:
        spi->max_speed_hz = priv->spi_max_speed_hz_orig;
 
index 85ba12a..ea7550d 100644 (file)
@@ -41,6 +41,9 @@ static int ksz8795_spi_probe(struct spi_device *spi)
        int i, ret = 0;
 
        ksz8 = devm_kzalloc(&spi->dev, sizeof(struct ksz8), GFP_KERNEL);
+       if (!ksz8)
+               return -ENOMEM;
+
        ksz8->priv = spi;
 
        dev = ksz_switch_alloc(&spi->dev, ksz8);
index 30d97ea..1129348 100644 (file)
@@ -147,11 +147,14 @@ static int ksz8863_smi_probe(struct mdio_device *mdiodev)
        int i;
 
        ksz8 = devm_kzalloc(&mdiodev->dev, sizeof(struct ksz8), GFP_KERNEL);
+       if (!ksz8)
+               return -ENOMEM;
+
        ksz8->priv = mdiodev;
 
        dev = ksz_switch_alloc(&mdiodev->dev, ksz8);
        if (!dev)
-               return -EINVAL;
+               return -ENOMEM;
 
        for (i = 0; i < ARRAY_SIZE(ksz8863_regmap_config); i++) {
                rc = ksz8863_regmap_config[i];
index 5552997..7965e5e 100644 (file)
@@ -2070,11 +2070,3 @@ static void __exit starfire_cleanup (void)
 
 module_init(starfire_init);
 module_exit(starfire_cleanup);
-
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
index 961796a..c1eab91 100644 (file)
@@ -1156,11 +1156,3 @@ static void __exit atarilance_module_exit(void)
 module_init(atarilance_module_init);
 module_exit(atarilance_module_exit);
 #endif /* MODULE */
-
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 4
- * End:
- */
index aa41250..4100ab0 100644 (file)
@@ -3029,10 +3029,3 @@ static void __exit pcnet32_cleanup_module(void)
 
 module_init(pcnet32_init_module);
 module_exit(pcnet32_cleanup_module);
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 8
- * End:
- */
index 9e02f88..b3d7433 100644 (file)
@@ -2016,7 +2016,7 @@ static struct pci_driver alx_driver = {
 module_pci_driver(alx_driver);
 MODULE_DEVICE_TABLE(pci, alx_pci_tbl);
 MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");
-MODULE_AUTHOR("Qualcomm Corporation, <nic-devel@qualcomm.com>");
+MODULE_AUTHOR("Qualcomm Corporation");
 MODULE_DESCRIPTION(
        "Qualcomm Atheros(R) AR816x/AR817x PCI-E Ethernet Network Driver");
 MODULE_LICENSE("GPL");
index 1d17c24..c6263cf 100644 (file)
@@ -32,7 +32,7 @@ static const struct pci_device_id atl1c_pci_tbl[] = {
 MODULE_DEVICE_TABLE(pci, atl1c_pci_tbl);
 
 MODULE_AUTHOR("Jie Yang");
-MODULE_AUTHOR("Qualcomm Atheros Inc., <nic-devel@qualcomm.com>");
+MODULE_AUTHOR("Qualcomm Atheros Inc.");
 MODULE_DESCRIPTION("Qualcomm Atheros 100/1000M Ethernet Network Driver");
 MODULE_LICENSE("GPL");
 
index 3e8a179..c098609 100644 (file)
@@ -8057,7 +8057,7 @@ bnx2_read_vpd_fw_ver(struct bnx2 *bp)
                data[i + 3] = data[i + BNX2_VPD_LEN];
        }
 
-       i = pci_vpd_find_tag(data, 0, BNX2_VPD_LEN, PCI_VPD_LRDT_RO_DATA);
+       i = pci_vpd_find_tag(data, BNX2_VPD_LEN, PCI_VPD_LRDT_RO_DATA);
        if (i < 0)
                goto vpd_done;
 
index 5680138..281b1c2 100644 (file)
@@ -12206,8 +12206,7 @@ static void bnx2x_read_fwinfo(struct bnx2x *bp)
        /* VPD RO tag should be first tag after identifier string, hence
         * we should be able to find it in first BNX2X_VPD_LEN chars
         */
-       i = pci_vpd_find_tag(vpd_start, 0, BNX2X_VPD_LEN,
-                            PCI_VPD_LRDT_RO_DATA);
+       i = pci_vpd_find_tag(vpd_start, BNX2X_VPD_LEN, PCI_VPD_LRDT_RO_DATA);
        if (i < 0)
                goto out_not_found;
 
index 9c2f51f..d21f085 100644 (file)
@@ -1192,7 +1192,6 @@ int bnx2x_iov_init_one(struct bnx2x *bp, int int_mode_param,
                return 0;
        }
 
-       err = -EIO;
        /* verify ari is enabled */
        if (!pci_ari_enabled(bp->pdev->bus)) {
                BNX2X_ERR("ARI not supported (check pci bridge ARI forwarding), SRIOV can not be enabled\n");
index 39ac9e2..2985844 100644 (file)
@@ -12794,7 +12794,7 @@ static void bnxt_vpd_read_info(struct bnxt *bp)
                goto exit;
        }
 
-       i = pci_vpd_find_tag(vpd_data, 0, vpd_size, PCI_VPD_LRDT_RO_DATA);
+       i = pci_vpd_find_tag(vpd_data, vpd_size, PCI_VPD_LRDT_RO_DATA);
        if (i < 0) {
                netdev_err(bp->dev, "VPD READ-Only not found\n");
                goto exit;
@@ -12985,8 +12985,6 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
        if (!BNXT_CHIP_P4_PLUS(bp))
                bp->flags |= BNXT_FLAG_DOUBLE_DB;
 
-       bp->ulp_probe = bnxt_ulp_probe;
-
        rc = bnxt_init_mac_addr(bp);
        if (rc) {
                dev_err(&pdev->dev, "Unable to initialize mac address.\n");
index 24d2ad6..98e0cef 100644 (file)
@@ -1751,7 +1751,6 @@ struct bnxt {
        (BNXT_CHIP_P4(bp) || BNXT_CHIP_P5(bp))
 
        struct bnxt_en_dev      *edev;
-       struct bnxt_en_dev *    (*ulp_probe)(struct net_device *);
 
        struct bnxt_napi        **bnapi;
 
index 64dbbb0..a918e37 100644 (file)
@@ -491,3 +491,4 @@ struct bnxt_en_dev *bnxt_ulp_probe(struct net_device *dev)
        }
        return bp->edev;
 }
+EXPORT_SYMBOL(bnxt_ulp_probe);
index d238192..b0e4964 100644 (file)
@@ -13016,7 +13016,7 @@ static int tg3_test_nvram(struct tg3 *tp)
        if (!buf)
                return -ENOMEM;
 
-       i = pci_vpd_find_tag((u8 *)buf, 0, len, PCI_VPD_LRDT_RO_DATA);
+       i = pci_vpd_find_tag((u8 *)buf, len, PCI_VPD_LRDT_RO_DATA);
        if (i > 0) {
                j = pci_vpd_lrdt_size(&((u8 *)buf)[i]);
                if (j < 0)
@@ -15629,7 +15629,7 @@ static void tg3_read_vpd(struct tg3 *tp)
        if (!vpd_data)
                goto out_no_vpd;
 
-       i = pci_vpd_find_tag(vpd_data, 0, vpdlen, PCI_VPD_LRDT_RO_DATA);
+       i = pci_vpd_find_tag(vpd_data, vpdlen, PCI_VPD_LRDT_RO_DATA);
        if (i < 0)
                goto out_not_found;
 
index 7e4e831..ba47777 100644 (file)
@@ -1764,7 +1764,7 @@ bnad_dim_timeout(struct timer_list *t)
                }
        }
 
-       /* Check for BNAD_CF_DIM_ENABLED, does not eleminate a race */
+       /* Check for BNAD_CF_DIM_ENABLED, does not eliminate a race */
        if (test_bit(BNAD_RF_DIM_TIMER_RUNNING, &bnad->run_flags))
                mod_timer(&bnad->dim_timer,
                          jiffies + msecs_to_jiffies(BNAD_DIM_TIMER_FREQ));
index 0e94db9..6bc7d41 100644 (file)
@@ -4852,7 +4852,7 @@ static int __maybe_unused macb_suspend(struct device *dev)
 {
        struct net_device *netdev = dev_get_drvdata(dev);
        struct macb *bp = netdev_priv(netdev);
-       struct macb_queue *queue = bp->queues;
+       struct macb_queue *queue;
        unsigned long flags;
        unsigned int q;
        int err;
@@ -4939,7 +4939,7 @@ static int __maybe_unused macb_resume(struct device *dev)
 {
        struct net_device *netdev = dev_get_drvdata(dev);
        struct macb *bp = netdev_priv(netdev);
-       struct macb_queue *queue = bp->queues;
+       struct macb_queue *queue;
        unsigned long flags;
        unsigned int q;
        int err;
index 256fae1..1e5f2ed 100644 (file)
@@ -2563,12 +2563,12 @@ int cxgb4_ethofld_send_flowc(struct net_device *dev, u32 eotid, u32 tc)
        spin_lock_bh(&eosw_txq->lock);
        if (tc != FW_SCHED_CLS_NONE) {
                if (eosw_txq->state != CXGB4_EO_STATE_CLOSED)
-                       goto out_unlock;
+                       goto out_free_skb;
 
                next_state = CXGB4_EO_STATE_FLOWC_OPEN_SEND;
        } else {
                if (eosw_txq->state != CXGB4_EO_STATE_ACTIVE)
-                       goto out_unlock;
+                       goto out_free_skb;
 
                next_state = CXGB4_EO_STATE_FLOWC_CLOSE_SEND;
        }
@@ -2604,17 +2604,19 @@ int cxgb4_ethofld_send_flowc(struct net_device *dev, u32 eotid, u32 tc)
                eosw_txq_flush_pending_skbs(eosw_txq);
 
        ret = eosw_txq_enqueue(eosw_txq, skb);
-       if (ret) {
-               dev_consume_skb_any(skb);
-               goto out_unlock;
-       }
+       if (ret)
+               goto out_free_skb;
 
        eosw_txq->state = next_state;
        eosw_txq->flowc_idx = eosw_txq->pidx;
        eosw_txq_advance(eosw_txq, 1);
        ethofld_xmit(dev, eosw_txq);
 
-out_unlock:
+       spin_unlock_bh(&eosw_txq->lock);
+       return 0;
+
+out_free_skb:
+       dev_consume_skb_any(skb);
        spin_unlock_bh(&eosw_txq->lock);
        return ret;
 }
index 80882cf..9428ef1 100644 (file)
@@ -2775,7 +2775,7 @@ int t4_get_raw_vpd_params(struct adapter *adapter, struct vpd_params *p)
        if (id_len > ID_LEN)
                id_len = ID_LEN;
 
-       i = pci_vpd_find_tag(vpd, 0, VPD_LEN, PCI_VPD_LRDT_RO_DATA);
+       i = pci_vpd_find_tag(vpd, VPD_LEN, PCI_VPD_LRDT_RO_DATA);
        if (i < 0) {
                dev_err(adapter->pdev_dev, "missing VPD-R section\n");
                ret = -EINVAL;
index f48957a..d0a8f71 100644 (file)
@@ -768,7 +768,7 @@ static inline int enic_queue_wq_skb_encap(struct enic *enic, struct vnic_wq *wq,
        return err;
 }
 
-static inline void enic_queue_wq_skb(struct enic *enic,
+static inline int enic_queue_wq_skb(struct enic *enic,
        struct vnic_wq *wq, struct sk_buff *skb)
 {
        unsigned int mss = skb_shinfo(skb)->gso_size;
@@ -814,6 +814,7 @@ static inline void enic_queue_wq_skb(struct enic *enic,
                wq->to_use = buf->next;
                dev_kfree_skb(skb);
        }
+       return err;
 }
 
 /* netif_tx_lock held, process context with BHs disabled, or BH */
@@ -857,7 +858,8 @@ static netdev_tx_t enic_hard_start_xmit(struct sk_buff *skb,
                return NETDEV_TX_BUSY;
        }
 
-       enic_queue_wq_skb(enic, wq, skb);
+       if (enic_queue_wq_skb(enic, wq, skb))
+               goto error;
 
        if (vnic_wq_desc_avail(wq) < MAX_SKB_FRAGS + ENIC_DESC_MAX_SPLITS)
                netif_tx_stop_queue(txq);
@@ -865,6 +867,7 @@ static netdev_tx_t enic_hard_start_xmit(struct sk_buff *skb,
        if (!netdev_xmit_more() || netif_xmit_stopped(txq))
                vnic_wq_doorbell(wq);
 
+error:
        spin_unlock(&enic->wq_lock[txq_map]);
 
        return NETDEV_TX_OK;
index c21dd11..783fdaf 100644 (file)
@@ -575,8 +575,8 @@ static int hns3_nic_net_stop(struct net_device *netdev)
        if (h->ae_algo->ops->set_timer_task)
                h->ae_algo->ops->set_timer_task(priv->ae_handle, false);
 
-       netif_tx_stop_all_queues(netdev);
        netif_carrier_off(netdev);
+       netif_tx_disable(netdev);
 
        hns3_nic_net_down(netdev);
 
@@ -824,7 +824,7 @@ static int hns3_get_l4_protocol(struct sk_buff *skb, u8 *ol4_proto,
  * and it is udp packet, which has a dest port as the IANA assigned.
  * the hardware is expected to do the checksum offload, but the
  * hardware will not do the checksum offload when udp dest port is
- * 4789 or 6081.
+ * 4789, 4790 or 6081.
  */
 static bool hns3_tunnel_csum_bug(struct sk_buff *skb)
 {
@@ -842,7 +842,8 @@ static bool hns3_tunnel_csum_bug(struct sk_buff *skb)
 
        if (!(!skb->encapsulation &&
              (l4.udp->dest == htons(IANA_VXLAN_UDP_PORT) ||
-             l4.udp->dest == htons(GENEVE_UDP_PORT))))
+             l4.udp->dest == htons(GENEVE_UDP_PORT) ||
+             l4.udp->dest == htons(4790))))
                return false;
 
        skb_checksum_help(skb);
@@ -4616,6 +4617,11 @@ static int hns3_reset_notify_up_enet(struct hnae3_handle *handle)
        struct hns3_nic_priv *priv = netdev_priv(kinfo->netdev);
        int ret = 0;
 
+       if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state)) {
+               netdev_err(kinfo->netdev, "device is not initialized yet\n");
+               return -EFAULT;
+       }
+
        clear_bit(HNS3_NIC_STATE_RESETTING, &priv->state);
 
        if (netif_running(kinfo->netdev)) {
index d252919..8223d69 100644 (file)
@@ -753,8 +753,9 @@ static int hclge_config_igu_egu_hw_err_int(struct hclge_dev *hdev, bool en)
 
        /* configure IGU,EGU error interrupts */
        hclge_cmd_setup_basic_desc(&desc, HCLGE_IGU_COMMON_INT_EN, false);
+       desc.data[0] = cpu_to_le32(HCLGE_IGU_ERR_INT_TYPE);
        if (en)
-               desc.data[0] = cpu_to_le32(HCLGE_IGU_ERR_INT_EN);
+               desc.data[0] |= cpu_to_le32(HCLGE_IGU_ERR_INT_EN);
 
        desc.data[1] = cpu_to_le32(HCLGE_IGU_ERR_INT_EN_MASK);
 
index 608fe26..d647f3c 100644 (file)
@@ -32,7 +32,8 @@
 #define HCLGE_TQP_ECC_ERR_INT_EN_MASK  0x0FFF
 #define HCLGE_MSIX_SRAM_ECC_ERR_INT_EN_MASK    0x0F000000
 #define HCLGE_MSIX_SRAM_ECC_ERR_INT_EN 0x0F000000
-#define HCLGE_IGU_ERR_INT_EN   0x0000066F
+#define HCLGE_IGU_ERR_INT_EN   0x0000000F
+#define HCLGE_IGU_ERR_INT_TYPE 0x00000660
 #define HCLGE_IGU_ERR_INT_EN_MASK      0x000F
 #define HCLGE_IGU_TNL_ERR_INT_EN    0x0002AABF
 #define HCLGE_IGU_TNL_ERR_INT_EN_MASK  0x003F
index c296ab6..6304aed 100644 (file)
@@ -3978,6 +3978,12 @@ static void hclge_update_reset_level(struct hclge_dev *hdev)
        struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
        enum hnae3_reset_type reset_level;
 
+       /* reset request will not be set during reset, so clear
+        * pending reset request to avoid unnecessary reset
+        * caused by the same reason.
+        */
+       hclge_get_reset_level(ae_dev, &hdev->reset_request);
+
        /* if default_reset_request has a higher level reset request,
         * it should be handled as soon as possible. since some errors
         * need this kind of reset to fix.
index 5512ffe..8e5f9dc 100644 (file)
@@ -533,7 +533,7 @@ static void hclge_get_link_mode(struct hclge_vport *vport,
        unsigned long advertising;
        unsigned long supported;
        unsigned long send_data;
-       u8 msg_data[10];
+       u8 msg_data[10] = {};
        u8 dest_vfid;
 
        advertising = hdev->hw.mac.advertising[0];
index 08e88d9..1231c34 100644 (file)
@@ -255,6 +255,8 @@ void hclge_mac_start_phy(struct hclge_dev *hdev)
        if (!phydev)
                return;
 
+       phy_loopback(phydev, false);
+
        phy_start(phydev);
 }
 
index 9067cd3..85d3dd3 100644 (file)
@@ -1144,7 +1144,6 @@ static inline bool i40e_is_sw_dcb(struct i40e_pf *pf)
        return !!(pf->flags & I40E_FLAG_DISABLE_FW_LLDP);
 }
 
-void i40e_set_lldp_forwarding(struct i40e_pf *pf, bool enable);
 #ifdef CONFIG_I40E_DCB
 void i40e_dcbnl_flush_apps(struct i40e_pf *pf,
                           struct i40e_dcbx_config *old_cfg,
index ce626ea..140b677 100644 (file)
@@ -1566,8 +1566,10 @@ enum i40e_aq_phy_type {
        I40E_PHY_TYPE_25GBASE_LR                = 0x22,
        I40E_PHY_TYPE_25GBASE_AOC               = 0x23,
        I40E_PHY_TYPE_25GBASE_ACC               = 0x24,
-       I40E_PHY_TYPE_2_5GBASE_T                = 0x30,
-       I40E_PHY_TYPE_5GBASE_T                  = 0x31,
+       I40E_PHY_TYPE_2_5GBASE_T                = 0x26,
+       I40E_PHY_TYPE_5GBASE_T                  = 0x27,
+       I40E_PHY_TYPE_2_5GBASE_T_LINK_STATUS    = 0x30,
+       I40E_PHY_TYPE_5GBASE_T_LINK_STATUS      = 0x31,
        I40E_PHY_TYPE_MAX,
        I40E_PHY_TYPE_NOT_SUPPORTED_HIGH_TEMP   = 0xFD,
        I40E_PHY_TYPE_EMPTY                     = 0xFE,
index a2dba32..32f3fac 100644 (file)
@@ -375,6 +375,7 @@ void i40e_client_subtask(struct i40e_pf *pf)
                                clear_bit(__I40E_CLIENT_INSTANCE_OPENED,
                                          &cdev->state);
                                i40e_client_del_instance(pf);
+                               return;
                        }
                }
        }
index 41b813f..67cb0b4 100644 (file)
@@ -1154,8 +1154,8 @@ static enum i40e_media_type i40e_get_media_type(struct i40e_hw *hw)
                break;
        case I40E_PHY_TYPE_100BASE_TX:
        case I40E_PHY_TYPE_1000BASE_T:
-       case I40E_PHY_TYPE_2_5GBASE_T:
-       case I40E_PHY_TYPE_5GBASE_T:
+       case I40E_PHY_TYPE_2_5GBASE_T_LINK_STATUS:
+       case I40E_PHY_TYPE_5GBASE_T_LINK_STATUS:
        case I40E_PHY_TYPE_10GBASE_T:
                media = I40E_MEDIA_TYPE_BASET;
                break;
index 040a014..ccd5b94 100644 (file)
@@ -841,8 +841,8 @@ static void i40e_get_settings_link_up(struct i40e_hw *hw,
                                                             10000baseT_Full);
                break;
        case I40E_PHY_TYPE_10GBASE_T:
-       case I40E_PHY_TYPE_5GBASE_T:
-       case I40E_PHY_TYPE_2_5GBASE_T:
+       case I40E_PHY_TYPE_5GBASE_T_LINK_STATUS:
+       case I40E_PHY_TYPE_2_5GBASE_T_LINK_STATUS:
        case I40E_PHY_TYPE_1000BASE_T:
        case I40E_PHY_TYPE_100BASE_TX:
                ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
@@ -1409,7 +1409,8 @@ static int i40e_set_fec_cfg(struct net_device *netdev, u8 fec_cfg)
 
                memset(&config, 0, sizeof(config));
                config.phy_type = abilities.phy_type;
-               config.abilities = abilities.abilities;
+               config.abilities = abilities.abilities |
+                                  I40E_AQ_PHY_ENABLE_ATOMIC_LINK;
                config.phy_type_ext = abilities.phy_type_ext;
                config.link_speed = abilities.link_speed;
                config.eee_capability = abilities.eee_capability;
@@ -5281,7 +5282,6 @@ flags_complete:
                        i40e_aq_cfg_lldp_mib_change_event(&pf->hw, false, NULL);
                        i40e_aq_stop_lldp(&pf->hw, true, false, NULL);
                } else {
-                       i40e_set_lldp_forwarding(pf, false);
                        status = i40e_aq_start_lldp(&pf->hw, false, NULL);
                        if (status) {
                                adq_err = pf->hw.aq.asq_last_status;
index c2d145a..704e474 100644 (file)
@@ -6879,40 +6879,6 @@ out:
 }
 #endif /* CONFIG_I40E_DCB */
 
-/**
- * i40e_set_lldp_forwarding - set forwarding of lldp frames
- * @pf: PF being configured
- * @enable: if forwarding to OS shall be enabled
- *
- * Toggle forwarding of lldp frames behavior,
- * When passing DCB control from firmware to software
- * lldp frames must be forwarded to the software based
- * lldp agent.
- */
-void i40e_set_lldp_forwarding(struct i40e_pf *pf, bool enable)
-{
-       if (pf->lan_vsi == I40E_NO_VSI)
-               return;
-
-       if (!pf->vsi[pf->lan_vsi])
-               return;
-
-       /* No need to check the outcome, commands may fail
-        * if desired value is already set
-        */
-       i40e_aq_add_rem_control_packet_filter(&pf->hw, NULL, ETH_P_LLDP,
-                                             I40E_AQC_ADD_CONTROL_PACKET_FLAGS_TX |
-                                             I40E_AQC_ADD_CONTROL_PACKET_FLAGS_IGNORE_MAC,
-                                             pf->vsi[pf->lan_vsi]->seid, 0,
-                                             enable, NULL, NULL);
-
-       i40e_aq_add_rem_control_packet_filter(&pf->hw, NULL, ETH_P_LLDP,
-                                             I40E_AQC_ADD_CONTROL_PACKET_FLAGS_RX |
-                                             I40E_AQC_ADD_CONTROL_PACKET_FLAGS_IGNORE_MAC,
-                                             pf->vsi[pf->lan_vsi]->seid, 0,
-                                             enable, NULL, NULL);
-}
-
 /**
  * i40e_print_link_message - print link up or down
  * @vsi: the VSI for which link needs a message
@@ -10736,10 +10702,6 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired)
         */
        i40e_add_filter_to_drop_tx_flow_control_frames(&pf->hw,
                                                       pf->main_vsi_seid);
-#ifdef CONFIG_I40E_DCB
-       if (pf->flags & I40E_FLAG_DISABLE_FW_LLDP)
-               i40e_set_lldp_forwarding(pf, true);
-#endif /* CONFIG_I40E_DCB */
 
        /* restart the VSIs that were rebuilt and running before the reset */
        i40e_pf_unquiesce_all_vsi(pf);
@@ -15772,10 +15734,6 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
         */
        i40e_add_filter_to_drop_tx_flow_control_frames(&pf->hw,
                                                       pf->main_vsi_seid);
-#ifdef CONFIG_I40E_DCB
-       if (pf->flags & I40E_FLAG_DISABLE_FW_LLDP)
-               i40e_set_lldp_forwarding(pf, true);
-#endif /* CONFIG_I40E_DCB */
 
        if ((pf->hw.device_id == I40E_DEV_ID_10G_BASE_T) ||
                (pf->hw.device_id == I40E_DEV_ID_10G_BASE_T4))
index 121cd99..de70c16 100644 (file)
@@ -1961,10 +1961,6 @@ static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb,
                                 union i40e_rx_desc *rx_desc)
 
 {
-       /* XDP packets use error pointer so abort at this point */
-       if (IS_ERR(skb))
-               return true;
-
        /* ERR_MASK will only have valid bits if EOP set, and
         * what we are doing here is actually checking
         * I40E_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in
@@ -2534,7 +2530,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
                }
 
                /* exit if we failed to retrieve a buffer */
-               if (!skb) {
+               if (!xdp_res && !skb) {
                        rx_ring->rx_stats.alloc_buff_failed++;
                        rx_buffer->pagecnt_bias++;
                        break;
@@ -2547,7 +2543,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
                if (i40e_is_non_eop(rx_ring, rx_desc))
                        continue;
 
-               if (i40e_cleanup_headers(rx_ring, skb, rx_desc)) {
+               if (xdp_res || i40e_cleanup_headers(rx_ring, skb, rx_desc)) {
                        skb = NULL;
                        continue;
                }
index 5c10faa..c81109a 100644 (file)
@@ -239,11 +239,8 @@ struct i40e_phy_info {
 #define I40E_CAP_PHY_TYPE_25GBASE_ACC BIT_ULL(I40E_PHY_TYPE_25GBASE_ACC + \
                                             I40E_PHY_TYPE_OFFSET)
 /* Offset for 2.5G/5G PHY Types value to bit number conversion */
-#define I40E_PHY_TYPE_OFFSET2 (-10)
-#define I40E_CAP_PHY_TYPE_2_5GBASE_T BIT_ULL(I40E_PHY_TYPE_2_5GBASE_T + \
-                                            I40E_PHY_TYPE_OFFSET2)
-#define I40E_CAP_PHY_TYPE_5GBASE_T BIT_ULL(I40E_PHY_TYPE_5GBASE_T + \
-                                            I40E_PHY_TYPE_OFFSET2)
+#define I40E_CAP_PHY_TYPE_2_5GBASE_T BIT_ULL(I40E_PHY_TYPE_2_5GBASE_T)
+#define I40E_CAP_PHY_TYPE_5GBASE_T BIT_ULL(I40E_PHY_TYPE_5GBASE_T)
 #define I40E_HW_CAP_MAX_GPIO                   30
 /* Capabilities of a PF or a VF or the whole device */
 struct i40e_hw_capabilities {
index 612a7f6..7d7ed02 100644 (file)
@@ -716,7 +716,7 @@ static const struct mlx5e_profile *mlx5_get_profile(struct mlx5_core_dev *mdev)
        return &mlx5i_nic_profile;
 }
 
-static int mlx5_rdma_setup_rn(struct ib_device *ibdev, u8 port_num,
+static int mlx5_rdma_setup_rn(struct ib_device *ibdev, u32 port_num,
                              struct net_device *netdev, void *param)
 {
        struct mlx5_core_dev *mdev = (struct mlx5_core_dev *)param;
index 3f9869c..96ffc0a 100644 (file)
@@ -137,10 +137,10 @@ int mlx5_core_roce_gid_set(struct mlx5_core_dev *dev, unsigned int index,
                }
 
                ether_addr_copy(addr_mac, mac);
-               MLX5_SET_RA(in_addr, roce_version, roce_version);
-               MLX5_SET_RA(in_addr, roce_l3_type, roce_l3_type);
                memcpy(addr_l3_addr, gid, gidsz);
        }
+       MLX5_SET_RA(in_addr, roce_version, roce_version);
+       MLX5_SET_RA(in_addr, roce_l3_type, roce_l3_type);
 
        if (MLX5_CAP_GEN(dev, num_vhca_ports) > 0)
                MLX5_SET(set_roce_address_in, in, vhca_port_num, port_num);
index 37fb2e1..dfea143 100644 (file)
@@ -132,7 +132,7 @@ static int mlxsw_get_cooling_device_idx(struct mlxsw_thermal *thermal,
        /* Allow mlxsw thermal zone binding to an external cooling device */
        for (i = 0; i < ARRAY_SIZE(mlxsw_thermal_external_allowed_cdev); i++) {
                if (strnstr(cdev->type, mlxsw_thermal_external_allowed_cdev[i],
-                           sizeof(cdev->type)))
+                           strlen(cdev->type)))
                        return 0;
        }
 
index 7846a21..1f6bc0c 100644 (file)
@@ -535,6 +535,16 @@ mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table,
        u16 erif_index = 0;
        int err;
 
+       /* Add the eRIF */
+       if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) {
+               erif_index = mlxsw_sp_rif_index(rve->mr_vif->rif);
+               err = mr->mr_ops->route_erif_add(mlxsw_sp,
+                                                rve->mr_route->route_priv,
+                                                erif_index);
+               if (err)
+                       return err;
+       }
+
        /* Update the route action, as the new eVIF can be a tunnel or a pimreg
         * device which will require updating the action.
         */
@@ -544,17 +554,7 @@ mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table,
                                                      rve->mr_route->route_priv,
                                                      route_action);
                if (err)
-                       return err;
-       }
-
-       /* Add the eRIF */
-       if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) {
-               erif_index = mlxsw_sp_rif_index(rve->mr_vif->rif);
-               err = mr->mr_ops->route_erif_add(mlxsw_sp,
-                                                rve->mr_route->route_priv,
-                                                erif_index);
-               if (err)
-                       goto err_route_erif_add;
+                       goto err_route_action_update;
        }
 
        /* Update the minimum MTU */
@@ -572,14 +572,14 @@ mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table,
        return 0;
 
 err_route_min_mtu_update:
-       if (mlxsw_sp_mr_vif_valid(rve->mr_vif))
-               mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv,
-                                          erif_index);
-err_route_erif_add:
        if (route_action != rve->mr_route->route_action)
                mr->mr_ops->route_action_update(mlxsw_sp,
                                                rve->mr_route->route_priv,
                                                rve->mr_route->route_action);
+err_route_action_update:
+       if (mlxsw_sp_mr_vif_valid(rve->mr_vif))
+               mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv,
+                                          erif_index);
        return err;
 }
 
index 3e86fbe..2c89cde 100644 (file)
@@ -4398,20 +4398,6 @@ static void rtl8169_pcierr_interrupt(struct net_device *dev)
        if (net_ratelimit())
                netdev_err(dev, "PCI error (cmd = 0x%04x, status_errs = 0x%04x)\n",
                           pci_cmd, pci_status_errs);
-       /*
-        * The recovery sequence below admits a very elaborated explanation:
-        * - it seems to work;
-        * - I did not see what else could be done;
-        * - it makes iop3xx happy.
-        *
-        * Feel free to adjust to your needs.
-        */
-       if (pdev->broken_parity_status)
-               pci_cmd &= ~PCI_COMMAND_PARITY;
-       else
-               pci_cmd |= PCI_COMMAND_SERR | PCI_COMMAND_PARITY;
-
-       pci_write_config_word(pdev, PCI_COMMAND, pci_cmd);
 
        rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
 }
index 36c8625..c746ca7 100644 (file)
@@ -920,7 +920,7 @@ static void efx_probe_vpd_strings(struct efx_nic *efx)
        }
 
        /* Get the Read only section */
-       ro_start = pci_vpd_find_tag(vpd_data, 0, vpd_size, PCI_VPD_LRDT_RO_DATA);
+       ro_start = pci_vpd_find_tag(vpd_data, vpd_size, PCI_VPD_LRDT_RO_DATA);
        if (ro_start < 0) {
                netif_err(efx, drv, efx->net_dev, "VPD Read-only not found\n");
                return;
index f897999..5e7a57b 100644 (file)
@@ -2800,7 +2800,7 @@ static void ef4_probe_vpd_strings(struct ef4_nic *efx)
        }
 
        /* Get the Read only section */
-       ro_start = pci_vpd_find_tag(vpd_data, 0, vpd_size, PCI_VPD_LRDT_RO_DATA);
+       ro_start = pci_vpd_find_tag(vpd_data, vpd_size, PCI_VPD_LRDT_RO_DATA);
        if (ro_start < 0) {
                netif_err(efx, drv, efx->net_dev, "VPD Read-only not found\n");
                return;
index 95864f0..f35c03c 100644 (file)
@@ -642,6 +642,7 @@ static void dwmac4_set_filter(struct mac_device_info *hw,
        value &= ~GMAC_PACKET_FILTER_PCF;
        value &= ~GMAC_PACKET_FILTER_PM;
        value &= ~GMAC_PACKET_FILTER_PR;
+       value &= ~GMAC_PACKET_FILTER_RA;
        if (dev->flags & IFF_PROMISC) {
                /* VLAN Tag Filter Fail Packets Queuing */
                if (hw->vlan_fail_q_en) {
index a602d16..5be8e6a 100644 (file)
@@ -232,7 +232,7 @@ static void dwmac4_dma_rx_chan_op_mode(void __iomem *ioaddr, int mode,
                                       u32 channel, int fifosz, u8 qmode)
 {
        unsigned int rqs = fifosz / 256 - 1;
-       u32 mtl_rx_op, mtl_rx_int;
+       u32 mtl_rx_op;
 
        mtl_rx_op = readl(ioaddr + MTL_CHAN_RX_OP_MODE(channel));
 
@@ -293,11 +293,6 @@ static void dwmac4_dma_rx_chan_op_mode(void __iomem *ioaddr, int mode,
        }
 
        writel(mtl_rx_op, ioaddr + MTL_CHAN_RX_OP_MODE(channel));
-
-       /* Enable MTL RX overflow */
-       mtl_rx_int = readl(ioaddr + MTL_CHAN_INT_CTRL(channel));
-       writel(mtl_rx_int | MTL_RX_OVERFLOW_INT_EN,
-              ioaddr + MTL_CHAN_INT_CTRL(channel));
 }
 
 static void dwmac4_dma_tx_chan_op_mode(void __iomem *ioaddr, int mode,
index 2cc9175..6d5e0f2 100644 (file)
@@ -564,7 +564,6 @@ struct stmmac_mode_ops {
 #define stmmac_clean_desc3(__priv, __args...) \
        stmmac_do_void_callback(__priv, mode, clean_desc3, __args)
 
-struct stmmac_priv;
 struct tc_cls_u32_offload;
 struct tc_cbs_qopt_offload;
 struct flow_cls_offload;
index a9a984c..345b4c6 100644 (file)
@@ -3180,6 +3180,7 @@ static int stmmac_fpe_start_wq(struct stmmac_priv *priv)
        char *name;
 
        clear_bit(__FPE_TASK_SCHED, &priv->fpe_task_state);
+       clear_bit(__FPE_REMOVING,  &priv->fpe_task_state);
 
        name = priv->wq_name;
        sprintf(name, "%s-fpe", priv->dev->name);
@@ -5586,7 +5587,6 @@ static void stmmac_common_interrupt(struct stmmac_priv *priv)
        /* To handle GMAC own interrupts */
        if ((priv->plat->has_gmac) || xmac) {
                int status = stmmac_host_irq_status(priv, priv->hw, &priv->xstats);
-               int mtl_status;
 
                if (unlikely(status)) {
                        /* For LPI we need to save the tx status */
@@ -5597,17 +5597,8 @@ static void stmmac_common_interrupt(struct stmmac_priv *priv)
                }
 
                for (queue = 0; queue < queues_count; queue++) {
-                       struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
-
-                       mtl_status = stmmac_host_mtl_irq_status(priv, priv->hw,
-                                                               queue);
-                       if (mtl_status != -EINVAL)
-                               status |= mtl_status;
-
-                       if (status & CORE_IRQ_MTL_RX_OVERFLOW)
-                               stmmac_set_rx_tail_ptr(priv, priv->ioaddr,
-                                                      rx_q->rx_tail_addr,
-                                                      queue);
+                       status = stmmac_host_mtl_irq_status(priv, priv->hw,
+                                                           queue);
                }
 
                /* PCS link status */
index 9f06663..e374079 100644 (file)
@@ -211,8 +211,8 @@ static void gsi_irq_setup(struct gsi *gsi)
        iowrite32(0, gsi->virt + GSI_CNTXT_SRC_IEOB_IRQ_MSK_OFFSET);
 
        /* The inter-EE registers are in the non-adjusted address range */
-       iowrite32(0, gsi->virt_raw + GSI_INTER_EE_SRC_CH_IRQ_OFFSET);
-       iowrite32(0, gsi->virt_raw + GSI_INTER_EE_SRC_EV_CH_IRQ_OFFSET);
+       iowrite32(0, gsi->virt_raw + GSI_INTER_EE_SRC_CH_IRQ_MSK_OFFSET);
+       iowrite32(0, gsi->virt_raw + GSI_INTER_EE_SRC_EV_CH_IRQ_MSK_OFFSET);
 
        iowrite32(0, gsi->virt + GSI_CNTXT_GSI_IRQ_EN_OFFSET);
 }
index b4ac025..cb42c5a 100644 (file)
 #define GSI_EE_REG_ADJUST                      0x0000d000      /* IPA v4.5+ */
 
 /* The two inter-EE IRQ register offsets are relative to gsi->virt_raw */
-#define GSI_INTER_EE_SRC_CH_IRQ_OFFSET \
-                       GSI_INTER_EE_N_SRC_CH_IRQ_OFFSET(GSI_EE_AP)
-#define GSI_INTER_EE_N_SRC_CH_IRQ_OFFSET(ee) \
-                       (0x0000c018 + 0x1000 * (ee))
-
-#define GSI_INTER_EE_SRC_EV_CH_IRQ_OFFSET \
-                       GSI_INTER_EE_N_SRC_EV_CH_IRQ_OFFSET(GSI_EE_AP)
-#define GSI_INTER_EE_N_SRC_EV_CH_IRQ_OFFSET(ee) \
-                       (0x0000c01c + 0x1000 * (ee))
+#define GSI_INTER_EE_SRC_CH_IRQ_MSK_OFFSET \
+                       GSI_INTER_EE_N_SRC_CH_IRQ_MSK_OFFSET(GSI_EE_AP)
+#define GSI_INTER_EE_N_SRC_CH_IRQ_MSK_OFFSET(ee) \
+                       (0x0000c020 + 0x1000 * (ee))
+
+#define GSI_INTER_EE_SRC_EV_CH_IRQ_MSK_OFFSET \
+                       GSI_INTER_EE_N_SRC_EV_CH_IRQ_MSK_OFFSET(GSI_EE_AP)
+#define GSI_INTER_EE_N_SRC_EV_CH_IRQ_MSK_OFFSET(ee) \
+                       (0x0000c024 + 0x1000 * (ee))
 
 /* All other register offsets are relative to gsi->virt */
 
index 0b2cccb..e6721c1 100644 (file)
@@ -1088,6 +1088,38 @@ static int m88e1011_set_tunable(struct phy_device *phydev,
        }
 }
 
+static int m88e1112_config_init(struct phy_device *phydev)
+{
+       int err;
+
+       err = m88e1011_set_downshift(phydev, 3);
+       if (err < 0)
+               return err;
+
+       return m88e1111_config_init(phydev);
+}
+
+static int m88e1111gbe_config_init(struct phy_device *phydev)
+{
+       int err;
+
+       err = m88e1111_set_downshift(phydev, 3);
+       if (err < 0)
+               return err;
+
+       return m88e1111_config_init(phydev);
+}
+
+static int marvell_1011gbe_config_init(struct phy_device *phydev)
+{
+       int err;
+
+       err = m88e1011_set_downshift(phydev, 3);
+       if (err < 0)
+               return err;
+
+       return marvell_config_init(phydev);
+}
 static int m88e1116r_config_init(struct phy_device *phydev)
 {
        int err;
@@ -1168,6 +1200,9 @@ static int m88e1510_config_init(struct phy_device *phydev)
                if (err < 0)
                        return err;
        }
+       err = m88e1011_set_downshift(phydev, 3);
+       if (err < 0)
+               return err;
 
        return m88e1318_config_init(phydev);
 }
@@ -1320,6 +1355,9 @@ static int m88e1145_config_init(struct phy_device *phydev)
                if (err < 0)
                        return err;
        }
+       err = m88e1111_set_downshift(phydev, 3);
+       if (err < 0)
+               return err;
 
        err = marvell_of_reg_init(phydev);
        if (err < 0)
@@ -2698,7 +2736,7 @@ static struct phy_driver marvell_drivers[] = {
                .name = "Marvell 88E1112",
                /* PHY_GBIT_FEATURES */
                .probe = marvell_probe,
-               .config_init = m88e1111_config_init,
+               .config_init = m88e1112_config_init,
                .config_aneg = marvell_config_aneg,
                .config_intr = marvell_config_intr,
                .handle_interrupt = marvell_handle_interrupt,
@@ -2718,7 +2756,7 @@ static struct phy_driver marvell_drivers[] = {
                .name = "Marvell 88E1111",
                /* PHY_GBIT_FEATURES */
                .probe = marvell_probe,
-               .config_init = m88e1111_config_init,
+               .config_init = m88e1111gbe_config_init,
                .config_aneg = m88e1111_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -2739,7 +2777,7 @@ static struct phy_driver marvell_drivers[] = {
                .name = "Marvell 88E1111 (Finisar)",
                /* PHY_GBIT_FEATURES */
                .probe = marvell_probe,
-               .config_init = m88e1111_config_init,
+               .config_init = m88e1111gbe_config_init,
                .config_aneg = m88e1111_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -2779,7 +2817,7 @@ static struct phy_driver marvell_drivers[] = {
                .driver_data = DEF_MARVELL_HWMON_OPS(m88e1121_hwmon_ops),
                /* PHY_GBIT_FEATURES */
                .probe = marvell_probe,
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e1121_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -2859,7 +2897,7 @@ static struct phy_driver marvell_drivers[] = {
                .name = "Marvell 88E1240",
                /* PHY_GBIT_FEATURES */
                .probe = marvell_probe,
-               .config_init = m88e1111_config_init,
+               .config_init = m88e1112_config_init,
                .config_aneg = marvell_config_aneg,
                .config_intr = marvell_config_intr,
                .handle_interrupt = marvell_handle_interrupt,
@@ -2929,7 +2967,7 @@ static struct phy_driver marvell_drivers[] = {
                /* PHY_GBIT_FEATURES */
                .flags = PHY_POLL_CABLE_TEST,
                .probe = marvell_probe,
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e1510_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -2955,7 +2993,7 @@ static struct phy_driver marvell_drivers[] = {
                .probe = marvell_probe,
                /* PHY_GBIT_FEATURES */
                .flags = PHY_POLL_CABLE_TEST,
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e1510_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -3000,7 +3038,7 @@ static struct phy_driver marvell_drivers[] = {
                /* PHY_GBIT_FEATURES */
                .flags = PHY_POLL_CABLE_TEST,
                .probe = marvell_probe,
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e6390_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -3026,7 +3064,7 @@ static struct phy_driver marvell_drivers[] = {
                /* PHY_GBIT_FEATURES */
                .flags = PHY_POLL_CABLE_TEST,
                .probe = marvell_probe,
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e6390_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -3052,7 +3090,7 @@ static struct phy_driver marvell_drivers[] = {
                /* PHY_GBIT_FEATURES */
                .flags = PHY_POLL_CABLE_TEST,
                .probe = marvell_probe,
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e1510_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -3077,7 +3115,7 @@ static struct phy_driver marvell_drivers[] = {
                .driver_data = DEF_MARVELL_HWMON_OPS(m88e1510_hwmon_ops),
                .probe = marvell_probe,
                /* PHY_GBIT_FEATURES */
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e1510_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
@@ -3099,7 +3137,7 @@ static struct phy_driver marvell_drivers[] = {
                .driver_data = DEF_MARVELL_HWMON_OPS(m88e1510_hwmon_ops),
                .probe = marvell_probe,
                .features = PHY_GBIT_FIBRE_FEATURES,
-               .config_init = marvell_config_init,
+               .config_init = marvell_1011gbe_config_init,
                .config_aneg = m88e1510_config_aneg,
                .read_status = marvell_read_status,
                .config_intr = marvell_config_intr,
index 7fda2ae..9b6a4a8 100644 (file)
@@ -2870,9 +2870,13 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
 {
        int i;
 
-       vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
-       if (!vi->ctrl)
-               goto err_ctrl;
+       if (vi->has_cvq) {
+               vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
+               if (!vi->ctrl)
+                       goto err_ctrl;
+       } else {
+               vi->ctrl = NULL;
+       }
        vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL);
        if (!vi->sq)
                goto err_sq;
index 4d9dc7d..0720f5f 100644 (file)
@@ -415,7 +415,7 @@ static netdev_tx_t pvc_xmit(struct sk_buff *skb, struct net_device *dev)
 
                if (pad > 0) { /* Pad the frame with zeros */
                        if (__skb_pad(skb, pad, false))
-                               goto out;
+                               goto drop;
                        skb_put(skb, pad);
                }
        }
@@ -448,9 +448,8 @@ static netdev_tx_t pvc_xmit(struct sk_buff *skb, struct net_device *dev)
        return NETDEV_TX_OK;
 
 drop:
-       kfree_skb(skb);
-out:
        dev->stats.tx_dropped++;
+       kfree_skb(skb);
        return NETDEV_TX_OK;
 }
 
index 2a7339b..398390c 100644 (file)
@@ -146,8 +146,8 @@ void iwl_mvm_temp_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb)
        if (mvm->tz_device.tzone) {
                struct iwl_mvm_thermal_device *tz_dev = &mvm->tz_device;
 
-               thermal_notify_framework(tz_dev->tzone,
-                                        tz_dev->fw_trips_index[ths_crossed]);
+               thermal_zone_device_update(tz_dev->tzone,
+                                          THERMAL_TRIP_VIOLATED);
        }
 #endif /* CONFIG_THERMAL */
 }
index 97c2708..51c847d 100644 (file)
@@ -227,6 +227,7 @@ static ssize_t prism2_aux_dump_proc_no_read(struct file *file, char __user *buf,
 
 static const struct proc_ops prism2_aux_dump_proc_ops = {
        .proc_read      = prism2_aux_dump_proc_no_read,
+       .proc_lseek     = default_llseek,
 };
 
 
index 96a03d1..18bd0d9 100644 (file)
@@ -312,11 +312,3 @@ static void __exit orinoco_nortel_exit(void)
 
 module_init(orinoco_nortel_init);
 module_exit(orinoco_nortel_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
index f3c86b0..7e3a6dd 100644 (file)
@@ -255,11 +255,3 @@ static void __exit orinoco_pci_exit(void)
 
 module_init(orinoco_pci_init);
 module_exit(orinoco_pci_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
index 16dada9..73e6ae1 100644 (file)
@@ -360,11 +360,3 @@ static void __exit orinoco_plx_exit(void)
 
 module_init(orinoco_plx_init);
 module_exit(orinoco_plx_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
index 9a9d335..939d5a1 100644 (file)
@@ -235,11 +235,3 @@ static void __exit orinoco_tmd_exit(void)
 
 module_init(orinoco_tmd_init);
 module_exit(orinoco_tmd_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
index 41aa1f0..18a267d 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/highmem.h>
 #include <linux/debugfs.h>
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/mutex.h>
index 7daac79..ed10a8b 100644 (file)
@@ -8,6 +8,7 @@
  */
 
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/hdreg.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
index b6f7815..522c9b2 100644 (file)
@@ -576,6 +576,11 @@ static void nvme_free_ns(struct kref *kref)
        kfree(ns);
 }
 
+static inline bool nvme_get_ns(struct nvme_ns *ns)
+{
+       return kref_get_unless_zero(&ns->kref);
+}
+
 void nvme_put_ns(struct nvme_ns *ns)
 {
        kref_put(&ns->kref, nvme_free_ns);
@@ -584,9 +589,6 @@ EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
 
 static inline void nvme_clear_nvme_request(struct request *req)
 {
-       struct nvme_command *cmd = nvme_req(req)->cmd;
-
-       memset(cmd, 0, sizeof(*cmd));
        nvme_req(req)->retries = 0;
        nvme_req(req)->flags = 0;
        req->rq_flags |= RQF_DONTPREP;
@@ -637,6 +639,66 @@ static struct request *nvme_alloc_request_qid(struct request_queue *q,
        return req;
 }
 
+/*
+ * For something we're not in a state to send to the device the default action
+ * is to busy it and retry it after the controller state is recovered.  However,
+ * if the controller is deleting or if anything is marked for failfast or
+ * nvme multipath it is immediately failed.
+ *
+ * Note: commands used to initialize the controller will be marked for failfast.
+ * Note: nvme cli/ioctl commands are marked for failfast.
+ */
+blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
+               struct request *rq)
+{
+       if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
+           ctrl->state != NVME_CTRL_DEAD &&
+           !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
+           !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
+               return BLK_STS_RESOURCE;
+       return nvme_host_path_error(rq);
+}
+EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
+
+bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
+               bool queue_live)
+{
+       struct nvme_request *req = nvme_req(rq);
+
+       /*
+        * currently we have a problem sending passthru commands
+        * on the admin_q if the controller is not LIVE because we can't
+        * make sure that they are going out after the admin connect,
+        * controller enable and/or other commands in the initialization
+        * sequence. until the controller will be LIVE, fail with
+        * BLK_STS_RESOURCE so that they will be rescheduled.
+        */
+       if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
+               return false;
+
+       if (ctrl->ops->flags & NVME_F_FABRICS) {
+               /*
+                * Only allow commands on a live queue, except for the connect
+                * command, which is require to set the queue live in the
+                * appropinquate states.
+                */
+               switch (ctrl->state) {
+               case NVME_CTRL_CONNECTING:
+                       if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
+                           req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
+                               return true;
+                       break;
+               default:
+                       break;
+               case NVME_CTRL_DEAD:
+                       return false;
+               }
+       }
+
+       return queue_live;
+}
+EXPORT_SYMBOL_GPL(__nvme_check_ready);
+
 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
 {
        struct nvme_command c;
@@ -898,8 +960,10 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
        struct nvme_command *cmd = nvme_req(req)->cmd;
        blk_status_t ret = BLK_STS_OK;
 
-       if (!(req->rq_flags & RQF_DONTPREP))
+       if (!(req->rq_flags & RQF_DONTPREP)) {
                nvme_clear_nvme_request(req);
+               memset(cmd, 0, sizeof(*cmd));
+       }
 
        switch (req_op(req)) {
        case REQ_OP_DRV_IN:
@@ -1494,7 +1558,7 @@ static int nvme_ns_open(struct nvme_ns *ns)
        /* should never be called due to GENHD_FL_HIDDEN */
        if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
                goto fail;
-       if (!kref_get_unless_zero(&ns->kref))
+       if (!nvme_get_ns(ns))
                goto fail;
        if (!try_module_get(ns->ctrl->ops->module))
                goto fail_put_ns;
@@ -1999,28 +2063,6 @@ static const struct block_device_operations nvme_bdev_ops = {
        .pr_ops         = &nvme_pr_ops,
 };
 
-#ifdef CONFIG_NVME_MULTIPATH
-struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys)
-{
-       struct nvme_ctrl *ctrl;
-       int ret;
-
-       ret = mutex_lock_killable(&nvme_subsystems_lock);
-       if (ret)
-               return ERR_PTR(ret);
-       list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
-               if (ctrl->state == NVME_CTRL_LIVE)
-                       goto found;
-       }
-       mutex_unlock(&nvme_subsystems_lock);
-       return ERR_PTR(-EWOULDBLOCK);
-found:
-       nvme_get_ctrl(ctrl);
-       mutex_unlock(&nvme_subsystems_lock);
-       return ctrl;
-}
-#endif /* CONFIG_NVME_MULTIPATH */
-
 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
 {
        unsigned long timeout =
@@ -3604,7 +3646,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
        down_read(&ctrl->namespaces_rwsem);
        list_for_each_entry(ns, &ctrl->namespaces, list) {
                if (ns->head->ns_id == nsid) {
-                       if (!kref_get_unless_zero(&ns->kref))
+                       if (!nvme_get_ns(ns))
                                continue;
                        ret = ns;
                        break;
index 13c2747..a2bb7fc 100644 (file)
@@ -533,63 +533,6 @@ static struct nvmf_transport_ops *nvmf_lookup_transport(
        return NULL;
 }
 
-/*
- * For something we're not in a state to send to the device the default action
- * is to busy it and retry it after the controller state is recovered.  However,
- * if the controller is deleting or if anything is marked for failfast or
- * nvme multipath it is immediately failed.
- *
- * Note: commands used to initialize the controller will be marked for failfast.
- * Note: nvme cli/ioctl commands are marked for failfast.
- */
-blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
-               struct request *rq)
-{
-       if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
-           ctrl->state != NVME_CTRL_DEAD &&
-           !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
-           !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
-               return BLK_STS_RESOURCE;
-       return nvme_host_path_error(rq);
-}
-EXPORT_SYMBOL_GPL(nvmf_fail_nonready_command);
-
-bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
-               bool queue_live)
-{
-       struct nvme_request *req = nvme_req(rq);
-
-       /*
-        * currently we have a problem sending passthru commands
-        * on the admin_q if the controller is not LIVE because we can't
-        * make sure that they are going out after the admin connect,
-        * controller enable and/or other commands in the initialization
-        * sequence. until the controller will be LIVE, fail with
-        * BLK_STS_RESOURCE so that they will be rescheduled.
-        */
-       if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
-               return false;
-
-       /*
-        * Only allow commands on a live queue, except for the connect command,
-        * which is require to set the queue live in the appropinquate states.
-        */
-       switch (ctrl->state) {
-       case NVME_CTRL_CONNECTING:
-               if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
-                   req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
-                       return true;
-               break;
-       default:
-               break;
-       case NVME_CTRL_DEAD:
-               return false;
-       }
-
-       return queue_live;
-}
-EXPORT_SYMBOL_GPL(__nvmf_check_ready);
-
 static const match_table_t opt_tokens = {
        { NVMF_OPT_TRANSPORT,           "transport=%s"          },
        { NVMF_OPT_TRADDR,              "traddr=%s"             },
index 888b108..d7f7974 100644 (file)
@@ -184,20 +184,7 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
 void nvmf_free_options(struct nvmf_ctrl_options *opts);
 int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
 bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
-blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
-               struct request *rq);
-bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
-               bool queue_live);
 bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
                struct nvmf_ctrl_options *opts);
 
-static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
-               bool queue_live)
-{
-       if (likely(ctrl->state == NVME_CTRL_LIVE ||
-                  ctrl->state == NVME_CTRL_DELETING))
-               return true;
-       return __nvmf_check_ready(ctrl, rq, queue_live);
-}
-
 #endif /* _NVME_FABRICS_H */
index 9b9b7be..d9ab9e7 100644 (file)
@@ -2766,8 +2766,8 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
        blk_status_t ret;
 
        if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE ||
-           !nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
-               return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
+           !nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
+               return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
 
        ret = nvme_setup_cmd(ns, rq);
        if (ret)
index 502f8e4..9557ead 100644 (file)
@@ -370,41 +370,45 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 }
 
 #ifdef CONFIG_NVME_MULTIPATH
-static int nvme_ns_head_ctrl_ioctl(struct nvme_ns_head *head,
-               unsigned int cmd, void __user *argp)
+static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
+               void __user *argp, struct nvme_ns_head *head, int srcu_idx)
 {
-       struct nvme_ctrl *ctrl = nvme_find_get_live_ctrl(head->subsys);
+       struct nvme_ctrl *ctrl = ns->ctrl;
        int ret;
 
-       if (IS_ERR(ctrl))
-               return PTR_ERR(ctrl);
-       ret = nvme_ctrl_ioctl(ctrl, cmd, argp);
-       nvme_put_ctrl(ctrl);
-       return ret;
-}
+       nvme_get_ctrl(ns->ctrl);
+       nvme_put_ns_from_disk(head, srcu_idx);
+       ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp);
 
-static int nvme_ns_head_ns_ioctl(struct nvme_ns_head *head,
-               unsigned int cmd, void __user *argp)
-{
-       int srcu_idx = srcu_read_lock(&head->srcu);
-       struct nvme_ns *ns = nvme_find_path(head);
-       int ret = -EWOULDBLOCK;
-
-       if (ns)
-               ret = nvme_ns_ioctl(ns, cmd, argp);
-       srcu_read_unlock(&head->srcu, srcu_idx);
+       nvme_put_ctrl(ctrl);
        return ret;
 }
 
 int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode,
                unsigned int cmd, unsigned long arg)
 {
-       struct nvme_ns_head *head = bdev->bd_disk->private_data;
+       struct nvme_ns_head *head = NULL;
        void __user *argp = (void __user *)arg;
+       struct nvme_ns *ns;
+       int srcu_idx, ret;
+
+       ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+       if (unlikely(!ns))
+               return -EWOULDBLOCK;
 
+       /*
+        * Handle ioctls that apply to the controller instead of the namespace
+        * seperately and drop the ns SRCU reference early.  This avoids a
+        * deadlock when deleting namespaces using the passthrough interface.
+        */
        if (is_ctrl_ioctl(cmd))
-               return nvme_ns_head_ctrl_ioctl(head, cmd, argp);
-       return nvme_ns_head_ns_ioctl(head, cmd, argp);
+               ret = nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
+       else {
+               ret = nvme_ns_ioctl(ns, cmd, argp);
+               nvme_put_ns_from_disk(head, srcu_idx);
+       }
+
+       return ret;
 }
 
 long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
@@ -414,10 +418,23 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
        struct nvme_ns_head *head =
                container_of(cdev, struct nvme_ns_head, cdev);
        void __user *argp = (void __user *)arg;
+       struct nvme_ns *ns;
+       int srcu_idx, ret;
+
+       srcu_idx = srcu_read_lock(&head->srcu);
+       ns = nvme_find_path(head);
+       if (!ns) {
+               srcu_read_unlock(&head->srcu, srcu_idx);
+               return -EWOULDBLOCK;
+       }
 
        if (is_ctrl_ioctl(cmd))
-               return nvme_ns_head_ctrl_ioctl(head, cmd, argp);
-       return nvme_ns_head_ns_ioctl(head, cmd, argp);
+               return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
+
+       ret = nvme_ns_ioctl(ns, cmd, argp);
+       nvme_put_ns_from_disk(head, srcu_idx);
+
+       return ret;
 }
 #endif /* CONFIG_NVME_MULTIPATH */
 
index 0d0de34..0551796 100644 (file)
@@ -70,6 +70,7 @@ void nvme_failover_req(struct request *req)
        struct nvme_ns *ns = req->q->queuedata;
        u16 status = nvme_req(req)->status & 0x7ff;
        unsigned long flags;
+       struct bio *bio;
 
        nvme_mpath_clear_current_path(ns);
 
@@ -84,6 +85,8 @@ void nvme_failover_req(struct request *req)
        }
 
        spin_lock_irqsave(&ns->head->requeue_lock, flags);
+       for (bio = req->bio; bio; bio = bio->bi_next)
+               bio_set_dev(bio, ns->head->disk->part0);
        blk_steal_bios(&ns->head->requeue_list, req);
        spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
 
index 773dde5..05f31a2 100644 (file)
@@ -638,6 +638,21 @@ struct request *nvme_alloc_request(struct request_queue *q,
                struct nvme_command *cmd, blk_mq_req_flags_t flags);
 void nvme_cleanup_cmd(struct request *req);
 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req);
+blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
+               struct request *req);
+bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
+               bool queue_live);
+
+static inline bool nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
+               bool queue_live)
+{
+       if (likely(ctrl->state == NVME_CTRL_LIVE))
+               return true;
+       if (ctrl->ops->flags & NVME_F_FABRICS &&
+           ctrl->state == NVME_CTRL_DELETING)
+               return true;
+       return __nvme_check_ready(ctrl, rq, queue_live);
+}
 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
                void *buf, unsigned bufflen);
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
@@ -664,7 +679,6 @@ struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
 void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
 bool nvme_tryget_ns_head(struct nvme_ns_head *head);
 void nvme_put_ns_head(struct nvme_ns_head *head);
-struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys);
 int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
                const struct file_operations *fops, struct module *owner);
 void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device);
index 09d4c5f..a29b170 100644 (file)
@@ -933,6 +933,9 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
        if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
                return BLK_STS_IOERR;
 
+       if (!nvme_check_ready(&dev->ctrl, req, true))
+               return nvme_fail_nonready_command(&dev->ctrl, req);
+
        ret = nvme_setup_cmd(ns, req);
        if (ret)
                return ret;
index 660c774..37943dc 100644 (file)
@@ -2050,8 +2050,8 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 
        WARN_ON_ONCE(rq->tag < 0);
 
-       if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
-               return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
+       if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
+               return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
 
        dev = queue->device->dev;
 
index 75435cd..0222e23 100644 (file)
@@ -2338,8 +2338,8 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
        bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
        blk_status_t ret;
 
-       if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
-               return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
+       if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
+               return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
 
        ret = nvme_tcp_setup_cmd_pdu(ns, rq);
        if (unlikely(ret))
index d2a26ff..e7a367c 100644 (file)
@@ -307,7 +307,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
        case NVME_LOG_ANA:
                return nvmet_execute_get_log_page_ana(req);
        }
-       pr_err("unhandled lid %d on qid %d\n",
+       pr_debug("unhandled lid %d on qid %d\n",
               req->cmd->get_log_page.lid, req->sq->qid);
        req->error_loc = offsetof(struct nvme_get_log_page_command, lid);
        nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR);
@@ -659,7 +659,7 @@ static void nvmet_execute_identify(struct nvmet_req *req)
                return nvmet_execute_identify_desclist(req);
        }
 
-       pr_err("unhandled identify cns %d on qid %d\n",
+       pr_debug("unhandled identify cns %d on qid %d\n",
               req->cmd->identify.cns, req->sq->qid);
        req->error_loc = offsetof(struct nvme_identify, cns);
        nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR);
@@ -977,7 +977,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
                return 0;
        }
 
-       pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
+       pr_debug("unhandled cmd %d on qid %d\n", cmd->common.opcode,
               req->sq->qid);
        req->error_loc = offsetof(struct nvme_common_command, opcode);
        return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
index 6665da3..74b3b15 100644 (file)
@@ -138,8 +138,8 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
        bool queue_ready = test_bit(NVME_LOOP_Q_LIVE, &queue->flags);
        blk_status_t ret;
 
-       if (!nvmf_check_ready(&queue->ctrl->ctrl, req, queue_ready))
-               return nvmf_fail_nonready_command(&queue->ctrl->ctrl, req);
+       if (!nvme_check_ready(&queue->ctrl->ctrl, req, queue_ready))
+               return nvme_fail_nonready_command(&queue->ctrl->ctrl, req);
 
        ret = nvme_setup_cmd(ns, req);
        if (ret)
index 180c6fb..d80160c 100644 (file)
@@ -1024,7 +1024,6 @@ int of_overlay_fdt_apply(const void *overlay_fdt, u32 overlay_fdt_size,
        struct device_node *overlay_root = NULL;
 
        *ovcs_id = 0;
-       ret = 0;
 
        if (overlay_fdt_size < sizeof(struct fdt_header) ||
            fdt_check_header(overlay_fdt)) {
@@ -1195,8 +1194,6 @@ int of_overlay_remove(int *ovcs_id)
        struct overlay_changeset *ovcs;
        int ret, ret_apply, ret_tmp;
 
-       ret = 0;
-
        if (devicetree_corrupt()) {
                pr_err("suspect devicetree state, refuse to remove overlay\n");
                ret = -EBUSY;
index 48b084e..0919ed9 100644 (file)
@@ -2224,15 +2224,3 @@ MODULE_PARM_DESC(features,
                 ", bit 2: hardware SPP mode"
                 ", bit 3: hardware EPP mode"
                 ", bit 4: hardware ECP mode");
-
-/*--- Inform (X)Emacs about preferred coding style ---------------------*/
-/*
- * Local Variables:
- * mode: c
- * c-file-style: "linux"
- * indent-tabs-mode: t
- * tab-width: 8
- * fill-column: 78
- * ispell-local-dictionary: "american"
- * End:
- */
index 0d37194..6d7d649 100644 (file)
@@ -480,7 +480,7 @@ EXPORT_SYMBOL_GPL(pci_pasid_features);
 #define PASID_NUMBER_SHIFT     8
 #define PASID_NUMBER_MASK      (0x1f << PASID_NUMBER_SHIFT)
 /**
- * pci_max_pasid - Get maximum number of PASIDs supported by device
+ * pci_max_pasids - Get maximum number of PASIDs supported by device
  * @pdev: PCI device structure
  *
  * Returns negative value when PASID capability is not present.
index 5aa8977..2f2c8a1 100644 (file)
@@ -41,7 +41,6 @@ config PCI_TEGRA
        bool "NVIDIA Tegra PCIe controller"
        depends on ARCH_TEGRA || COMPILE_TEST
        depends on PCI_MSI_IRQ_DOMAIN
-       select PCI_MSI_ARCH_FALLBACKS
        help
          Say Y here if you want support for the PCIe host controller found
          on NVIDIA Tegra SoCs.
@@ -59,7 +58,6 @@ config PCIE_RCAR_HOST
        bool "Renesas R-Car PCIe host controller"
        depends on ARCH_RENESAS || COMPILE_TEST
        depends on PCI_MSI_IRQ_DOMAIN
-       select PCI_MSI_ARCH_FALLBACKS
        help
          Say Y here if you want PCIe controller support on R-Car SoCs in host
          mode.
@@ -88,7 +86,7 @@ config PCI_HOST_GENERIC
 config PCIE_XILINX
        bool "Xilinx AXI PCIe host bridge support"
        depends on OF || COMPILE_TEST
-       select PCI_MSI_ARCH_FALLBACKS
+       depends on PCI_MSI_IRQ_DOMAIN
        help
          Say 'Y' here if you want kernel to support the Xilinx AXI PCIe
          Host Bridge driver.
@@ -233,6 +231,19 @@ config PCIE_MEDIATEK
          Say Y here if you want to enable PCIe controller support on
          MediaTek SoCs.
 
+config PCIE_MEDIATEK_GEN3
+       tristate "MediaTek Gen3 PCIe controller"
+       depends on ARCH_MEDIATEK || COMPILE_TEST
+       depends on PCI_MSI_IRQ_DOMAIN
+       help
+         Adds support for PCIe Gen3 MAC controller for MediaTek SoCs.
+         This PCIe controller is compatible with Gen3, Gen2 and Gen1 speed,
+         and support up to 256 MSI interrupt numbers for
+         multi-function devices.
+
+         Say Y here if you want to enable Gen3 PCIe controller support on
+         MediaTek SoCs.
+
 config VMD
        depends on PCI_MSI && X86_64 && SRCU
        tristate "Intel Volume Management Device Driver"
index e4559f2..63e3880 100644 (file)
@@ -11,10 +11,13 @@ obj-$(CONFIG_PCIE_RCAR_HOST) += pcie-rcar.o pcie-rcar-host.o
 obj-$(CONFIG_PCIE_RCAR_EP) += pcie-rcar.o pcie-rcar-ep.o
 obj-$(CONFIG_PCI_HOST_COMMON) += pci-host-common.o
 obj-$(CONFIG_PCI_HOST_GENERIC) += pci-host-generic.o
+obj-$(CONFIG_PCI_HOST_THUNDER_ECAM) += pci-thunder-ecam.o
+obj-$(CONFIG_PCI_HOST_THUNDER_PEM) += pci-thunder-pem.o
 obj-$(CONFIG_PCIE_XILINX) += pcie-xilinx.o
 obj-$(CONFIG_PCIE_XILINX_NWL) += pcie-xilinx-nwl.o
 obj-$(CONFIG_PCIE_XILINX_CPM) += pcie-xilinx-cpm.o
 obj-$(CONFIG_PCI_V3_SEMI) += pci-v3-semi.o
+obj-$(CONFIG_PCI_XGENE) += pci-xgene.o
 obj-$(CONFIG_PCI_XGENE_MSI) += pci-xgene-msi.o
 obj-$(CONFIG_PCI_VERSATILE) += pci-versatile.o
 obj-$(CONFIG_PCIE_IPROC) += pcie-iproc.o
@@ -27,6 +30,7 @@ obj-$(CONFIG_PCIE_ROCKCHIP) += pcie-rockchip.o
 obj-$(CONFIG_PCIE_ROCKCHIP_EP) += pcie-rockchip-ep.o
 obj-$(CONFIG_PCIE_ROCKCHIP_HOST) += pcie-rockchip-host.o
 obj-$(CONFIG_PCIE_MEDIATEK) += pcie-mediatek.o
+obj-$(CONFIG_PCIE_MEDIATEK_GEN3) += pcie-mediatek-gen3.o
 obj-$(CONFIG_PCIE_MICROCHIP_HOST) += pcie-microchip-host.o
 obj-$(CONFIG_VMD) += vmd.o
 obj-$(CONFIG_PCIE_BRCMSTB) += pcie-brcmstb.o
@@ -47,8 +51,10 @@ obj-y                                += mobiveil/
 # ARM64 and use internal ifdefs to only build the pieces we need
 # depending on whether ACPI, the DT driver, or both are enabled.
 
-ifdef CONFIG_PCI
+ifdef CONFIG_ACPI
+ifdef CONFIG_PCI_QUIRKS
 obj-$(CONFIG_ARM64) += pci-thunder-ecam.o
 obj-$(CONFIG_ARM64) += pci-thunder-pem.o
 obj-$(CONFIG_ARM64) += pci-xgene.o
 endif
+endif
index 849f1e4..35e6104 100644 (file)
@@ -1,11 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
-/**
+/*
  * pci-j721e - PCIe controller driver for TI's J721E SoCs
  *
  * Copyright (C) 2020 Texas Instruments Incorporated - http://www.ti.com
  * Author: Kishon Vijay Abraham I <kishon@ti.com>
  */
 
+#include <linux/clk.h>
 #include <linux/delay.h>
 #include <linux/gpio/consumer.h>
 #include <linux/io.h>
@@ -50,6 +51,7 @@ enum link_status {
 
 struct j721e_pcie {
        struct device           *dev;
+       struct clk              *refclk;
        u32                     mode;
        u32                     num_lanes;
        struct cdns_pcie        *cdns_pcie;
@@ -312,6 +314,7 @@ static int j721e_pcie_probe(struct platform_device *pdev)
        struct cdns_pcie_ep *ep;
        struct gpio_desc *gpiod;
        void __iomem *base;
+       struct clk *clk;
        u32 num_lanes;
        u32 mode;
        int ret;
@@ -411,6 +414,20 @@ static int j721e_pcie_probe(struct platform_device *pdev)
                        goto err_get_sync;
                }
 
+               clk = devm_clk_get_optional(dev, "pcie_refclk");
+               if (IS_ERR(clk)) {
+                       ret = PTR_ERR(clk);
+                       dev_err(dev, "failed to get pcie_refclk\n");
+                       goto err_pcie_setup;
+               }
+
+               ret = clk_prepare_enable(clk);
+               if (ret) {
+                       dev_err(dev, "failed to enable pcie_refclk\n");
+                       goto err_get_sync;
+               }
+               pcie->refclk = clk;
+
                /*
                 * "Power Sequencing and Reset Signal Timings" table in
                 * PCI EXPRESS CARD ELECTROMECHANICAL SPECIFICATION, REV. 3.0
@@ -425,8 +442,10 @@ static int j721e_pcie_probe(struct platform_device *pdev)
                }
 
                ret = cdns_pcie_host_setup(rc);
-               if (ret < 0)
+               if (ret < 0) {
+                       clk_disable_unprepare(pcie->refclk);
                        goto err_pcie_setup;
+               }
 
                break;
        case PCI_MODE_EP:
@@ -479,6 +498,7 @@ static int j721e_pcie_remove(struct platform_device *pdev)
        struct cdns_pcie *cdns_pcie = pcie->cdns_pcie;
        struct device *dev = &pdev->dev;
 
+       clk_disable_unprepare(pcie->refclk);
        cdns_pcie_disable_phy(cdns_pcie);
        pm_runtime_put(dev);
        pm_runtime_disable(dev);
index 22c5529..423d358 100644 (file)
@@ -280,7 +280,7 @@ config PCIE_TEGRA194_EP
        select PCIE_TEGRA194
        help
          Enables support for the PCIe controller in the NVIDIA Tegra194 SoC to
-         work in host mode. There are two instances of PCIe controllers in
+         work in endpoint mode. There are two instances of PCIe controllers in
          Tegra194. This controller can work either as EP or RC. In order to
          enable host-specific features PCIE_TEGRA194_HOST must be selected and
          in order to enable device-specific features PCIE_TEGRA194_EP must be
@@ -311,6 +311,7 @@ config PCIE_AL
        depends on OF && (ARM64 || COMPILE_TEST)
        depends on PCI_MSI_IRQ_DOMAIN
        select PCIE_DW_HOST
+       select PCI_ECAM
        help
          Say Y here to enable support of the Amazon's Annapurna Labs PCIe
          controller IP on Amazon SoCs. The PCIe controller uses the DesignWare
@@ -318,4 +319,13 @@ config PCIE_AL
          required only for DT-based platforms. ACPI platforms with the
          Annapurna Labs PCIe controller don't need to enable this.
 
+config PCIE_FU740
+       bool "SiFive FU740 PCIe host controller"
+       depends on PCI_MSI_IRQ_DOMAIN
+       depends on SOC_SIFIVE || COMPILE_TEST
+       select PCIE_DW_HOST
+       help
+         Say Y here if you want PCIe controller support for the SiFive
+         FU740.
+
 endmenu
index a751553..eca805c 100644 (file)
@@ -5,6 +5,7 @@ obj-$(CONFIG_PCIE_DW_EP) += pcie-designware-ep.o
 obj-$(CONFIG_PCIE_DW_PLAT) += pcie-designware-plat.o
 obj-$(CONFIG_PCI_DRA7XX) += pci-dra7xx.o
 obj-$(CONFIG_PCI_EXYNOS) += pci-exynos.o
+obj-$(CONFIG_PCIE_FU740) += pcie-fu740.o
 obj-$(CONFIG_PCI_IMX6) += pci-imx6.o
 obj-$(CONFIG_PCIE_SPEAR13XX) += pcie-spear13xx.o
 obj-$(CONFIG_PCI_KEYSTONE) += pci-keystone.o
@@ -17,7 +18,6 @@ obj-$(CONFIG_PCIE_INTEL_GW) += pcie-intel-gw.o
 obj-$(CONFIG_PCIE_KIRIN) += pcie-kirin.o
 obj-$(CONFIG_PCIE_HISI_STB) += pcie-histb.o
 obj-$(CONFIG_PCI_MESON) += pci-meson.o
-obj-$(CONFIG_PCIE_TEGRA194) += pcie-tegra194.o
 obj-$(CONFIG_PCIE_UNIPHIER) += pcie-uniphier.o
 obj-$(CONFIG_PCIE_UNIPHIER_EP) += pcie-uniphier-ep.o
 
@@ -31,7 +31,13 @@ obj-$(CONFIG_PCIE_UNIPHIER_EP) += pcie-uniphier-ep.o
 # ARM64 and use internal ifdefs to only build the pieces we need
 # depending on whether ACPI, the DT driver, or both are enabled.
 
-ifdef CONFIG_PCI
+obj-$(CONFIG_PCIE_AL) += pcie-al.o
+obj-$(CONFIG_PCI_HISI) += pcie-hisi.o
+
+ifdef CONFIG_ACPI
+ifdef CONFIG_PCI_QUIRKS
 obj-$(CONFIG_ARM64) += pcie-al.o
 obj-$(CONFIG_ARM64) += pcie-hisi.o
+obj-$(CONFIG_ARM64) += pcie-tegra194.o
+endif
 endif
index 53aa35c..bde3b28 100644 (file)
@@ -346,8 +346,9 @@ static const struct irq_domain_ops ks_pcie_legacy_irq_domain_ops = {
 };
 
 /**
- * ks_pcie_set_dbi_mode() - Set DBI mode to access overlaid BAR mask
- * registers
+ * ks_pcie_set_dbi_mode() - Set DBI mode to access overlaid BAR mask registers
+ * @ks_pcie: A pointer to the keystone_pcie structure which holds the KeyStone
+ *          PCIe host controller driver information.
  *
  * Since modification of dbi_cs2 involves different clock domain, read the
  * status back to ensure the transition is complete.
@@ -367,6 +368,8 @@ static void ks_pcie_set_dbi_mode(struct keystone_pcie *ks_pcie)
 
 /**
  * ks_pcie_clear_dbi_mode() - Disable DBI mode
+ * @ks_pcie: A pointer to the keystone_pcie structure which holds the KeyStone
+ *          PCIe host controller driver information.
  *
  * Since modification of dbi_cs2 involves different clock domain, read the
  * status back to ensure the transition is complete.
@@ -449,6 +452,7 @@ static struct pci_ops ks_child_pcie_ops = {
 
 /**
  * ks_pcie_v3_65_add_bus() - keystone add_bus post initialization
+ * @bus: A pointer to the PCI bus structure.
  *
  * This sets BAR0 to enable inbound access for MSI_IRQ register
  */
@@ -488,6 +492,8 @@ static struct pci_ops ks_pcie_ops = {
 
 /**
  * ks_pcie_link_up() - Check if link up
+ * @pci: A pointer to the dw_pcie structure which holds the DesignWare PCIe host
+ *      controller driver information.
  */
 static int ks_pcie_link_up(struct dw_pcie *pci)
 {
@@ -605,7 +611,6 @@ static void ks_pcie_msi_irq_handler(struct irq_desc *desc)
 
 /**
  * ks_pcie_legacy_irq_handler() - Handle legacy interrupt
- * @irq: IRQ line for legacy interrupts
  * @desc: Pointer to irq descriptor
  *
  * Traverse through pending legacy interrupts and invoke handler for each. Also
@@ -798,7 +803,8 @@ static int __init ks_pcie_host_init(struct pcie_port *pp)
        int ret;
 
        pp->bridge->ops = &ks_pcie_ops;
-       pp->bridge->child_ops = &ks_child_pcie_ops;
+       if (!ks_pcie->is_am6)
+               pp->bridge->child_ops = &ks_child_pcie_ops;
 
        ret = ks_pcie_config_legacy_irq(ks_pcie);
        if (ret)
index 39fe2ed..39f4664 100644 (file)
@@ -154,7 +154,7 @@ static int __init ls_pcie_ep_probe(struct platform_device *pdev)
        pci->dev = dev;
        pci->ops = pcie->drvdata->dw_pcie_ops;
 
-       ls_epc->bar_fixed_64bit = (1 << BAR_2) | (1 << BAR_4),
+       ls_epc->bar_fixed_64bit = (1 << BAR_2) | (1 << BAR_4);
 
        pcie->pci = pci;
        pcie->ls_epc = ls_epc;
index 1c25d83..8d028a8 100644 (file)
@@ -705,6 +705,8 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep)
                }
        }
 
+       dw_pcie_iatu_detect(pci);
+
        res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "addr_space");
        if (!res)
                return -EINVAL;
index 7e55b2b..a608ae1 100644 (file)
@@ -398,9 +398,9 @@ int dw_pcie_host_init(struct pcie_port *pp)
                if (ret)
                        goto err_free_msi;
        }
+       dw_pcie_iatu_detect(pci);
 
        dw_pcie_setup_rc(pp);
-       dw_pcie_msi_init(pp);
 
        if (!dw_pcie_link_up(pci) && pci->ops && pci->ops->start_link) {
                ret = pci->ops->start_link(pci);
@@ -551,6 +551,8 @@ void dw_pcie_setup_rc(struct pcie_port *pp)
                }
        }
 
+       dw_pcie_msi_init(pp);
+
        /* Setup RC BARs */
        dw_pcie_writel_dbi(pci, PCI_BASE_ADDRESS_0, 0x00000004);
        dw_pcie_writel_dbi(pci, PCI_BASE_ADDRESS_1, 0x00000000);
index 004cb86..a945f0c 100644 (file)
@@ -660,11 +660,9 @@ static void dw_pcie_iatu_detect_regions(struct dw_pcie *pci)
        pci->num_ob_windows = ob;
 }
 
-void dw_pcie_setup(struct dw_pcie *pci)
+void dw_pcie_iatu_detect(struct dw_pcie *pci)
 {
-       u32 val;
        struct device *dev = pci->dev;
-       struct device_node *np = dev->of_node;
        struct platform_device *pdev = to_platform_device(dev);
 
        if (pci->version >= 0x480A || (!pci->version &&
@@ -693,6 +691,13 @@ void dw_pcie_setup(struct dw_pcie *pci)
 
        dev_info(pci->dev, "Detected iATU regions: %u outbound, %u inbound",
                 pci->num_ob_windows, pci->num_ib_windows);
+}
+
+void dw_pcie_setup(struct dw_pcie *pci)
+{
+       u32 val;
+       struct device *dev = pci->dev;
+       struct device_node *np = dev->of_node;
 
        if (pci->link_gen > 0)
                dw_pcie_link_set_max_speed(pci, pci->link_gen);
index 7247c8b..7d6e9b7 100644 (file)
@@ -306,6 +306,7 @@ int dw_pcie_prog_inbound_atu(struct dw_pcie *pci, u8 func_no, int index,
 void dw_pcie_disable_atu(struct dw_pcie *pci, int index,
                         enum dw_pcie_region_type type);
 void dw_pcie_setup(struct dw_pcie *pci);
+void dw_pcie_iatu_detect(struct dw_pcie *pci);
 
 static inline void dw_pcie_writel_dbi(struct dw_pcie *pci, u32 reg, u32 val)
 {
diff --git a/drivers/pci/controller/dwc/pcie-fu740.c b/drivers/pci/controller/dwc/pcie-fu740.c
new file mode 100644 (file)
index 0000000..00cde9a
--- /dev/null
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FU740 DesignWare PCIe Controller integration
+ * Copyright (C) 2019-2021 SiFive, Inc.
+ * Paul Walmsley
+ * Greentime Hu
+ *
+ * Based in part on the i.MX6 PCIe host controller shim which is:
+ *
+ * Copyright (C) 2013 Kosagi
+ *             https://www.kosagi.com
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
+#include <linux/kernel.h>
+#include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/resource.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/iopoll.h>
+#include <linux/reset.h>
+
+#include "pcie-designware.h"
+
+#define to_fu740_pcie(x)       dev_get_drvdata((x)->dev)
+
+struct fu740_pcie {
+       struct dw_pcie pci;
+       void __iomem *mgmt_base;
+       struct gpio_desc *reset;
+       struct gpio_desc *pwren;
+       struct clk *pcie_aux;
+       struct reset_control *rst;
+};
+
+#define SIFIVE_DEVICESRESETREG         0x28
+
+#define PCIEX8MGMT_PERST_N             0x0
+#define PCIEX8MGMT_APP_LTSSM_ENABLE    0x10
+#define PCIEX8MGMT_APP_HOLD_PHY_RST    0x18
+#define PCIEX8MGMT_DEVICE_TYPE         0x708
+#define PCIEX8MGMT_PHY0_CR_PARA_ADDR   0x860
+#define PCIEX8MGMT_PHY0_CR_PARA_RD_EN  0x870
+#define PCIEX8MGMT_PHY0_CR_PARA_RD_DATA        0x878
+#define PCIEX8MGMT_PHY0_CR_PARA_SEL    0x880
+#define PCIEX8MGMT_PHY0_CR_PARA_WR_DATA        0x888
+#define PCIEX8MGMT_PHY0_CR_PARA_WR_EN  0x890
+#define PCIEX8MGMT_PHY0_CR_PARA_ACK    0x898
+#define PCIEX8MGMT_PHY1_CR_PARA_ADDR   0x8a0
+#define PCIEX8MGMT_PHY1_CR_PARA_RD_EN  0x8b0
+#define PCIEX8MGMT_PHY1_CR_PARA_RD_DATA        0x8b8
+#define PCIEX8MGMT_PHY1_CR_PARA_SEL    0x8c0
+#define PCIEX8MGMT_PHY1_CR_PARA_WR_DATA        0x8c8
+#define PCIEX8MGMT_PHY1_CR_PARA_WR_EN  0x8d0
+#define PCIEX8MGMT_PHY1_CR_PARA_ACK    0x8d8
+
+#define PCIEX8MGMT_PHY_CDR_TRACK_EN    BIT(0)
+#define PCIEX8MGMT_PHY_LOS_THRSHLD     BIT(5)
+#define PCIEX8MGMT_PHY_TERM_EN         BIT(9)
+#define PCIEX8MGMT_PHY_TERM_ACDC       BIT(10)
+#define PCIEX8MGMT_PHY_EN              BIT(11)
+#define PCIEX8MGMT_PHY_INIT_VAL                (PCIEX8MGMT_PHY_CDR_TRACK_EN|\
+                                        PCIEX8MGMT_PHY_LOS_THRSHLD|\
+                                        PCIEX8MGMT_PHY_TERM_EN|\
+                                        PCIEX8MGMT_PHY_TERM_ACDC|\
+                                        PCIEX8MGMT_PHY_EN)
+
+#define PCIEX8MGMT_PHY_LANEN_DIG_ASIC_RX_OVRD_IN_3     0x1008
+#define PCIEX8MGMT_PHY_LANE_OFF                0x100
+#define PCIEX8MGMT_PHY_LANE0_BASE      (PCIEX8MGMT_PHY_LANEN_DIG_ASIC_RX_OVRD_IN_3 + 0x100 * 0)
+#define PCIEX8MGMT_PHY_LANE1_BASE      (PCIEX8MGMT_PHY_LANEN_DIG_ASIC_RX_OVRD_IN_3 + 0x100 * 1)
+#define PCIEX8MGMT_PHY_LANE2_BASE      (PCIEX8MGMT_PHY_LANEN_DIG_ASIC_RX_OVRD_IN_3 + 0x100 * 2)
+#define PCIEX8MGMT_PHY_LANE3_BASE      (PCIEX8MGMT_PHY_LANEN_DIG_ASIC_RX_OVRD_IN_3 + 0x100 * 3)
+
+static void fu740_pcie_assert_reset(struct fu740_pcie *afp)
+{
+       /* Assert PERST_N GPIO */
+       gpiod_set_value_cansleep(afp->reset, 0);
+       /* Assert controller PERST_N */
+       writel_relaxed(0x0, afp->mgmt_base + PCIEX8MGMT_PERST_N);
+}
+
+static void fu740_pcie_deassert_reset(struct fu740_pcie *afp)
+{
+       /* Deassert controller PERST_N */
+       writel_relaxed(0x1, afp->mgmt_base + PCIEX8MGMT_PERST_N);
+       /* Deassert PERST_N GPIO */
+       gpiod_set_value_cansleep(afp->reset, 1);
+}
+
+static void fu740_pcie_power_on(struct fu740_pcie *afp)
+{
+       gpiod_set_value_cansleep(afp->pwren, 1);
+       /*
+        * Ensure that PERST has been asserted for at least 100 ms.
+        * Section 2.2 of PCI Express Card Electromechanical Specification
+        * Revision 3.0
+        */
+       msleep(100);
+}
+
+static void fu740_pcie_drive_reset(struct fu740_pcie *afp)
+{
+       fu740_pcie_assert_reset(afp);
+       fu740_pcie_power_on(afp);
+       fu740_pcie_deassert_reset(afp);
+}
+
+static void fu740_phyregwrite(const uint8_t phy, const uint16_t addr,
+                             const uint16_t wrdata, struct fu740_pcie *afp)
+{
+       struct device *dev = afp->pci.dev;
+       void __iomem *phy_cr_para_addr;
+       void __iomem *phy_cr_para_wr_data;
+       void __iomem *phy_cr_para_wr_en;
+       void __iomem *phy_cr_para_ack;
+       int ret, val;
+
+       /* Setup */
+       if (phy) {
+               phy_cr_para_addr = afp->mgmt_base + PCIEX8MGMT_PHY1_CR_PARA_ADDR;
+               phy_cr_para_wr_data = afp->mgmt_base + PCIEX8MGMT_PHY1_CR_PARA_WR_DATA;
+               phy_cr_para_wr_en = afp->mgmt_base + PCIEX8MGMT_PHY1_CR_PARA_WR_EN;
+               phy_cr_para_ack = afp->mgmt_base + PCIEX8MGMT_PHY1_CR_PARA_ACK;
+       } else {
+               phy_cr_para_addr = afp->mgmt_base + PCIEX8MGMT_PHY0_CR_PARA_ADDR;
+               phy_cr_para_wr_data = afp->mgmt_base + PCIEX8MGMT_PHY0_CR_PARA_WR_DATA;
+               phy_cr_para_wr_en = afp->mgmt_base + PCIEX8MGMT_PHY0_CR_PARA_WR_EN;
+               phy_cr_para_ack = afp->mgmt_base + PCIEX8MGMT_PHY0_CR_PARA_ACK;
+       }
+
+       writel_relaxed(addr, phy_cr_para_addr);
+       writel_relaxed(wrdata, phy_cr_para_wr_data);
+       writel_relaxed(1, phy_cr_para_wr_en);
+
+       /* Wait for wait_idle */
+       ret = readl_poll_timeout(phy_cr_para_ack, val, val, 10, 5000);
+       if (ret)
+               dev_warn(dev, "Wait for wait_idle state failed!\n");
+
+       /* Clear */
+       writel_relaxed(0, phy_cr_para_wr_en);
+
+       /* Wait for ~wait_idle */
+       ret = readl_poll_timeout(phy_cr_para_ack, val, !val, 10, 5000);
+       if (ret)
+               dev_warn(dev, "Wait for !wait_idle state failed!\n");
+}
+
+static void fu740_pcie_init_phy(struct fu740_pcie *afp)
+{
+       /* Enable phy cr_para_sel interfaces */
+       writel_relaxed(0x1, afp->mgmt_base + PCIEX8MGMT_PHY0_CR_PARA_SEL);
+       writel_relaxed(0x1, afp->mgmt_base + PCIEX8MGMT_PHY1_CR_PARA_SEL);
+
+       /*
+        * Wait 10 cr_para cycles to guarantee that the registers are ready
+        * to be edited.
+        */
+       ndelay(10);
+
+       /* Set PHY AC termination mode */
+       fu740_phyregwrite(0, PCIEX8MGMT_PHY_LANE0_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp);
+       fu740_phyregwrite(0, PCIEX8MGMT_PHY_LANE1_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp);
+       fu740_phyregwrite(0, PCIEX8MGMT_PHY_LANE2_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp);
+       fu740_phyregwrite(0, PCIEX8MGMT_PHY_LANE3_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp);
+       fu740_phyregwrite(1, PCIEX8MGMT_PHY_LANE0_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp);
+       fu740_phyregwrite(1, PCIEX8MGMT_PHY_LANE1_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp);
+       fu740_phyregwrite(1, PCIEX8MGMT_PHY_LANE2_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp);
+       fu740_phyregwrite(1, PCIEX8MGMT_PHY_LANE3_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp);
+}
+
+static int fu740_pcie_start_link(struct dw_pcie *pci)
+{
+       struct device *dev = pci->dev;
+       struct fu740_pcie *afp = dev_get_drvdata(dev);
+
+       /* Enable LTSSM */
+       writel_relaxed(0x1, afp->mgmt_base + PCIEX8MGMT_APP_LTSSM_ENABLE);
+       return 0;
+}
+
+static int fu740_pcie_host_init(struct pcie_port *pp)
+{
+       struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+       struct fu740_pcie *afp = to_fu740_pcie(pci);
+       struct device *dev = pci->dev;
+       int ret;
+
+       /* Power on reset */
+       fu740_pcie_drive_reset(afp);
+
+       /* Enable pcieauxclk */
+       ret = clk_prepare_enable(afp->pcie_aux);
+       if (ret) {
+               dev_err(dev, "unable to enable pcie_aux clock\n");
+               return ret;
+       }
+
+       /*
+        * Assert hold_phy_rst (hold the controller LTSSM in reset after
+        * power_up_rst_n for register programming with cr_para)
+        */
+       writel_relaxed(0x1, afp->mgmt_base + PCIEX8MGMT_APP_HOLD_PHY_RST);
+
+       /* Deassert power_up_rst_n */
+       ret = reset_control_deassert(afp->rst);
+       if (ret) {
+               dev_err(dev, "unable to deassert pcie_power_up_rst_n\n");
+               return ret;
+       }
+
+       fu740_pcie_init_phy(afp);
+
+       /* Disable pcieauxclk */
+       clk_disable_unprepare(afp->pcie_aux);
+       /* Clear hold_phy_rst */
+       writel_relaxed(0x0, afp->mgmt_base + PCIEX8MGMT_APP_HOLD_PHY_RST);
+       /* Enable pcieauxclk */
+       ret = clk_prepare_enable(afp->pcie_aux);
+       /* Set RC mode */
+       writel_relaxed(0x4, afp->mgmt_base + PCIEX8MGMT_DEVICE_TYPE);
+
+       return 0;
+}
+
+static const struct dw_pcie_host_ops fu740_pcie_host_ops = {
+       .host_init = fu740_pcie_host_init,
+};
+
+static const struct dw_pcie_ops dw_pcie_ops = {
+       .start_link = fu740_pcie_start_link,
+};
+
+static int fu740_pcie_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct dw_pcie *pci;
+       struct fu740_pcie *afp;
+
+       afp = devm_kzalloc(dev, sizeof(*afp), GFP_KERNEL);
+       if (!afp)
+               return -ENOMEM;
+       pci = &afp->pci;
+       pci->dev = dev;
+       pci->ops = &dw_pcie_ops;
+       pci->pp.ops = &fu740_pcie_host_ops;
+
+       /* SiFive specific region: mgmt */
+       afp->mgmt_base = devm_platform_ioremap_resource_byname(pdev, "mgmt");
+       if (IS_ERR(afp->mgmt_base))
+               return PTR_ERR(afp->mgmt_base);
+
+       /* Fetch GPIOs */
+       afp->reset = devm_gpiod_get_optional(dev, "reset-gpios", GPIOD_OUT_LOW);
+       if (IS_ERR(afp->reset))
+               return dev_err_probe(dev, PTR_ERR(afp->reset), "unable to get reset-gpios\n");
+
+       afp->pwren = devm_gpiod_get_optional(dev, "pwren-gpios", GPIOD_OUT_LOW);
+       if (IS_ERR(afp->pwren))
+               return dev_err_probe(dev, PTR_ERR(afp->pwren), "unable to get pwren-gpios\n");
+
+       /* Fetch clocks */
+       afp->pcie_aux = devm_clk_get(dev, "pcie_aux");
+       if (IS_ERR(afp->pcie_aux))
+               return dev_err_probe(dev, PTR_ERR(afp->pcie_aux),
+                                            "pcie_aux clock source missing or invalid\n");
+
+       /* Fetch reset */
+       afp->rst = devm_reset_control_get_exclusive(dev, NULL);
+       if (IS_ERR(afp->rst))
+               return dev_err_probe(dev, PTR_ERR(afp->rst), "unable to get reset\n");
+
+       platform_set_drvdata(pdev, afp);
+
+       return dw_pcie_host_init(&pci->pp);
+}
+
+static void fu740_pcie_shutdown(struct platform_device *pdev)
+{
+       struct fu740_pcie *afp = platform_get_drvdata(pdev);
+
+       /* Bring down link, so bootloader gets clean state in case of reboot */
+       fu740_pcie_assert_reset(afp);
+}
+
+static const struct of_device_id fu740_pcie_of_match[] = {
+       { .compatible = "sifive,fu740-pcie", },
+       {},
+};
+
+static struct platform_driver fu740_pcie_driver = {
+       .driver = {
+                  .name = "fu740-pcie",
+                  .of_match_table = fu740_pcie_of_match,
+                  .suppress_bind_attrs = true,
+       },
+       .probe = fu740_pcie_probe,
+       .shutdown = fu740_pcie_shutdown,
+};
+
+builtin_platform_driver(fu740_pcie_driver);
index 0cedd1f..f89a7d2 100644 (file)
@@ -81,11 +81,6 @@ static void pcie_update_bits(void __iomem *base, u32 ofs, u32 mask, u32 val)
                writel(val, base + ofs);
 }
 
-static inline u32 pcie_app_rd(struct intel_pcie_port *lpp, u32 ofs)
-{
-       return readl(lpp->app_base + ofs);
-}
-
 static inline void pcie_app_wr(struct intel_pcie_port *lpp, u32 ofs, u32 val)
 {
        writel(val, lpp->app_base + ofs);
index 6fa216e..bafd2c6 100644 (file)
@@ -22,6 +22,8 @@
 #include <linux/of_irq.h>
 #include <linux/of_pci.h>
 #include <linux/pci.h>
+#include <linux/pci-acpi.h>
+#include <linux/pci-ecam.h>
 #include <linux/phy/phy.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/platform_device.h>
@@ -311,6 +313,104 @@ struct tegra_pcie_dw_of_data {
        enum dw_pcie_device_mode mode;
 };
 
+#if defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS)
+struct tegra194_pcie_ecam  {
+       void __iomem *config_base;
+       void __iomem *iatu_base;
+       void __iomem *dbi_base;
+};
+
+static int tegra194_acpi_init(struct pci_config_window *cfg)
+{
+       struct device *dev = cfg->parent;
+       struct tegra194_pcie_ecam *pcie_ecam;
+
+       pcie_ecam = devm_kzalloc(dev, sizeof(*pcie_ecam), GFP_KERNEL);
+       if (!pcie_ecam)
+               return -ENOMEM;
+
+       pcie_ecam->config_base = cfg->win;
+       pcie_ecam->iatu_base = cfg->win + SZ_256K;
+       pcie_ecam->dbi_base = cfg->win + SZ_512K;
+       cfg->priv = pcie_ecam;
+
+       return 0;
+}
+
+static void atu_reg_write(struct tegra194_pcie_ecam *pcie_ecam, int index,
+                         u32 val, u32 reg)
+{
+       u32 offset = PCIE_GET_ATU_OUTB_UNR_REG_OFFSET(index);
+
+       writel(val, pcie_ecam->iatu_base + offset + reg);
+}
+
+static void program_outbound_atu(struct tegra194_pcie_ecam *pcie_ecam,
+                                int index, int type, u64 cpu_addr,
+                                u64 pci_addr, u64 size)
+{
+       atu_reg_write(pcie_ecam, index, lower_32_bits(cpu_addr),
+                     PCIE_ATU_LOWER_BASE);
+       atu_reg_write(pcie_ecam, index, upper_32_bits(cpu_addr),
+                     PCIE_ATU_UPPER_BASE);
+       atu_reg_write(pcie_ecam, index, lower_32_bits(pci_addr),
+                     PCIE_ATU_LOWER_TARGET);
+       atu_reg_write(pcie_ecam, index, lower_32_bits(cpu_addr + size - 1),
+                     PCIE_ATU_LIMIT);
+       atu_reg_write(pcie_ecam, index, upper_32_bits(pci_addr),
+                     PCIE_ATU_UPPER_TARGET);
+       atu_reg_write(pcie_ecam, index, type, PCIE_ATU_CR1);
+       atu_reg_write(pcie_ecam, index, PCIE_ATU_ENABLE, PCIE_ATU_CR2);
+}
+
+static void __iomem *tegra194_map_bus(struct pci_bus *bus,
+                                     unsigned int devfn, int where)
+{
+       struct pci_config_window *cfg = bus->sysdata;
+       struct tegra194_pcie_ecam *pcie_ecam = cfg->priv;
+       u32 busdev;
+       int type;
+
+       if (bus->number < cfg->busr.start || bus->number > cfg->busr.end)
+               return NULL;
+
+       if (bus->number == cfg->busr.start) {
+               if (PCI_SLOT(devfn) == 0)
+                       return pcie_ecam->dbi_base + where;
+               else
+                       return NULL;
+       }
+
+       busdev = PCIE_ATU_BUS(bus->number) | PCIE_ATU_DEV(PCI_SLOT(devfn)) |
+                PCIE_ATU_FUNC(PCI_FUNC(devfn));
+
+       if (bus->parent->number == cfg->busr.start) {
+               if (PCI_SLOT(devfn) == 0)
+                       type = PCIE_ATU_TYPE_CFG0;
+               else
+                       return NULL;
+       } else {
+               type = PCIE_ATU_TYPE_CFG1;
+       }
+
+       program_outbound_atu(pcie_ecam, 0, type, cfg->res.start, busdev,
+                            SZ_256K);
+
+       return pcie_ecam->config_base + where;
+}
+
+const struct pci_ecam_ops tegra194_pcie_ops = {
+       .init           = tegra194_acpi_init,
+       .pci_ops        = {
+               .map_bus        = tegra194_map_bus,
+               .read           = pci_generic_config_read,
+               .write          = pci_generic_config_write,
+       }
+};
+#endif /* defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) */
+
+#ifdef CONFIG_PCIE_TEGRA194
+
 static inline struct tegra_pcie_dw *to_tegra_pcie(struct dw_pcie *pci)
 {
        return container_of(pci, struct tegra_pcie_dw, pci);
@@ -1019,7 +1119,7 @@ static const struct dw_pcie_ops tegra_dw_pcie_ops = {
        .stop_link = tegra_pcie_dw_stop_link,
 };
 
-static struct dw_pcie_host_ops tegra_pcie_dw_host_ops = {
+static const struct dw_pcie_host_ops tegra_pcie_dw_host_ops = {
        .host_init = tegra_pcie_dw_host_init,
 };
 
@@ -1645,7 +1745,7 @@ static void pex_ep_event_pex_rst_deassert(struct tegra_pcie_dw *pcie)
        if (pcie->ep_state == EP_STATE_ENABLED)
                return;
 
-       ret = pm_runtime_get_sync(dev);
+       ret = pm_runtime_resume_and_get(dev);
        if (ret < 0) {
                dev_err(dev, "Failed to get runtime sync for PCIe dev: %d\n",
                        ret);
@@ -1881,7 +1981,7 @@ tegra_pcie_ep_get_features(struct dw_pcie_ep *ep)
        return &tegra_pcie_epc_features;
 }
 
-static struct dw_pcie_ep_ops pcie_ep_ops = {
+static const struct dw_pcie_ep_ops pcie_ep_ops = {
        .raise_irq = tegra_pcie_ep_raise_irq,
        .get_features = tegra_pcie_ep_get_features,
 };
@@ -2311,3 +2411,5 @@ MODULE_DEVICE_TABLE(of, tegra_pcie_dw_of_match);
 MODULE_AUTHOR("Vidya Sagar <vidyas@nvidia.com>");
 MODULE_DESCRIPTION("NVIDIA PCIe host controller driver");
 MODULE_LICENSE("GPL v2");
+
+#endif /* CONFIG_PCIE_TEGRA194 */
index a62d247..e4643fb 100644 (file)
@@ -24,8 +24,7 @@ config PCIE_MOBIVEIL_PLAT
 
 config PCIE_LAYERSCAPE_GEN4
        bool "Freescale Layerscape PCIe Gen4 controller"
-       depends on PCI
-       depends on OF && (ARM64 || ARCH_LAYERSCAPE)
+       depends on ARCH_LAYERSCAPE || COMPILE_TEST
        depends on PCI_MSI_IRQ_DOMAIN
        select PCIE_MOBIVEIL_HOST
        help
index 6ab694f..d3924a4 100644 (file)
@@ -79,6 +79,7 @@ int pci_host_common_probe(struct platform_device *pdev)
 
        bridge->sysdata = cfg;
        bridge->ops = (struct pci_ops *)&ops->pci_ops;
+       bridge->msi_domain = true;
 
        return pci_host_probe(bridge);
 }
index 1ff4ce2..6511648 100644 (file)
@@ -473,7 +473,6 @@ struct hv_pcibus_device {
        struct list_head dr_list;
 
        struct msi_domain_info msi_info;
-       struct msi_controller msi_chip;
        struct irq_domain *irq_domain;
 
        spinlock_t retarget_msi_interrupt_lock;
@@ -1866,9 +1865,6 @@ static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
        if (!hbus->pci_bus)
                return -ENODEV;
 
-       hbus->pci_bus->msi = &hbus->msi_chip;
-       hbus->pci_bus->msi->dev = &hbus->hdev->device;
-
        pci_lock_rescan_remove();
        pci_scan_child_bus(hbus->pci_bus);
        hv_pci_assign_numa_node(hbus);
index 8fcabed..8069bd9 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/interrupt.h>
 #include <linux/iopoll.h>
 #include <linux/irq.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/irqdomain.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #define AFI_MSI_FPCI_BAR_ST    0x64
 #define AFI_MSI_AXI_BAR_ST     0x68
 
-#define AFI_MSI_VEC0           0x6c
-#define AFI_MSI_VEC1           0x70
-#define AFI_MSI_VEC2           0x74
-#define AFI_MSI_VEC3           0x78
-#define AFI_MSI_VEC4           0x7c
-#define AFI_MSI_VEC5           0x80
-#define AFI_MSI_VEC6           0x84
-#define AFI_MSI_VEC7           0x88
-
-#define AFI_MSI_EN_VEC0                0x8c
-#define AFI_MSI_EN_VEC1                0x90
-#define AFI_MSI_EN_VEC2                0x94
-#define AFI_MSI_EN_VEC3                0x98
-#define AFI_MSI_EN_VEC4                0x9c
-#define AFI_MSI_EN_VEC5                0xa0
-#define AFI_MSI_EN_VEC6                0xa4
-#define AFI_MSI_EN_VEC7                0xa8
+#define AFI_MSI_VEC(x)         (0x6c + ((x) * 4))
+#define AFI_MSI_EN_VEC(x)      (0x8c + ((x) * 4))
 
 #define AFI_CONFIGURATION              0xac
 #define  AFI_CONFIGURATION_EN_FPCI             (1 << 0)
 #define LINK_RETRAIN_TIMEOUT 100000 /* in usec */
 
 struct tegra_msi {
-       struct msi_controller chip;
        DECLARE_BITMAP(used, INT_PCI_MSI_NR);
        struct irq_domain *domain;
-       struct mutex lock;
+       struct mutex map_lock;
+       spinlock_t mask_lock;
        void *virt;
        dma_addr_t phys;
        int irq;
@@ -333,11 +319,6 @@ struct tegra_pcie_soc {
        } ectl;
 };
 
-static inline struct tegra_msi *to_tegra_msi(struct msi_controller *chip)
-{
-       return container_of(chip, struct tegra_msi, chip);
-}
-
 struct tegra_pcie {
        struct device *dev;
 
@@ -372,6 +353,11 @@ struct tegra_pcie {
        struct dentry *debugfs;
 };
 
+static inline struct tegra_pcie *msi_to_pcie(struct tegra_msi *msi)
+{
+       return container_of(msi, struct tegra_pcie, msi);
+}
+
 struct tegra_pcie_port {
        struct tegra_pcie *pcie;
        struct device_node *np;
@@ -1432,7 +1418,6 @@ static void tegra_pcie_phys_put(struct tegra_pcie *pcie)
        }
 }
 
-
 static int tegra_pcie_get_resources(struct tegra_pcie *pcie)
 {
        struct device *dev = pcie->dev;
@@ -1509,6 +1494,7 @@ static int tegra_pcie_get_resources(struct tegra_pcie *pcie)
 phys_put:
        if (soc->program_uphy)
                tegra_pcie_phys_put(pcie);
+
        return err;
 }
 
@@ -1551,161 +1537,227 @@ static void tegra_pcie_pme_turnoff(struct tegra_pcie_port *port)
        afi_writel(pcie, val, AFI_PCIE_PME);
 }
 
-static int tegra_msi_alloc(struct tegra_msi *chip)
-{
-       int msi;
-
-       mutex_lock(&chip->lock);
-
-       msi = find_first_zero_bit(chip->used, INT_PCI_MSI_NR);
-       if (msi < INT_PCI_MSI_NR)
-               set_bit(msi, chip->used);
-       else
-               msi = -ENOSPC;
-
-       mutex_unlock(&chip->lock);
-
-       return msi;
-}
-
-static void tegra_msi_free(struct tegra_msi *chip, unsigned long irq)
+static void tegra_pcie_msi_irq(struct irq_desc *desc)
 {
-       struct device *dev = chip->chip.dev;
-
-       mutex_lock(&chip->lock);
-
-       if (!test_bit(irq, chip->used))
-               dev_err(dev, "trying to free unused MSI#%lu\n", irq);
-       else
-               clear_bit(irq, chip->used);
-
-       mutex_unlock(&chip->lock);
-}
-
-static irqreturn_t tegra_pcie_msi_irq(int irq, void *data)
-{
-       struct tegra_pcie *pcie = data;
-       struct device *dev = pcie->dev;
+       struct tegra_pcie *pcie = irq_desc_get_handler_data(desc);
+       struct irq_chip *chip = irq_desc_get_chip(desc);
        struct tegra_msi *msi = &pcie->msi;
-       unsigned int i, processed = 0;
+       struct device *dev = pcie->dev;
+       unsigned int i;
+
+       chained_irq_enter(chip, desc);
 
        for (i = 0; i < 8; i++) {
-               unsigned long reg = afi_readl(pcie, AFI_MSI_VEC0 + i * 4);
+               unsigned long reg = afi_readl(pcie, AFI_MSI_VEC(i));
 
                while (reg) {
                        unsigned int offset = find_first_bit(&reg, 32);
                        unsigned int index = i * 32 + offset;
                        unsigned int irq;
 
-                       /* clear the interrupt */
-                       afi_writel(pcie, 1 << offset, AFI_MSI_VEC0 + i * 4);
-
-                       irq = irq_find_mapping(msi->domain, index);
+                       irq = irq_find_mapping(msi->domain->parent, index);
                        if (irq) {
-                               if (test_bit(index, msi->used))
-                                       generic_handle_irq(irq);
-                               else
-                                       dev_info(dev, "unhandled MSI\n");
+                               generic_handle_irq(irq);
                        } else {
                                /*
                                 * that's weird who triggered this?
                                 * just clear it
                                 */
                                dev_info(dev, "unexpected MSI\n");
+                               afi_writel(pcie, BIT(index % 32), AFI_MSI_VEC(index));
                        }
 
                        /* see if there's any more pending in this vector */
-                       reg = afi_readl(pcie, AFI_MSI_VEC0 + i * 4);
-
-                       processed++;
+                       reg = afi_readl(pcie, AFI_MSI_VEC(i));
                }
        }
 
-       return processed > 0 ? IRQ_HANDLED : IRQ_NONE;
+       chained_irq_exit(chip, desc);
 }
 
-static int tegra_msi_setup_irq(struct msi_controller *chip,
-                              struct pci_dev *pdev, struct msi_desc *desc)
+static void tegra_msi_top_irq_ack(struct irq_data *d)
 {
-       struct tegra_msi *msi = to_tegra_msi(chip);
-       struct msi_msg msg;
-       unsigned int irq;
-       int hwirq;
+       irq_chip_ack_parent(d);
+}
 
-       hwirq = tegra_msi_alloc(msi);
-       if (hwirq < 0)
-               return hwirq;
+static void tegra_msi_top_irq_mask(struct irq_data *d)
+{
+       pci_msi_mask_irq(d);
+       irq_chip_mask_parent(d);
+}
 
-       irq = irq_create_mapping(msi->domain, hwirq);
-       if (!irq) {
-               tegra_msi_free(msi, hwirq);
-               return -EINVAL;
-       }
+static void tegra_msi_top_irq_unmask(struct irq_data *d)
+{
+       pci_msi_unmask_irq(d);
+       irq_chip_unmask_parent(d);
+}
+
+static struct irq_chip tegra_msi_top_chip = {
+       .name           = "Tegra PCIe MSI",
+       .irq_ack        = tegra_msi_top_irq_ack,
+       .irq_mask       = tegra_msi_top_irq_mask,
+       .irq_unmask     = tegra_msi_top_irq_unmask,
+};
 
-       irq_set_msi_desc(irq, desc);
+static void tegra_msi_irq_ack(struct irq_data *d)
+{
+       struct tegra_msi *msi = irq_data_get_irq_chip_data(d);
+       struct tegra_pcie *pcie = msi_to_pcie(msi);
+       unsigned int index = d->hwirq / 32;
 
-       msg.address_lo = lower_32_bits(msi->phys);
-       msg.address_hi = upper_32_bits(msi->phys);
-       msg.data = hwirq;
+       /* clear the interrupt */
+       afi_writel(pcie, BIT(d->hwirq % 32), AFI_MSI_VEC(index));
+}
 
-       pci_write_msi_msg(irq, &msg);
+static void tegra_msi_irq_mask(struct irq_data *d)
+{
+       struct tegra_msi *msi = irq_data_get_irq_chip_data(d);
+       struct tegra_pcie *pcie = msi_to_pcie(msi);
+       unsigned int index = d->hwirq / 32;
+       unsigned long flags;
+       u32 value;
 
-       return 0;
+       spin_lock_irqsave(&msi->mask_lock, flags);
+       value = afi_readl(pcie, AFI_MSI_EN_VEC(index));
+       value &= ~BIT(d->hwirq % 32);
+       afi_writel(pcie, value, AFI_MSI_EN_VEC(index));
+       spin_unlock_irqrestore(&msi->mask_lock, flags);
 }
 
-static void tegra_msi_teardown_irq(struct msi_controller *chip,
-                                  unsigned int irq)
+static void tegra_msi_irq_unmask(struct irq_data *d)
 {
-       struct tegra_msi *msi = to_tegra_msi(chip);
-       struct irq_data *d = irq_get_irq_data(irq);
-       irq_hw_number_t hwirq = irqd_to_hwirq(d);
+       struct tegra_msi *msi = irq_data_get_irq_chip_data(d);
+       struct tegra_pcie *pcie = msi_to_pcie(msi);
+       unsigned int index = d->hwirq / 32;
+       unsigned long flags;
+       u32 value;
 
-       irq_dispose_mapping(irq);
-       tegra_msi_free(msi, hwirq);
+       spin_lock_irqsave(&msi->mask_lock, flags);
+       value = afi_readl(pcie, AFI_MSI_EN_VEC(index));
+       value |= BIT(d->hwirq % 32);
+       afi_writel(pcie, value, AFI_MSI_EN_VEC(index));
+       spin_unlock_irqrestore(&msi->mask_lock, flags);
 }
 
-static struct irq_chip tegra_msi_irq_chip = {
-       .name = "Tegra PCIe MSI",
-       .irq_enable = pci_msi_unmask_irq,
-       .irq_disable = pci_msi_mask_irq,
-       .irq_mask = pci_msi_mask_irq,
-       .irq_unmask = pci_msi_unmask_irq,
+static int tegra_msi_set_affinity(struct irq_data *d, const struct cpumask *mask, bool force)
+{
+       return -EINVAL;
+}
+
+static void tegra_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+       struct tegra_msi *msi = irq_data_get_irq_chip_data(data);
+
+       msg->address_lo = lower_32_bits(msi->phys);
+       msg->address_hi = upper_32_bits(msi->phys);
+       msg->data = data->hwirq;
+}
+
+static struct irq_chip tegra_msi_bottom_chip = {
+       .name                   = "Tegra MSI",
+       .irq_ack                = tegra_msi_irq_ack,
+       .irq_mask               = tegra_msi_irq_mask,
+       .irq_unmask             = tegra_msi_irq_unmask,
+       .irq_set_affinity       = tegra_msi_set_affinity,
+       .irq_compose_msi_msg    = tegra_compose_msi_msg,
 };
 
-static int tegra_msi_map(struct irq_domain *domain, unsigned int irq,
-                        irq_hw_number_t hwirq)
+static int tegra_msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
+                                 unsigned int nr_irqs, void *args)
 {
-       irq_set_chip_and_handler(irq, &tegra_msi_irq_chip, handle_simple_irq);
-       irq_set_chip_data(irq, domain->host_data);
+       struct tegra_msi *msi = domain->host_data;
+       unsigned int i;
+       int hwirq;
+
+       mutex_lock(&msi->map_lock);
+
+       hwirq = bitmap_find_free_region(msi->used, INT_PCI_MSI_NR, order_base_2(nr_irqs));
+
+       mutex_unlock(&msi->map_lock);
+
+       if (hwirq < 0)
+               return -ENOSPC;
+
+       for (i = 0; i < nr_irqs; i++)
+               irq_domain_set_info(domain, virq + i, hwirq + i,
+                                   &tegra_msi_bottom_chip, domain->host_data,
+                                   handle_edge_irq, NULL, NULL);
 
        tegra_cpuidle_pcie_irqs_in_use();
 
        return 0;
 }
 
-static const struct irq_domain_ops msi_domain_ops = {
-       .map = tegra_msi_map,
+static void tegra_msi_domain_free(struct irq_domain *domain, unsigned int virq,
+                                 unsigned int nr_irqs)
+{
+       struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+       struct tegra_msi *msi = domain->host_data;
+
+       mutex_lock(&msi->map_lock);
+
+       bitmap_release_region(msi->used, d->hwirq, order_base_2(nr_irqs));
+
+       mutex_unlock(&msi->map_lock);
+}
+
+static const struct irq_domain_ops tegra_msi_domain_ops = {
+       .alloc = tegra_msi_domain_alloc,
+       .free = tegra_msi_domain_free,
+};
+
+static struct msi_domain_info tegra_msi_info = {
+       .flags  = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+                  MSI_FLAG_PCI_MSIX),
+       .chip   = &tegra_msi_top_chip,
 };
 
+static int tegra_allocate_domains(struct tegra_msi *msi)
+{
+       struct tegra_pcie *pcie = msi_to_pcie(msi);
+       struct fwnode_handle *fwnode = dev_fwnode(pcie->dev);
+       struct irq_domain *parent;
+
+       parent = irq_domain_create_linear(fwnode, INT_PCI_MSI_NR,
+                                         &tegra_msi_domain_ops, msi);
+       if (!parent) {
+               dev_err(pcie->dev, "failed to create IRQ domain\n");
+               return -ENOMEM;
+       }
+       irq_domain_update_bus_token(parent, DOMAIN_BUS_NEXUS);
+
+       msi->domain = pci_msi_create_irq_domain(fwnode, &tegra_msi_info, parent);
+       if (!msi->domain) {
+               dev_err(pcie->dev, "failed to create MSI domain\n");
+               irq_domain_remove(parent);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static void tegra_free_domains(struct tegra_msi *msi)
+{
+       struct irq_domain *parent = msi->domain->parent;
+
+       irq_domain_remove(msi->domain);
+       irq_domain_remove(parent);
+}
+
 static int tegra_pcie_msi_setup(struct tegra_pcie *pcie)
 {
-       struct pci_host_bridge *host = pci_host_bridge_from_priv(pcie);
        struct platform_device *pdev = to_platform_device(pcie->dev);
        struct tegra_msi *msi = &pcie->msi;
        struct device *dev = pcie->dev;
        int err;
 
-       mutex_init(&msi->lock);
-
-       msi->chip.dev = dev;
-       msi->chip.setup_irq = tegra_msi_setup_irq;
-       msi->chip.teardown_irq = tegra_msi_teardown_irq;
+       mutex_init(&msi->map_lock);
+       spin_lock_init(&msi->mask_lock);
 
-       msi->domain = irq_domain_add_linear(dev->of_node, INT_PCI_MSI_NR,
-                                           &msi_domain_ops, &msi->chip);
-       if (!msi->domain) {
-               dev_err(dev, "failed to create IRQ domain\n");
-               return -ENOMEM;
+       if (IS_ENABLED(CONFIG_PCI_MSI)) {
+               err = tegra_allocate_domains(msi);
+               if (err)
+                       return err;
        }
 
        err = platform_get_irq_byname(pdev, "msi");
@@ -1714,12 +1766,7 @@ static int tegra_pcie_msi_setup(struct tegra_pcie *pcie)
 
        msi->irq = err;
 
-       err = request_irq(msi->irq, tegra_pcie_msi_irq, IRQF_NO_THREAD,
-                         tegra_msi_irq_chip.name, pcie);
-       if (err < 0) {
-               dev_err(dev, "failed to request IRQ: %d\n", err);
-               goto free_irq_domain;
-       }
+       irq_set_chained_handler_and_data(msi->irq, tegra_pcie_msi_irq, pcie);
 
        /* Though the PCIe controller can address >32-bit address space, to
         * facilitate endpoints that support only 32-bit MSI target address,
@@ -1740,14 +1787,14 @@ static int tegra_pcie_msi_setup(struct tegra_pcie *pcie)
                goto free_irq;
        }
 
-       host->msi = &msi->chip;
-
        return 0;
 
 free_irq:
-       free_irq(msi->irq, pcie);
+       irq_set_chained_handler_and_data(msi->irq, NULL, NULL);
 free_irq_domain:
-       irq_domain_remove(msi->domain);
+       if (IS_ENABLED(CONFIG_PCI_MSI))
+               tegra_free_domains(msi);
+
        return err;
 }
 
@@ -1755,22 +1802,18 @@ static void tegra_pcie_enable_msi(struct tegra_pcie *pcie)
 {
        const struct tegra_pcie_soc *soc = pcie->soc;
        struct tegra_msi *msi = &pcie->msi;
-       u32 reg;
+       u32 reg, msi_state[INT_PCI_MSI_NR / 32];
+       int i;
 
        afi_writel(pcie, msi->phys >> soc->msi_base_shift, AFI_MSI_FPCI_BAR_ST);
        afi_writel(pcie, msi->phys, AFI_MSI_AXI_BAR_ST);
        /* this register is in 4K increments */
        afi_writel(pcie, 1, AFI_MSI_BAR_SZ);
 
-       /* enable all MSI vectors */
-       afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC0);
-       afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC1);
-       afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC2);
-       afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC3);
-       afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC4);
-       afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC5);
-       afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC6);
-       afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC7);
+       /* Restore the MSI allocation state */
+       bitmap_to_arr32(msi_state, msi->used, INT_PCI_MSI_NR);
+       for (i = 0; i < ARRAY_SIZE(msi_state); i++)
+               afi_writel(pcie, msi_state[i], AFI_MSI_EN_VEC(i));
 
        /* and unmask the MSI interrupt */
        reg = afi_readl(pcie, AFI_INTR_MASK);
@@ -1786,16 +1829,16 @@ static void tegra_pcie_msi_teardown(struct tegra_pcie *pcie)
        dma_free_attrs(pcie->dev, PAGE_SIZE, msi->virt, msi->phys,
                       DMA_ATTR_NO_KERNEL_MAPPING);
 
-       if (msi->irq > 0)
-               free_irq(msi->irq, pcie);
-
        for (i = 0; i < INT_PCI_MSI_NR; i++) {
                irq = irq_find_mapping(msi->domain, i);
                if (irq > 0)
-                       irq_dispose_mapping(irq);
+                       irq_domain_free_irqs(irq, 1);
        }
 
-       irq_domain_remove(msi->domain);
+       irq_set_chained_handler_and_data(msi->irq, NULL, NULL);
+
+       if (IS_ENABLED(CONFIG_PCI_MSI))
+               tegra_free_domains(msi);
 }
 
 static int tegra_pcie_disable_msi(struct tegra_pcie *pcie)
@@ -1807,16 +1850,6 @@ static int tegra_pcie_disable_msi(struct tegra_pcie *pcie)
        value &= ~AFI_INTR_MASK_MSI_MASK;
        afi_writel(pcie, value, AFI_INTR_MASK);
 
-       /* disable all MSI vectors */
-       afi_writel(pcie, 0, AFI_MSI_EN_VEC0);
-       afi_writel(pcie, 0, AFI_MSI_EN_VEC1);
-       afi_writel(pcie, 0, AFI_MSI_EN_VEC2);
-       afi_writel(pcie, 0, AFI_MSI_EN_VEC3);
-       afi_writel(pcie, 0, AFI_MSI_EN_VEC4);
-       afi_writel(pcie, 0, AFI_MSI_EN_VEC5);
-       afi_writel(pcie, 0, AFI_MSI_EN_VEC6);
-       afi_writel(pcie, 0, AFI_MSI_EN_VEC7);
-
        return 0;
 }
 
index f964fd2..ffd8465 100644 (file)
@@ -116,7 +116,7 @@ static int thunder_ecam_p2_config_read(struct pci_bus *bus, unsigned int devfn,
         * the config space access window.  Since we are working with
         * the high-order 32 bits, shift everything down by 32 bits.
         */
-       node_bits = (cfg->res.start >> 32) & (1 << 12);
+       node_bits = upper_32_bits(cfg->res.start) & (1 << 12);
 
        v |= node_bits;
        set_val(v, where, size, val);
index 1a3f70a..0660b9d 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/pci-acpi.h>
 #include <linux/pci-ecam.h>
 #include <linux/platform_device.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
 #include "../pci.h"
 
 #if defined(CONFIG_PCI_HOST_THUNDER_PEM) || (defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS))
@@ -324,9 +325,9 @@ static int thunder_pem_init(struct device *dev, struct pci_config_window *cfg,
         * structure here for the BAR.
         */
        bar4_start = res_pem->start + 0xf00000;
-       pem_pci->ea_entry[0] = (u32)bar4_start | 2;
-       pem_pci->ea_entry[1] = (u32)(res_pem->end - bar4_start) & ~3u;
-       pem_pci->ea_entry[2] = (u32)(bar4_start >> 32);
+       pem_pci->ea_entry[0] = lower_32_bits(bar4_start) | 2;
+       pem_pci->ea_entry[1] = lower_32_bits(res_pem->end - bar4_start) & ~3u;
+       pem_pci->ea_entry[2] = upper_32_bits(bar4_start);
 
        cfg->priv = pem_pci;
        return 0;
@@ -334,9 +335,9 @@ static int thunder_pem_init(struct device *dev, struct pci_config_window *cfg,
 
 #if defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS)
 
-#define PEM_RES_BASE           0x87e0c0000000UL
-#define PEM_NODE_MASK          GENMASK(45, 44)
-#define PEM_INDX_MASK          GENMASK(26, 24)
+#define PEM_RES_BASE           0x87e0c0000000ULL
+#define PEM_NODE_MASK          GENMASK_ULL(45, 44)
+#define PEM_INDX_MASK          GENMASK_ULL(26, 24)
 #define PEM_MIN_DOM_IN_NODE    4
 #define PEM_MAX_DOM_IN_NODE    10
 
index 2afdc86..7f503dd 100644 (file)
@@ -354,7 +354,8 @@ static int xgene_pcie_map_reg(struct xgene_pcie_port *port,
        if (IS_ERR(port->csr_base))
                return PTR_ERR(port->csr_base);
 
-       port->cfg_base = devm_platform_ioremap_resource_byname(pdev, "cfg");
+       res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cfg");
+       port->cfg_base = devm_ioremap_resource(dev, res);
        if (IS_ERR(port->cfg_base))
                return PTR_ERR(port->cfg_base);
        port->cfg_addr = res->start;
index 42691dd..98aa1dc 100644 (file)
@@ -236,10 +236,8 @@ static int altera_msi_probe(struct platform_device *pdev)
        res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
                                           "vector_slave");
        msi->vector_base = devm_ioremap_resource(&pdev->dev, res);
-       if (IS_ERR(msi->vector_base)) {
-               dev_err(&pdev->dev, "failed to map vector_slave memory\n");
+       if (IS_ERR(msi->vector_base))
                return PTR_ERR(msi->vector_base);
-       }
 
        msi->vector_phy = res->start;
 
index e330e68..08bc788 100644 (file)
@@ -1148,6 +1148,7 @@ static int brcm_pcie_suspend(struct device *dev)
 
        brcm_pcie_turn_off(pcie);
        ret = brcm_phy_stop(pcie);
+       reset_control_rearm(pcie->rescal);
        clk_disable_unprepare(pcie->clk);
 
        return ret;
@@ -1163,9 +1164,13 @@ static int brcm_pcie_resume(struct device *dev)
        base = pcie->base;
        clk_prepare_enable(pcie->clk);
 
+       ret = reset_control_reset(pcie->rescal);
+       if (ret)
+               goto err_disable_clk;
+
        ret = brcm_phy_start(pcie);
        if (ret)
-               goto err;
+               goto err_reset;
 
        /* Take bridge out of reset so we can access the SERDES reg */
        pcie->bridge_sw_init_set(pcie, 0);
@@ -1180,14 +1185,16 @@ static int brcm_pcie_resume(struct device *dev)
 
        ret = brcm_pcie_setup(pcie);
        if (ret)
-               goto err;
+               goto err_reset;
 
        if (pcie->msi)
                brcm_msi_set_regs(pcie->msi);
 
        return 0;
 
-err:
+err_reset:
+       reset_control_rearm(pcie->rescal);
+err_disable_clk:
        clk_disable_unprepare(pcie->clk);
        return ret;
 }
@@ -1197,7 +1204,7 @@ static void __brcm_pcie_remove(struct brcm_pcie *pcie)
        brcm_msi_remove(pcie);
        brcm_pcie_turn_off(pcie);
        brcm_phy_stop(pcie);
-       reset_control_assert(pcie->rescal);
+       reset_control_rearm(pcie->rescal);
        clk_disable_unprepare(pcie->clk);
 }
 
@@ -1278,13 +1285,13 @@ static int brcm_pcie_probe(struct platform_device *pdev)
                return PTR_ERR(pcie->perst_reset);
        }
 
-       ret = reset_control_deassert(pcie->rescal);
+       ret = reset_control_reset(pcie->rescal);
        if (ret)
                dev_err(&pdev->dev, "failed to deassert 'rescal'\n");
 
        ret = brcm_phy_start(pcie);
        if (ret) {
-               reset_control_assert(pcie->rescal);
+               reset_control_rearm(pcie->rescal);
                clk_disable_unprepare(pcie->clk);
                return ret;
        }
@@ -1296,6 +1303,7 @@ static int brcm_pcie_probe(struct platform_device *pdev)
        pcie->hw_rev = readl(pcie->base + PCIE_MISC_REVISION);
        if (pcie->type == BCM4908 && pcie->hw_rev >= BRCM_PCIE_HW_REV_3_20) {
                dev_err(pcie->dev, "hardware revision with unsupported PERST# setup\n");
+               ret = -ENODEV;
                goto fail;
        }
 
index 908475d..eede4e8 100644 (file)
@@ -271,7 +271,7 @@ static int iproc_msi_irq_domain_alloc(struct irq_domain *domain,
                                    NULL, NULL);
        }
 
-       return hwirq;
+       return 0;
 }
 
 static void iproc_msi_irq_domain_free(struct irq_domain *domain,
diff --git a/drivers/pci/controller/pcie-mediatek-gen3.c b/drivers/pci/controller/pcie-mediatek-gen3.c
new file mode 100644 (file)
index 0000000..3c5b977
--- /dev/null
@@ -0,0 +1,1027 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * MediaTek PCIe host controller driver.
+ *
+ * Copyright (c) 2020 MediaTek Inc.
+ * Author: Jianjun Wang <jianjun.wang@mediatek.com>
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/iopoll.h>
+#include <linux/irq.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/pci.h>
+#include <linux/phy/phy.h>
+#include <linux/platform_device.h>
+#include <linux/pm_domain.h>
+#include <linux/pm_runtime.h>
+#include <linux/reset.h>
+
+#include "../pci.h"
+
+#define PCIE_SETTING_REG               0x80
+#define PCIE_PCI_IDS_1                 0x9c
+#define PCI_CLASS(class)               (class << 8)
+#define PCIE_RC_MODE                   BIT(0)
+
+#define PCIE_CFGNUM_REG                        0x140
+#define PCIE_CFG_DEVFN(devfn)          ((devfn) & GENMASK(7, 0))
+#define PCIE_CFG_BUS(bus)              (((bus) << 8) & GENMASK(15, 8))
+#define PCIE_CFG_BYTE_EN(bytes)                (((bytes) << 16) & GENMASK(19, 16))
+#define PCIE_CFG_FORCE_BYTE_EN         BIT(20)
+#define PCIE_CFG_OFFSET_ADDR           0x1000
+#define PCIE_CFG_HEADER(bus, devfn) \
+       (PCIE_CFG_BUS(bus) | PCIE_CFG_DEVFN(devfn))
+
+#define PCIE_RST_CTRL_REG              0x148
+#define PCIE_MAC_RSTB                  BIT(0)
+#define PCIE_PHY_RSTB                  BIT(1)
+#define PCIE_BRG_RSTB                  BIT(2)
+#define PCIE_PE_RSTB                   BIT(3)
+
+#define PCIE_LTSSM_STATUS_REG          0x150
+#define PCIE_LTSSM_STATE_MASK          GENMASK(28, 24)
+#define PCIE_LTSSM_STATE(val)          ((val & PCIE_LTSSM_STATE_MASK) >> 24)
+#define PCIE_LTSSM_STATE_L2_IDLE       0x14
+
+#define PCIE_LINK_STATUS_REG           0x154
+#define PCIE_PORT_LINKUP               BIT(8)
+
+#define PCIE_MSI_SET_NUM               8
+#define PCIE_MSI_IRQS_PER_SET          32
+#define PCIE_MSI_IRQS_NUM \
+       (PCIE_MSI_IRQS_PER_SET * PCIE_MSI_SET_NUM)
+
+#define PCIE_INT_ENABLE_REG            0x180
+#define PCIE_MSI_ENABLE                        GENMASK(PCIE_MSI_SET_NUM + 8 - 1, 8)
+#define PCIE_MSI_SHIFT                 8
+#define PCIE_INTX_SHIFT                        24
+#define PCIE_INTX_ENABLE \
+       GENMASK(PCIE_INTX_SHIFT + PCI_NUM_INTX - 1, PCIE_INTX_SHIFT)
+
+#define PCIE_INT_STATUS_REG            0x184
+#define PCIE_MSI_SET_ENABLE_REG                0x190
+#define PCIE_MSI_SET_ENABLE            GENMASK(PCIE_MSI_SET_NUM - 1, 0)
+
+#define PCIE_MSI_SET_BASE_REG          0xc00
+#define PCIE_MSI_SET_OFFSET            0x10
+#define PCIE_MSI_SET_STATUS_OFFSET     0x04
+#define PCIE_MSI_SET_ENABLE_OFFSET     0x08
+
+#define PCIE_MSI_SET_ADDR_HI_BASE      0xc80
+#define PCIE_MSI_SET_ADDR_HI_OFFSET    0x04
+
+#define PCIE_ICMD_PM_REG               0x198
+#define PCIE_TURN_OFF_LINK             BIT(4)
+
+#define PCIE_TRANS_TABLE_BASE_REG      0x800
+#define PCIE_ATR_SRC_ADDR_MSB_OFFSET   0x4
+#define PCIE_ATR_TRSL_ADDR_LSB_OFFSET  0x8
+#define PCIE_ATR_TRSL_ADDR_MSB_OFFSET  0xc
+#define PCIE_ATR_TRSL_PARAM_OFFSET     0x10
+#define PCIE_ATR_TLB_SET_OFFSET                0x20
+
+#define PCIE_MAX_TRANS_TABLES          8
+#define PCIE_ATR_EN                    BIT(0)
+#define PCIE_ATR_SIZE(size) \
+       (((((size) - 1) << 1) & GENMASK(6, 1)) | PCIE_ATR_EN)
+#define PCIE_ATR_ID(id)                        ((id) & GENMASK(3, 0))
+#define PCIE_ATR_TYPE_MEM              PCIE_ATR_ID(0)
+#define PCIE_ATR_TYPE_IO               PCIE_ATR_ID(1)
+#define PCIE_ATR_TLP_TYPE(type)                (((type) << 16) & GENMASK(18, 16))
+#define PCIE_ATR_TLP_TYPE_MEM          PCIE_ATR_TLP_TYPE(0)
+#define PCIE_ATR_TLP_TYPE_IO           PCIE_ATR_TLP_TYPE(2)
+
+/**
+ * struct mtk_msi_set - MSI information for each set
+ * @base: IO mapped register base
+ * @msg_addr: MSI message address
+ * @saved_irq_state: IRQ enable state saved at suspend time
+ */
+struct mtk_msi_set {
+       void __iomem *base;
+       phys_addr_t msg_addr;
+       u32 saved_irq_state;
+};
+
+/**
+ * struct mtk_pcie_port - PCIe port information
+ * @dev: pointer to PCIe device
+ * @base: IO mapped register base
+ * @reg_base: physical register base
+ * @mac_reset: MAC reset control
+ * @phy_reset: PHY reset control
+ * @phy: PHY controller block
+ * @clks: PCIe clocks
+ * @num_clks: PCIe clocks count for this port
+ * @irq: PCIe controller interrupt number
+ * @saved_irq_state: IRQ enable state saved at suspend time
+ * @irq_lock: lock protecting IRQ register access
+ * @intx_domain: legacy INTx IRQ domain
+ * @msi_domain: MSI IRQ domain
+ * @msi_bottom_domain: MSI IRQ bottom domain
+ * @msi_sets: MSI sets information
+ * @lock: lock protecting IRQ bit map
+ * @msi_irq_in_use: bit map for assigned MSI IRQ
+ */
+struct mtk_pcie_port {
+       struct device *dev;
+       void __iomem *base;
+       phys_addr_t reg_base;
+       struct reset_control *mac_reset;
+       struct reset_control *phy_reset;
+       struct phy *phy;
+       struct clk_bulk_data *clks;
+       int num_clks;
+
+       int irq;
+       u32 saved_irq_state;
+       raw_spinlock_t irq_lock;
+       struct irq_domain *intx_domain;
+       struct irq_domain *msi_domain;
+       struct irq_domain *msi_bottom_domain;
+       struct mtk_msi_set msi_sets[PCIE_MSI_SET_NUM];
+       struct mutex lock;
+       DECLARE_BITMAP(msi_irq_in_use, PCIE_MSI_IRQS_NUM);
+};
+
+/**
+ * mtk_pcie_config_tlp_header() - Configure a configuration TLP header
+ * @bus: PCI bus to query
+ * @devfn: device/function number
+ * @where: offset in config space
+ * @size: data size in TLP header
+ *
+ * Set byte enable field and device information in configuration TLP header.
+ */
+static void mtk_pcie_config_tlp_header(struct pci_bus *bus, unsigned int devfn,
+                                       int where, int size)
+{
+       struct mtk_pcie_port *port = bus->sysdata;
+       int bytes;
+       u32 val;
+
+       bytes = (GENMASK(size - 1, 0) & 0xf) << (where & 0x3);
+
+       val = PCIE_CFG_FORCE_BYTE_EN | PCIE_CFG_BYTE_EN(bytes) |
+             PCIE_CFG_HEADER(bus->number, devfn);
+
+       writel_relaxed(val, port->base + PCIE_CFGNUM_REG);
+}
+
+static void __iomem *mtk_pcie_map_bus(struct pci_bus *bus, unsigned int devfn,
+                                     int where)
+{
+       struct mtk_pcie_port *port = bus->sysdata;
+
+       return port->base + PCIE_CFG_OFFSET_ADDR + where;
+}
+
+static int mtk_pcie_config_read(struct pci_bus *bus, unsigned int devfn,
+                               int where, int size, u32 *val)
+{
+       mtk_pcie_config_tlp_header(bus, devfn, where, size);
+
+       return pci_generic_config_read32(bus, devfn, where, size, val);
+}
+
+static int mtk_pcie_config_write(struct pci_bus *bus, unsigned int devfn,
+                                int where, int size, u32 val)
+{
+       mtk_pcie_config_tlp_header(bus, devfn, where, size);
+
+       if (size <= 2)
+               val <<= (where & 0x3) * 8;
+
+       return pci_generic_config_write32(bus, devfn, where, 4, val);
+}
+
+static struct pci_ops mtk_pcie_ops = {
+       .map_bus = mtk_pcie_map_bus,
+       .read  = mtk_pcie_config_read,
+       .write = mtk_pcie_config_write,
+};
+
+static int mtk_pcie_set_trans_table(struct mtk_pcie_port *port,
+                                   resource_size_t cpu_addr,
+                                   resource_size_t pci_addr,
+                                   resource_size_t size,
+                                   unsigned long type, int num)
+{
+       void __iomem *table;
+       u32 val;
+
+       if (num >= PCIE_MAX_TRANS_TABLES) {
+               dev_err(port->dev, "not enough translate table for addr: %#llx, limited to [%d]\n",
+                       (unsigned long long)cpu_addr, PCIE_MAX_TRANS_TABLES);
+               return -ENODEV;
+       }
+
+       table = port->base + PCIE_TRANS_TABLE_BASE_REG +
+               num * PCIE_ATR_TLB_SET_OFFSET;
+
+       writel_relaxed(lower_32_bits(cpu_addr) | PCIE_ATR_SIZE(fls(size) - 1),
+                      table);
+       writel_relaxed(upper_32_bits(cpu_addr),
+                      table + PCIE_ATR_SRC_ADDR_MSB_OFFSET);
+       writel_relaxed(lower_32_bits(pci_addr),
+                      table + PCIE_ATR_TRSL_ADDR_LSB_OFFSET);
+       writel_relaxed(upper_32_bits(pci_addr),
+                      table + PCIE_ATR_TRSL_ADDR_MSB_OFFSET);
+
+       if (type == IORESOURCE_IO)
+               val = PCIE_ATR_TYPE_IO | PCIE_ATR_TLP_TYPE_IO;
+       else
+               val = PCIE_ATR_TYPE_MEM | PCIE_ATR_TLP_TYPE_MEM;
+
+       writel_relaxed(val, table + PCIE_ATR_TRSL_PARAM_OFFSET);
+
+       return 0;
+}
+
+static void mtk_pcie_enable_msi(struct mtk_pcie_port *port)
+{
+       int i;
+       u32 val;
+
+       for (i = 0; i < PCIE_MSI_SET_NUM; i++) {
+               struct mtk_msi_set *msi_set = &port->msi_sets[i];
+
+               msi_set->base = port->base + PCIE_MSI_SET_BASE_REG +
+                               i * PCIE_MSI_SET_OFFSET;
+               msi_set->msg_addr = port->reg_base + PCIE_MSI_SET_BASE_REG +
+                                   i * PCIE_MSI_SET_OFFSET;
+
+               /* Configure the MSI capture address */
+               writel_relaxed(lower_32_bits(msi_set->msg_addr), msi_set->base);
+               writel_relaxed(upper_32_bits(msi_set->msg_addr),
+                              port->base + PCIE_MSI_SET_ADDR_HI_BASE +
+                              i * PCIE_MSI_SET_ADDR_HI_OFFSET);
+       }
+
+       val = readl_relaxed(port->base + PCIE_MSI_SET_ENABLE_REG);
+       val |= PCIE_MSI_SET_ENABLE;
+       writel_relaxed(val, port->base + PCIE_MSI_SET_ENABLE_REG);
+
+       val = readl_relaxed(port->base + PCIE_INT_ENABLE_REG);
+       val |= PCIE_MSI_ENABLE;
+       writel_relaxed(val, port->base + PCIE_INT_ENABLE_REG);
+}
+
+static int mtk_pcie_startup_port(struct mtk_pcie_port *port)
+{
+       struct resource_entry *entry;
+       struct pci_host_bridge *host = pci_host_bridge_from_priv(port);
+       unsigned int table_index = 0;
+       int err;
+       u32 val;
+
+       /* Set as RC mode */
+       val = readl_relaxed(port->base + PCIE_SETTING_REG);
+       val |= PCIE_RC_MODE;
+       writel_relaxed(val, port->base + PCIE_SETTING_REG);
+
+       /* Set class code */
+       val = readl_relaxed(port->base + PCIE_PCI_IDS_1);
+       val &= ~GENMASK(31, 8);
+       val |= PCI_CLASS(PCI_CLASS_BRIDGE_PCI << 8);
+       writel_relaxed(val, port->base + PCIE_PCI_IDS_1);
+
+       /* Mask all INTx interrupts */
+       val = readl_relaxed(port->base + PCIE_INT_ENABLE_REG);
+       val &= ~PCIE_INTX_ENABLE;
+       writel_relaxed(val, port->base + PCIE_INT_ENABLE_REG);
+
+       /* Assert all reset signals */
+       val = readl_relaxed(port->base + PCIE_RST_CTRL_REG);
+       val |= PCIE_MAC_RSTB | PCIE_PHY_RSTB | PCIE_BRG_RSTB | PCIE_PE_RSTB;
+       writel_relaxed(val, port->base + PCIE_RST_CTRL_REG);
+
+       /*
+        * Described in PCIe CEM specification setctions 2.2 (PERST# Signal)
+        * and 2.2.1 (Initial Power-Up (G3 to S0)).
+        * The deassertion of PERST# should be delayed 100ms (TPVPERL)
+        * for the power and clock to become stable.
+        */
+       msleep(100);
+
+       /* De-assert reset signals */
+       val &= ~(PCIE_MAC_RSTB | PCIE_PHY_RSTB | PCIE_BRG_RSTB | PCIE_PE_RSTB);
+       writel_relaxed(val, port->base + PCIE_RST_CTRL_REG);
+
+       /* Check if the link is up or not */
+       err = readl_poll_timeout(port->base + PCIE_LINK_STATUS_REG, val,
+                                !!(val & PCIE_PORT_LINKUP), 20,
+                                PCI_PM_D3COLD_WAIT * USEC_PER_MSEC);
+       if (err) {
+               val = readl_relaxed(port->base + PCIE_LTSSM_STATUS_REG);
+               dev_err(port->dev, "PCIe link down, ltssm reg val: %#x\n", val);
+               return err;
+       }
+
+       mtk_pcie_enable_msi(port);
+
+       /* Set PCIe translation windows */
+       resource_list_for_each_entry(entry, &host->windows) {
+               struct resource *res = entry->res;
+               unsigned long type = resource_type(res);
+               resource_size_t cpu_addr;
+               resource_size_t pci_addr;
+               resource_size_t size;
+               const char *range_type;
+
+               if (type == IORESOURCE_IO) {
+                       cpu_addr = pci_pio_to_address(res->start);
+                       range_type = "IO";
+               } else if (type == IORESOURCE_MEM) {
+                       cpu_addr = res->start;
+                       range_type = "MEM";
+               } else {
+                       continue;
+               }
+
+               pci_addr = res->start - entry->offset;
+               size = resource_size(res);
+               err = mtk_pcie_set_trans_table(port, cpu_addr, pci_addr, size,
+                                              type, table_index);
+               if (err)
+                       return err;
+
+               dev_dbg(port->dev, "set %s trans window[%d]: cpu_addr = %#llx, pci_addr = %#llx, size = %#llx\n",
+                       range_type, table_index, (unsigned long long)cpu_addr,
+                       (unsigned long long)pci_addr, (unsigned long long)size);
+
+               table_index++;
+       }
+
+       return 0;
+}
+
+static int mtk_pcie_set_affinity(struct irq_data *data,
+                                const struct cpumask *mask, bool force)
+{
+       return -EINVAL;
+}
+
+static void mtk_pcie_msi_irq_mask(struct irq_data *data)
+{
+       pci_msi_mask_irq(data);
+       irq_chip_mask_parent(data);
+}
+
+static void mtk_pcie_msi_irq_unmask(struct irq_data *data)
+{
+       pci_msi_unmask_irq(data);
+       irq_chip_unmask_parent(data);
+}
+
+static struct irq_chip mtk_msi_irq_chip = {
+       .irq_ack = irq_chip_ack_parent,
+       .irq_mask = mtk_pcie_msi_irq_mask,
+       .irq_unmask = mtk_pcie_msi_irq_unmask,
+       .name = "MSI",
+};
+
+static struct msi_domain_info mtk_msi_domain_info = {
+       .flags  = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+                  MSI_FLAG_PCI_MSIX | MSI_FLAG_MULTI_PCI_MSI),
+       .chip   = &mtk_msi_irq_chip,
+};
+
+static void mtk_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+       struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
+       struct mtk_pcie_port *port = data->domain->host_data;
+       unsigned long hwirq;
+
+       hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET;
+
+       msg->address_hi = upper_32_bits(msi_set->msg_addr);
+       msg->address_lo = lower_32_bits(msi_set->msg_addr);
+       msg->data = hwirq;
+       dev_dbg(port->dev, "msi#%#lx address_hi %#x address_lo %#x data %d\n",
+               hwirq, msg->address_hi, msg->address_lo, msg->data);
+}
+
+static void mtk_msi_bottom_irq_ack(struct irq_data *data)
+{
+       struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
+       unsigned long hwirq;
+
+       hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET;
+
+       writel_relaxed(BIT(hwirq), msi_set->base + PCIE_MSI_SET_STATUS_OFFSET);
+}
+
+static void mtk_msi_bottom_irq_mask(struct irq_data *data)
+{
+       struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
+       struct mtk_pcie_port *port = data->domain->host_data;
+       unsigned long hwirq, flags;
+       u32 val;
+
+       hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET;
+
+       raw_spin_lock_irqsave(&port->irq_lock, flags);
+       val = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+       val &= ~BIT(hwirq);
+       writel_relaxed(val, msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+       raw_spin_unlock_irqrestore(&port->irq_lock, flags);
+}
+
+static void mtk_msi_bottom_irq_unmask(struct irq_data *data)
+{
+       struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data);
+       struct mtk_pcie_port *port = data->domain->host_data;
+       unsigned long hwirq, flags;
+       u32 val;
+
+       hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET;
+
+       raw_spin_lock_irqsave(&port->irq_lock, flags);
+       val = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+       val |= BIT(hwirq);
+       writel_relaxed(val, msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+       raw_spin_unlock_irqrestore(&port->irq_lock, flags);
+}
+
+static struct irq_chip mtk_msi_bottom_irq_chip = {
+       .irq_ack                = mtk_msi_bottom_irq_ack,
+       .irq_mask               = mtk_msi_bottom_irq_mask,
+       .irq_unmask             = mtk_msi_bottom_irq_unmask,
+       .irq_compose_msi_msg    = mtk_compose_msi_msg,
+       .irq_set_affinity       = mtk_pcie_set_affinity,
+       .name                   = "MSI",
+};
+
+static int mtk_msi_bottom_domain_alloc(struct irq_domain *domain,
+                                      unsigned int virq, unsigned int nr_irqs,
+                                      void *arg)
+{
+       struct mtk_pcie_port *port = domain->host_data;
+       struct mtk_msi_set *msi_set;
+       int i, hwirq, set_idx;
+
+       mutex_lock(&port->lock);
+
+       hwirq = bitmap_find_free_region(port->msi_irq_in_use, PCIE_MSI_IRQS_NUM,
+                                       order_base_2(nr_irqs));
+
+       mutex_unlock(&port->lock);
+
+       if (hwirq < 0)
+               return -ENOSPC;
+
+       set_idx = hwirq / PCIE_MSI_IRQS_PER_SET;
+       msi_set = &port->msi_sets[set_idx];
+
+       for (i = 0; i < nr_irqs; i++)
+               irq_domain_set_info(domain, virq + i, hwirq + i,
+                                   &mtk_msi_bottom_irq_chip, msi_set,
+                                   handle_edge_irq, NULL, NULL);
+
+       return 0;
+}
+
+static void mtk_msi_bottom_domain_free(struct irq_domain *domain,
+                                      unsigned int virq, unsigned int nr_irqs)
+{
+       struct mtk_pcie_port *port = domain->host_data;
+       struct irq_data *data = irq_domain_get_irq_data(domain, virq);
+
+       mutex_lock(&port->lock);
+
+       bitmap_release_region(port->msi_irq_in_use, data->hwirq,
+                             order_base_2(nr_irqs));
+
+       mutex_unlock(&port->lock);
+
+       irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static const struct irq_domain_ops mtk_msi_bottom_domain_ops = {
+       .alloc = mtk_msi_bottom_domain_alloc,
+       .free = mtk_msi_bottom_domain_free,
+};
+
+static void mtk_intx_mask(struct irq_data *data)
+{
+       struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data);
+       unsigned long flags;
+       u32 val;
+
+       raw_spin_lock_irqsave(&port->irq_lock, flags);
+       val = readl_relaxed(port->base + PCIE_INT_ENABLE_REG);
+       val &= ~BIT(data->hwirq + PCIE_INTX_SHIFT);
+       writel_relaxed(val, port->base + PCIE_INT_ENABLE_REG);
+       raw_spin_unlock_irqrestore(&port->irq_lock, flags);
+}
+
+static void mtk_intx_unmask(struct irq_data *data)
+{
+       struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data);
+       unsigned long flags;
+       u32 val;
+
+       raw_spin_lock_irqsave(&port->irq_lock, flags);
+       val = readl_relaxed(port->base + PCIE_INT_ENABLE_REG);
+       val |= BIT(data->hwirq + PCIE_INTX_SHIFT);
+       writel_relaxed(val, port->base + PCIE_INT_ENABLE_REG);
+       raw_spin_unlock_irqrestore(&port->irq_lock, flags);
+}
+
+/**
+ * mtk_intx_eoi() - Clear INTx IRQ status at the end of interrupt
+ * @data: pointer to chip specific data
+ *
+ * As an emulated level IRQ, its interrupt status will remain
+ * until the corresponding de-assert message is received; hence that
+ * the status can only be cleared when the interrupt has been serviced.
+ */
+static void mtk_intx_eoi(struct irq_data *data)
+{
+       struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data);
+       unsigned long hwirq;
+
+       hwirq = data->hwirq + PCIE_INTX_SHIFT;
+       writel_relaxed(BIT(hwirq), port->base + PCIE_INT_STATUS_REG);
+}
+
+static struct irq_chip mtk_intx_irq_chip = {
+       .irq_mask               = mtk_intx_mask,
+       .irq_unmask             = mtk_intx_unmask,
+       .irq_eoi                = mtk_intx_eoi,
+       .irq_set_affinity       = mtk_pcie_set_affinity,
+       .name                   = "INTx",
+};
+
+static int mtk_pcie_intx_map(struct irq_domain *domain, unsigned int irq,
+                            irq_hw_number_t hwirq)
+{
+       irq_set_chip_data(irq, domain->host_data);
+       irq_set_chip_and_handler_name(irq, &mtk_intx_irq_chip,
+                                     handle_fasteoi_irq, "INTx");
+       return 0;
+}
+
+static const struct irq_domain_ops intx_domain_ops = {
+       .map = mtk_pcie_intx_map,
+};
+
+static int mtk_pcie_init_irq_domains(struct mtk_pcie_port *port)
+{
+       struct device *dev = port->dev;
+       struct device_node *intc_node, *node = dev->of_node;
+       int ret;
+
+       raw_spin_lock_init(&port->irq_lock);
+
+       /* Setup INTx */
+       intc_node = of_get_child_by_name(node, "interrupt-controller");
+       if (!intc_node) {
+               dev_err(dev, "missing interrupt-controller node\n");
+               return -ENODEV;
+       }
+
+       port->intx_domain = irq_domain_add_linear(intc_node, PCI_NUM_INTX,
+                                                 &intx_domain_ops, port);
+       if (!port->intx_domain) {
+               dev_err(dev, "failed to create INTx IRQ domain\n");
+               return -ENODEV;
+       }
+
+       /* Setup MSI */
+       mutex_init(&port->lock);
+
+       port->msi_bottom_domain = irq_domain_add_linear(node, PCIE_MSI_IRQS_NUM,
+                                 &mtk_msi_bottom_domain_ops, port);
+       if (!port->msi_bottom_domain) {
+               dev_err(dev, "failed to create MSI bottom domain\n");
+               ret = -ENODEV;
+               goto err_msi_bottom_domain;
+       }
+
+       port->msi_domain = pci_msi_create_irq_domain(dev->fwnode,
+                                                    &mtk_msi_domain_info,
+                                                    port->msi_bottom_domain);
+       if (!port->msi_domain) {
+               dev_err(dev, "failed to create MSI domain\n");
+               ret = -ENODEV;
+               goto err_msi_domain;
+       }
+
+       return 0;
+
+err_msi_domain:
+       irq_domain_remove(port->msi_bottom_domain);
+err_msi_bottom_domain:
+       irq_domain_remove(port->intx_domain);
+
+       return ret;
+}
+
+static void mtk_pcie_irq_teardown(struct mtk_pcie_port *port)
+{
+       irq_set_chained_handler_and_data(port->irq, NULL, NULL);
+
+       if (port->intx_domain)
+               irq_domain_remove(port->intx_domain);
+
+       if (port->msi_domain)
+               irq_domain_remove(port->msi_domain);
+
+       if (port->msi_bottom_domain)
+               irq_domain_remove(port->msi_bottom_domain);
+
+       irq_dispose_mapping(port->irq);
+}
+
+static void mtk_pcie_msi_handler(struct mtk_pcie_port *port, int set_idx)
+{
+       struct mtk_msi_set *msi_set = &port->msi_sets[set_idx];
+       unsigned long msi_enable, msi_status;
+       unsigned int virq;
+       irq_hw_number_t bit, hwirq;
+
+       msi_enable = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+
+       do {
+               msi_status = readl_relaxed(msi_set->base +
+                                          PCIE_MSI_SET_STATUS_OFFSET);
+               msi_status &= msi_enable;
+               if (!msi_status)
+                       break;
+
+               for_each_set_bit(bit, &msi_status, PCIE_MSI_IRQS_PER_SET) {
+                       hwirq = bit + set_idx * PCIE_MSI_IRQS_PER_SET;
+                       virq = irq_find_mapping(port->msi_bottom_domain, hwirq);
+                       generic_handle_irq(virq);
+               }
+       } while (true);
+}
+
+static void mtk_pcie_irq_handler(struct irq_desc *desc)
+{
+       struct mtk_pcie_port *port = irq_desc_get_handler_data(desc);
+       struct irq_chip *irqchip = irq_desc_get_chip(desc);
+       unsigned long status;
+       unsigned int virq;
+       irq_hw_number_t irq_bit = PCIE_INTX_SHIFT;
+
+       chained_irq_enter(irqchip, desc);
+
+       status = readl_relaxed(port->base + PCIE_INT_STATUS_REG);
+       for_each_set_bit_from(irq_bit, &status, PCI_NUM_INTX +
+                             PCIE_INTX_SHIFT) {
+               virq = irq_find_mapping(port->intx_domain,
+                                       irq_bit - PCIE_INTX_SHIFT);
+               generic_handle_irq(virq);
+       }
+
+       irq_bit = PCIE_MSI_SHIFT;
+       for_each_set_bit_from(irq_bit, &status, PCIE_MSI_SET_NUM +
+                             PCIE_MSI_SHIFT) {
+               mtk_pcie_msi_handler(port, irq_bit - PCIE_MSI_SHIFT);
+
+               writel_relaxed(BIT(irq_bit), port->base + PCIE_INT_STATUS_REG);
+       }
+
+       chained_irq_exit(irqchip, desc);
+}
+
+static int mtk_pcie_setup_irq(struct mtk_pcie_port *port)
+{
+       struct device *dev = port->dev;
+       struct platform_device *pdev = to_platform_device(dev);
+       int err;
+
+       err = mtk_pcie_init_irq_domains(port);
+       if (err)
+               return err;
+
+       port->irq = platform_get_irq(pdev, 0);
+       if (port->irq < 0)
+               return port->irq;
+
+       irq_set_chained_handler_and_data(port->irq, mtk_pcie_irq_handler, port);
+
+       return 0;
+}
+
+static int mtk_pcie_parse_port(struct mtk_pcie_port *port)
+{
+       struct device *dev = port->dev;
+       struct platform_device *pdev = to_platform_device(dev);
+       struct resource *regs;
+       int ret;
+
+       regs = platform_get_resource_byname(pdev, IORESOURCE_MEM, "pcie-mac");
+       if (!regs)
+               return -EINVAL;
+       port->base = devm_ioremap_resource(dev, regs);
+       if (IS_ERR(port->base)) {
+               dev_err(dev, "failed to map register base\n");
+               return PTR_ERR(port->base);
+       }
+
+       port->reg_base = regs->start;
+
+       port->phy_reset = devm_reset_control_get_optional_exclusive(dev, "phy");
+       if (IS_ERR(port->phy_reset)) {
+               ret = PTR_ERR(port->phy_reset);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(dev, "failed to get PHY reset\n");
+
+               return ret;
+       }
+
+       port->mac_reset = devm_reset_control_get_optional_exclusive(dev, "mac");
+       if (IS_ERR(port->mac_reset)) {
+               ret = PTR_ERR(port->mac_reset);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(dev, "failed to get MAC reset\n");
+
+               return ret;
+       }
+
+       port->phy = devm_phy_optional_get(dev, "pcie-phy");
+       if (IS_ERR(port->phy)) {
+               ret = PTR_ERR(port->phy);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(dev, "failed to get PHY\n");
+
+               return ret;
+       }
+
+       port->num_clks = devm_clk_bulk_get_all(dev, &port->clks);
+       if (port->num_clks < 0) {
+               dev_err(dev, "failed to get clocks\n");
+               return port->num_clks;
+       }
+
+       return 0;
+}
+
+static int mtk_pcie_power_up(struct mtk_pcie_port *port)
+{
+       struct device *dev = port->dev;
+       int err;
+
+       /* PHY power on and enable pipe clock */
+       reset_control_deassert(port->phy_reset);
+
+       err = phy_init(port->phy);
+       if (err) {
+               dev_err(dev, "failed to initialize PHY\n");
+               goto err_phy_init;
+       }
+
+       err = phy_power_on(port->phy);
+       if (err) {
+               dev_err(dev, "failed to power on PHY\n");
+               goto err_phy_on;
+       }
+
+       /* MAC power on and enable transaction layer clocks */
+       reset_control_deassert(port->mac_reset);
+
+       pm_runtime_enable(dev);
+       pm_runtime_get_sync(dev);
+
+       err = clk_bulk_prepare_enable(port->num_clks, port->clks);
+       if (err) {
+               dev_err(dev, "failed to enable clocks\n");
+               goto err_clk_init;
+       }
+
+       return 0;
+
+err_clk_init:
+       pm_runtime_put_sync(dev);
+       pm_runtime_disable(dev);
+       reset_control_assert(port->mac_reset);
+       phy_power_off(port->phy);
+err_phy_on:
+       phy_exit(port->phy);
+err_phy_init:
+       reset_control_assert(port->phy_reset);
+
+       return err;
+}
+
+static void mtk_pcie_power_down(struct mtk_pcie_port *port)
+{
+       clk_bulk_disable_unprepare(port->num_clks, port->clks);
+
+       pm_runtime_put_sync(port->dev);
+       pm_runtime_disable(port->dev);
+       reset_control_assert(port->mac_reset);
+
+       phy_power_off(port->phy);
+       phy_exit(port->phy);
+       reset_control_assert(port->phy_reset);
+}
+
+static int mtk_pcie_setup(struct mtk_pcie_port *port)
+{
+       int err;
+
+       err = mtk_pcie_parse_port(port);
+       if (err)
+               return err;
+
+       /* Don't touch the hardware registers before power up */
+       err = mtk_pcie_power_up(port);
+       if (err)
+               return err;
+
+       /* Try link up */
+       err = mtk_pcie_startup_port(port);
+       if (err)
+               goto err_setup;
+
+       err = mtk_pcie_setup_irq(port);
+       if (err)
+               goto err_setup;
+
+       return 0;
+
+err_setup:
+       mtk_pcie_power_down(port);
+
+       return err;
+}
+
+static int mtk_pcie_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct mtk_pcie_port *port;
+       struct pci_host_bridge *host;
+       int err;
+
+       host = devm_pci_alloc_host_bridge(dev, sizeof(*port));
+       if (!host)
+               return -ENOMEM;
+
+       port = pci_host_bridge_priv(host);
+
+       port->dev = dev;
+       platform_set_drvdata(pdev, port);
+
+       err = mtk_pcie_setup(port);
+       if (err)
+               return err;
+
+       host->ops = &mtk_pcie_ops;
+       host->sysdata = port;
+
+       err = pci_host_probe(host);
+       if (err) {
+               mtk_pcie_irq_teardown(port);
+               mtk_pcie_power_down(port);
+               return err;
+       }
+
+       return 0;
+}
+
+static int mtk_pcie_remove(struct platform_device *pdev)
+{
+       struct mtk_pcie_port *port = platform_get_drvdata(pdev);
+       struct pci_host_bridge *host = pci_host_bridge_from_priv(port);
+
+       pci_lock_rescan_remove();
+       pci_stop_root_bus(host->bus);
+       pci_remove_root_bus(host->bus);
+       pci_unlock_rescan_remove();
+
+       mtk_pcie_irq_teardown(port);
+       mtk_pcie_power_down(port);
+
+       return 0;
+}
+
+static void __maybe_unused mtk_pcie_irq_save(struct mtk_pcie_port *port)
+{
+       int i;
+
+       raw_spin_lock(&port->irq_lock);
+
+       port->saved_irq_state = readl_relaxed(port->base + PCIE_INT_ENABLE_REG);
+
+       for (i = 0; i < PCIE_MSI_SET_NUM; i++) {
+               struct mtk_msi_set *msi_set = &port->msi_sets[i];
+
+               msi_set->saved_irq_state = readl_relaxed(msi_set->base +
+                                          PCIE_MSI_SET_ENABLE_OFFSET);
+       }
+
+       raw_spin_unlock(&port->irq_lock);
+}
+
+static void __maybe_unused mtk_pcie_irq_restore(struct mtk_pcie_port *port)
+{
+       int i;
+
+       raw_spin_lock(&port->irq_lock);
+
+       writel_relaxed(port->saved_irq_state, port->base + PCIE_INT_ENABLE_REG);
+
+       for (i = 0; i < PCIE_MSI_SET_NUM; i++) {
+               struct mtk_msi_set *msi_set = &port->msi_sets[i];
+
+               writel_relaxed(msi_set->saved_irq_state,
+                              msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET);
+       }
+
+       raw_spin_unlock(&port->irq_lock);
+}
+
+static int __maybe_unused mtk_pcie_turn_off_link(struct mtk_pcie_port *port)
+{
+       u32 val;
+
+       val = readl_relaxed(port->base + PCIE_ICMD_PM_REG);
+       val |= PCIE_TURN_OFF_LINK;
+       writel_relaxed(val, port->base + PCIE_ICMD_PM_REG);
+
+       /* Check the link is L2 */
+       return readl_poll_timeout(port->base + PCIE_LTSSM_STATUS_REG, val,
+                                 (PCIE_LTSSM_STATE(val) ==
+                                  PCIE_LTSSM_STATE_L2_IDLE), 20,
+                                  50 * USEC_PER_MSEC);
+}
+
+static int __maybe_unused mtk_pcie_suspend_noirq(struct device *dev)
+{
+       struct mtk_pcie_port *port = dev_get_drvdata(dev);
+       int err;
+       u32 val;
+
+       /* Trigger link to L2 state */
+       err = mtk_pcie_turn_off_link(port);
+       if (err) {
+               dev_err(port->dev, "cannot enter L2 state\n");
+               return err;
+       }
+
+       /* Pull down the PERST# pin */
+       val = readl_relaxed(port->base + PCIE_RST_CTRL_REG);
+       val |= PCIE_PE_RSTB;
+       writel_relaxed(val, port->base + PCIE_RST_CTRL_REG);
+
+       dev_dbg(port->dev, "entered L2 states successfully");
+
+       mtk_pcie_irq_save(port);
+       mtk_pcie_power_down(port);
+
+       return 0;
+}
+
+static int __maybe_unused mtk_pcie_resume_noirq(struct device *dev)
+{
+       struct mtk_pcie_port *port = dev_get_drvdata(dev);
+       int err;
+
+       err = mtk_pcie_power_up(port);
+       if (err)
+               return err;
+
+       err = mtk_pcie_startup_port(port);
+       if (err) {
+               mtk_pcie_power_down(port);
+               return err;
+       }
+
+       mtk_pcie_irq_restore(port);
+
+       return 0;
+}
+
+static const struct dev_pm_ops mtk_pcie_pm_ops = {
+       SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(mtk_pcie_suspend_noirq,
+                                     mtk_pcie_resume_noirq)
+};
+
+static const struct of_device_id mtk_pcie_of_match[] = {
+       { .compatible = "mediatek,mt8192-pcie" },
+       {},
+};
+
+static struct platform_driver mtk_pcie_driver = {
+       .probe = mtk_pcie_probe,
+       .remove = mtk_pcie_remove,
+       .driver = {
+               .name = "mtk-pcie",
+               .of_match_table = mtk_pcie_of_match,
+               .pm = &mtk_pcie_pm_ops,
+       },
+};
+
+module_platform_driver(mtk_pcie_driver);
+MODULE_LICENSE("GPL v2");
index 23548b5..62a042e 100644 (file)
@@ -143,6 +143,7 @@ struct mtk_pcie_port;
  * struct mtk_pcie_soc - differentiate between host generations
  * @need_fix_class_id: whether this host's class ID needed to be fixed or not
  * @need_fix_device_id: whether this host's device ID needed to be fixed or not
+ * @no_msi: Bridge has no MSI support, and relies on an external block
  * @device_id: device ID which this host need to be fixed
  * @ops: pointer to configuration access functions
  * @startup: pointer to controller setting functions
@@ -151,6 +152,7 @@ struct mtk_pcie_port;
 struct mtk_pcie_soc {
        bool need_fix_class_id;
        bool need_fix_device_id;
+       bool no_msi;
        unsigned int device_id;
        struct pci_ops *ops;
        int (*startup)(struct mtk_pcie_port *port);
@@ -760,7 +762,7 @@ static struct pci_ops mtk_pcie_ops = {
 static int mtk_pcie_startup_port(struct mtk_pcie_port *port)
 {
        struct mtk_pcie *pcie = port->pcie;
-       u32 func = PCI_FUNC(port->slot << 3);
+       u32 func = PCI_FUNC(port->slot);
        u32 slot = PCI_SLOT(port->slot << 3);
        u32 val;
        int err;
@@ -1087,6 +1089,7 @@ static int mtk_pcie_probe(struct platform_device *pdev)
 
        host->ops = pcie->soc->ops;
        host->sysdata = pcie;
+       host->msi_domain = pcie->soc->no_msi;
 
        err = pci_host_probe(host);
        if (err)
@@ -1176,6 +1179,7 @@ static const struct dev_pm_ops mtk_pcie_pm_ops = {
 };
 
 static const struct mtk_pcie_soc mtk_pcie_soc_v1 = {
+       .no_msi = true,
        .ops = &mtk_pcie_ops,
        .startup = mtk_pcie_startup_port,
 };
@@ -1210,6 +1214,7 @@ static const struct of_device_id mtk_pcie_ids[] = {
        { .compatible = "mediatek,mt7629-pcie", .data = &mtk_pcie_soc_mt7629 },
        {},
 };
+MODULE_DEVICE_TABLE(of, mtk_pcie_ids);
 
 static struct platform_driver mtk_pcie_driver = {
        .probe = mtk_pcie_probe,
index 04c19ff..89c68c5 100644 (file)
@@ -301,27 +301,27 @@ static const struct cause event_cause[NUM_EVENTS] = {
        LOCAL_EVENT_CAUSE(PM_MSI_INT_SYS_ERR, "system error"),
 };
 
-struct event_map pcie_event_to_event[] = {
+static struct event_map pcie_event_to_event[] = {
        PCIE_EVENT_TO_EVENT_MAP(L2_EXIT),
        PCIE_EVENT_TO_EVENT_MAP(HOTRST_EXIT),
        PCIE_EVENT_TO_EVENT_MAP(DLUP_EXIT),
 };
 
-struct event_map sec_error_to_event[] = {
+static struct event_map sec_error_to_event[] = {
        SEC_ERROR_TO_EVENT_MAP(TX_RAM_SEC_ERR),
        SEC_ERROR_TO_EVENT_MAP(RX_RAM_SEC_ERR),
        SEC_ERROR_TO_EVENT_MAP(PCIE2AXI_RAM_SEC_ERR),
        SEC_ERROR_TO_EVENT_MAP(AXI2PCIE_RAM_SEC_ERR),
 };
 
-struct event_map ded_error_to_event[] = {
+static struct event_map ded_error_to_event[] = {
        DED_ERROR_TO_EVENT_MAP(TX_RAM_DED_ERR),
        DED_ERROR_TO_EVENT_MAP(RX_RAM_DED_ERR),
        DED_ERROR_TO_EVENT_MAP(PCIE2AXI_RAM_DED_ERR),
        DED_ERROR_TO_EVENT_MAP(AXI2PCIE_RAM_DED_ERR),
 };
 
-struct event_map local_status_to_event[] = {
+static struct event_map local_status_to_event[] = {
        LOCAL_STATUS_TO_EVENT_MAP(DMA_END_ENGINE_0),
        LOCAL_STATUS_TO_EVENT_MAP(DMA_END_ENGINE_1),
        LOCAL_STATUS_TO_EVENT_MAP(DMA_ERROR_ENGINE_0),
@@ -1023,10 +1023,8 @@ static int mc_platform_init(struct pci_config_window *cfg)
        }
 
        irq = platform_get_irq(pdev, 0);
-       if (irq < 0) {
-               dev_err(dev, "unable to request IRQ%d\n", irq);
+       if (irq < 0)
                return -ENODEV;
-       }
 
        for (i = 0; i < NUM_EVENTS; i++) {
                event_irq = irq_create_mapping(port->event_domain, i);
index a728e8f..765cf2b 100644 (file)
 struct rcar_msi {
        DECLARE_BITMAP(used, INT_PCI_MSI_NR);
        struct irq_domain *domain;
-       struct msi_controller chip;
-       unsigned long pages;
-       struct mutex lock;
+       struct mutex map_lock;
+       spinlock_t mask_lock;
        int irq1;
        int irq2;
 };
 
-static inline struct rcar_msi *to_rcar_msi(struct msi_controller *chip)
-{
-       return container_of(chip, struct rcar_msi, chip);
-}
-
 /* Structure representing the PCIe interface */
 struct rcar_pcie_host {
        struct rcar_pcie        pcie;
@@ -56,6 +50,11 @@ struct rcar_pcie_host {
        int                     (*phy_init_fn)(struct rcar_pcie_host *host);
 };
 
+static struct rcar_pcie_host *msi_to_host(struct rcar_msi *msi)
+{
+       return container_of(msi, struct rcar_pcie_host, msi);
+}
+
 static u32 rcar_read_conf(struct rcar_pcie *pcie, int where)
 {
        unsigned int shift = BITS_PER_BYTE * (where & 3);
@@ -292,8 +291,6 @@ static int rcar_pcie_enable(struct rcar_pcie_host *host)
 
        bridge->sysdata = host;
        bridge->ops = &rcar_pcie_ops;
-       if (IS_ENABLED(CONFIG_PCI_MSI))
-               bridge->msi = &host->msi.chip;
 
        return pci_host_probe(bridge);
 }
@@ -473,42 +470,6 @@ static int rcar_pcie_phy_init_gen3(struct rcar_pcie_host *host)
        return err;
 }
 
-static int rcar_msi_alloc(struct rcar_msi *chip)
-{
-       int msi;
-
-       mutex_lock(&chip->lock);
-
-       msi = find_first_zero_bit(chip->used, INT_PCI_MSI_NR);
-       if (msi < INT_PCI_MSI_NR)
-               set_bit(msi, chip->used);
-       else
-               msi = -ENOSPC;
-
-       mutex_unlock(&chip->lock);
-
-       return msi;
-}
-
-static int rcar_msi_alloc_region(struct rcar_msi *chip, int no_irqs)
-{
-       int msi;
-
-       mutex_lock(&chip->lock);
-       msi = bitmap_find_free_region(chip->used, INT_PCI_MSI_NR,
-                                     order_base_2(no_irqs));
-       mutex_unlock(&chip->lock);
-
-       return msi;
-}
-
-static void rcar_msi_free(struct rcar_msi *chip, unsigned long irq)
-{
-       mutex_lock(&chip->lock);
-       clear_bit(irq, chip->used);
-       mutex_unlock(&chip->lock);
-}
-
 static irqreturn_t rcar_pcie_msi_irq(int irq, void *data)
 {
        struct rcar_pcie_host *host = data;
@@ -527,18 +488,13 @@ static irqreturn_t rcar_pcie_msi_irq(int irq, void *data)
                unsigned int index = find_first_bit(&reg, 32);
                unsigned int msi_irq;
 
-               /* clear the interrupt */
-               rcar_pci_write_reg(pcie, 1 << index, PCIEMSIFR);
-
-               msi_irq = irq_find_mapping(msi->domain, index);
+               msi_irq = irq_find_mapping(msi->domain->parent, index);
                if (msi_irq) {
-                       if (test_bit(index, msi->used))
-                               generic_handle_irq(msi_irq);
-                       else
-                               dev_info(dev, "unhandled MSI\n");
+                       generic_handle_irq(msi_irq);
                } else {
                        /* Unknown MSI, just clear it */
                        dev_dbg(dev, "unexpected MSI\n");
+                       rcar_pci_write_reg(pcie, BIT(index), PCIEMSIFR);
                }
 
                /* see if there's any more pending in this vector */
@@ -548,149 +504,169 @@ static irqreturn_t rcar_pcie_msi_irq(int irq, void *data)
        return IRQ_HANDLED;
 }
 
-static int rcar_msi_setup_irq(struct msi_controller *chip, struct pci_dev *pdev,
-                             struct msi_desc *desc)
+static void rcar_msi_top_irq_ack(struct irq_data *d)
 {
-       struct rcar_msi *msi = to_rcar_msi(chip);
-       struct rcar_pcie_host *host = container_of(chip, struct rcar_pcie_host,
-                                                  msi.chip);
-       struct rcar_pcie *pcie = &host->pcie;
-       struct msi_msg msg;
-       unsigned int irq;
-       int hwirq;
+       irq_chip_ack_parent(d);
+}
 
-       hwirq = rcar_msi_alloc(msi);
-       if (hwirq < 0)
-               return hwirq;
+static void rcar_msi_top_irq_mask(struct irq_data *d)
+{
+       pci_msi_mask_irq(d);
+       irq_chip_mask_parent(d);
+}
 
-       irq = irq_find_mapping(msi->domain, hwirq);
-       if (!irq) {
-               rcar_msi_free(msi, hwirq);
-               return -EINVAL;
-       }
+static void rcar_msi_top_irq_unmask(struct irq_data *d)
+{
+       pci_msi_unmask_irq(d);
+       irq_chip_unmask_parent(d);
+}
 
-       irq_set_msi_desc(irq, desc);
+static struct irq_chip rcar_msi_top_chip = {
+       .name           = "PCIe MSI",
+       .irq_ack        = rcar_msi_top_irq_ack,
+       .irq_mask       = rcar_msi_top_irq_mask,
+       .irq_unmask     = rcar_msi_top_irq_unmask,
+};
 
-       msg.address_lo = rcar_pci_read_reg(pcie, PCIEMSIALR) & ~MSIFE;
-       msg.address_hi = rcar_pci_read_reg(pcie, PCIEMSIAUR);
-       msg.data = hwirq;
+static void rcar_msi_irq_ack(struct irq_data *d)
+{
+       struct rcar_msi *msi = irq_data_get_irq_chip_data(d);
+       struct rcar_pcie *pcie = &msi_to_host(msi)->pcie;
 
-       pci_write_msi_msg(irq, &msg);
+       /* clear the interrupt */
+       rcar_pci_write_reg(pcie, BIT(d->hwirq), PCIEMSIFR);
+}
 
-       return 0;
+static void rcar_msi_irq_mask(struct irq_data *d)
+{
+       struct rcar_msi *msi = irq_data_get_irq_chip_data(d);
+       struct rcar_pcie *pcie = &msi_to_host(msi)->pcie;
+       unsigned long flags;
+       u32 value;
+
+       spin_lock_irqsave(&msi->mask_lock, flags);
+       value = rcar_pci_read_reg(pcie, PCIEMSIIER);
+       value &= ~BIT(d->hwirq);
+       rcar_pci_write_reg(pcie, value, PCIEMSIIER);
+       spin_unlock_irqrestore(&msi->mask_lock, flags);
 }
 
-static int rcar_msi_setup_irqs(struct msi_controller *chip,
-                              struct pci_dev *pdev, int nvec, int type)
+static void rcar_msi_irq_unmask(struct irq_data *d)
 {
-       struct rcar_msi *msi = to_rcar_msi(chip);
-       struct rcar_pcie_host *host = container_of(chip, struct rcar_pcie_host,
-                                                  msi.chip);
-       struct rcar_pcie *pcie = &host->pcie;
-       struct msi_desc *desc;
-       struct msi_msg msg;
-       unsigned int irq;
-       int hwirq;
-       int i;
+       struct rcar_msi *msi = irq_data_get_irq_chip_data(d);
+       struct rcar_pcie *pcie = &msi_to_host(msi)->pcie;
+       unsigned long flags;
+       u32 value;
+
+       spin_lock_irqsave(&msi->mask_lock, flags);
+       value = rcar_pci_read_reg(pcie, PCIEMSIIER);
+       value |= BIT(d->hwirq);
+       rcar_pci_write_reg(pcie, value, PCIEMSIIER);
+       spin_unlock_irqrestore(&msi->mask_lock, flags);
+}
 
-       /* MSI-X interrupts are not supported */
-       if (type == PCI_CAP_ID_MSIX)
-               return -EINVAL;
+static int rcar_msi_set_affinity(struct irq_data *d, const struct cpumask *mask, bool force)
+{
+       return -EINVAL;
+}
 
-       WARN_ON(!list_is_singular(&pdev->dev.msi_list));
-       desc = list_entry(pdev->dev.msi_list.next, struct msi_desc, list);
+static void rcar_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+       struct rcar_msi *msi = irq_data_get_irq_chip_data(data);
+       struct rcar_pcie *pcie = &msi_to_host(msi)->pcie;
 
-       hwirq = rcar_msi_alloc_region(msi, nvec);
-       if (hwirq < 0)
-               return -ENOSPC;
+       msg->address_lo = rcar_pci_read_reg(pcie, PCIEMSIALR) & ~MSIFE;
+       msg->address_hi = rcar_pci_read_reg(pcie, PCIEMSIAUR);
+       msg->data = data->hwirq;
+}
 
-       irq = irq_find_mapping(msi->domain, hwirq);
-       if (!irq)
-               return -ENOSPC;
+static struct irq_chip rcar_msi_bottom_chip = {
+       .name                   = "Rcar MSI",
+       .irq_ack                = rcar_msi_irq_ack,
+       .irq_mask               = rcar_msi_irq_mask,
+       .irq_unmask             = rcar_msi_irq_unmask,
+       .irq_set_affinity       = rcar_msi_set_affinity,
+       .irq_compose_msi_msg    = rcar_compose_msi_msg,
+};
 
-       for (i = 0; i < nvec; i++) {
-               /*
-                * irq_create_mapping() called from rcar_pcie_probe() pre-
-                * allocates descs,  so there is no need to allocate descs here.
-                * We can therefore assume that if irq_find_mapping() above
-                * returns non-zero, then the descs are also successfully
-                * allocated.
-                */
-               if (irq_set_msi_desc_off(irq, i, desc)) {
-                       /* TODO: clear */
-                       return -EINVAL;
-               }
-       }
+static int rcar_msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
+                                 unsigned int nr_irqs, void *args)
+{
+       struct rcar_msi *msi = domain->host_data;
+       unsigned int i;
+       int hwirq;
 
-       desc->nvec_used = nvec;
-       desc->msi_attrib.multiple = order_base_2(nvec);
+       mutex_lock(&msi->map_lock);
 
-       msg.address_lo = rcar_pci_read_reg(pcie, PCIEMSIALR) & ~MSIFE;
-       msg.address_hi = rcar_pci_read_reg(pcie, PCIEMSIAUR);
-       msg.data = hwirq;
+       hwirq = bitmap_find_free_region(msi->used, INT_PCI_MSI_NR, order_base_2(nr_irqs));
 
-       pci_write_msi_msg(irq, &msg);
+       mutex_unlock(&msi->map_lock);
+
+       if (hwirq < 0)
+               return -ENOSPC;
+
+       for (i = 0; i < nr_irqs; i++)
+               irq_domain_set_info(domain, virq + i, hwirq + i,
+                                   &rcar_msi_bottom_chip, domain->host_data,
+                                   handle_edge_irq, NULL, NULL);
 
        return 0;
 }
 
-static void rcar_msi_teardown_irq(struct msi_controller *chip, unsigned int irq)
+static void rcar_msi_domain_free(struct irq_domain *domain, unsigned int virq,
+                                 unsigned int nr_irqs)
 {
-       struct rcar_msi *msi = to_rcar_msi(chip);
-       struct irq_data *d = irq_get_irq_data(irq);
-
-       rcar_msi_free(msi, d->hwirq);
-}
+       struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+       struct rcar_msi *msi = domain->host_data;
 
-static struct irq_chip rcar_msi_irq_chip = {
-       .name = "R-Car PCIe MSI",
-       .irq_enable = pci_msi_unmask_irq,
-       .irq_disable = pci_msi_mask_irq,
-       .irq_mask = pci_msi_mask_irq,
-       .irq_unmask = pci_msi_unmask_irq,
-};
+       mutex_lock(&msi->map_lock);
 
-static int rcar_msi_map(struct irq_domain *domain, unsigned int irq,
-                       irq_hw_number_t hwirq)
-{
-       irq_set_chip_and_handler(irq, &rcar_msi_irq_chip, handle_simple_irq);
-       irq_set_chip_data(irq, domain->host_data);
+       bitmap_release_region(msi->used, d->hwirq, order_base_2(nr_irqs));
 
-       return 0;
+       mutex_unlock(&msi->map_lock);
 }
 
-static const struct irq_domain_ops msi_domain_ops = {
-       .map = rcar_msi_map,
+static const struct irq_domain_ops rcar_msi_domain_ops = {
+       .alloc  = rcar_msi_domain_alloc,
+       .free   = rcar_msi_domain_free,
+};
+
+static struct msi_domain_info rcar_msi_info = {
+       .flags  = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+                  MSI_FLAG_MULTI_PCI_MSI),
+       .chip   = &rcar_msi_top_chip,
 };
 
-static void rcar_pcie_unmap_msi(struct rcar_pcie_host *host)
+static int rcar_allocate_domains(struct rcar_msi *msi)
 {
-       struct rcar_msi *msi = &host->msi;
-       int i, irq;
+       struct rcar_pcie *pcie = &msi_to_host(msi)->pcie;
+       struct fwnode_handle *fwnode = dev_fwnode(pcie->dev);
+       struct irq_domain *parent;
+
+       parent = irq_domain_create_linear(fwnode, INT_PCI_MSI_NR,
+                                         &rcar_msi_domain_ops, msi);
+       if (!parent) {
+               dev_err(pcie->dev, "failed to create IRQ domain\n");
+               return -ENOMEM;
+       }
+       irq_domain_update_bus_token(parent, DOMAIN_BUS_NEXUS);
 
-       for (i = 0; i < INT_PCI_MSI_NR; i++) {
-               irq = irq_find_mapping(msi->domain, i);
-               if (irq > 0)
-                       irq_dispose_mapping(irq);
+       msi->domain = pci_msi_create_irq_domain(fwnode, &rcar_msi_info, parent);
+       if (!msi->domain) {
+               dev_err(pcie->dev, "failed to create MSI domain\n");
+               irq_domain_remove(parent);
+               return -ENOMEM;
        }
 
-       irq_domain_remove(msi->domain);
+       return 0;
 }
 
-static void rcar_pcie_hw_enable_msi(struct rcar_pcie_host *host)
+static void rcar_free_domains(struct rcar_msi *msi)
 {
-       struct rcar_pcie *pcie = &host->pcie;
-       struct rcar_msi *msi = &host->msi;
-       unsigned long base;
-
-       /* setup MSI data target */
-       base = virt_to_phys((void *)msi->pages);
+       struct irq_domain *parent = msi->domain->parent;
 
-       rcar_pci_write_reg(pcie, lower_32_bits(base) | MSIFE, PCIEMSIALR);
-       rcar_pci_write_reg(pcie, upper_32_bits(base), PCIEMSIAUR);
-
-       /* enable all MSI interrupts */
-       rcar_pci_write_reg(pcie, 0xffffffff, PCIEMSIIER);
+       irq_domain_remove(msi->domain);
+       irq_domain_remove(parent);
 }
 
 static int rcar_pcie_enable_msi(struct rcar_pcie_host *host)
@@ -698,29 +674,24 @@ static int rcar_pcie_enable_msi(struct rcar_pcie_host *host)
        struct rcar_pcie *pcie = &host->pcie;
        struct device *dev = pcie->dev;
        struct rcar_msi *msi = &host->msi;
-       int err, i;
-
-       mutex_init(&msi->lock);
+       struct resource res;
+       int err;
 
-       msi->chip.dev = dev;
-       msi->chip.setup_irq = rcar_msi_setup_irq;
-       msi->chip.setup_irqs = rcar_msi_setup_irqs;
-       msi->chip.teardown_irq = rcar_msi_teardown_irq;
+       mutex_init(&msi->map_lock);
+       spin_lock_init(&msi->mask_lock);
 
-       msi->domain = irq_domain_add_linear(dev->of_node, INT_PCI_MSI_NR,
-                                           &msi_domain_ops, &msi->chip);
-       if (!msi->domain) {
-               dev_err(dev, "failed to create IRQ domain\n");
-               return -ENOMEM;
-       }
+       err = of_address_to_resource(dev->of_node, 0, &res);
+       if (err)
+               return err;
 
-       for (i = 0; i < INT_PCI_MSI_NR; i++)
-               irq_create_mapping(msi->domain, i);
+       err = rcar_allocate_domains(msi);
+       if (err)
+               return err;
 
        /* Two irqs are for MSI, but they are also used for non-MSI irqs */
        err = devm_request_irq(dev, msi->irq1, rcar_pcie_msi_irq,
                               IRQF_SHARED | IRQF_NO_THREAD,
-                              rcar_msi_irq_chip.name, host);
+                              rcar_msi_bottom_chip.name, host);
        if (err < 0) {
                dev_err(dev, "failed to request IRQ: %d\n", err);
                goto err;
@@ -728,27 +699,32 @@ static int rcar_pcie_enable_msi(struct rcar_pcie_host *host)
 
        err = devm_request_irq(dev, msi->irq2, rcar_pcie_msi_irq,
                               IRQF_SHARED | IRQF_NO_THREAD,
-                              rcar_msi_irq_chip.name, host);
+                              rcar_msi_bottom_chip.name, host);
        if (err < 0) {
                dev_err(dev, "failed to request IRQ: %d\n", err);
                goto err;
        }
 
-       /* setup MSI data target */
-       msi->pages = __get_free_pages(GFP_KERNEL | GFP_DMA32, 0);
-       rcar_pcie_hw_enable_msi(host);
+       /* disable all MSIs */
+       rcar_pci_write_reg(pcie, 0, PCIEMSIIER);
+
+       /*
+        * Setup MSI data target using RC base address address, which
+        * is guaranteed to be in the low 32bit range on any RCar HW.
+        */
+       rcar_pci_write_reg(pcie, lower_32_bits(res.start) | MSIFE, PCIEMSIALR);
+       rcar_pci_write_reg(pcie, upper_32_bits(res.start), PCIEMSIAUR);
 
        return 0;
 
 err:
-       rcar_pcie_unmap_msi(host);
+       rcar_free_domains(msi);
        return err;
 }
 
 static void rcar_pcie_teardown_msi(struct rcar_pcie_host *host)
 {
        struct rcar_pcie *pcie = &host->pcie;
-       struct rcar_msi *msi = &host->msi;
 
        /* Disable all MSI interrupts */
        rcar_pci_write_reg(pcie, 0, PCIEMSIIER);
@@ -756,9 +732,7 @@ static void rcar_pcie_teardown_msi(struct rcar_pcie_host *host)
        /* Disable address decoding of the MSI interrupt, MSIFE */
        rcar_pci_write_reg(pcie, 0, PCIEMSIALR);
 
-       free_pages(msi->pages, 0);
-
-       rcar_pcie_unmap_msi(host);
+       rcar_free_domains(&host->msi);
 }
 
 static int rcar_pcie_get_resources(struct rcar_pcie_host *host)
@@ -1011,8 +985,17 @@ static int __maybe_unused rcar_pcie_resume(struct device *dev)
        dev_info(dev, "PCIe x%d: link up\n", (data >> 20) & 0x3f);
 
        /* Enable MSI */
-       if (IS_ENABLED(CONFIG_PCI_MSI))
-               rcar_pcie_hw_enable_msi(host);
+       if (IS_ENABLED(CONFIG_PCI_MSI)) {
+               struct resource res;
+               u32 val;
+
+               of_address_to_resource(dev->of_node, 0, &res);
+               rcar_pci_write_reg(pcie, upper_32_bits(res.start), PCIEMSIAUR);
+               rcar_pci_write_reg(pcie, lower_32_bits(res.start) | MSIFE, PCIEMSIALR);
+
+               bitmap_to_arr32(&val, host->msi.used, INT_PCI_MSI_NR);
+               rcar_pci_write_reg(pcie, val, PCIEMSIIER);
+       }
 
        rcar_pcie_hw_enable(host);
 
index 07e3666..8689311 100644 (file)
@@ -26,6 +26,7 @@
 
 /* Bridge core config registers */
 #define BRCFG_PCIE_RX0                 0x00000000
+#define BRCFG_PCIE_RX1                 0x00000004
 #define BRCFG_INTERRUPT                        0x00000010
 #define BRCFG_PCIE_RX_MSG_FILTER       0x00000020
 
 #define NWL_ECAM_VALUE_DEFAULT         12
 
 #define CFG_DMA_REG_BAR                        GENMASK(2, 0)
+#define CFG_PCIE_CACHE                 GENMASK(7, 0)
 
 #define INT_PCI_MSI_NR                 (2 * 32)
 
@@ -675,6 +677,11 @@ static int nwl_pcie_bridge_init(struct nwl_pcie *pcie)
        nwl_bridge_writel(pcie, CFG_ENABLE_MSG_FILTER_MASK,
                          BRCFG_PCIE_RX_MSG_FILTER);
 
+       /* This routes the PCIe DMA traffic to go through CCI path */
+       if (of_dma_is_coherent(dev->of_node))
+               nwl_bridge_writel(pcie, nwl_bridge_readl(pcie, BRCFG_PCIE_RX1) |
+                                 CFG_PCIE_CACHE, BRCFG_PCIE_RX1);
+
        err = nwl_wait_for_link(pcie);
        if (err)
                return err;
index fa5baeb..14001fe 100644 (file)
 /**
  * struct xilinx_pcie_port - PCIe port information
  * @reg_base: IO Mapped Register Base
- * @irq: Interrupt number
- * @msi_pages: MSI pages
  * @dev: Device pointer
+ * @msi_map: Bitmap of allocated MSIs
+ * @map_lock: Mutex protecting the MSI allocation
  * @msi_domain: MSI IRQ domain pointer
  * @leg_domain: Legacy IRQ domain pointer
  * @resources: Bus Resources
  */
 struct xilinx_pcie_port {
        void __iomem *reg_base;
-       u32 irq;
-       unsigned long msi_pages;
        struct device *dev;
+       unsigned long msi_map[BITS_TO_LONGS(XILINX_NUM_MSI_IRQS)];
+       struct mutex map_lock;
        struct irq_domain *msi_domain;
        struct irq_domain *leg_domain;
        struct list_head resources;
 };
 
-static DECLARE_BITMAP(msi_irq_in_use, XILINX_NUM_MSI_IRQS);
-
 static inline u32 pcie_read(struct xilinx_pcie_port *port, u32 reg)
 {
        return readl(port->reg_base + reg);
@@ -196,151 +194,118 @@ static struct pci_ops xilinx_pcie_ops = {
 
 /* MSI functions */
 
-/**
- * xilinx_pcie_destroy_msi - Free MSI number
- * @irq: IRQ to be freed
- */
-static void xilinx_pcie_destroy_msi(unsigned int irq)
+static void xilinx_msi_top_irq_ack(struct irq_data *d)
 {
-       struct msi_desc *msi;
-       struct xilinx_pcie_port *port;
-       struct irq_data *d = irq_get_irq_data(irq);
-       irq_hw_number_t hwirq = irqd_to_hwirq(d);
-
-       if (!test_bit(hwirq, msi_irq_in_use)) {
-               msi = irq_get_msi_desc(irq);
-               port = msi_desc_to_pci_sysdata(msi);
-               dev_err(port->dev, "Trying to free unused MSI#%d\n", irq);
-       } else {
-               clear_bit(hwirq, msi_irq_in_use);
-       }
+       /*
+        * xilinx_pcie_intr_handler() will have performed the Ack.
+        * Eventually, this should be fixed and the Ack be moved in
+        * the respective callbacks for INTx and MSI.
+        */
 }
 
-/**
- * xilinx_pcie_assign_msi - Allocate MSI number
- *
- * Return: A valid IRQ on success and error value on failure.
- */
-static int xilinx_pcie_assign_msi(void)
-{
-       int pos;
-
-       pos = find_first_zero_bit(msi_irq_in_use, XILINX_NUM_MSI_IRQS);
-       if (pos < XILINX_NUM_MSI_IRQS)
-               set_bit(pos, msi_irq_in_use);
-       else
-               return -ENOSPC;
+static struct irq_chip xilinx_msi_top_chip = {
+       .name           = "PCIe MSI",
+       .irq_ack        = xilinx_msi_top_irq_ack,
+};
 
-       return pos;
+static int xilinx_msi_set_affinity(struct irq_data *d, const struct cpumask *mask, bool force)
+{
+       return -EINVAL;
 }
 
-/**
- * xilinx_msi_teardown_irq - Destroy the MSI
- * @chip: MSI Chip descriptor
- * @irq: MSI IRQ to destroy
- */
-static void xilinx_msi_teardown_irq(struct msi_controller *chip,
-                                   unsigned int irq)
+static void xilinx_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 {
-       xilinx_pcie_destroy_msi(irq);
-       irq_dispose_mapping(irq);
+       struct xilinx_pcie_port *pcie = irq_data_get_irq_chip_data(data);
+       phys_addr_t pa = ALIGN_DOWN(virt_to_phys(pcie), SZ_4K);
+
+       msg->address_lo = lower_32_bits(pa);
+       msg->address_hi = upper_32_bits(pa);
+       msg->data = data->hwirq;
 }
 
-/**
- * xilinx_pcie_msi_setup_irq - Setup MSI request
- * @chip: MSI chip pointer
- * @pdev: PCIe device pointer
- * @desc: MSI descriptor pointer
- *
- * Return: '0' on success and error value on failure
- */
-static int xilinx_pcie_msi_setup_irq(struct msi_controller *chip,
-                                    struct pci_dev *pdev,
-                                    struct msi_desc *desc)
-{
-       struct xilinx_pcie_port *port = pdev->bus->sysdata;
-       unsigned int irq;
-       int hwirq;
-       struct msi_msg msg;
-       phys_addr_t msg_addr;
+static struct irq_chip xilinx_msi_bottom_chip = {
+       .name                   = "Xilinx MSI",
+       .irq_set_affinity       = xilinx_msi_set_affinity,
+       .irq_compose_msi_msg    = xilinx_compose_msi_msg,
+};
 
-       hwirq = xilinx_pcie_assign_msi();
-       if (hwirq < 0)
-               return hwirq;
+static int xilinx_msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
+                                 unsigned int nr_irqs, void *args)
+{
+       struct xilinx_pcie_port *port = domain->host_data;
+       int hwirq, i;
 
-       irq = irq_create_mapping(port->msi_domain, hwirq);
-       if (!irq)
-               return -EINVAL;
+       mutex_lock(&port->map_lock);
 
-       irq_set_msi_desc(irq, desc);
+       hwirq = bitmap_find_free_region(port->msi_map, XILINX_NUM_MSI_IRQS, order_base_2(nr_irqs));
 
-       msg_addr = virt_to_phys((void *)port->msi_pages);
+       mutex_unlock(&port->map_lock);
 
-       msg.address_hi = 0;
-       msg.address_lo = msg_addr;
-       msg.data = irq;
+       if (hwirq < 0)
+               return -ENOSPC;
 
-       pci_write_msi_msg(irq, &msg);
+       for (i = 0; i < nr_irqs; i++)
+               irq_domain_set_info(domain, virq + i, hwirq + i,
+                                   &xilinx_msi_bottom_chip, domain->host_data,
+                                   handle_edge_irq, NULL, NULL);
 
        return 0;
 }
 
-/* MSI Chip Descriptor */
-static struct msi_controller xilinx_pcie_msi_chip = {
-       .setup_irq = xilinx_pcie_msi_setup_irq,
-       .teardown_irq = xilinx_msi_teardown_irq,
-};
+static void xilinx_msi_domain_free(struct irq_domain *domain, unsigned int virq,
+                                 unsigned int nr_irqs)
+{
+       struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+       struct xilinx_pcie_port *port = domain->host_data;
 
-/* HW Interrupt Chip Descriptor */
-static struct irq_chip xilinx_msi_irq_chip = {
-       .name = "Xilinx PCIe MSI",
-       .irq_enable = pci_msi_unmask_irq,
-       .irq_disable = pci_msi_mask_irq,
-       .irq_mask = pci_msi_mask_irq,
-       .irq_unmask = pci_msi_unmask_irq,
-};
+       mutex_lock(&port->map_lock);
 
-/**
- * xilinx_pcie_msi_map - Set the handler for the MSI and mark IRQ as valid
- * @domain: IRQ domain
- * @irq: Virtual IRQ number
- * @hwirq: HW interrupt number
- *
- * Return: Always returns 0.
- */
-static int xilinx_pcie_msi_map(struct irq_domain *domain, unsigned int irq,
-                              irq_hw_number_t hwirq)
-{
-       irq_set_chip_and_handler(irq, &xilinx_msi_irq_chip, handle_simple_irq);
-       irq_set_chip_data(irq, domain->host_data);
+       bitmap_release_region(port->msi_map, d->hwirq, order_base_2(nr_irqs));
 
-       return 0;
+       mutex_unlock(&port->map_lock);
 }
 
-/* IRQ Domain operations */
-static const struct irq_domain_ops msi_domain_ops = {
-       .map = xilinx_pcie_msi_map,
+static const struct irq_domain_ops xilinx_msi_domain_ops = {
+       .alloc  = xilinx_msi_domain_alloc,
+       .free   = xilinx_msi_domain_free,
 };
 
-/**
- * xilinx_pcie_enable_msi - Enable MSI support
- * @port: PCIe port information
- */
-static int xilinx_pcie_enable_msi(struct xilinx_pcie_port *port)
+static struct msi_domain_info xilinx_msi_info = {
+       .flags  = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS),
+       .chip   = &xilinx_msi_top_chip,
+};
+
+static int xilinx_allocate_msi_domains(struct xilinx_pcie_port *pcie)
 {
-       phys_addr_t msg_addr;
+       struct fwnode_handle *fwnode = dev_fwnode(pcie->dev);
+       struct irq_domain *parent;
 
-       port->msi_pages = __get_free_pages(GFP_KERNEL, 0);
-       if (!port->msi_pages)
+       parent = irq_domain_create_linear(fwnode, XILINX_NUM_MSI_IRQS,
+                                         &xilinx_msi_domain_ops, pcie);
+       if (!parent) {
+               dev_err(pcie->dev, "failed to create IRQ domain\n");
                return -ENOMEM;
+       }
+       irq_domain_update_bus_token(parent, DOMAIN_BUS_NEXUS);
 
-       msg_addr = virt_to_phys((void *)port->msi_pages);
-       pcie_write(port, 0x0, XILINX_PCIE_REG_MSIBASE1);
-       pcie_write(port, msg_addr, XILINX_PCIE_REG_MSIBASE2);
+       pcie->msi_domain = pci_msi_create_irq_domain(fwnode, &xilinx_msi_info, parent);
+       if (!pcie->msi_domain) {
+               dev_err(pcie->dev, "failed to create MSI domain\n");
+               irq_domain_remove(parent);
+               return -ENOMEM;
+       }
 
        return 0;
 }
 
+static void xilinx_free_msi_domains(struct xilinx_pcie_port *pcie)
+{
+       struct irq_domain *parent = pcie->msi_domain->parent;
+
+       irq_domain_remove(pcie->msi_domain);
+       irq_domain_remove(parent);
+}
+
 /* INTx Functions */
 
 /**
@@ -420,6 +385,8 @@ static irqreturn_t xilinx_pcie_intr_handler(int irq, void *data)
        }
 
        if (status & (XILINX_PCIE_INTR_INTX | XILINX_PCIE_INTR_MSI)) {
+               unsigned int irq;
+
                val = pcie_read(port, XILINX_PCIE_REG_RPIFR1);
 
                /* Check whether interrupt valid */
@@ -432,20 +399,19 @@ static irqreturn_t xilinx_pcie_intr_handler(int irq, void *data)
                if (val & XILINX_PCIE_RPIFR1_MSI_INTR) {
                        val = pcie_read(port, XILINX_PCIE_REG_RPIFR2) &
                                XILINX_PCIE_RPIFR2_MSG_DATA;
+                       irq = irq_find_mapping(port->msi_domain->parent, val);
                } else {
                        val = (val & XILINX_PCIE_RPIFR1_INTR_MASK) >>
                                XILINX_PCIE_RPIFR1_INTR_SHIFT;
-                       val = irq_find_mapping(port->leg_domain, val);
+                       irq = irq_find_mapping(port->leg_domain, val);
                }
 
                /* Clear interrupt FIFO register 1 */
                pcie_write(port, XILINX_PCIE_RPIFR1_ALL_MASK,
                           XILINX_PCIE_REG_RPIFR1);
 
-               /* Handle the interrupt */
-               if (IS_ENABLED(CONFIG_PCI_MSI) ||
-                   !(val & XILINX_PCIE_RPIFR1_MSI_INTR))
-                       generic_handle_irq(val);
+               if (irq)
+                       generic_handle_irq(irq);
        }
 
        if (status & XILINX_PCIE_INTR_SLV_UNSUPP)
@@ -491,12 +457,11 @@ error:
 static int xilinx_pcie_init_irq_domain(struct xilinx_pcie_port *port)
 {
        struct device *dev = port->dev;
-       struct device_node *node = dev->of_node;
        struct device_node *pcie_intc_node;
        int ret;
 
        /* Setup INTx */
-       pcie_intc_node = of_get_next_child(node, NULL);
+       pcie_intc_node = of_get_next_child(dev->of_node, NULL);
        if (!pcie_intc_node) {
                dev_err(dev, "No PCIe Intc node found\n");
                return -ENODEV;
@@ -513,18 +478,14 @@ static int xilinx_pcie_init_irq_domain(struct xilinx_pcie_port *port)
 
        /* Setup MSI */
        if (IS_ENABLED(CONFIG_PCI_MSI)) {
-               port->msi_domain = irq_domain_add_linear(node,
-                                                        XILINX_NUM_MSI_IRQS,
-                                                        &msi_domain_ops,
-                                                        &xilinx_pcie_msi_chip);
-               if (!port->msi_domain) {
-                       dev_err(dev, "Failed to get a MSI IRQ domain\n");
-                       return -ENODEV;
-               }
+               phys_addr_t pa = ALIGN_DOWN(virt_to_phys(port), SZ_4K);
 
-               ret = xilinx_pcie_enable_msi(port);
+               ret = xilinx_allocate_msi_domains(port);
                if (ret)
                        return ret;
+
+               pcie_write(port, upper_32_bits(pa), XILINX_PCIE_REG_MSIBASE1);
+               pcie_write(port, lower_32_bits(pa), XILINX_PCIE_REG_MSIBASE2);
        }
 
        return 0;
@@ -572,6 +533,7 @@ static int xilinx_pcie_parse_dt(struct xilinx_pcie_port *port)
        struct device *dev = port->dev;
        struct device_node *node = dev->of_node;
        struct resource regs;
+       unsigned int irq;
        int err;
 
        err = of_address_to_resource(node, 0, &regs);
@@ -584,12 +546,12 @@ static int xilinx_pcie_parse_dt(struct xilinx_pcie_port *port)
        if (IS_ERR(port->reg_base))
                return PTR_ERR(port->reg_base);
 
-       port->irq = irq_of_parse_and_map(node, 0);
-       err = devm_request_irq(dev, port->irq, xilinx_pcie_intr_handler,
+       irq = irq_of_parse_and_map(node, 0);
+       err = devm_request_irq(dev, irq, xilinx_pcie_intr_handler,
                               IRQF_SHARED | IRQF_NO_THREAD,
                               "xilinx-pcie", port);
        if (err) {
-               dev_err(dev, "unable to request irq %d\n", port->irq);
+               dev_err(dev, "unable to request irq %d\n", irq);
                return err;
        }
 
@@ -617,7 +579,7 @@ static int xilinx_pcie_probe(struct platform_device *pdev)
                return -ENODEV;
 
        port = pci_host_bridge_priv(bridge);
-
+       mutex_init(&port->map_lock);
        port->dev = dev;
 
        err = xilinx_pcie_parse_dt(port);
@@ -637,11 +599,11 @@ static int xilinx_pcie_probe(struct platform_device *pdev)
        bridge->sysdata = port;
        bridge->ops = &xilinx_pcie_ops;
 
-#ifdef CONFIG_PCI_MSI
-       xilinx_pcie_msi_chip.dev = dev;
-       bridge->msi = &xilinx_pcie_msi_chip;
-#endif
-       return pci_host_probe(bridge);
+       err = pci_host_probe(bridge);
+       if (err)
+               xilinx_free_msi_domains(port);
+
+       return err;
 }
 
 static const struct of_device_id xilinx_pcie_of_match[] = {
index 5e80f28..e3fcdfe 100644 (file)
@@ -28,6 +28,7 @@
 #define BUS_RESTRICT_CAP(vmcap)        (vmcap & 0x1)
 #define PCI_REG_VMCONFIG       0x44
 #define BUS_RESTRICT_CFG(vmcfg)        ((vmcfg >> 8) & 0x3)
+#define VMCONFIG_MSI_REMAP     0x2
 #define PCI_REG_VMLOCK         0x70
 #define MB2_SHADOW_EN(vmlock)  (vmlock & 0x2)
 
@@ -59,6 +60,13 @@ enum vmd_features {
         * be used for MSI remapping
         */
        VMD_FEAT_OFFSET_FIRST_VECTOR            = (1 << 3),
+
+       /*
+        * Device can bypass remapping MSI-X transactions into its MSI-X table,
+        * avoiding the requirement of a VMD MSI domain for child device
+        * interrupt handling.
+        */
+       VMD_FEAT_CAN_BYPASS_MSI_REMAP           = (1 << 4),
 };
 
 /*
@@ -306,6 +314,16 @@ static struct msi_domain_info vmd_msi_domain_info = {
        .chip           = &vmd_msi_controller,
 };
 
+static void vmd_set_msi_remapping(struct vmd_dev *vmd, bool enable)
+{
+       u16 reg;
+
+       pci_read_config_word(vmd->dev, PCI_REG_VMCONFIG, &reg);
+       reg = enable ? (reg & ~VMCONFIG_MSI_REMAP) :
+                      (reg | VMCONFIG_MSI_REMAP);
+       pci_write_config_word(vmd->dev, PCI_REG_VMCONFIG, reg);
+}
+
 static int vmd_create_irq_domain(struct vmd_dev *vmd)
 {
        struct fwnode_handle *fn;
@@ -325,6 +343,13 @@ static int vmd_create_irq_domain(struct vmd_dev *vmd)
 
 static void vmd_remove_irq_domain(struct vmd_dev *vmd)
 {
+       /*
+        * Some production BIOS won't enable remapping between soft reboots.
+        * Ensure remapping is restored before unloading the driver.
+        */
+       if (!vmd->msix_count)
+               vmd_set_msi_remapping(vmd, true);
+
        if (vmd->irq_domain) {
                struct fwnode_handle *fn = vmd->irq_domain->fwnode;
 
@@ -679,15 +704,32 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 
        sd->node = pcibus_to_node(vmd->dev->bus);
 
-       ret = vmd_create_irq_domain(vmd);
-       if (ret)
-               return ret;
-
        /*
-        * Override the irq domain bus token so the domain can be distinguished
-        * from a regular PCI/MSI domain.
+        * Currently MSI remapping must be enabled in guest passthrough mode
+        * due to some missing interrupt remapping plumbing. This is probably
+        * acceptable because the guest is usually CPU-limited and MSI
+        * remapping doesn't become a performance bottleneck.
         */
-       irq_domain_update_bus_token(vmd->irq_domain, DOMAIN_BUS_VMD_MSI);
+       if (!(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) ||
+           offset[0] || offset[1]) {
+               ret = vmd_alloc_irqs(vmd);
+               if (ret)
+                       return ret;
+
+               vmd_set_msi_remapping(vmd, true);
+
+               ret = vmd_create_irq_domain(vmd);
+               if (ret)
+                       return ret;
+
+               /*
+                * Override the IRQ domain bus token so the domain can be
+                * distinguished from a regular PCI/MSI domain.
+                */
+               irq_domain_update_bus_token(vmd->irq_domain, DOMAIN_BUS_VMD_MSI);
+       } else {
+               vmd_set_msi_remapping(vmd, false);
+       }
 
        pci_add_resource(&resources, &vmd->resources[0]);
        pci_add_resource_offset(&resources, &vmd->resources[1], offset[0]);
@@ -753,10 +795,6 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
        if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
                vmd->first_vec = 1;
 
-       err = vmd_alloc_irqs(vmd);
-       if (err)
-               return err;
-
        spin_lock_init(&vmd->cfg_lock);
        pci_set_drvdata(dev, vmd);
        err = vmd_enable_domain(vmd, features);
@@ -825,7 +863,8 @@ static const struct pci_device_id vmd_ids[] = {
                .driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP,},
        {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_28C0),
                .driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW |
-                               VMD_FEAT_HAS_BUS_RESTRICTIONS,},
+                               VMD_FEAT_HAS_BUS_RESTRICTIONS |
+                               VMD_FEAT_CAN_BYPASS_MSI_REMAP,},
        {PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x467f),
                .driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP |
                                VMD_FEAT_HAS_BUS_RESTRICTIONS |
index 338148c..bce274d 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/**
+/*
  * Endpoint Function Driver to implement Non-Transparent Bridge functionality
  *
  * Copyright (C) 2020 Texas Instruments
@@ -696,7 +696,8 @@ reset_handler:
 
 /**
  * epf_ntb_peer_spad_bar_clear() - Clear Peer Scratchpad BAR
- * @ntb: NTB device that facilitates communication between HOST1 and HOST2
+ * @ntb_epc: EPC associated with one of the HOST which holds peer's outbound
+ *          address.
  *
  *+-----------------+------->+------------------+        +-----------------+
  *|       BAR0      |        |  CONFIG REGION   |        |       BAR0      |
@@ -740,6 +741,7 @@ static void epf_ntb_peer_spad_bar_clear(struct epf_ntb_epc *ntb_epc)
 /**
  * epf_ntb_peer_spad_bar_set() - Set peer scratchpad BAR
  * @ntb: NTB device that facilitates communication between HOST1 and HOST2
+ * @type: PRIMARY interface or SECONDARY interface
  *
  *+-----------------+------->+------------------+        +-----------------+
  *|       BAR0      |        |  CONFIG REGION   |        |       BAR0      |
@@ -808,7 +810,8 @@ static int epf_ntb_peer_spad_bar_set(struct epf_ntb *ntb,
 
 /**
  * epf_ntb_config_sspad_bar_clear() - Clear Config + Self scratchpad BAR
- * @ntb: NTB device that facilitates communication between HOST1 and HOST2
+ * @ntb_epc: EPC associated with one of the HOST which holds peer's outbound
+ *          address.
  *
  * +-----------------+------->+------------------+        +-----------------+
  * |       BAR0      |        |  CONFIG REGION   |        |       BAR0      |
@@ -851,7 +854,8 @@ static void epf_ntb_config_sspad_bar_clear(struct epf_ntb_epc *ntb_epc)
 
 /**
  * epf_ntb_config_sspad_bar_set() - Set Config + Self scratchpad BAR
- * @ntb: NTB device that facilitates communication between HOST1 and HOST2
+ * @ntb_epc: EPC associated with one of the HOST which holds peer's outbound
+ *          address.
  *
  * +-----------------+------->+------------------+        +-----------------+
  * |       BAR0      |        |  CONFIG REGION   |        |       BAR0      |
@@ -1312,6 +1316,7 @@ static int epf_ntb_configure_interrupt(struct epf_ntb *ntb,
 
 /**
  * epf_ntb_alloc_peer_mem() - Allocate memory in peer's outbound address space
+ * @dev: The PCI device.
  * @ntb_epc: EPC associated with one of the HOST whose BAR holds peer's outbound
  *   address
  * @bar: BAR of @ntb_epc in for which memory has to be allocated (could be
@@ -1660,7 +1665,6 @@ static int epf_ntb_init_epc_bar_interface(struct epf_ntb *ntb,
  * epf_ntb_init_epc_bar() - Identify BARs to be used for each of the NTB
  * constructs (scratchpad region, doorbell, memorywindow)
  * @ntb: NTB device that facilitates communication between HOST1 and HOST2
- * @type: PRIMARY interface or SECONDARY interface
  *
  * Wrapper to epf_ntb_init_epc_bar_interface() to identify the free BARs
  * to be used for each of BAR_CONFIG, BAR_PEER_SPAD, BAR_DB_MW1, BAR_MW2,
@@ -2037,6 +2041,8 @@ static const struct config_item_type ntb_group_type = {
 /**
  * epf_ntb_add_cfs() - Add configfs directory specific to NTB
  * @epf: NTB endpoint function device
+ * @group: A pointer to the config_group structure referencing a group of
+ *        config_items of a specific type that belong to a specific sub-system.
  *
  * Add configfs directory specific to NTB. This directory will hold
  * NTB specific properties like db_count, spad_count, num_mws etc.,
index c0ac4e9..d2708ca 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/**
+/*
  * Test driver to test endpoint functionality
  *
  * Copyright (C) 2017 Texas Instruments
@@ -833,15 +833,18 @@ static int pci_epf_test_bind(struct pci_epf *epf)
                return -EINVAL;
 
        epc_features = pci_epc_get_features(epc, epf->func_no);
-       if (epc_features) {
-               linkup_notifier = epc_features->linkup_notifier;
-               core_init_notifier = epc_features->core_init_notifier;
-               test_reg_bar = pci_epc_get_first_free_bar(epc_features);
-               if (test_reg_bar < 0)
-                       return -EINVAL;
-               pci_epf_configure_bar(epf, epc_features);
+       if (!epc_features) {
+               dev_err(&epf->dev, "epc_features not implemented\n");
+               return -EOPNOTSUPP;
        }
 
+       linkup_notifier = epc_features->linkup_notifier;
+       core_init_notifier = epc_features->core_init_notifier;
+       test_reg_bar = pci_epc_get_first_free_bar(epc_features);
+       if (test_reg_bar < 0)
+               return -EINVAL;
+       pci_epf_configure_bar(epf, epc_features);
+
        epf_test->test_reg_bar = test_reg_bar;
        epf_test->epc_features = epc_features;
 
@@ -922,6 +925,7 @@ static int __init pci_epf_test_init(void)
 
        ret = pci_epf_register_driver(&test_driver);
        if (ret) {
+               destroy_workqueue(kpcitest_workqueue);
                pr_err("Failed to register pci epf test driver --> %d\n", ret);
                return ret;
        }
@@ -932,6 +936,8 @@ module_init(pci_epf_test_init);
 
 static void __exit pci_epf_test_exit(void)
 {
+       if (kpcitest_workqueue)
+               destroy_workqueue(kpcitest_workqueue);
        pci_epf_unregister_driver(&test_driver);
 }
 module_exit(pci_epf_test_exit);
index cc8f9eb..adec9be 100644 (file)
@@ -594,6 +594,8 @@ EXPORT_SYMBOL_GPL(pci_epc_add_epf);
  * pci_epc_remove_epf() - remove PCI endpoint function from endpoint controller
  * @epc: the EPC device from which the endpoint function should be removed
  * @epf: the endpoint function to be removed
+ * @type: identifies if the EPC is connected to the primary or secondary
+ *        interface of EPF
  *
  * Invoke to remove PCI endpoint function from the endpoint controller.
  */
index 7646c86..e9289d1 100644 (file)
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(pci_epf_bind);
 void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar,
                        enum pci_epc_interface_type type)
 {
-       struct device *dev = epf->epc->dev.parent;
+       struct device *dev;
        struct pci_epf_bar *epf_bar;
        struct pci_epc *epc;
 
index 2750a64..4fedebf 100644 (file)
@@ -157,7 +157,7 @@ static int pcihp_is_ejectable(acpi_handle handle)
 }
 
 /**
- * acpi_pcihp_check_ejectable - check if handle is ejectable ACPI PCI slot
+ * acpi_pci_check_ejectable - check if handle is ejectable ACPI PCI slot
  * @pbus: the PCI bus of the PCI slot corresponding to 'handle'
  * @handle: ACPI handle to check
  *
index a74b274..1f8ab43 100644 (file)
@@ -148,8 +148,7 @@ static inline struct acpiphp_root_context *to_acpiphp_root_context(struct acpi_h
  * ACPI has no generic method of setting/getting attention status
  * this allows for device specific driver registration
  */
-struct acpiphp_attention_info
-{
+struct acpiphp_attention_info {
        int (*set_attn)(struct hotplug_slot *slot, u8 status);
        int (*get_attn)(struct hotplug_slot *slot, u8 *status);
        struct module *owner;
index 3365c93..f031302 100644 (file)
@@ -533,6 +533,7 @@ static void enable_slot(struct acpiphp_slot *slot, bool bridge)
                        slot->flags &= ~SLOT_ENABLED;
                        continue;
                }
+               pci_dev_put(dev);
        }
 }
 
index 00cd2b4..7a65d42 100644 (file)
@@ -80,7 +80,7 @@ static u8 evbuffer[1024];
 static void __iomem *compaq_int15_entry_point;
 
 /* lock for ordering int15_bios_call() */
-static spinlock_t int15_lock;
+static DEFINE_SPINLOCK(int15_lock);
 
 
 /* This is a series of function that deals with
@@ -415,9 +415,6 @@ void compaq_nvram_init(void __iomem *rom_start)
                compaq_int15_entry_point = (rom_start + ROM_INT15_PHY_ADDR - ROM_PHY_ADDR);
 
        dbg("int15 entry  = %p\n", compaq_int15_entry_point);
-
-       /* initialize our int15 lock */
-       spin_lock_init(&int15_lock);
 }
 
 
index f8f056b..0148687 100644 (file)
@@ -35,7 +35,7 @@ static int enable_slot(struct hotplug_slot *hotplug_slot)
                return rc;
        zdev->state = ZPCI_FN_STATE_CONFIGURED;
 
-       return zpci_configure_device(zdev, zdev->fh);
+       return zpci_scan_configured_device(zdev, zdev->fh);
 }
 
 static int disable_slot(struct hotplug_slot *hotplug_slot)
index db04728..9e3b277 100644 (file)
@@ -174,11 +174,6 @@ static inline u8 shpc_readb(struct controller *ctrl, int reg)
        return readb(ctrl->creg + reg);
 }
 
-static inline void shpc_writeb(struct controller *ctrl, int reg, u8 val)
-{
-       writeb(val, ctrl->creg + reg);
-}
-
 static inline u16 shpc_readw(struct controller *ctrl, int reg)
 {
        return readw(ctrl->creg + reg);
index 3162f88..217dc9f 100644 (file)
@@ -64,39 +64,18 @@ static void pci_msi_teardown_msi_irqs(struct pci_dev *dev)
 /* Arch hooks */
 int __weak arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
 {
-       struct msi_controller *chip = dev->bus->msi;
-       int err;
-
-       if (!chip || !chip->setup_irq)
-               return -EINVAL;
-
-       err = chip->setup_irq(chip, dev, desc);
-       if (err < 0)
-               return err;
-
-       irq_set_chip_data(desc->irq, chip);
-
-       return 0;
+       return -EINVAL;
 }
 
 void __weak arch_teardown_msi_irq(unsigned int irq)
 {
-       struct msi_controller *chip = irq_get_chip_data(irq);
-
-       if (!chip || !chip->teardown_irq)
-               return;
-
-       chip->teardown_irq(chip, irq);
 }
 
 int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-       struct msi_controller *chip = dev->bus->msi;
        struct msi_desc *entry;
        int ret;
 
-       if (chip && chip->setup_irqs)
-               return chip->setup_irqs(chip, dev, nvec, type);
        /*
         * If an architecture wants to support multiple MSI, it needs to
         * override arch_setup_msi_irqs()
@@ -115,11 +94,7 @@ int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
        return 0;
 }
 
-/*
- * We have a default implementation available as a separate non-weak
- * function, as it is used by the Xen x86 PCI code
- */
-void default_teardown_msi_irqs(struct pci_dev *dev)
+void __weak arch_teardown_msi_irqs(struct pci_dev *dev)
 {
        int i;
        struct msi_desc *entry;
@@ -129,11 +104,6 @@ void default_teardown_msi_irqs(struct pci_dev *dev)
                        for (i = 0; i < entry->nvec_used; i++)
                                arch_teardown_msi_irq(entry->irq + i);
 }
-
-void __weak arch_teardown_msi_irqs(struct pci_dev *dev)
-{
-       return default_teardown_msi_irqs(dev);
-}
 #endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */
 
 static void default_restore_msi_irq(struct pci_dev *dev, int irq)
@@ -901,8 +871,15 @@ static int pci_msi_supported(struct pci_dev *dev, int nvec)
         * Any bridge which does NOT route MSI transactions from its
         * secondary bus to its primary bus must set NO_MSI flag on
         * the secondary pci_bus.
-        * We expect only arch-specific PCI host bus controller driver
-        * or quirks for specific PCI bridges to be setting NO_MSI.
+        *
+        * The NO_MSI flag can either be set directly by:
+        * - arch-specific PCI host bus controller drivers (deprecated)
+        * - quirks for specific PCI bridges
+        *
+        * or indirectly by platform-specific PCI host bridge drivers by
+        * advertising the 'msi_domain' property, which results in
+        * the NO_MSI flag when no MSI domain is found for this bridge
+        * at probe time.
         */
        for (bus = dev->bus; bus; bus = bus->parent)
                if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
index 5ea472a..da5b414 100644 (file)
@@ -190,10 +190,18 @@ int of_pci_parse_bus_range(struct device_node *node, struct resource *res)
 EXPORT_SYMBOL_GPL(of_pci_parse_bus_range);
 
 /**
- * This function will try to obtain the host bridge domain number by
- * finding a property called "linux,pci-domain" of the given device node.
+ * of_get_pci_domain_nr - Find the host bridge domain number
+ *                       of the given device node.
+ * @node: Device tree node with the domain information.
  *
- * @node: device tree node with the domain information
+ * This function will try to obtain the host bridge domain number by finding
+ * a property called "linux,pci-domain" of the given device node.
+ *
+ * Return:
+ * * > 0       - On success, an associated domain number.
+ * * -EINVAL   - The property "linux,pci-domain" does not exist.
+ * * -ENODATA  - The linux,pci-domain" property does not have value.
+ * * -EOVERFLOW        - Invalid "linux,pci-domain" property value.
  *
  * Returns the associated domain number from DT in the range [0-0xffff], or
  * a negative value if the required property is not found.
@@ -585,10 +593,16 @@ int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge)
 #endif /* CONFIG_PCI */
 
 /**
+ * of_pci_get_max_link_speed - Find the maximum link speed of the given device node.
+ * @node: Device tree node with the maximum link speed information.
+ *
  * This function will try to find the limitation of link speed by finding
  * a property called "max-link-speed" of the given device node.
  *
- * @node: device tree node with the max link speed information
+ * Return:
+ * * > 0       - On success, a maximum link speed.
+ * * -EINVAL   - Invalid "max-link-speed" property value, or failure to access
+ *               the property of the device tree node.
  *
  * Returns the associated max link speed from DT, or a negative value if the
  * required property is not found or is invalid.
index 53502a7..36bc23e 100644 (file)
@@ -1021,7 +1021,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 
        if (!error)
                pci_dbg(dev, "power state changed by ACPI to %s\n",
-                        acpi_power_state_string(state_conv[state]));
+                       acpi_power_state_string(adev->power.state));
 
        return error;
 }
index 781e45c..c32f3b7 100644 (file)
 #include <linux/pci-acpi.h>
 #include "pci.h"
 
+static bool device_has_acpi_name(struct device *dev)
+{
+#ifdef CONFIG_ACPI
+       acpi_handle handle = ACPI_HANDLE(dev);
+
+       if (!handle)
+               return false;
+
+       return acpi_check_dsm(handle, &pci_acpi_dsm_guid, 0x2,
+                             1 << DSM_PCI_DEVICE_NAME);
+#else
+       return false;
+#endif
+}
+
 #ifdef CONFIG_DMI
 enum smbios_attr_enum {
        SMBIOS_ATTR_NONE = 0,
@@ -45,13 +60,9 @@ static size_t find_smbios_instance_string(struct pci_dev *pdev, char *buf,
 {
        const struct dmi_device *dmi;
        struct dmi_dev_onboard *donboard;
-       int domain_nr;
-       int bus;
-       int devfn;
-
-       domain_nr = pci_domain_nr(pdev->bus);
-       bus = pdev->bus->number;
-       devfn = pdev->devfn;
+       int domain_nr = pci_domain_nr(pdev->bus);
+       int bus = pdev->bus->number;
+       int devfn = pdev->devfn;
 
        dmi = NULL;
        while ((dmi = dmi_find_device(DMI_DEV_TYPE_DEV_ONBOARD,
@@ -62,13 +73,11 @@ static size_t find_smbios_instance_string(struct pci_dev *pdev, char *buf,
                                donboard->devfn == devfn) {
                        if (buf) {
                                if (attribute == SMBIOS_ATTR_INSTANCE_SHOW)
-                                       return scnprintf(buf, PAGE_SIZE,
-                                                        "%d\n",
-                                                        donboard->instance);
+                                       return sysfs_emit(buf, "%d\n",
+                                                         donboard->instance);
                                else if (attribute == SMBIOS_ATTR_LABEL_SHOW)
-                                       return scnprintf(buf, PAGE_SIZE,
-                                                        "%s\n",
-                                                        dmi->name);
+                                       return sysfs_emit(buf, "%s\n",
+                                                         dmi->name);
                        }
                        return strlen(dmi->name);
                }
@@ -76,78 +85,52 @@ static size_t find_smbios_instance_string(struct pci_dev *pdev, char *buf,
        return 0;
 }
 
-static umode_t smbios_instance_string_exist(struct kobject *kobj,
-                                           struct attribute *attr, int n)
+static ssize_t smbios_label_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
 {
-       struct device *dev;
-       struct pci_dev *pdev;
-
-       dev = kobj_to_dev(kobj);
-       pdev = to_pci_dev(dev);
-
-       return find_smbios_instance_string(pdev, NULL, SMBIOS_ATTR_NONE) ?
-                                          S_IRUGO : 0;
-}
-
-static ssize_t smbioslabel_show(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       struct pci_dev *pdev;
-       pdev = to_pci_dev(dev);
+       struct pci_dev *pdev = to_pci_dev(dev);
 
        return find_smbios_instance_string(pdev, buf,
                                           SMBIOS_ATTR_LABEL_SHOW);
 }
+static struct device_attribute dev_attr_smbios_label = __ATTR(label, 0444,
+                                                   smbios_label_show, NULL);
 
-static ssize_t smbiosinstance_show(struct device *dev,
-                                  struct device_attribute *attr, char *buf)
+static ssize_t index_show(struct device *dev, struct device_attribute *attr,
+                         char *buf)
 {
-       struct pci_dev *pdev;
-       pdev = to_pci_dev(dev);
+       struct pci_dev *pdev = to_pci_dev(dev);
 
        return find_smbios_instance_string(pdev, buf,
                                           SMBIOS_ATTR_INSTANCE_SHOW);
 }
+static DEVICE_ATTR_RO(index);
 
-static struct device_attribute smbios_attr_label = {
-       .attr = {.name = "label", .mode = 0444},
-       .show = smbioslabel_show,
-};
-
-static struct device_attribute smbios_attr_instance = {
-       .attr = {.name = "index", .mode = 0444},
-       .show = smbiosinstance_show,
-};
-
-static struct attribute *smbios_attributes[] = {
-       &smbios_attr_label.attr,
-       &smbios_attr_instance.attr,
+static struct attribute *smbios_attrs[] = {
+       &dev_attr_smbios_label.attr,
+       &dev_attr_index.attr,
        NULL,
 };
 
-static const struct attribute_group smbios_attr_group = {
-       .attrs = smbios_attributes,
-       .is_visible = smbios_instance_string_exist,
-};
-
-static int pci_create_smbiosname_file(struct pci_dev *pdev)
+static umode_t smbios_attr_is_visible(struct kobject *kobj, struct attribute *a,
+                                     int n)
 {
-       return sysfs_create_group(&pdev->dev.kobj, &smbios_attr_group);
-}
+       struct device *dev = kobj_to_dev(kobj);
+       struct pci_dev *pdev = to_pci_dev(dev);
 
-static void pci_remove_smbiosname_file(struct pci_dev *pdev)
-{
-       sysfs_remove_group(&pdev->dev.kobj, &smbios_attr_group);
-}
-#else
-static inline int pci_create_smbiosname_file(struct pci_dev *pdev)
-{
-       return -1;
-}
+       if (device_has_acpi_name(dev))
+               return 0;
 
-static inline void pci_remove_smbiosname_file(struct pci_dev *pdev)
-{
+       if (!find_smbios_instance_string(pdev, NULL, SMBIOS_ATTR_NONE))
+               return 0;
+
+       return a->mode;
 }
+
+const struct attribute_group pci_dev_smbios_attr_group = {
+       .attrs = smbios_attrs,
+       .is_visible = smbios_attr_is_visible,
+};
 #endif
 
 #ifdef CONFIG_ACPI
@@ -169,11 +152,10 @@ static void dsm_label_utf16s_to_utf8s(union acpi_object *obj, char *buf)
 static int dsm_get_label(struct device *dev, char *buf,
                         enum acpi_attr_enum attr)
 {
-       acpi_handle handle;
+       acpi_handle handle = ACPI_HANDLE(dev);
        union acpi_object *obj, *tmp;
        int len = -1;
 
-       handle = ACPI_HANDLE(dev);
        if (!handle)
                return -1;
 
@@ -209,103 +191,39 @@ static int dsm_get_label(struct device *dev, char *buf,
        return len;
 }
 
-static bool device_has_dsm(struct device *dev)
-{
-       acpi_handle handle;
-
-       handle = ACPI_HANDLE(dev);
-       if (!handle)
-               return false;
-
-       return !!acpi_check_dsm(handle, &pci_acpi_dsm_guid, 0x2,
-                               1 << DSM_PCI_DEVICE_NAME);
-}
-
-static umode_t acpi_index_string_exist(struct kobject *kobj,
-                                      struct attribute *attr, int n)
-{
-       struct device *dev;
-
-       dev = kobj_to_dev(kobj);
-
-       if (device_has_dsm(dev))
-               return S_IRUGO;
-
-       return 0;
-}
-
-static ssize_t acpilabel_show(struct device *dev,
-                             struct device_attribute *attr, char *buf)
+static ssize_t label_show(struct device *dev, struct device_attribute *attr,
+                         char *buf)
 {
        return dsm_get_label(dev, buf, ACPI_ATTR_LABEL_SHOW);
 }
+static DEVICE_ATTR_RO(label);
 
-static ssize_t acpiindex_show(struct device *dev,
+static ssize_t acpi_index_show(struct device *dev,
                              struct device_attribute *attr, char *buf)
 {
        return dsm_get_label(dev, buf, ACPI_ATTR_INDEX_SHOW);
 }
+static DEVICE_ATTR_RO(acpi_index);
 
-static struct device_attribute acpi_attr_label = {
-       .attr = {.name = "label", .mode = 0444},
-       .show = acpilabel_show,
-};
-
-static struct device_attribute acpi_attr_index = {
-       .attr = {.name = "acpi_index", .mode = 0444},
-       .show = acpiindex_show,
-};
-
-static struct attribute *acpi_attributes[] = {
-       &acpi_attr_label.attr,
-       &acpi_attr_index.attr,
+static struct attribute *acpi_attrs[] = {
+       &dev_attr_label.attr,
+       &dev_attr_acpi_index.attr,
        NULL,
 };
 
-static const struct attribute_group acpi_attr_group = {
-       .attrs = acpi_attributes,
-       .is_visible = acpi_index_string_exist,
-};
-
-static int pci_create_acpi_index_label_files(struct pci_dev *pdev)
+static umode_t acpi_attr_is_visible(struct kobject *kobj, struct attribute *a,
+                                   int n)
 {
-       return sysfs_create_group(&pdev->dev.kobj, &acpi_attr_group);
-}
+       struct device *dev = kobj_to_dev(kobj);
 
-static int pci_remove_acpi_index_label_files(struct pci_dev *pdev)
-{
-       sysfs_remove_group(&pdev->dev.kobj, &acpi_attr_group);
-       return 0;
-}
-#else
-static inline int pci_create_acpi_index_label_files(struct pci_dev *pdev)
-{
-       return -1;
-}
+       if (!device_has_acpi_name(dev))
+               return 0;
 
-static inline int pci_remove_acpi_index_label_files(struct pci_dev *pdev)
-{
-       return -1;
+       return a->mode;
 }
 
-static inline bool device_has_dsm(struct device *dev)
-{
-       return false;
-}
+const struct attribute_group pci_dev_acpi_attr_group = {
+       .attrs = acpi_attrs,
+       .is_visible = acpi_attr_is_visible,
+};
 #endif
-
-void pci_create_firmware_label_files(struct pci_dev *pdev)
-{
-       if (device_has_dsm(&pdev->dev))
-               pci_create_acpi_index_label_files(pdev);
-       else
-               pci_create_smbiosname_file(pdev);
-}
-
-void pci_remove_firmware_label_files(struct pci_dev *pdev)
-{
-       if (device_has_dsm(&pdev->dev))
-               pci_remove_acpi_index_label_files(pdev);
-       else
-               pci_remove_smbiosname_file(pdev);
-}
index a6b8fbb..beb8d1f 100644 (file)
@@ -39,7 +39,7 @@ field##_show(struct device *dev, struct device_attribute *attr, char *buf)                            \
        struct pci_dev *pdev;                                           \
                                                                        \
        pdev = to_pci_dev(dev);                                         \
-       return sprintf(buf, format_string, pdev->field);                \
+       return sysfs_emit(buf, format_string, pdev->field);             \
 }                                                                      \
 static DEVICE_ATTR_RO(field)
 
@@ -56,7 +56,7 @@ static ssize_t broken_parity_status_show(struct device *dev,
                                         char *buf)
 {
        struct pci_dev *pdev = to_pci_dev(dev);
-       return sprintf(buf, "%u\n", pdev->broken_parity_status);
+       return sysfs_emit(buf, "%u\n", pdev->broken_parity_status);
 }
 
 static ssize_t broken_parity_status_store(struct device *dev,
@@ -129,7 +129,7 @@ static ssize_t power_state_show(struct device *dev,
 {
        struct pci_dev *pdev = to_pci_dev(dev);
 
-       return sprintf(buf, "%s\n", pci_power_name(pdev->current_state));
+       return sysfs_emit(buf, "%s\n", pci_power_name(pdev->current_state));
 }
 static DEVICE_ATTR_RO(power_state);
 
@@ -138,10 +138,10 @@ static ssize_t resource_show(struct device *dev, struct device_attribute *attr,
                             char *buf)
 {
        struct pci_dev *pci_dev = to_pci_dev(dev);
-       char *str = buf;
        int i;
        int max;
        resource_size_t start, end;
+       size_t len = 0;
 
        if (pci_dev->subordinate)
                max = DEVICE_COUNT_RESOURCE;
@@ -151,12 +151,12 @@ static ssize_t resource_show(struct device *dev, struct device_attribute *attr,
        for (i = 0; i < max; i++) {
                struct resource *res =  &pci_dev->resource[i];
                pci_resource_to_user(pci_dev, i, res, &start, &end);
-               str += sprintf(str, "0x%016llx 0x%016llx 0x%016llx\n",
-                              (unsigned long long)start,
-                              (unsigned long long)end,
-                              (unsigned long long)res->flags);
+               len += sysfs_emit_at(buf, len, "0x%016llx 0x%016llx 0x%016llx\n",
+                                    (unsigned long long)start,
+                                    (unsigned long long)end,
+                                    (unsigned long long)res->flags);
        }
-       return (str - buf);
+       return len;
 }
 static DEVICE_ATTR_RO(resource);
 
@@ -165,8 +165,8 @@ static ssize_t max_link_speed_show(struct device *dev,
 {
        struct pci_dev *pdev = to_pci_dev(dev);
 
-       return sprintf(buf, "%s\n",
-                      pci_speed_string(pcie_get_speed_cap(pdev)));
+       return sysfs_emit(buf, "%s\n",
+                         pci_speed_string(pcie_get_speed_cap(pdev)));
 }
 static DEVICE_ATTR_RO(max_link_speed);
 
@@ -175,7 +175,7 @@ static ssize_t max_link_width_show(struct device *dev,
 {
        struct pci_dev *pdev = to_pci_dev(dev);
 
-       return sprintf(buf, "%u\n", pcie_get_width_cap(pdev));
+       return sysfs_emit(buf, "%u\n", pcie_get_width_cap(pdev));
 }
 static DEVICE_ATTR_RO(max_link_width);
 
@@ -193,7 +193,7 @@ static ssize_t current_link_speed_show(struct device *dev,
 
        speed = pcie_link_speed[linkstat & PCI_EXP_LNKSTA_CLS];
 
-       return sprintf(buf, "%s\n", pci_speed_string(speed));
+       return sysfs_emit(buf, "%s\n", pci_speed_string(speed));
 }
 static DEVICE_ATTR_RO(current_link_speed);
 
@@ -208,7 +208,7 @@ static ssize_t current_link_width_show(struct device *dev,
        if (err)
                return -EINVAL;
 
-       return sprintf(buf, "%u\n",
+       return sysfs_emit(buf, "%u\n",
                (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT);
 }
 static DEVICE_ATTR_RO(current_link_width);
@@ -225,7 +225,7 @@ static ssize_t secondary_bus_number_show(struct device *dev,
        if (err)
                return -EINVAL;
 
-       return sprintf(buf, "%u\n", sec_bus);
+       return sysfs_emit(buf, "%u\n", sec_bus);
 }
 static DEVICE_ATTR_RO(secondary_bus_number);
 
@@ -241,7 +241,7 @@ static ssize_t subordinate_bus_number_show(struct device *dev,
        if (err)
                return -EINVAL;
 
-       return sprintf(buf, "%u\n", sub_bus);
+       return sysfs_emit(buf, "%u\n", sub_bus);
 }
 static DEVICE_ATTR_RO(subordinate_bus_number);
 
@@ -251,7 +251,7 @@ static ssize_t ari_enabled_show(struct device *dev,
 {
        struct pci_dev *pci_dev = to_pci_dev(dev);
 
-       return sprintf(buf, "%u\n", pci_ari_enabled(pci_dev->bus));
+       return sysfs_emit(buf, "%u\n", pci_ari_enabled(pci_dev->bus));
 }
 static DEVICE_ATTR_RO(ari_enabled);
 
@@ -260,11 +260,11 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 {
        struct pci_dev *pci_dev = to_pci_dev(dev);
 
-       return sprintf(buf, "pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02X\n",
-                      pci_dev->vendor, pci_dev->device,
-                      pci_dev->subsystem_vendor, pci_dev->subsystem_device,
-                      (u8)(pci_dev->class >> 16), (u8)(pci_dev->class >> 8),
-                      (u8)(pci_dev->class));
+       return sysfs_emit(buf, "pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02X\n",
+                         pci_dev->vendor, pci_dev->device,
+                         pci_dev->subsystem_vendor, pci_dev->subsystem_device,
+                         (u8)(pci_dev->class >> 16), (u8)(pci_dev->class >> 8),
+                         (u8)(pci_dev->class));
 }
 static DEVICE_ATTR_RO(modalias);
 
@@ -302,7 +302,7 @@ static ssize_t enable_show(struct device *dev, struct device_attribute *attr,
        struct pci_dev *pdev;
 
        pdev = to_pci_dev(dev);
-       return sprintf(buf, "%u\n", atomic_read(&pdev->enable_cnt));
+       return sysfs_emit(buf, "%u\n", atomic_read(&pdev->enable_cnt));
 }
 static DEVICE_ATTR_RW(enable);
 
@@ -338,7 +338,7 @@ static ssize_t numa_node_store(struct device *dev,
 static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr,
                              char *buf)
 {
-       return sprintf(buf, "%d\n", dev->numa_node);
+       return sysfs_emit(buf, "%d\n", dev->numa_node);
 }
 static DEVICE_ATTR_RW(numa_node);
 #endif
@@ -348,7 +348,7 @@ static ssize_t dma_mask_bits_show(struct device *dev,
 {
        struct pci_dev *pdev = to_pci_dev(dev);
 
-       return sprintf(buf, "%d\n", fls64(pdev->dma_mask));
+       return sysfs_emit(buf, "%d\n", fls64(pdev->dma_mask));
 }
 static DEVICE_ATTR_RO(dma_mask_bits);
 
@@ -356,7 +356,7 @@ static ssize_t consistent_dma_mask_bits_show(struct device *dev,
                                             struct device_attribute *attr,
                                             char *buf)
 {
-       return sprintf(buf, "%d\n", fls64(dev->coherent_dma_mask));
+       return sysfs_emit(buf, "%d\n", fls64(dev->coherent_dma_mask));
 }
 static DEVICE_ATTR_RO(consistent_dma_mask_bits);
 
@@ -366,9 +366,9 @@ static ssize_t msi_bus_show(struct device *dev, struct device_attribute *attr,
        struct pci_dev *pdev = to_pci_dev(dev);
        struct pci_bus *subordinate = pdev->subordinate;
 
-       return sprintf(buf, "%u\n", subordinate ?
-                      !(subordinate->bus_flags & PCI_BUS_FLAGS_NO_MSI)
-                          : !pdev->no_msi);
+       return sysfs_emit(buf, "%u\n", subordinate ?
+                         !(subordinate->bus_flags & PCI_BUS_FLAGS_NO_MSI)
+                           : !pdev->no_msi);
 }
 
 static ssize_t msi_bus_store(struct device *dev, struct device_attribute *attr,
@@ -523,7 +523,7 @@ static ssize_t d3cold_allowed_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
 {
        struct pci_dev *pdev = to_pci_dev(dev);
-       return sprintf(buf, "%u\n", pdev->d3cold_allowed);
+       return sysfs_emit(buf, "%u\n", pdev->d3cold_allowed);
 }
 static DEVICE_ATTR_RW(d3cold_allowed);
 #endif
@@ -537,7 +537,7 @@ static ssize_t devspec_show(struct device *dev,
 
        if (np == NULL)
                return 0;
-       return sprintf(buf, "%pOF", np);
+       return sysfs_emit(buf, "%pOF", np);
 }
 static DEVICE_ATTR_RO(devspec);
 #endif
@@ -583,7 +583,7 @@ static ssize_t driver_override_show(struct device *dev,
        ssize_t len;
 
        device_lock(dev);
-       len = scnprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override);
+       len = sysfs_emit(buf, "%s\n", pdev->driver_override);
        device_unlock(dev);
        return len;
 }
@@ -658,11 +658,11 @@ static ssize_t boot_vga_show(struct device *dev, struct device_attribute *attr,
        struct pci_dev *vga_dev = vga_default_device();
 
        if (vga_dev)
-               return sprintf(buf, "%u\n", (pdev == vga_dev));
+               return sysfs_emit(buf, "%u\n", (pdev == vga_dev));
 
-       return sprintf(buf, "%u\n",
-               !!(pdev->resource[PCI_ROM_RESOURCE].flags &
-                  IORESOURCE_ROM_SHADOW));
+       return sysfs_emit(buf, "%u\n",
+                         !!(pdev->resource[PCI_ROM_RESOURCE].flags &
+                            IORESOURCE_ROM_SHADOW));
 }
 static DEVICE_ATTR_RO(boot_vga);
 
@@ -808,6 +808,29 @@ static ssize_t pci_write_config(struct file *filp, struct kobject *kobj,
 
        return count;
 }
+static BIN_ATTR(config, 0644, pci_read_config, pci_write_config, 0);
+
+static struct bin_attribute *pci_dev_config_attrs[] = {
+       &bin_attr_config,
+       NULL,
+};
+
+static umode_t pci_dev_config_attr_is_visible(struct kobject *kobj,
+                                             struct bin_attribute *a, int n)
+{
+       struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
+
+       a->size = PCI_CFG_SPACE_SIZE;
+       if (pdev->cfg_size > PCI_CFG_SPACE_SIZE)
+               a->size = PCI_CFG_SPACE_EXP_SIZE;
+
+       return a->attr.mode;
+}
+
+static const struct attribute_group pci_dev_config_attr_group = {
+       .bin_attrs = pci_dev_config_attrs,
+       .is_bin_visible = pci_dev_config_attr_is_visible,
+};
 
 #ifdef HAVE_PCI_LEGACY
 /**
@@ -1283,25 +1306,32 @@ static ssize_t pci_read_rom(struct file *filp, struct kobject *kobj,
 
        return count;
 }
+static BIN_ATTR(rom, 0600, pci_read_rom, pci_write_rom, 0);
 
-static const struct bin_attribute pci_config_attr = {
-       .attr = {
-               .name = "config",
-               .mode = 0644,
-       },
-       .size = PCI_CFG_SPACE_SIZE,
-       .read = pci_read_config,
-       .write = pci_write_config,
+static struct bin_attribute *pci_dev_rom_attrs[] = {
+       &bin_attr_rom,
+       NULL,
 };
 
-static const struct bin_attribute pcie_config_attr = {
-       .attr = {
-               .name = "config",
-               .mode = 0644,
-       },
-       .size = PCI_CFG_SPACE_EXP_SIZE,
-       .read = pci_read_config,
-       .write = pci_write_config,
+static umode_t pci_dev_rom_attr_is_visible(struct kobject *kobj,
+                                          struct bin_attribute *a, int n)
+{
+       struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
+       size_t rom_size;
+
+       /* If the device has a ROM, try to expose it in sysfs. */
+       rom_size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
+       if (!rom_size)
+               return 0;
+
+       a->size = rom_size;
+
+       return a->attr.mode;
+}
+
+static const struct attribute_group pci_dev_rom_attr_group = {
+       .bin_attrs = pci_dev_rom_attrs,
+       .is_bin_visible = pci_dev_rom_attr_is_visible,
 };
 
 static ssize_t reset_store(struct device *dev, struct device_attribute *attr,
@@ -1325,102 +1355,35 @@ static ssize_t reset_store(struct device *dev, struct device_attribute *attr,
 
        return count;
 }
+static DEVICE_ATTR_WO(reset);
 
-static DEVICE_ATTR(reset, 0200, NULL, reset_store);
+static struct attribute *pci_dev_reset_attrs[] = {
+       &dev_attr_reset.attr,
+       NULL,
+};
 
-static int pci_create_capabilities_sysfs(struct pci_dev *dev)
+static umode_t pci_dev_reset_attr_is_visible(struct kobject *kobj,
+                                            struct attribute *a, int n)
 {
-       int retval;
-
-       pcie_vpd_create_sysfs_dev_files(dev);
+       struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
 
-       if (dev->reset_fn) {
-               retval = device_create_file(&dev->dev, &dev_attr_reset);
-               if (retval)
-                       goto error;
-       }
-       return 0;
+       if (!pdev->reset_fn)
+               return 0;
 
-error:
-       pcie_vpd_remove_sysfs_dev_files(dev);
-       return retval;
+       return a->mode;
 }
 
+static const struct attribute_group pci_dev_reset_attr_group = {
+       .attrs = pci_dev_reset_attrs,
+       .is_visible = pci_dev_reset_attr_is_visible,
+};
+
 int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev)
 {
-       int retval;
-       int rom_size;
-       struct bin_attribute *attr;
-
        if (!sysfs_initialized)
                return -EACCES;
 
-       if (pdev->cfg_size > PCI_CFG_SPACE_SIZE)
-               retval = sysfs_create_bin_file(&pdev->dev.kobj, &pcie_config_attr);
-       else
-               retval = sysfs_create_bin_file(&pdev->dev.kobj, &pci_config_attr);
-       if (retval)
-               goto err;
-
-       retval = pci_create_resource_files(pdev);
-       if (retval)
-               goto err_config_file;
-
-       /* If the device has a ROM, try to expose it in sysfs. */
-       rom_size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
-       if (rom_size) {
-               attr = kzalloc(sizeof(*attr), GFP_ATOMIC);
-               if (!attr) {
-                       retval = -ENOMEM;
-                       goto err_resource_files;
-               }
-               sysfs_bin_attr_init(attr);
-               attr->size = rom_size;
-               attr->attr.name = "rom";
-               attr->attr.mode = 0600;
-               attr->read = pci_read_rom;
-               attr->write = pci_write_rom;
-               retval = sysfs_create_bin_file(&pdev->dev.kobj, attr);
-               if (retval) {
-                       kfree(attr);
-                       goto err_resource_files;
-               }
-               pdev->rom_attr = attr;
-       }
-
-       /* add sysfs entries for various capabilities */
-       retval = pci_create_capabilities_sysfs(pdev);
-       if (retval)
-               goto err_rom_file;
-
-       pci_create_firmware_label_files(pdev);
-
-       return 0;
-
-err_rom_file:
-       if (pdev->rom_attr) {
-               sysfs_remove_bin_file(&pdev->dev.kobj, pdev->rom_attr);
-               kfree(pdev->rom_attr);
-               pdev->rom_attr = NULL;
-       }
-err_resource_files:
-       pci_remove_resource_files(pdev);
-err_config_file:
-       if (pdev->cfg_size > PCI_CFG_SPACE_SIZE)
-               sysfs_remove_bin_file(&pdev->dev.kobj, &pcie_config_attr);
-       else
-               sysfs_remove_bin_file(&pdev->dev.kobj, &pci_config_attr);
-err:
-       return retval;
-}
-
-static void pci_remove_capabilities_sysfs(struct pci_dev *dev)
-{
-       pcie_vpd_remove_sysfs_dev_files(dev);
-       if (dev->reset_fn) {
-               device_remove_file(&dev->dev, &dev_attr_reset);
-               dev->reset_fn = 0;
-       }
+       return pci_create_resource_files(pdev);
 }
 
 /**
@@ -1434,22 +1397,7 @@ void pci_remove_sysfs_dev_files(struct pci_dev *pdev)
        if (!sysfs_initialized)
                return;
 
-       pci_remove_capabilities_sysfs(pdev);
-
-       if (pdev->cfg_size > PCI_CFG_SPACE_SIZE)
-               sysfs_remove_bin_file(&pdev->dev.kobj, &pcie_config_attr);
-       else
-               sysfs_remove_bin_file(&pdev->dev.kobj, &pci_config_attr);
-
        pci_remove_resource_files(pdev);
-
-       if (pdev->rom_attr) {
-               sysfs_remove_bin_file(&pdev->dev.kobj, pdev->rom_attr);
-               kfree(pdev->rom_attr);
-               pdev->rom_attr = NULL;
-       }
-
-       pci_remove_firmware_label_files(pdev);
 }
 
 static int __init pci_sysfs_init(void)
@@ -1540,6 +1488,16 @@ static const struct attribute_group pci_dev_group = {
 
 const struct attribute_group *pci_dev_groups[] = {
        &pci_dev_group,
+       &pci_dev_config_attr_group,
+       &pci_dev_rom_attr_group,
+       &pci_dev_reset_attr_group,
+       &pci_dev_vpd_attr_group,
+#ifdef CONFIG_DMI
+       &pci_dev_smbios_attr_group,
+#endif
+#ifdef CONFIG_ACPI
+       &pci_dev_acpi_attr_group,
+#endif
        NULL,
 };
 
index f4c26e6..b717680 100644 (file)
@@ -692,6 +692,36 @@ u8 pci_find_ht_capability(struct pci_dev *dev, int ht_cap)
 }
 EXPORT_SYMBOL_GPL(pci_find_ht_capability);
 
+/**
+ * pci_find_vsec_capability - Find a vendor-specific extended capability
+ * @dev: PCI device to query
+ * @vendor: Vendor ID for which capability is defined
+ * @cap: Vendor-specific capability ID
+ *
+ * If @dev has Vendor ID @vendor, search for a VSEC capability with
+ * VSEC ID @cap. If found, return the capability offset in
+ * config space; otherwise return 0.
+ */
+u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap)
+{
+       u16 vsec = 0;
+       u32 header;
+
+       if (vendor != dev->vendor)
+               return 0;
+
+       while ((vsec = pci_find_next_ext_capability(dev, vsec,
+                                                    PCI_EXT_CAP_ID_VNDR))) {
+               if (pci_read_config_dword(dev, vsec + PCI_VNDR_HEADER,
+                                         &header) == PCIBIOS_SUCCESSFUL &&
+                   PCI_VNDR_HEADER_ID(header) == cap)
+                       return vsec;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(pci_find_vsec_capability);
+
 /**
  * pci_find_parent_resource - return resource region of parent bus of given
  *                           region
@@ -4042,6 +4072,7 @@ phys_addr_t pci_pio_to_address(unsigned long pio)
 
        return address;
 }
+EXPORT_SYMBOL_GPL(pci_pio_to_address);
 
 unsigned long __weak pci_address_to_pio(phys_addr_t address)
 {
@@ -4443,6 +4474,23 @@ void pci_clear_mwi(struct pci_dev *dev)
 }
 EXPORT_SYMBOL(pci_clear_mwi);
 
+/**
+ * pci_disable_parity - disable parity checking for device
+ * @dev: the PCI device to operate on
+ *
+ * Disable parity checking for device @dev
+ */
+void pci_disable_parity(struct pci_dev *dev)
+{
+       u16 cmd;
+
+       pci_read_config_word(dev, PCI_COMMAND, &cmd);
+       if (cmd & PCI_COMMAND_PARITY) {
+               cmd &= ~PCI_COMMAND_PARITY;
+               pci_write_config_word(dev, PCI_COMMAND, cmd);
+       }
+}
+
 /**
  * pci_intx - enables/disables PCI INTx for device dev
  * @pdev: the PCI device to operate on
index afb87b9..37c913b 100644 (file)
@@ -21,16 +21,10 @@ bool pcie_cap_has_rtctl(const struct pci_dev *dev);
 
 int pci_create_sysfs_dev_files(struct pci_dev *pdev);
 void pci_remove_sysfs_dev_files(struct pci_dev *pdev);
-#if !defined(CONFIG_DMI) && !defined(CONFIG_ACPI)
-static inline void pci_create_firmware_label_files(struct pci_dev *pdev)
-{ return; }
-static inline void pci_remove_firmware_label_files(struct pci_dev *pdev)
-{ return; }
-#else
-void pci_create_firmware_label_files(struct pci_dev *pdev);
-void pci_remove_firmware_label_files(struct pci_dev *pdev);
-#endif
 void pci_cleanup_rom(struct pci_dev *dev);
+#ifdef CONFIG_DMI
+extern const struct attribute_group pci_dev_smbios_attr_group;
+#endif
 
 enum pci_mmap_api {
        PCI_MMAP_SYSFS, /* mmap on /sys/bus/pci/devices/<BDF>/resource<N> */
@@ -141,10 +135,9 @@ static inline bool pcie_downstream_port(const struct pci_dev *dev)
               type == PCI_EXP_TYPE_PCIE_BRIDGE;
 }
 
-int pci_vpd_init(struct pci_dev *dev);
+void pci_vpd_init(struct pci_dev *dev);
 void pci_vpd_release(struct pci_dev *dev);
-void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev);
-void pcie_vpd_remove_sysfs_dev_files(struct pci_dev *dev);
+extern const struct attribute_group pci_dev_vpd_attr_group;
 
 /* PCI Virtual Channel */
 int pci_save_vc_state(struct pci_dev *dev);
@@ -625,6 +618,12 @@ static inline int pci_dev_specific_reset(struct pci_dev *dev, int probe)
 #if defined(CONFIG_PCI_QUIRKS) && defined(CONFIG_ARM64)
 int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment,
                          struct resource *res);
+#else
+static inline int acpi_get_rc_resources(struct device *dev, const char *hid,
+                                       u16 segment, struct resource *res)
+{
+       return -ENODEV;
+}
 #endif
 
 int pci_rebar_get_current_size(struct pci_dev *pdev, int bar);
@@ -697,6 +696,7 @@ static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL
 
 #ifdef CONFIG_ACPI
 int pci_acpi_program_hp_params(struct pci_dev *dev);
+extern const struct attribute_group pci_dev_acpi_attr_group;
 #else
 static inline int pci_acpi_program_hp_params(struct pci_dev *dev)
 {
index ba22388..ec943ce 100644 (file)
@@ -129,7 +129,7 @@ static const char * const ecrc_policy_str[] = {
 };
 
 /**
- * enable_ercr_checking - enable PCIe ECRC checking for a device
+ * enable_ecrc_checking - enable PCIe ECRC checking for a device
  * @dev: the PCI device
  *
  * Returns 0 on success, or negative on failure.
@@ -153,7 +153,7 @@ static int enable_ecrc_checking(struct pci_dev *dev)
 }
 
 /**
- * disable_ercr_checking - disables PCIe ECRC checking for a device
+ * disable_ecrc_checking - disables PCIe ECRC checking for a device
  * @dev: the PCI device
  *
  * Returns 0 on success, or negative on failure.
@@ -1442,7 +1442,7 @@ static struct pcie_port_service_driver aerdriver = {
 };
 
 /**
- * aer_service_init - register AER root service driver
+ * pcie_aer_init - register AER root service driver
  *
  * Invoked when AER root service driver is loaded.
  */
index 3fc0848..1d0dd77 100644 (file)
@@ -463,7 +463,7 @@ static struct pcie_port_service_driver pcie_pme_driver = {
 };
 
 /**
- * pcie_pme_service_init - Register the PCIe PME service driver.
+ * pcie_pme_init - Register the PCIe PME service driver.
  */
 int __init pcie_pme_init(void)
 {
index 2c5c552..d0bcd14 100644 (file)
@@ -32,7 +32,7 @@ static bool rcec_assoc_rciep(struct pci_dev *rcec, struct pci_dev *rciep)
 
        /* Same bus, so check bitmap */
        for_each_set_bit(devn, &bitmap, 32)
-               if (devn == rciep->devfn)
+               if (devn == PCI_SLOT(rciep->devfn))
                        return true;
 
        return false;
index 953f15a..3a62d09 100644 (file)
@@ -895,7 +895,6 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge)
        /* Temporarily move resources off the list */
        list_splice_init(&bridge->windows, &resources);
        bus->sysdata = bridge->sysdata;
-       bus->msi = bridge->msi;
        bus->ops = bridge->ops;
        bus->number = bus->busn_res.start = bridge->busnr;
 #ifdef CONFIG_PCI_DOMAINS_GENERIC
@@ -926,6 +925,8 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge)
        device_enable_async_suspend(bus->bridge);
        pci_set_bus_of_node(bus);
        pci_set_bus_msi_domain(bus);
+       if (bridge->msi_domain && !dev_get_msi_domain(&bus->dev))
+               bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
 
        if (!parent)
                set_dev_node(bus->bridge, pcibus_to_node(bus));
@@ -1053,7 +1054,6 @@ static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent,
                return NULL;
 
        child->parent = parent;
-       child->msi = parent->msi;
        child->sysdata = parent->sysdata;
        child->bus_flags = parent->bus_flags;
 
@@ -2353,6 +2353,7 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn)
        pci_set_of_node(dev);
 
        if (pci_setup_device(dev)) {
+               pci_release_of_node(dev);
                pci_bus_put(dev->bus);
                kfree(dev);
                return NULL;
index 653660e..dcb229d 100644 (file)
@@ -206,16 +206,11 @@ DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_ANY_ID, PCI_ANY_ID,
                                PCI_CLASS_BRIDGE_HOST, 8, quirk_mmio_always_on);
 
 /*
- * The Mellanox Tavor device gives false positive parity errors.  Mark this
- * device with a broken_parity_status to allow PCI scanning code to "skip"
- * this now blacklisted device.
+ * The Mellanox Tavor device gives false positive parity errors.  Disable
+ * parity error reporting.
  */
-static void quirk_mellanox_tavor(struct pci_dev *dev)
-{
-       dev->broken_parity_status = 1;  /* This device gives false positives */
-}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR, quirk_mellanox_tavor);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE, quirk_mellanox_tavor);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR, pci_disable_parity);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE, pci_disable_parity);
 
 /*
  * Deal with broken BIOSes that neglect to enable passive release,
@@ -2585,10 +2580,8 @@ static int msi_ht_cap_enabled(struct pci_dev *dev)
 /* Check the HyperTransport MSI mapping to know whether MSI is enabled or not */
 static void quirk_msi_ht_cap(struct pci_dev *dev)
 {
-       if (dev->subordinate && !msi_ht_cap_enabled(dev)) {
-               pci_warn(dev, "MSI quirk detected; subordinate MSI disabled\n");
-               dev->subordinate->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
-       }
+       if (!msi_ht_cap_enabled(dev))
+               quirk_disable_msi(dev);
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_HT2000_PCIE,
                        quirk_msi_ht_cap);
@@ -2601,9 +2594,6 @@ static void quirk_nvidia_ck804_msi_ht_cap(struct pci_dev *dev)
 {
        struct pci_dev *pdev;
 
-       if (!dev->subordinate)
-               return;
-
        /*
         * Check HT MSI cap on this chipset and the root one.  A single one
         * having MSI is enough to be sure that MSI is supported.
@@ -2611,10 +2601,8 @@ static void quirk_nvidia_ck804_msi_ht_cap(struct pci_dev *dev)
        pdev = pci_get_slot(dev->bus, 0);
        if (!pdev)
                return;
-       if (!msi_ht_cap_enabled(dev) && !msi_ht_cap_enabled(pdev)) {
-               pci_warn(dev, "MSI quirk detected; subordinate MSI disabled\n");
-               dev->subordinate->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
-       }
+       if (!msi_ht_cap_enabled(pdev))
+               quirk_msi_ht_cap(dev);
        pci_dev_put(pdev);
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_CK804_PCIE,
@@ -3922,6 +3910,7 @@ static const struct pci_dev_reset_methods pci_dev_reset_methods[] = {
                reset_ivb_igd },
        { PCI_VENDOR_ID_SAMSUNG, 0xa804, nvme_disable_and_flr },
        { PCI_VENDOR_ID_INTEL, 0x0953, delay_250ms_after_flr },
+       { PCI_VENDOR_ID_INTEL, 0x0a54, delay_250ms_after_flr },
        { PCI_VENDOR_ID_CHELSIO, PCI_ANY_ID,
                reset_chelsio_generic_dev },
        { 0 }
index 95dec03..dd12c2f 100644 (file)
@@ -19,6 +19,8 @@ static void pci_stop_dev(struct pci_dev *dev)
        pci_pme_active(dev, false);
 
        if (pci_dev_is_added(dev)) {
+               dev->reset_fn = 0;
+
                device_release_driver(&dev->dev);
                pci_proc_detach_device(dev);
                pci_remove_sysfs_dev_files(dev);
index 7915d10..26bf7c8 100644 (file)
 struct pci_vpd_ops {
        ssize_t (*read)(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
        ssize_t (*write)(struct pci_dev *dev, loff_t pos, size_t count, const void *buf);
-       int (*set_size)(struct pci_dev *dev, size_t len);
 };
 
 struct pci_vpd {
        const struct pci_vpd_ops *ops;
-       struct bin_attribute *attr;     /* Descriptor for sysfs VPD entry */
        struct mutex    lock;
        unsigned int    len;
        u16             flag;
@@ -30,6 +28,11 @@ struct pci_vpd {
        unsigned int    valid:1;
 };
 
+static struct pci_dev *pci_get_func0_dev(struct pci_dev *dev)
+{
+       return pci_get_slot(dev->bus, PCI_DEVFN(PCI_SLOT(dev->devfn), 0));
+}
+
 /**
  * pci_read_vpd - Read one entry from Vital Product Data
  * @dev:       pci device struct
@@ -60,19 +63,6 @@ ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void
 }
 EXPORT_SYMBOL(pci_write_vpd);
 
-/**
- * pci_set_vpd_size - Set size of Vital Product Data space
- * @dev:       pci device struct
- * @len:       size of vpd space
- */
-int pci_set_vpd_size(struct pci_dev *dev, size_t len)
-{
-       if (!dev->vpd || !dev->vpd->ops)
-               return -ENODEV;
-       return dev->vpd->ops->set_size(dev, len);
-}
-EXPORT_SYMBOL(pci_set_vpd_size);
-
 #define PCI_VPD_MAX_SIZE (PCI_VPD_ADDR_MASK + 1)
 
 /**
@@ -85,10 +75,14 @@ static size_t pci_vpd_size(struct pci_dev *dev, size_t old_size)
        size_t off = 0;
        unsigned char header[1+2];      /* 1 byte tag, 2 bytes length */
 
-       while (off < old_size &&
-              pci_read_vpd(dev, off, 1, header) == 1) {
+       while (off < old_size && pci_read_vpd(dev, off, 1, header) == 1) {
                unsigned char tag;
 
+               if (!header[0] && !off) {
+                       pci_info(dev, "Invalid VPD tag 00, assume missing optional VPD EPROM\n");
+                       return 0;
+               }
+
                if (header[0] & PCI_VPD_LRDT) {
                        /* Large Resource Data Type Tag */
                        tag = pci_vpd_lrdt_tag(header);
@@ -297,30 +291,15 @@ out:
        return ret ? ret : count;
 }
 
-static int pci_vpd_set_size(struct pci_dev *dev, size_t len)
-{
-       struct pci_vpd *vpd = dev->vpd;
-
-       if (len == 0 || len > PCI_VPD_MAX_SIZE)
-               return -EIO;
-
-       vpd->valid = 1;
-       vpd->len = len;
-
-       return 0;
-}
-
 static const struct pci_vpd_ops pci_vpd_ops = {
        .read = pci_vpd_read,
        .write = pci_vpd_write,
-       .set_size = pci_vpd_set_size,
 };
 
 static ssize_t pci_vpd_f0_read(struct pci_dev *dev, loff_t pos, size_t count,
                               void *arg)
 {
-       struct pci_dev *tdev = pci_get_slot(dev->bus,
-                                           PCI_DEVFN(PCI_SLOT(dev->devfn), 0));
+       struct pci_dev *tdev = pci_get_func0_dev(dev);
        ssize_t ret;
 
        if (!tdev)
@@ -334,8 +313,7 @@ static ssize_t pci_vpd_f0_read(struct pci_dev *dev, loff_t pos, size_t count,
 static ssize_t pci_vpd_f0_write(struct pci_dev *dev, loff_t pos, size_t count,
                                const void *arg)
 {
-       struct pci_dev *tdev = pci_get_slot(dev->bus,
-                                           PCI_DEVFN(PCI_SLOT(dev->devfn), 0));
+       struct pci_dev *tdev = pci_get_func0_dev(dev);
        ssize_t ret;
 
        if (!tdev)
@@ -346,38 +324,23 @@ static ssize_t pci_vpd_f0_write(struct pci_dev *dev, loff_t pos, size_t count,
        return ret;
 }
 
-static int pci_vpd_f0_set_size(struct pci_dev *dev, size_t len)
-{
-       struct pci_dev *tdev = pci_get_slot(dev->bus,
-                                           PCI_DEVFN(PCI_SLOT(dev->devfn), 0));
-       int ret;
-
-       if (!tdev)
-               return -ENODEV;
-
-       ret = pci_set_vpd_size(tdev, len);
-       pci_dev_put(tdev);
-       return ret;
-}
-
 static const struct pci_vpd_ops pci_vpd_f0_ops = {
        .read = pci_vpd_f0_read,
        .write = pci_vpd_f0_write,
-       .set_size = pci_vpd_f0_set_size,
 };
 
-int pci_vpd_init(struct pci_dev *dev)
+void pci_vpd_init(struct pci_dev *dev)
 {
        struct pci_vpd *vpd;
        u8 cap;
 
        cap = pci_find_capability(dev, PCI_CAP_ID_VPD);
        if (!cap)
-               return -ENODEV;
+               return;
 
        vpd = kzalloc(sizeof(*vpd), GFP_ATOMIC);
        if (!vpd)
-               return -ENOMEM;
+               return;
 
        vpd->len = PCI_VPD_MAX_SIZE;
        if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0)
@@ -389,7 +352,6 @@ int pci_vpd_init(struct pci_dev *dev)
        vpd->busy = 0;
        vpd->valid = 0;
        dev->vpd = vpd;
-       return 0;
 }
 
 void pci_vpd_release(struct pci_dev *dev)
@@ -397,102 +359,56 @@ void pci_vpd_release(struct pci_dev *dev)
        kfree(dev->vpd);
 }
 
-static ssize_t read_vpd_attr(struct file *filp, struct kobject *kobj,
-                            struct bin_attribute *bin_attr, char *buf,
-                            loff_t off, size_t count)
+static ssize_t vpd_read(struct file *filp, struct kobject *kobj,
+                       struct bin_attribute *bin_attr, char *buf, loff_t off,
+                       size_t count)
 {
        struct pci_dev *dev = to_pci_dev(kobj_to_dev(kobj));
 
-       if (bin_attr->size > 0) {
-               if (off > bin_attr->size)
-                       count = 0;
-               else if (count > bin_attr->size - off)
-                       count = bin_attr->size - off;
-       }
-
        return pci_read_vpd(dev, off, count, buf);
 }
 
-static ssize_t write_vpd_attr(struct file *filp, struct kobject *kobj,
-                             struct bin_attribute *bin_attr, char *buf,
-                             loff_t off, size_t count)
+static ssize_t vpd_write(struct file *filp, struct kobject *kobj,
+                        struct bin_attribute *bin_attr, char *buf, loff_t off,
+                        size_t count)
 {
        struct pci_dev *dev = to_pci_dev(kobj_to_dev(kobj));
 
-       if (bin_attr->size > 0) {
-               if (off > bin_attr->size)
-                       count = 0;
-               else if (count > bin_attr->size - off)
-                       count = bin_attr->size - off;
-       }
-
        return pci_write_vpd(dev, off, count, buf);
 }
+static BIN_ATTR(vpd, 0600, vpd_read, vpd_write, 0);
 
-void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev)
-{
-       int retval;
-       struct bin_attribute *attr;
-
-       if (!dev->vpd)
-               return;
+static struct bin_attribute *vpd_attrs[] = {
+       &bin_attr_vpd,
+       NULL,
+};
 
-       attr = kzalloc(sizeof(*attr), GFP_ATOMIC);
-       if (!attr)
-               return;
+static umode_t vpd_attr_is_visible(struct kobject *kobj,
+                                  struct bin_attribute *a, int n)
+{
+       struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
 
-       sysfs_bin_attr_init(attr);
-       attr->size = 0;
-       attr->attr.name = "vpd";
-       attr->attr.mode = S_IRUSR | S_IWUSR;
-       attr->read = read_vpd_attr;
-       attr->write = write_vpd_attr;
-       retval = sysfs_create_bin_file(&dev->dev.kobj, attr);
-       if (retval) {
-               kfree(attr);
-               return;
-       }
+       if (!pdev->vpd)
+               return 0;
 
-       dev->vpd->attr = attr;
+       return a->attr.mode;
 }
 
-void pcie_vpd_remove_sysfs_dev_files(struct pci_dev *dev)
-{
-       if (dev->vpd && dev->vpd->attr) {
-               sysfs_remove_bin_file(&dev->dev.kobj, dev->vpd->attr);
-               kfree(dev->vpd->attr);
-       }
-}
+const struct attribute_group pci_dev_vpd_attr_group = {
+       .bin_attrs = vpd_attrs,
+       .is_bin_visible = vpd_attr_is_visible,
+};
 
-int pci_vpd_find_tag(const u8 *buf, unsigned int off, unsigned int len, u8 rdt)
+int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt)
 {
-       int i;
+       int i = 0;
 
-       for (i = off; i < len; ) {
-               u8 val = buf[i];
-
-               if (val & PCI_VPD_LRDT) {
-                       /* Don't return success of the tag isn't complete */
-                       if (i + PCI_VPD_LRDT_TAG_SIZE > len)
-                               break;
-
-                       if (val == rdt)
-                               return i;
-
-                       i += PCI_VPD_LRDT_TAG_SIZE +
-                            pci_vpd_lrdt_size(&buf[i]);
-               } else {
-                       u8 tag = val & ~PCI_VPD_SRDT_LEN_MASK;
-
-                       if (tag == rdt)
-                               return i;
-
-                       if (tag == PCI_VPD_SRDT_END)
-                               break;
+       /* look for LRDT tags only, end tag is the only SRDT tag */
+       while (i + PCI_VPD_LRDT_TAG_SIZE <= len && buf[i] & PCI_VPD_LRDT) {
+               if (buf[i] == rdt)
+                       return i;
 
-                       i += PCI_VPD_SRDT_TAG_SIZE +
-                            pci_vpd_srdt_size(&buf[i]);
-               }
+               i += PCI_VPD_LRDT_TAG_SIZE + pci_vpd_lrdt_size(buf + i);
        }
 
        return -ENOENT;
@@ -530,7 +446,7 @@ static void quirk_f0_vpd_link(struct pci_dev *dev)
        if (!PCI_FUNC(dev->devfn))
                return;
 
-       f0 = pci_get_slot(dev->bus, PCI_DEVFN(PCI_SLOT(dev->devfn), 0));
+       f0 = pci_get_func0_dev(dev);
        if (!f0)
                return;
 
@@ -570,7 +486,6 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LSI_LOGIC, 0x005d, quirk_blacklist_vpd);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LSI_LOGIC, 0x005f, quirk_blacklist_vpd);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, PCI_ANY_ID,
                quirk_blacklist_vpd);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_QLOGIC, 0x2261, quirk_blacklist_vpd);
 /*
  * The Amazon Annapurna Labs 0x0031 device id is reused for other non Root Port
  * device types, so the quirk is registered for the PCI_CLASS_BRIDGE_PCI class.
@@ -578,51 +493,16 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_QLOGIC, 0x2261, quirk_blacklist_vpd);
 DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031,
                              PCI_CLASS_BRIDGE_PCI, 8, quirk_blacklist_vpd);
 
-/*
- * For Broadcom 5706, 5708, 5709 rev. A nics, any read beyond the
- * VPD end tag will hang the device.  This problem was initially
- * observed when a vpd entry was created in sysfs
- * ('/sys/bus/pci/devices/<id>/vpd').   A read to this sysfs entry
- * will dump 32k of data.  Reading a full 32k will cause an access
- * beyond the VPD end tag causing the device to hang.  Once the device
- * is hung, the bnx2 driver will not be able to reset the device.
- * We believe that it is legal to read beyond the end tag and
- * therefore the solution is to limit the read/write length.
- */
-static void quirk_brcm_570x_limit_vpd(struct pci_dev *dev)
+static void pci_vpd_set_size(struct pci_dev *dev, size_t len)
 {
-       /*
-        * Only disable the VPD capability for 5706, 5706S, 5708,
-        * 5708S and 5709 rev. A
-        */
-       if ((dev->device == PCI_DEVICE_ID_NX2_5706) ||
-           (dev->device == PCI_DEVICE_ID_NX2_5706S) ||
-           (dev->device == PCI_DEVICE_ID_NX2_5708) ||
-           (dev->device == PCI_DEVICE_ID_NX2_5708S) ||
-           ((dev->device == PCI_DEVICE_ID_NX2_5709) &&
-            (dev->revision & 0xf0) == 0x0)) {
-               if (dev->vpd)
-                       dev->vpd->len = 0x80;
-       }
+       struct pci_vpd *vpd = dev->vpd;
+
+       if (!vpd || len == 0 || len > PCI_VPD_MAX_SIZE)
+               return;
+
+       vpd->valid = 1;
+       vpd->len = len;
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_BROADCOM,
-                       PCI_DEVICE_ID_NX2_5706,
-                       quirk_brcm_570x_limit_vpd);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_BROADCOM,
-                       PCI_DEVICE_ID_NX2_5706S,
-                       quirk_brcm_570x_limit_vpd);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_BROADCOM,
-                       PCI_DEVICE_ID_NX2_5708,
-                       quirk_brcm_570x_limit_vpd);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_BROADCOM,
-                       PCI_DEVICE_ID_NX2_5708S,
-                       quirk_brcm_570x_limit_vpd);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_BROADCOM,
-                       PCI_DEVICE_ID_NX2_5709,
-                       quirk_brcm_570x_limit_vpd);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_BROADCOM,
-                       PCI_DEVICE_ID_NX2_5709S,
-                       quirk_brcm_570x_limit_vpd);
 
 static void quirk_chelsio_extend_vpd(struct pci_dev *dev)
 {
@@ -642,9 +522,9 @@ static void quirk_chelsio_extend_vpd(struct pci_dev *dev)
         * limits.
         */
        if (chip == 0x0 && prod >= 0x20)
-               pci_set_vpd_size(dev, 8192);
+               pci_vpd_set_size(dev, 8192);
        else if (chip >= 0x4 && func < 0x8)
-               pci_set_vpd_size(dev, 2048);
+               pci_vpd_set_size(dev, 2048);
 }
 
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, PCI_ANY_ID,
index 2d75026..b7a8f3a 100644 (file)
@@ -693,7 +693,7 @@ static int pcifront_connect_and_init_dma(struct pcifront_device *pdev)
 
        spin_unlock(&pcifront_dev_lock);
 
-       if (!err && !swiotlb_nr_tbl()) {
+       if (!err && !is_swiotlb_active()) {
                err = pci_xen_swiotlb_init_late();
                if (err)
                        dev_err(&pdev->xdev->dev, "Could not setup SWIOTLB!\n");
index e693910..948b763 100644 (file)
@@ -75,7 +75,7 @@ void release_cis_mem(struct pcmcia_socket *s)
        mutex_unlock(&s->ops_mutex);
 }
 
-/**
+/*
  * set_cis_map() - map the card memory at "card_offset" into virtual space.
  *
  * If flags & MAP_ATTRIB, map the attribute space, otherwise
@@ -126,7 +126,7 @@ static void __iomem *set_cis_map(struct pcmcia_socket *s,
 #define IS_ATTR                1
 #define IS_INDIRECT    8
 
-/**
+/*
  * pcmcia_read_cis_mem() - low-level function to read CIS memory
  *
  * must be called with ops_mutex held
@@ -206,7 +206,7 @@ int pcmcia_read_cis_mem(struct pcmcia_socket *s, int attr, u_int addr,
 }
 
 
-/**
+/*
  * pcmcia_write_cis_mem() - low-level function to write CIS memory
  *
  * Probably only useful for writing one-byte registers. Must be called
@@ -277,7 +277,7 @@ int pcmcia_write_cis_mem(struct pcmcia_socket *s, int attr, u_int addr,
 }
 
 
-/**
+/*
  * read_cis_cache() - read CIS memory or its associated cache
  *
  * This is a wrapper around read_cis_mem, with the same interface,
@@ -365,7 +365,7 @@ void destroy_cis_cache(struct pcmcia_socket *s)
        }
 }
 
-/**
+/*
  * verify_cis_cache() - does the CIS match what is in the CIS cache?
  */
 int verify_cis_cache(struct pcmcia_socket *s)
@@ -401,7 +401,7 @@ int verify_cis_cache(struct pcmcia_socket *s)
        return 0;
 }
 
-/**
+/*
  * pcmcia_replace_cis() - use a replacement CIS instead of the card's CIS
  *
  * For really bad cards, we provide a facility for uploading a
index 7211490..bd81aa6 100644 (file)
@@ -83,7 +83,7 @@ struct pcmcia_dynid {
 };
 
 /**
- * pcmcia_store_new_id - add a new PCMCIA device ID to this driver and re-probe devices
+ * new_id_store() - add a new PCMCIA device ID to this driver and re-probe devices
  * @driver: target device driver
  * @buf: buffer for scanning device ID data
  * @count: input size
@@ -371,9 +371,6 @@ static int pcmcia_device_remove(struct device *dev)
                pcmcia_card_remove(p_dev->socket, p_dev);
 
        /* detach the "instance" */
-       if (!p_drv)
-               return 0;
-
        if (p_drv->remove)
                p_drv->remove(p_dev);
 
@@ -389,7 +386,7 @@ static int pcmcia_device_remove(struct device *dev)
                                 "pcmcia: driver %s did not release window properly\n",
                                 p_drv->name);
 
-       /* references from pcmcia_probe_device */
+       /* references from pcmcia_device_probe */
        pcmcia_put_dev(p_dev);
        module_put(p_drv->owner);
 
index e4c4daf..d2d0ed4 100644 (file)
@@ -122,7 +122,7 @@ next_entry:
 }
 
 
-/**
+/*
  * pcmcia_io_cfg_data_width() - convert cfgtable to data path width parameter
  */
 static int pcmcia_io_cfg_data_width(unsigned int flags)
@@ -143,7 +143,7 @@ struct pcmcia_cfg_mem {
        cistpl_cftable_entry_t dflt;
 };
 
-/**
+/*
  * pcmcia_do_loop_config() - internal helper for pcmcia_loop_config()
  *
  * pcmcia_do_loop_config() is the internal callback for the call from
@@ -289,7 +289,7 @@ struct pcmcia_loop_mem {
                           void *priv_data);
 };
 
-/**
+/*
  * pcmcia_do_loop_tuple() - internal helper for pcmcia_loop_config()
  *
  * pcmcia_do_loop_tuple() is the internal callback for the call from
@@ -337,7 +337,7 @@ struct pcmcia_loop_get {
        cisdata_t **buf;
 };
 
-/**
+/*
  * pcmcia_do_get_tuple() - internal helper for pcmcia_get_tuple()
  *
  * pcmcia_do_get_tuple() is the internal callback for the call from
@@ -386,7 +386,7 @@ size_t pcmcia_get_tuple(struct pcmcia_device *p_dev, cisdata_t code,
 EXPORT_SYMBOL(pcmcia_get_tuple);
 
 
-/**
+/*
  * pcmcia_do_get_mac() - internal helper for pcmcia_get_mac_from_cis()
  *
  * pcmcia_do_get_mac() is the internal callback for the call from
index e3a6b6c..c1c1972 100644 (file)
@@ -144,7 +144,7 @@ static int alloc_io_space(struct pcmcia_socket *s, struct resource *res,
 }
 
 
-/**
+/*
  * pcmcia_access_config() - read or write card configuration registers
  *
  * pcmcia_access_config() reads and writes configuration registers in
@@ -184,7 +184,7 @@ static int pcmcia_access_config(struct pcmcia_device *p_dev,
 }
 
 
-/**
+/*
  * pcmcia_read_config_byte() - read a byte from a card configuration register
  *
  * pcmcia_read_config_byte() reads a byte from a configuration register in
@@ -197,7 +197,7 @@ int pcmcia_read_config_byte(struct pcmcia_device *p_dev, off_t where, u8 *val)
 EXPORT_SYMBOL(pcmcia_read_config_byte);
 
 
-/**
+/*
  * pcmcia_write_config_byte() - write a byte to a card configuration register
  *
  * pcmcia_write_config_byte() writes a byte to a configuration register in
@@ -720,7 +720,8 @@ static irqreturn_t test_action(int cpl, void *dev_id)
 
 /**
  * pcmcia_setup_isa_irq() - determine whether an ISA IRQ can be used
- * @p_dev - the associated PCMCIA device
+ * @p_dev: the associated PCMCIA device
+ * @type:  IRQ type (flags)
  *
  * locking note: must be called with ops_mutex locked.
  */
@@ -785,7 +786,7 @@ void pcmcia_cleanup_irq(struct pcmcia_socket *s)
 
 /**
  * pcmcia_setup_irq() - determine IRQ to be used for device
- * @p_dev - the associated PCMCIA device
+ * @p_dev: the associated PCMCIA device
  *
  * locking note: must be called with ops_mutex locked.
  */
index 3b05760..bb15a8b 100644 (file)
@@ -257,7 +257,7 @@ static void do_io_probe(struct pcmcia_socket *s, unsigned int base,
 
 /*======================================================================*/
 
-/**
+/*
  * readable() - iomem validation function for cards with a valid CIS
  */
 static int readable(struct pcmcia_socket *s, struct resource *res,
@@ -288,7 +288,7 @@ static int readable(struct pcmcia_socket *s, struct resource *res,
        return 0;
 }
 
-/**
+/*
  * checksum() - iomem validation function for simple memory cards
  */
 static int checksum(struct pcmcia_socket *s, struct resource *res,
@@ -343,9 +343,9 @@ static int checksum(struct pcmcia_socket *s, struct resource *res,
  */
 static int do_validate_mem(struct pcmcia_socket *s,
                           unsigned long base, unsigned long size,
-                          int validate (struct pcmcia_socket *s,
-                                        struct resource *res,
-                                        unsigned int *value))
+                          int (*validate)(struct pcmcia_socket *s,
+                                          struct resource *res,
+                                          unsigned int *value))
 {
        struct socket_data *s_data = s->resource_data;
        struct resource *res1, *res2;
@@ -398,12 +398,12 @@ static int do_validate_mem(struct pcmcia_socket *s,
  * function returns the size of the usable memory area.
  */
 static int do_mem_probe(struct pcmcia_socket *s, u_long base, u_long num,
-                       int validate (struct pcmcia_socket *s,
-                                     struct resource *res,
-                                     unsigned int *value),
-                       int fallback (struct pcmcia_socket *s,
-                                     struct resource *res,
-                                     unsigned int *value))
+                       int (*validate)(struct pcmcia_socket *s,
+                                       struct resource *res,
+                                       unsigned int *value),
+                       int (*fallback)(struct pcmcia_socket *s,
+                                       struct resource *res,
+                                       unsigned int *value))
 {
        struct socket_data *s_data = s->resource_data;
        u_long i, j, bad, fail, step;
index 2d10d84..d4f7f1f 100644 (file)
@@ -581,33 +581,6 @@ static const struct attribute_group armpmu_common_attr_group = {
        .attrs = armpmu_common_attrs,
 };
 
-/* Set at runtime when we know what CPU type we are. */
-static struct arm_pmu *__oprofile_cpu_pmu;
-
-/*
- * Despite the names, these two functions are CPU-specific and are used
- * by the OProfile/perf code.
- */
-const char *perf_pmu_name(void)
-{
-       if (!__oprofile_cpu_pmu)
-               return NULL;
-
-       return __oprofile_cpu_pmu->name;
-}
-EXPORT_SYMBOL_GPL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
-       int max_events = 0;
-
-       if (__oprofile_cpu_pmu != NULL)
-               max_events = __oprofile_cpu_pmu->num_events;
-
-       return max_events;
-}
-EXPORT_SYMBOL_GPL(perf_num_counters);
-
 static int armpmu_count_irq_users(const int irq)
 {
        int cpu, count = 0;
@@ -979,9 +952,6 @@ int armpmu_register(struct arm_pmu *pmu)
        if (ret)
                goto out_destroy;
 
-       if (!__oprofile_cpu_pmu)
-               __oprofile_cpu_pmu = pmu;
-
        pr_info("enabled with %s PMU driver, %d counters available%s\n",
                pmu->name, pmu->num_events,
                has_nmi ? ", using NMIs" : "");
index 9035b17..bbc2884 100644 (file)
@@ -14,7 +14,7 @@
  * This mutex must be held while accessing the EMI unit. We can't rely on the
  * EC mutex because memmap data may be accessed without it being held.
  */
-static struct mutex io_mutex;
+static DEFINE_MUTEX(io_mutex);
 static u16 mec_emi_base, mec_emi_end;
 
 /**
@@ -142,7 +142,6 @@ EXPORT_SYMBOL(cros_ec_lpc_io_bytes_mec);
 
 void cros_ec_lpc_mec_init(unsigned int base, unsigned int end)
 {
-       mutex_init(&io_mutex);
        mec_emi_base = base;
        mec_emi_end = end;
 }
index 0811562..27c068c 100644 (file)
@@ -58,6 +58,7 @@ struct cros_typec_port {
        /* Variables keeping track of switch state. */
        struct typec_mux_state state;
        uint8_t mux_flags;
+       uint8_t role;
 
        /* Port alt modes. */
        struct typec_altmode p_altmode[CROS_EC_ALTMODE_MAX];
@@ -220,6 +221,9 @@ static void cros_typec_remove_partner(struct cros_typec_data *typec,
 {
        struct cros_typec_port *port = typec->ports[port_num];
 
+       if (!port->partner)
+               return;
+
        cros_typec_unregister_altmodes(typec, port_num, true);
 
        cros_typec_usb_disconnect_state(port);
@@ -235,6 +239,9 @@ static void cros_typec_remove_cable(struct cros_typec_data *typec,
 {
        struct cros_typec_port *port = typec->ports[port_num];
 
+       if (!port->cable)
+               return;
+
        cros_typec_unregister_altmodes(typec, port_num, false);
 
        typec_unregister_plug(port->plug);
@@ -253,11 +260,8 @@ static void cros_unregister_ports(struct cros_typec_data *typec)
                if (!typec->ports[i])
                        continue;
 
-               if (typec->ports[i]->partner)
-                       cros_typec_remove_partner(typec, i);
-
-               if (typec->ports[i]->cable)
-                       cros_typec_remove_cable(typec, i);
+               cros_typec_remove_partner(typec, i);
+               cros_typec_remove_cable(typec, i);
 
                usb_role_switch_put(typec->ports[i]->role_sw);
                typec_switch_put(typec->ports[i]->ori_sw);
@@ -483,6 +487,11 @@ static int cros_typec_enable_dp(struct cros_typec_data *typec,
                return -ENOTSUPP;
        }
 
+       if (!pd_ctrl->dp_mode) {
+               dev_err(typec->dev, "No valid DP mode provided.\n");
+               return -EINVAL;
+       }
+
        /* Status VDO. */
        dp_data.status = DP_STATUS_ENABLED;
        if (port->mux_flags & USB_PD_MUX_HPD_IRQ)
@@ -647,11 +656,8 @@ static void cros_typec_set_port_params_v1(struct cros_typec_data *typec,
                                 "Failed to register partner on port: %d\n",
                                 port_num);
        } else {
-               if (typec->ports[port_num]->partner)
-                       cros_typec_remove_partner(typec, port_num);
-
-               if (typec->ports[port_num]->cable)
-                       cros_typec_remove_cable(typec, port_num);
+               cros_typec_remove_partner(typec, port_num);
+               cros_typec_remove_cable(typec, port_num);
        }
 }
 
@@ -905,6 +911,19 @@ static void cros_typec_handle_status(struct cros_typec_data *typec, int port_num
                return;
        }
 
+       /* If we got a hard reset, unregister everything and return. */
+       if (resp.events & PD_STATUS_EVENT_HARD_RESET) {
+               cros_typec_remove_partner(typec, port_num);
+               cros_typec_remove_cable(typec, port_num);
+
+               ret = cros_typec_send_clear_event(typec, port_num,
+                                                 PD_STATUS_EVENT_HARD_RESET);
+               if (ret < 0)
+                       dev_warn(typec->dev,
+                                "Failed hard reset event clear, port: %d\n", port_num);
+               return;
+       }
+
        /* Handle any events appropriately. */
        if (resp.events & PD_STATUS_EVENT_SOP_DISC_DONE && !typec->ports[port_num]->sop_disc_done) {
                u16 sop_revision;
@@ -995,10 +1014,12 @@ static int cros_typec_port_update(struct cros_typec_data *typec, int port_num)
        }
 
        /* No change needs to be made, let's exit early. */
-       if (typec->ports[port_num]->mux_flags == mux_resp.flags)
+       if (typec->ports[port_num]->mux_flags == mux_resp.flags &&
+           typec->ports[port_num]->role == resp.role)
                return 0;
 
        typec->ports[port_num]->mux_flags = mux_resp.flags;
+       typec->ports[port_num]->role = resp.role;
        ret = cros_typec_configure_mux(typec, port_num, mux_resp.flags, &resp);
        if (ret)
                dev_warn(typec->dev, "Configure muxes failed, err = %d\n", ret);
@@ -1027,8 +1048,8 @@ static int cros_typec_get_cmd_version(struct cros_typec_data *typec)
        else
                typec->pd_ctrl_ver = 0;
 
-       dev_dbg(typec->dev, "PD Control has version mask 0x%hhx\n",
-               typec->pd_ctrl_ver);
+       dev_dbg(typec->dev, "PD Control has version mask 0x%02x\n",
+               typec->pd_ctrl_ver & 0xff);
 
        return 0;
 }
index 7f36142..48a6617 100644 (file)
@@ -220,7 +220,8 @@ static int cros_usbpd_notify_plat(struct notifier_block *nb,
        if (!host_event)
                return NOTIFY_DONE;
 
-       if (host_event & EC_HOST_EVENT_MASK(EC_HOST_EVENT_PD_MCU)) {
+       if (host_event & (EC_HOST_EVENT_MASK(EC_HOST_EVENT_PD_MCU) |
+                         EC_HOST_EVENT_MASK(EC_HOST_EVENT_USB_MUX))) {
                cros_usbpd_get_event_and_notify(pdnotify->dev, ec_dev);
                return NOTIFY_OK;
        }
index e06d96f..60da7a2 100644 (file)
@@ -256,7 +256,7 @@ static int telem_open(struct inode *inode, struct file *filp)
        sess_data->dev_data = dev_data;
        sess_data->has_msg = false;
 
-       nonseekable_open(inode, filp);
+       stream_open(inode, filp);
        filp->private_data = sess_data;
 
        return 0;
index 03c3ff3..085ad0a 100644 (file)
@@ -675,6 +675,3 @@ static __exit void dcdrbu_exit(void)
 
 module_exit(dcdrbu_exit);
 module_init(dcdrbu_init);
-
-/* vim:noet:ts=8:sw=8
-*/
index f2edef0..8c20e52 100644 (file)
@@ -108,7 +108,7 @@ config PTP_1588_CLOCK_PCH
 config PTP_1588_CLOCK_KVM
        tristate "KVM virtual PTP clock"
        depends on PTP_1588_CLOCK
-       depends on KVM_GUEST && X86
+       depends on (KVM_GUEST && X86) || (HAVE_ARM_SMCCC_DISCOVERY && ARM_ARCH_TIMER)
        default y
        help
          This driver adds support for using kvm infrastructure as a PTP
index db5aef3..8673d17 100644 (file)
@@ -4,6 +4,8 @@
 #
 
 ptp-y                                  := ptp_clock.o ptp_chardev.o ptp_sysfs.o
+ptp_kvm-$(CONFIG_X86)                  := ptp_kvm_x86.o ptp_kvm_common.o
+ptp_kvm-$(CONFIG_HAVE_ARM_SMCCC)       := ptp_kvm_arm.o ptp_kvm_common.o
 obj-$(CONFIG_PTP_1588_CLOCK)           += ptp.o
 obj-$(CONFIG_PTP_1588_CLOCK_DTE)       += ptp_dte.o
 obj-$(CONFIG_PTP_1588_CLOCK_INES)      += ptp_ines.o
diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c
deleted file mode 100644 (file)
index 658d33f..0000000
+++ /dev/null
@@ -1,197 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Virtual PTP 1588 clock for use with KVM guests
- *
- * Copyright (C) 2017 Red Hat Inc.
- */
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <uapi/linux/kvm_para.h>
-#include <asm/kvm_para.h>
-#include <asm/pvclock.h>
-#include <asm/kvmclock.h>
-#include <uapi/asm/kvm_para.h>
-
-#include <linux/ptp_clock_kernel.h>
-
-struct kvm_ptp_clock {
-       struct ptp_clock *ptp_clock;
-       struct ptp_clock_info caps;
-};
-
-static DEFINE_SPINLOCK(kvm_ptp_lock);
-
-static struct pvclock_vsyscall_time_info *hv_clock;
-
-static struct kvm_clock_pairing clock_pair;
-static phys_addr_t clock_pair_gpa;
-
-static int ptp_kvm_get_time_fn(ktime_t *device_time,
-                              struct system_counterval_t *system_counter,
-                              void *ctx)
-{
-       unsigned long ret;
-       struct timespec64 tspec;
-       unsigned version;
-       int cpu;
-       struct pvclock_vcpu_time_info *src;
-
-       spin_lock(&kvm_ptp_lock);
-
-       preempt_disable_notrace();
-       cpu = smp_processor_id();
-       src = &hv_clock[cpu].pvti;
-
-       do {
-               /*
-                * We are using a TSC value read in the hosts
-                * kvm_hc_clock_pairing handling.
-                * So any changes to tsc_to_system_mul
-                * and tsc_shift or any other pvclock
-                * data invalidate that measurement.
-                */
-               version = pvclock_read_begin(src);
-
-               ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-                                    clock_pair_gpa,
-                                    KVM_CLOCK_PAIRING_WALLCLOCK);
-               if (ret != 0) {
-                       pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
-                       spin_unlock(&kvm_ptp_lock);
-                       preempt_enable_notrace();
-                       return -EOPNOTSUPP;
-               }
-
-               tspec.tv_sec = clock_pair.sec;
-               tspec.tv_nsec = clock_pair.nsec;
-               ret = __pvclock_read_cycles(src, clock_pair.tsc);
-       } while (pvclock_read_retry(src, version));
-
-       preempt_enable_notrace();
-
-       system_counter->cycles = ret;
-       system_counter->cs = &kvm_clock;
-
-       *device_time = timespec64_to_ktime(tspec);
-
-       spin_unlock(&kvm_ptp_lock);
-
-       return 0;
-}
-
-static int ptp_kvm_getcrosststamp(struct ptp_clock_info *ptp,
-                                 struct system_device_crosststamp *xtstamp)
-{
-       return get_device_system_crosststamp(ptp_kvm_get_time_fn, NULL,
-                                            NULL, xtstamp);
-}
-
-/*
- * PTP clock operations
- */
-
-static int ptp_kvm_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
-{
-       return -EOPNOTSUPP;
-}
-
-static int ptp_kvm_adjtime(struct ptp_clock_info *ptp, s64 delta)
-{
-       return -EOPNOTSUPP;
-}
-
-static int ptp_kvm_settime(struct ptp_clock_info *ptp,
-                          const struct timespec64 *ts)
-{
-       return -EOPNOTSUPP;
-}
-
-static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
-{
-       unsigned long ret;
-       struct timespec64 tspec;
-
-       spin_lock(&kvm_ptp_lock);
-
-       ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-                            clock_pair_gpa,
-                            KVM_CLOCK_PAIRING_WALLCLOCK);
-       if (ret != 0) {
-               pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
-               spin_unlock(&kvm_ptp_lock);
-               return -EOPNOTSUPP;
-       }
-
-       tspec.tv_sec = clock_pair.sec;
-       tspec.tv_nsec = clock_pair.nsec;
-       spin_unlock(&kvm_ptp_lock);
-
-       memcpy(ts, &tspec, sizeof(struct timespec64));
-
-       return 0;
-}
-
-static int ptp_kvm_enable(struct ptp_clock_info *ptp,
-                         struct ptp_clock_request *rq, int on)
-{
-       return -EOPNOTSUPP;
-}
-
-static const struct ptp_clock_info ptp_kvm_caps = {
-       .owner          = THIS_MODULE,
-       .name           = "KVM virtual PTP",
-       .max_adj        = 0,
-       .n_ext_ts       = 0,
-       .n_pins         = 0,
-       .pps            = 0,
-       .adjfreq        = ptp_kvm_adjfreq,
-       .adjtime        = ptp_kvm_adjtime,
-       .gettime64      = ptp_kvm_gettime,
-       .settime64      = ptp_kvm_settime,
-       .enable         = ptp_kvm_enable,
-       .getcrosststamp = ptp_kvm_getcrosststamp,
-};
-
-/* module operations */
-
-static struct kvm_ptp_clock kvm_ptp_clock;
-
-static void __exit ptp_kvm_exit(void)
-{
-       ptp_clock_unregister(kvm_ptp_clock.ptp_clock);
-}
-
-static int __init ptp_kvm_init(void)
-{
-       long ret;
-
-       if (!kvm_para_available())
-               return -ENODEV;
-
-       clock_pair_gpa = slow_virt_to_phys(&clock_pair);
-       hv_clock = pvclock_get_pvti_cpu0_va();
-
-       if (!hv_clock)
-               return -ENODEV;
-
-       ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
-                       KVM_CLOCK_PAIRING_WALLCLOCK);
-       if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
-               return -ENODEV;
-
-       kvm_ptp_clock.caps = ptp_kvm_caps;
-
-       kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL);
-
-       return PTR_ERR_OR_ZERO(kvm_ptp_clock.ptp_clock);
-}
-
-module_init(ptp_kvm_init);
-module_exit(ptp_kvm_exit);
-
-MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>");
-MODULE_DESCRIPTION("PTP clock using KVMCLOCK");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ptp/ptp_kvm_arm.c b/drivers/ptp/ptp_kvm_arm.c
new file mode 100644 (file)
index 0000000..b7d28c8
--- /dev/null
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Virtual PTP 1588 clock for use with KVM guests
+ *  Copyright (C) 2019 ARM Ltd.
+ *  All Rights Reserved
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/ptp_kvm.h>
+
+#include <asm/arch_timer.h>
+#include <asm/hypervisor.h>
+
+int kvm_arch_ptp_init(void)
+{
+       int ret;
+
+       ret = kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_PTP);
+       if (ret <= 0)
+               return -EOPNOTSUPP;
+
+       return 0;
+}
+
+int kvm_arch_ptp_get_clock(struct timespec64 *ts)
+{
+       return kvm_arch_ptp_get_crosststamp(NULL, ts, NULL);
+}
diff --git a/drivers/ptp/ptp_kvm_common.c b/drivers/ptp/ptp_kvm_common.c
new file mode 100644 (file)
index 0000000..fcae32f
--- /dev/null
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ptp_kvm.h>
+#include <uapi/linux/kvm_para.h>
+#include <asm/kvm_para.h>
+#include <uapi/asm/kvm_para.h>
+
+#include <linux/ptp_clock_kernel.h>
+
+struct kvm_ptp_clock {
+       struct ptp_clock *ptp_clock;
+       struct ptp_clock_info caps;
+};
+
+static DEFINE_SPINLOCK(kvm_ptp_lock);
+
+static int ptp_kvm_get_time_fn(ktime_t *device_time,
+                              struct system_counterval_t *system_counter,
+                              void *ctx)
+{
+       long ret;
+       u64 cycle;
+       struct timespec64 tspec;
+       struct clocksource *cs;
+
+       spin_lock(&kvm_ptp_lock);
+
+       preempt_disable_notrace();
+       ret = kvm_arch_ptp_get_crosststamp(&cycle, &tspec, &cs);
+       if (ret) {
+               spin_unlock(&kvm_ptp_lock);
+               preempt_enable_notrace();
+               return ret;
+       }
+
+       preempt_enable_notrace();
+
+       system_counter->cycles = cycle;
+       system_counter->cs = cs;
+
+       *device_time = timespec64_to_ktime(tspec);
+
+       spin_unlock(&kvm_ptp_lock);
+
+       return 0;
+}
+
+static int ptp_kvm_getcrosststamp(struct ptp_clock_info *ptp,
+                                 struct system_device_crosststamp *xtstamp)
+{
+       return get_device_system_crosststamp(ptp_kvm_get_time_fn, NULL,
+                                            NULL, xtstamp);
+}
+
+/*
+ * PTP clock operations
+ */
+
+static int ptp_kvm_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
+{
+       return -EOPNOTSUPP;
+}
+
+static int ptp_kvm_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+       return -EOPNOTSUPP;
+}
+
+static int ptp_kvm_settime(struct ptp_clock_info *ptp,
+                          const struct timespec64 *ts)
+{
+       return -EOPNOTSUPP;
+}
+
+static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
+{
+       long ret;
+       struct timespec64 tspec;
+
+       spin_lock(&kvm_ptp_lock);
+
+       ret = kvm_arch_ptp_get_clock(&tspec);
+       if (ret) {
+               spin_unlock(&kvm_ptp_lock);
+               return ret;
+       }
+
+       spin_unlock(&kvm_ptp_lock);
+
+       memcpy(ts, &tspec, sizeof(struct timespec64));
+
+       return 0;
+}
+
+static int ptp_kvm_enable(struct ptp_clock_info *ptp,
+                         struct ptp_clock_request *rq, int on)
+{
+       return -EOPNOTSUPP;
+}
+
+static const struct ptp_clock_info ptp_kvm_caps = {
+       .owner          = THIS_MODULE,
+       .name           = "KVM virtual PTP",
+       .max_adj        = 0,
+       .n_ext_ts       = 0,
+       .n_pins         = 0,
+       .pps            = 0,
+       .adjfreq        = ptp_kvm_adjfreq,
+       .adjtime        = ptp_kvm_adjtime,
+       .gettime64      = ptp_kvm_gettime,
+       .settime64      = ptp_kvm_settime,
+       .enable         = ptp_kvm_enable,
+       .getcrosststamp = ptp_kvm_getcrosststamp,
+};
+
+/* module operations */
+
+static struct kvm_ptp_clock kvm_ptp_clock;
+
+static void __exit ptp_kvm_exit(void)
+{
+       ptp_clock_unregister(kvm_ptp_clock.ptp_clock);
+}
+
+static int __init ptp_kvm_init(void)
+{
+       long ret;
+
+       ret = kvm_arch_ptp_init();
+       if (ret) {
+               if (ret != -EOPNOTSUPP)
+                       pr_err("fail to initialize ptp_kvm");
+               return ret;
+       }
+
+       kvm_ptp_clock.caps = ptp_kvm_caps;
+
+       kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL);
+
+       return PTR_ERR_OR_ZERO(kvm_ptp_clock.ptp_clock);
+}
+
+module_init(ptp_kvm_init);
+module_exit(ptp_kvm_exit);
+
+MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>");
+MODULE_DESCRIPTION("PTP clock using KVMCLOCK");
+MODULE_LICENSE("GPL");
diff --git a/drivers/ptp/ptp_kvm_x86.c b/drivers/ptp/ptp_kvm_x86.c
new file mode 100644 (file)
index 0000000..3dd519d
--- /dev/null
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <asm/pvclock.h>
+#include <asm/kvmclock.h>
+#include <linux/module.h>
+#include <uapi/asm/kvm_para.h>
+#include <uapi/linux/kvm_para.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/ptp_kvm.h>
+
+struct pvclock_vsyscall_time_info *hv_clock;
+
+static phys_addr_t clock_pair_gpa;
+static struct kvm_clock_pairing clock_pair;
+
+int kvm_arch_ptp_init(void)
+{
+       long ret;
+
+       if (!kvm_para_available())
+               return -ENODEV;
+
+       clock_pair_gpa = slow_virt_to_phys(&clock_pair);
+       hv_clock = pvclock_get_pvti_cpu0_va();
+       if (!hv_clock)
+               return -ENODEV;
+
+       ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
+                            KVM_CLOCK_PAIRING_WALLCLOCK);
+       if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
+               return -ENODEV;
+
+       return 0;
+}
+
+int kvm_arch_ptp_get_clock(struct timespec64 *ts)
+{
+       long ret;
+
+       ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+                            clock_pair_gpa,
+                            KVM_CLOCK_PAIRING_WALLCLOCK);
+       if (ret != 0) {
+               pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
+               return -EOPNOTSUPP;
+       }
+
+       ts->tv_sec = clock_pair.sec;
+       ts->tv_nsec = clock_pair.nsec;
+
+       return 0;
+}
+
+int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *tspec,
+                             struct clocksource **cs)
+{
+       struct pvclock_vcpu_time_info *src;
+       unsigned int version;
+       long ret;
+       int cpu;
+
+       cpu = smp_processor_id();
+       src = &hv_clock[cpu].pvti;
+
+       do {
+               /*
+                * We are using a TSC value read in the hosts
+                * kvm_hc_clock_pairing handling.
+                * So any changes to tsc_to_system_mul
+                * and tsc_shift or any other pvclock
+                * data invalidate that measurement.
+                */
+               version = pvclock_read_begin(src);
+
+               ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+                                    clock_pair_gpa,
+                                    KVM_CLOCK_PAIRING_WALLCLOCK);
+               if (ret != 0) {
+                       pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
+                       return -EOPNOTSUPP;
+               }
+               tspec->tv_sec = clock_pair.sec;
+               tspec->tv_nsec = clock_pair.nsec;
+               *cycle = __pvclock_read_cycles(src, clock_pair.tsc);
+       } while (pvclock_read_retry(src, version));
+
+       *cs = &kvm_clock;
+
+       return 0;
+}
index d3371ac..c76aded 100644 (file)
@@ -618,6 +618,15 @@ config PWM_TWL_LED
          To compile this driver as a module, choose M here: the module
          will be called pwm-twl-led.
 
+config PWM_VISCONTI
+       tristate "Toshiba Visconti PWM support"
+       depends on ARCH_VISCONTI || COMPILE_TEST
+       help
+         PWM Subsystem driver support for Toshiba Visconti SoCs.
+
+         To compile this driver as a module, choose M here: the module
+         will be called pwm-visconti.
+
 config PWM_VT8500
        tristate "vt8500 PWM support"
        depends on ARCH_VT8500 || COMPILE_TEST
index d387961..708840b 100644 (file)
@@ -58,4 +58,5 @@ obj-$(CONFIG_PWM_TIECAP)      += pwm-tiecap.o
 obj-$(CONFIG_PWM_TIEHRPWM)     += pwm-tiehrpwm.o
 obj-$(CONFIG_PWM_TWL)          += pwm-twl.o
 obj-$(CONFIG_PWM_TWL_LED)      += pwm-twl-led.o
+obj-$(CONFIG_PWM_VISCONTI)     += pwm-visconti.o
 obj-$(CONFIG_PWM_VT8500)       += pwm-vt8500.o
index a8eff4b..c4d5c06 100644 (file)
@@ -37,23 +37,13 @@ static struct pwm_device *pwm_to_device(unsigned int pwm)
        return radix_tree_lookup(&pwm_tree, pwm);
 }
 
-static int alloc_pwms(int pwm, unsigned int count)
+static int alloc_pwms(unsigned int count)
 {
-       unsigned int from = 0;
        unsigned int start;
 
-       if (pwm >= MAX_PWMS)
-               return -EINVAL;
-
-       if (pwm >= 0)
-               from = pwm;
-
-       start = bitmap_find_next_zero_area(allocated_pwms, MAX_PWMS, from,
+       start = bitmap_find_next_zero_area(allocated_pwms, MAX_PWMS, 0,
                                           count, 0);
 
-       if (pwm >= 0 && start != pwm)
-               return -EEXIST;
-
        if (start + count > MAX_PWMS)
                return -ENOSPC;
 
@@ -260,18 +250,14 @@ static bool pwm_ops_check(const struct pwm_chip *chip)
 }
 
 /**
- * pwmchip_add_with_polarity() - register a new PWM chip
+ * pwmchip_add() - register a new PWM chip
  * @chip: the PWM chip to add
- * @polarity: initial polarity of PWM channels
  *
- * Register a new PWM chip. If chip->base < 0 then a dynamically assigned base
- * will be used. The initial polarity for all channels is specified by the
- * @polarity parameter.
+ * Register a new PWM chip.
  *
  * Returns: 0 on success or a negative error code on failure.
  */
-int pwmchip_add_with_polarity(struct pwm_chip *chip,
-                             enum pwm_polarity polarity)
+int pwmchip_add(struct pwm_chip *chip)
 {
        struct pwm_device *pwm;
        unsigned int i;
@@ -285,25 +271,24 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
 
        mutex_lock(&pwm_lock);
 
-       ret = alloc_pwms(chip->base, chip->npwm);
+       ret = alloc_pwms(chip->npwm);
        if (ret < 0)
                goto out;
 
+       chip->base = ret;
+
        chip->pwms = kcalloc(chip->npwm, sizeof(*pwm), GFP_KERNEL);
        if (!chip->pwms) {
                ret = -ENOMEM;
                goto out;
        }
 
-       chip->base = ret;
-
        for (i = 0; i < chip->npwm; i++) {
                pwm = &chip->pwms[i];
 
                pwm->chip = chip;
                pwm->pwm = chip->base + i;
                pwm->hwpwm = i;
-               pwm->state.polarity = polarity;
 
                radix_tree_insert(&pwm_tree, pwm->pwm, pwm);
        }
@@ -326,21 +311,6 @@ out:
 
        return ret;
 }
-EXPORT_SYMBOL_GPL(pwmchip_add_with_polarity);
-
-/**
- * pwmchip_add() - register a new PWM chip
- * @chip: the PWM chip to add
- *
- * Register a new PWM chip. If chip->base < 0 then a dynamically assigned base
- * will be used. The initial polarity for all channels is normal.
- *
- * Returns: 0 on success or a negative error code on failure.
- */
-int pwmchip_add(struct pwm_chip *chip)
-{
-       return pwmchip_add_with_polarity(chip, PWM_POLARITY_NORMAL);
-}
 EXPORT_SYMBOL_GPL(pwmchip_add);
 
 /**
@@ -607,7 +577,7 @@ int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state)
                 */
                if (state->polarity != pwm->state.polarity) {
                        if (!chip->ops->set_polarity)
-                               return -ENOTSUPP;
+                               return -EINVAL;
 
                        /*
                         * Changing the polarity of a running PWM is
index 58c6c0f..e2a26d9 100644 (file)
@@ -24,23 +24,37 @@ struct ab8500_pwm_chip {
        struct pwm_chip chip;
 };
 
-static int ab8500_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
-                            int duty_ns, int period_ns)
+static int ab8500_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
+                           const struct pwm_state *state)
 {
-       int ret = 0;
-       unsigned int higher_val, lower_val;
+       int ret;
        u8 reg;
+       unsigned int higher_val, lower_val;
+
+       if (state->polarity != PWM_POLARITY_NORMAL)
+               return -EINVAL;
+
+       if (!state->enabled) {
+               ret = abx500_mask_and_set_register_interruptible(chip->dev,
+                                       AB8500_MISC, AB8500_PWM_OUT_CTRL7_REG,
+                                       1 << (chip->base - 1), 0);
+
+               if (ret < 0)
+                       dev_err(chip->dev, "%s: Failed to disable PWM, Error %d\n",
+                                                               pwm->label, ret);
+               return ret;
+       }
 
        /*
         * get the first 8 bits that are be written to
         * AB8500_PWM_OUT_CTRL1_REG[0:7]
         */
-       lower_val = duty_ns & 0x00FF;
+       lower_val = state->duty_cycle & 0x00FF;
        /*
         * get bits [9:10] that are to be written to
         * AB8500_PWM_OUT_CTRL2_REG[0:1]
         */
-       higher_val = ((duty_ns & 0x0300) >> 8);
+       higher_val = ((state->duty_cycle & 0x0300) >> 8);
 
        reg = AB8500_PWM_OUT_CTRL1_REG + ((chip->base - 1) * 2);
 
@@ -48,15 +62,11 @@ static int ab8500_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
                        reg, (u8)lower_val);
        if (ret < 0)
                return ret;
+
        ret = abx500_set_register_interruptible(chip->dev, AB8500_MISC,
                        (reg + 1), (u8)higher_val);
-
-       return ret;
-}
-
-static int ab8500_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
-{
-       int ret;
+       if (ret < 0)
+               return ret;
 
        ret = abx500_mask_and_set_register_interruptible(chip->dev,
                                AB8500_MISC, AB8500_PWM_OUT_CTRL7_REG,
@@ -64,25 +74,12 @@ static int ab8500_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
        if (ret < 0)
                dev_err(chip->dev, "%s: Failed to enable PWM, Error %d\n",
                                                        pwm->label, ret);
-       return ret;
-}
 
-static void ab8500_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
-{
-       int ret;
-
-       ret = abx500_mask_and_set_register_interruptible(chip->dev,
-                               AB8500_MISC, AB8500_PWM_OUT_CTRL7_REG,
-                               1 << (chip->base - 1), 0);
-       if (ret < 0)
-               dev_err(chip->dev, "%s: Failed to disable PWM, Error %d\n",
-                                                       pwm->label, ret);
+       return ret;
 }
 
 static const struct pwm_ops ab8500_pwm_ops = {
-       .config = ab8500_pwm_config,
-       .enable = ab8500_pwm_enable,
-       .disable = ab8500_pwm_disable,
+       .apply = ab8500_pwm_apply,
        .owner = THIS_MODULE,
 };
 
@@ -101,7 +98,6 @@ static int ab8500_pwm_probe(struct platform_device *pdev)
 
        ab8500->chip.dev = &pdev->dev;
        ab8500->chip.ops = &ab8500_pwm_ops;
-       ab8500->chip.base = -1;
        ab8500->chip.npwm = 1;
 
        err = pwmchip_add(&ab8500->chip);
index dcbc048..6ab597e 100644 (file)
@@ -265,12 +265,11 @@ static int atmel_hlcdc_pwm_probe(struct platform_device *pdev)
        chip->hlcdc = hlcdc;
        chip->chip.ops = &atmel_hlcdc_pwm_ops;
        chip->chip.dev = dev;
-       chip->chip.base = -1;
        chip->chip.npwm = 1;
        chip->chip.of_xlate = of_pwm_xlate_with_flags;
        chip->chip.of_pwm_n_cells = 3;
 
-       ret = pwmchip_add_with_polarity(&chip->chip, PWM_POLARITY_INVERSED);
+       ret = pwmchip_add(&chip->chip);
        if (ret) {
                clk_disable_unprepare(hlcdc->periph_clk);
                return ret;
index 5ccc3e7..8451d3e 100644 (file)
@@ -362,20 +362,37 @@ static int atmel_tcb_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
        tcbpwm->div = i;
        tcbpwm->duty = duty;
 
-       /* If the PWM is enabled, call enable to apply the new conf */
-       if (pwm_is_enabled(pwm))
-               atmel_tcb_pwm_enable(chip, pwm);
-
        return 0;
 }
 
+static int atmel_tcb_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
+                              const struct pwm_state *state)
+{
+       int duty_cycle, period;
+       int ret;
+
+       /* This function only sets a flag in driver data */
+       atmel_tcb_pwm_set_polarity(chip, pwm, state->polarity);
+
+       if (!state->enabled) {
+               atmel_tcb_pwm_disable(chip, pwm);
+               return 0;
+       }
+
+       period = state->period < INT_MAX ? state->period : INT_MAX;
+       duty_cycle = state->duty_cycle < INT_MAX ? state->duty_cycle : INT_MAX;
+
+       ret = atmel_tcb_pwm_config(chip, pwm, duty_cycle, period);
+       if (ret)
+               return ret;
+
+       return atmel_tcb_pwm_enable(chip, pwm);
+}
+
 static const struct pwm_ops atmel_tcb_pwm_ops = {
        .request = atmel_tcb_pwm_request,
        .free = atmel_tcb_pwm_free,
-       .config = atmel_tcb_pwm_config,
-       .set_polarity = atmel_tcb_pwm_set_polarity,
-       .enable = atmel_tcb_pwm_enable,
-       .disable = atmel_tcb_pwm_disable,
+       .apply = atmel_tcb_pwm_apply,
        .owner = THIS_MODULE,
 };
 
@@ -454,7 +471,6 @@ static int atmel_tcb_pwm_probe(struct platform_device *pdev)
        tcbpwm->chip.ops = &atmel_tcb_pwm_ops;
        tcbpwm->chip.of_xlate = of_pwm_xlate_with_flags;
        tcbpwm->chip.of_pwm_n_cells = 3;
-       tcbpwm->chip.base = -1;
        tcbpwm->chip.npwm = NPWM;
        tcbpwm->channel = channel;
        tcbpwm->regmap = regmap;
@@ -491,14 +507,14 @@ static int atmel_tcb_pwm_remove(struct platform_device *pdev)
        struct atmel_tcb_pwm_chip *tcbpwm = platform_get_drvdata(pdev);
        int err;
 
-       clk_disable_unprepare(tcbpwm->slow_clk);
-       clk_put(tcbpwm->slow_clk);
-       clk_put(tcbpwm->clk);
-
        err = pwmchip_remove(&tcbpwm->chip);
        if (err < 0)
                return err;
 
+       clk_disable_unprepare(tcbpwm->slow_clk);
+       clk_put(tcbpwm->slow_clk);
+       clk_put(tcbpwm->clk);
+
        return 0;
 }
 
index 5813339..29b5ad0 100644 (file)
@@ -124,6 +124,7 @@ static inline void atmel_pwm_ch_writel(struct atmel_pwm_chip *chip,
 }
 
 static int atmel_pwm_calculate_cprd_and_pres(struct pwm_chip *chip,
+                                            unsigned long clkrate,
                                             const struct pwm_state *state,
                                             unsigned long *cprd, u32 *pres)
 {
@@ -132,7 +133,7 @@ static int atmel_pwm_calculate_cprd_and_pres(struct pwm_chip *chip,
        int shift;
 
        /* Calculate the period cycles and prescale value */
-       cycles *= clk_get_rate(atmel_pwm->clk);
+       cycles *= clkrate;
        do_div(cycles, NSEC_PER_SEC);
 
        /*
@@ -158,12 +159,14 @@ static int atmel_pwm_calculate_cprd_and_pres(struct pwm_chip *chip,
 }
 
 static void atmel_pwm_calculate_cdty(const struct pwm_state *state,
-                                    unsigned long cprd, unsigned long *cdty)
+                                    unsigned long clkrate, unsigned long cprd,
+                                    u32 pres, unsigned long *cdty)
 {
        unsigned long long cycles = state->duty_cycle;
 
-       cycles *= cprd;
-       do_div(cycles, state->period);
+       cycles *= clkrate;
+       do_div(cycles, NSEC_PER_SEC);
+       cycles >>= pres;
        *cdty = cprd - cycles;
 }
 
@@ -244,17 +247,23 @@ static int atmel_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
        pwm_get_state(pwm, &cstate);
 
        if (state->enabled) {
+               unsigned long clkrate = clk_get_rate(atmel_pwm->clk);
+
                if (cstate.enabled &&
                    cstate.polarity == state->polarity &&
                    cstate.period == state->period) {
+                       u32 cmr = atmel_pwm_ch_readl(atmel_pwm, pwm->hwpwm, PWM_CMR);
+
                        cprd = atmel_pwm_ch_readl(atmel_pwm, pwm->hwpwm,
                                                  atmel_pwm->data->regs.period);
-                       atmel_pwm_calculate_cdty(state, cprd, &cdty);
+                       pres = cmr & PWM_CMR_CPRE_MSK;
+
+                       atmel_pwm_calculate_cdty(state, clkrate, cprd, pres, &cdty);
                        atmel_pwm_update_cdty(chip, pwm, cdty);
                        return 0;
                }
 
-               ret = atmel_pwm_calculate_cprd_and_pres(chip, state, &cprd,
+               ret = atmel_pwm_calculate_cprd_and_pres(chip, clkrate, state, &cprd,
                                                        &pres);
                if (ret) {
                        dev_err(chip->dev,
@@ -262,7 +271,7 @@ static int atmel_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
                        return ret;
                }
 
-               atmel_pwm_calculate_cdty(state, cprd, &cdty);
+               atmel_pwm_calculate_cdty(state, clkrate, cprd, pres, &cdty);
 
                if (cstate.enabled) {
                        atmel_pwm_disable(chip, pwm, false);
@@ -319,7 +328,7 @@ static void atmel_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
 
                cdty = atmel_pwm_ch_readl(atmel_pwm, pwm->hwpwm,
                                          atmel_pwm->data->regs.duty);
-               tmp = (u64)cdty * NSEC_PER_SEC;
+               tmp = (u64)(cprd - cdty) * NSEC_PER_SEC;
                tmp <<= pres;
                state->duty_cycle = DIV64_U64_ROUND_UP(tmp, rate);
 
@@ -429,7 +438,6 @@ static int atmel_pwm_probe(struct platform_device *pdev)
        atmel_pwm->chip.ops = &atmel_pwm_ops;
        atmel_pwm->chip.of_xlate = of_pwm_xlate_with_flags;
        atmel_pwm->chip.of_pwm_n_cells = 3;
-       atmel_pwm->chip.base = -1;
        atmel_pwm->chip.npwm = 4;
 
        ret = pwmchip_add(&atmel_pwm->chip);
@@ -451,10 +459,12 @@ static int atmel_pwm_remove(struct platform_device *pdev)
 {
        struct atmel_pwm_chip *atmel_pwm = platform_get_drvdata(pdev);
 
+       pwmchip_remove(&atmel_pwm->chip);
+
        clk_unprepare(atmel_pwm->clk);
        mutex_destroy(&atmel_pwm->isr_lock);
 
-       return pwmchip_remove(&atmel_pwm->chip);
+       return 0;
 }
 
 static struct platform_driver atmel_pwm_driver = {
index f4853c4..edd2ce1 100644 (file)
@@ -209,7 +209,6 @@ static int iproc_pwmc_probe(struct platform_device *pdev)
 
        ip->chip.dev = &pdev->dev;
        ip->chip.ops = &iproc_pwm_ops;
-       ip->chip.base = -1;
        ip->chip.npwm = 4;
        ip->chip.of_xlate = of_pwm_xlate_with_flags;
        ip->chip.of_pwm_n_cells = 3;
@@ -254,9 +253,11 @@ static int iproc_pwmc_remove(struct platform_device *pdev)
 {
        struct iproc_pwmc *ip = platform_get_drvdata(pdev);
 
+       pwmchip_remove(&ip->chip);
+
        clk_disable_unprepare(ip->clk);
 
-       return pwmchip_remove(&ip->chip);
+       return 0;
 }
 
 static const struct of_device_id bcm_iproc_pwmc_dt[] = {
index 578b362..800b9ed 100644 (file)
@@ -271,7 +271,6 @@ static int kona_pwmc_probe(struct platform_device *pdev)
 
        kp->chip.dev = &pdev->dev;
        kp->chip.ops = &kona_pwm_ops;
-       kp->chip.base = -1;
        kp->chip.npwm = 6;
        kp->chip.of_xlate = of_pwm_xlate_with_flags;
        kp->chip.of_pwm_n_cells = 3;
@@ -301,7 +300,7 @@ static int kona_pwmc_probe(struct platform_device *pdev)
 
        clk_disable_unprepare(kp->clk);
 
-       ret = pwmchip_add_with_polarity(&kp->chip, PWM_POLARITY_INVERSED);
+       ret = pwmchip_add(&kp->chip);
        if (ret < 0)
                dev_err(&pdev->dev, "failed to add PWM chip: %d\n", ret);
 
@@ -311,11 +310,6 @@ static int kona_pwmc_probe(struct platform_device *pdev)
 static int kona_pwmc_remove(struct platform_device *pdev)
 {
        struct kona_pwmc *kp = platform_get_drvdata(pdev);
-       unsigned int chan;
-
-       for (chan = 0; chan < kp->chip.npwm; chan++)
-               if (pwm_is_enabled(&kp->chip.pwms[chan]))
-                       clk_disable_unprepare(kp->clk);
 
        return pwmchip_remove(&kp->chip);
 }
index 6ff5f04..fc240d5 100644 (file)
@@ -64,8 +64,9 @@ static int bcm2835_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
        struct bcm2835_pwm *pc = to_bcm2835_pwm(chip);
        unsigned long rate = clk_get_rate(pc->clk);
-       unsigned long long period;
-       unsigned long scaler;
+       unsigned long long period_cycles;
+       u64 max_period;
+
        u32 val;
 
        if (!rate) {
@@ -73,18 +74,36 @@ static int bcm2835_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
                return -EINVAL;
        }
 
-       scaler = DIV_ROUND_CLOSEST(NSEC_PER_SEC, rate);
+       /*
+        * period_cycles must be a 32 bit value, so period * rate / NSEC_PER_SEC
+        * must be <= U32_MAX. As U32_MAX * NSEC_PER_SEC < U64_MAX the
+        * multiplication period * rate doesn't overflow.
+        * To calculate the maximal possible period that guarantees the
+        * above inequality:
+        *
+        *     round(period * rate / NSEC_PER_SEC) <= U32_MAX
+        * <=> period * rate / NSEC_PER_SEC < U32_MAX + 0.5
+        * <=> period * rate < (U32_MAX + 0.5) * NSEC_PER_SEC
+        * <=> period < ((U32_MAX + 0.5) * NSEC_PER_SEC) / rate
+        * <=> period < ((U32_MAX * NSEC_PER_SEC + NSEC_PER_SEC/2) / rate
+        * <=> period <= ceil((U32_MAX * NSEC_PER_SEC + NSEC_PER_SEC/2) / rate) - 1
+        */
+       max_period = DIV_ROUND_UP_ULL((u64)U32_MAX * NSEC_PER_SEC + NSEC_PER_SEC / 2, rate) - 1;
+
+       if (state->period > max_period)
+               return -EINVAL;
+
        /* set period */
-       period = DIV_ROUND_CLOSEST_ULL(state->period, scaler);
+       period_cycles = DIV_ROUND_CLOSEST_ULL(state->period * rate, NSEC_PER_SEC);
 
-       /* dont accept a period that is too small or has been truncated */
-       if ((period < PERIOD_MIN) || (period > U32_MAX))
+       /* don't accept a period that is too small */
+       if (period_cycles < PERIOD_MIN)
                return -EINVAL;
 
-       writel(period, pc->base + PERIOD(pwm->hwpwm));
+       writel(period_cycles, pc->base + PERIOD(pwm->hwpwm));
 
        /* set duty cycle */
-       val = DIV_ROUND_CLOSEST_ULL(state->duty_cycle, scaler);
+       val = DIV_ROUND_CLOSEST_ULL(state->duty_cycle * rate, NSEC_PER_SEC);
        writel(val, pc->base + DUTY(pwm->hwpwm));
 
        /* set polarity */
@@ -139,7 +158,6 @@ static int bcm2835_pwm_probe(struct platform_device *pdev)
 
        pc->chip.dev = &pdev->dev;
        pc->chip.ops = &bcm2835_pwm_ops;
-       pc->chip.base = -1;
        pc->chip.npwm = 2;
        pc->chip.of_xlate = of_pwm_xlate_with_flags;
        pc->chip.of_pwm_n_cells = 3;
@@ -161,9 +179,11 @@ static int bcm2835_pwm_remove(struct platform_device *pdev)
 {
        struct bcm2835_pwm *pc = platform_get_drvdata(pdev);
 
+       pwmchip_remove(&pc->chip);
+
        clk_disable_unprepare(pc->clk);
 
-       return pwmchip_remove(&pc->chip);
+       return 0;
 }
 
 static const struct of_device_id bcm2835_pwm_of_match[] = {
index fe40528..acb6fbc 100644 (file)
@@ -206,7 +206,6 @@ static int berlin_pwm_probe(struct platform_device *pdev)
 
        pwm->chip.dev = &pdev->dev;
        pwm->chip.ops = &berlin_pwm_ops;
-       pwm->chip.base = -1;
        pwm->chip.npwm = 4;
        pwm->chip.of_xlate = of_pwm_xlate_with_flags;
        pwm->chip.of_pwm_n_cells = 3;
index 8b66f9d..8b1d1e7 100644 (file)
@@ -258,7 +258,6 @@ static int brcmstb_pwm_probe(struct platform_device *pdev)
 
        p->chip.dev = &pdev->dev;
        p->chip.ops = &brcmstb_pwm_ops;
-       p->chip.base = -1;
        p->chip.npwm = 2;
 
        p->base = devm_platform_ioremap_resource(pdev, 0);
index cb1af86..f3d17a5 100644 (file)
@@ -128,7 +128,6 @@ static int clps711x_pwm_probe(struct platform_device *pdev)
 
        priv->chip.ops = &clps711x_pwm_ops;
        priv->chip.dev = &pdev->dev;
-       priv->chip.base = -1;
        priv->chip.npwm = 2;
        priv->chip.of_xlate = clps711x_pwm_xlate;
        priv->chip.of_pwm_n_cells = 1;
index 1e22768..02522a9 100644 (file)
@@ -168,7 +168,6 @@ static int crystalcove_pwm_probe(struct platform_device *pdev)
 
        pwm->chip.dev = &pdev->dev;
        pwm->chip.ops = &crc_pwm_ops;
-       pwm->chip.base = -1;
        pwm->chip.npwm = 1;
 
        /* get the PMIC regmap */
index c1c3379..9fffb56 100644 (file)
@@ -124,6 +124,9 @@ static int cros_ec_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
        if (state->period != EC_PWM_MAX_DUTY)
                return -EINVAL;
 
+       if (state->polarity != PWM_POLARITY_NORMAL)
+               return -EINVAL;
+
        /*
         * EC doesn't separate the concept of duty cycle and enabled, but
         * kernel does. Translate.
@@ -253,7 +256,6 @@ static int cros_ec_pwm_probe(struct platform_device *pdev)
        chip->ops = &cros_ec_pwm_ops;
        chip->of_xlate = cros_ec_pwm_xlate;
        chip->of_pwm_n_cells = 1;
-       chip->base = -1;
        ret = cros_ec_num_pwms(ec);
        if (ret < 0) {
                dev_err(dev, "Couldn't find PWMs: %d\n", ret);
index f6c98e0..7568300 100644 (file)
@@ -233,7 +233,6 @@ static int dwc_pwm_probe(struct pci_dev *pci, const struct pci_device_id *id)
        dwc->chip.dev = dev;
        dwc->chip.ops = &dwc_pwm_ops;
        dwc->chip.npwm = DWC_TIMERS_TOTAL;
-       dwc->chip.base = -1;
 
        ret = pwmchip_add(&dwc->chip);
        if (ret)
index c9fc6f2..4ca7079 100644 (file)
@@ -185,7 +185,6 @@ static int ep93xx_pwm_probe(struct platform_device *pdev)
 
        ep93xx_pwm->chip.dev = &pdev->dev;
        ep93xx_pwm->chip.ops = &ep93xx_pwm_ops;
-       ep93xx_pwm->chip.base = -1;
        ep93xx_pwm->chip.npwm = 1;
 
        ret = pwmchip_add(&ep93xx_pwm->chip);
index 2a68012..0e1ae94 100644 (file)
@@ -453,7 +453,6 @@ static int fsl_pwm_probe(struct platform_device *pdev)
        fpc->chip.ops = &fsl_pwm_ops;
        fpc->chip.of_xlate = of_pwm_xlate_with_flags;
        fpc->chip.of_pwm_n_cells = 3;
-       fpc->chip.base = -1;
        fpc->chip.npwm = 8;
 
        ret = pwmchip_add(&fpc->chip);
index a1900d0..82d17fc 100644 (file)
@@ -205,7 +205,6 @@ static int hibvt_pwm_probe(struct platform_device *pdev)
 
        pwm_chip->chip.ops = &hibvt_pwm_ops;
        pwm_chip->chip.dev = &pdev->dev;
-       pwm_chip->chip.base = -1;
        pwm_chip->chip.npwm = soc->num_pwms;
        pwm_chip->chip.of_xlate = of_pwm_xlate_with_flags;
        pwm_chip->chip.of_pwm_n_cells = 3;
index 6faf5b5..cc37054 100644 (file)
@@ -304,7 +304,6 @@ static int img_pwm_probe(struct platform_device *pdev)
 
        pwm->chip.dev = &pdev->dev;
        pwm->chip.ops = &img_pwm_ops;
-       pwm->chip.base = -1;
        pwm->chip.npwm = IMG_PWM_NPWM;
 
        ret = pwmchip_add(&pwm->chip);
index aaf629b..97c9133 100644 (file)
@@ -363,7 +363,6 @@ static int pwm_imx_tpm_probe(struct platform_device *pdev)
 
        tpm->chip.dev = &pdev->dev;
        tpm->chip.ops = &imx_tpm_pwm_ops;
-       tpm->chip.base = -1;
        tpm->chip.of_xlate = of_pwm_xlate_with_flags;
        tpm->chip.of_pwm_n_cells = 3;
 
@@ -411,9 +410,7 @@ static int __maybe_unused pwm_imx_tpm_resume(struct device *dev)
 
        ret = clk_prepare_enable(tpm->clk);
        if (ret)
-               dev_err(dev,
-                       "failed to prepare or enable clock: %d\n",
-                       ret);
+               dev_err(dev, "failed to prepare or enable clock: %d\n", ret);
 
        return ret;
 }
index 727e0d3..c957b36 100644 (file)
@@ -155,7 +155,6 @@ static int pwm_imx1_probe(struct platform_device *pdev)
 
        imx->chip.ops = &pwm_imx1_ops;
        imx->chip.dev = &pdev->dev;
-       imx->chip.base = -1;
        imx->chip.npwm = 1;
 
        imx->mmio_base = devm_platform_ioremap_resource(pdev, 0);
index 1805532..ba69511 100644 (file)
@@ -327,7 +327,6 @@ static int pwm_imx27_probe(struct platform_device *pdev)
 
        imx->chip.ops = &pwm_imx27_ops;
        imx->chip.dev = &pdev->dev;
-       imx->chip.base = -1;
        imx->chip.npwm = 1;
 
        imx->chip.of_xlate = of_pwm_xlate_with_flags;
index e9e54dd..015f5eb 100644 (file)
@@ -207,7 +207,6 @@ static int lgm_pwm_probe(struct platform_device *pdev)
        pc->chip.dev = dev;
        pc->chip.ops = &lgm_pwm_ops;
        pc->chip.npwm = 1;
-       pc->chip.base = -1;
 
        lgm_pwm_init(pc);
 
index 957b972..6c6e26d 100644 (file)
@@ -206,7 +206,6 @@ static int iqs620_pwm_probe(struct platform_device *pdev)
 
        iqs620_pwm->chip.dev = &pdev->dev;
        iqs620_pwm->chip.ops = &iqs620_pwm_ops;
-       iqs620_pwm->chip.base = -1;
        iqs620_pwm->chip.npwm = 1;
 
        mutex_init(&iqs620_pwm->lock);
index 00c642f..5b6bdcd 100644 (file)
@@ -244,7 +244,6 @@ static int jz4740_pwm_probe(struct platform_device *pdev)
        jz4740->chip.dev = dev;
        jz4740->chip.ops = &jz4740_pwm_ops;
        jz4740->chip.npwm = info->num_pwms;
-       jz4740->chip.base = -1;
        jz4740->chip.of_xlate = of_pwm_xlate_with_flags;
        jz4740->chip.of_pwm_n_cells = 3;
 
index cdfdef6..521a825 100644 (file)
@@ -203,7 +203,6 @@ static int keembay_pwm_probe(struct platform_device *pdev)
        if (ret)
                return ret;
 
-       priv->chip.base = -1;
        priv->chip.dev = dev;
        priv->chip.ops = &keembay_pwm_ops;
        priv->chip.npwm = KMB_TOTAL_PWM_CHANNELS;
index bf3f14f..7551253 100644 (file)
@@ -275,7 +275,6 @@ static int lp3943_pwm_probe(struct platform_device *pdev)
        lp3943_pwm->chip.dev = &pdev->dev;
        lp3943_pwm->chip.ops = &lp3943_pwm_ops;
        lp3943_pwm->chip.npwm = LP3943_NUM_PWMS;
-       lp3943_pwm->chip.base = -1;
 
        platform_set_drvdata(pdev, lp3943_pwm);
 
index 7ef4024..b643ac6 100644 (file)
@@ -370,7 +370,6 @@ static int lpc18xx_pwm_probe(struct platform_device *pdev)
 
        lpc18xx_pwm->chip.dev = &pdev->dev;
        lpc18xx_pwm->chip.ops = &lpc18xx_pwm_ops;
-       lpc18xx_pwm->chip.base = -1;
        lpc18xx_pwm->chip.npwm = 16;
        lpc18xx_pwm->chip.of_xlate = of_pwm_xlate_with_flags;
        lpc18xx_pwm->chip.of_pwm_n_cells = 3;
@@ -442,13 +441,15 @@ static int lpc18xx_pwm_remove(struct platform_device *pdev)
        struct lpc18xx_pwm_chip *lpc18xx_pwm = platform_get_drvdata(pdev);
        u32 val;
 
+       pwmchip_remove(&lpc18xx_pwm->chip);
+
        val = lpc18xx_pwm_readl(lpc18xx_pwm, LPC18XX_PWM_CTRL);
        lpc18xx_pwm_writel(lpc18xx_pwm, LPC18XX_PWM_CTRL,
                           val | LPC18XX_PWM_CTRL_HALT);
 
        clk_disable_unprepare(lpc18xx_pwm->pwm_clk);
 
-       return pwmchip_remove(&lpc18xx_pwm->chip);
+       return 0;
 }
 
 static struct platform_driver lpc18xx_pwm_driver = {
index 6b40904..2834a0f 100644 (file)
@@ -116,7 +116,6 @@ static int lpc32xx_pwm_probe(struct platform_device *pdev)
        lpc32xx->chip.dev = &pdev->dev;
        lpc32xx->chip.ops = &lpc32xx_pwm_ops;
        lpc32xx->chip.npwm = 1;
-       lpc32xx->chip.base = -1;
 
        ret = pwmchip_add(&lpc32xx->chip);
        if (ret < 0) {
@@ -137,10 +136,6 @@ static int lpc32xx_pwm_probe(struct platform_device *pdev)
 static int lpc32xx_pwm_remove(struct platform_device *pdev)
 {
        struct lpc32xx_pwm_chip *lpc32xx = platform_get_drvdata(pdev);
-       unsigned int i;
-
-       for (i = 0; i < lpc32xx->chip.npwm; i++)
-               pwm_disable(&lpc32xx->chip.pwms[i]);
 
        return pwmchip_remove(&lpc32xx->chip);
 }
index 939de93..58b4031 100644 (file)
@@ -234,7 +234,6 @@ struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, struct resource *r,
 
        lpwm->chip.dev = dev;
        lpwm->chip.ops = &pwm_lpss_ops;
-       lpwm->chip.base = -1;
        lpwm->chip.npwm = info->npwm;
 
        ret = pwmchip_add(&lpwm->chip);
@@ -255,12 +254,6 @@ EXPORT_SYMBOL_GPL(pwm_lpss_probe);
 
 int pwm_lpss_remove(struct pwm_lpss_chip *lpwm)
 {
-       int i;
-
-       for (i = 0; i < lpwm->info->npwm; i++) {
-               if (pwm_is_enabled(&lpwm->chip.pwms[i]))
-                       pm_runtime_put(lpwm->chip.dev);
-       }
        return pwmchip_remove(&lpwm->chip);
 }
 EXPORT_SYMBOL_GPL(pwm_lpss_remove);
index fcfc3b1..b4a3106 100644 (file)
@@ -107,12 +107,6 @@ static void pwm_mediatek_clk_disable(struct pwm_chip *chip,
        clk_disable_unprepare(pc->clk_top);
 }
 
-static inline u32 pwm_mediatek_readl(struct pwm_mediatek_chip *chip,
-                                    unsigned int num, unsigned int offset)
-{
-       return readl(chip->regs + pwm_mediatek_reg_offset[num] + offset);
-}
-
 static inline void pwm_mediatek_writel(struct pwm_mediatek_chip *chip,
                                       unsigned int num, unsigned int offset,
                                       u32 value)
@@ -263,7 +257,6 @@ static int pwm_mediatek_probe(struct platform_device *pdev)
 
        pc->chip.dev = &pdev->dev;
        pc->chip.ops = &pwm_mediatek_ops;
-       pc->chip.base = -1;
        pc->chip.npwm = pc->soc->num_pwms;
 
        ret = pwmchip_add(&pc->chip);
index a3ce978..9eb0606 100644 (file)
@@ -550,7 +550,6 @@ static int meson_pwm_probe(struct platform_device *pdev)
        spin_lock_init(&meson->lock);
        meson->chip.dev = &pdev->dev;
        meson->chip.ops = &meson_pwm_ops;
-       meson->chip.base = -1;
        meson->chip.npwm = MESON_NUM_PWMS;
        meson->chip.of_xlate = of_pwm_xlate_with_flags;
        meson->chip.of_pwm_n_cells = 3;
index 87c6b4b..9b3ba40 100644 (file)
@@ -202,7 +202,6 @@ static int mtk_disp_pwm_probe(struct platform_device *pdev)
 
        mdp->chip.dev = &pdev->dev;
        mdp->chip.ops = &mtk_disp_pwm_ops;
-       mdp->chip.base = -1;
        mdp->chip.npwm = 1;
 
        ret = pwmchip_add(&mdp->chip);
index 7ce6169..0266e84 100644 (file)
@@ -140,7 +140,6 @@ static int mxs_pwm_probe(struct platform_device *pdev)
        mxs->chip.ops = &mxs_pwm_ops;
        mxs->chip.of_xlate = of_pwm_xlate_with_flags;
        mxs->chip.of_pwm_n_cells = 3;
-       mxs->chip.base = -1;
 
        ret = of_property_read_u32(np, "fsl,pwm-number", &mxs->chip.npwm);
        if (ret < 0) {
index 358db4f..612b3c8 100644 (file)
@@ -403,7 +403,6 @@ static int pwm_omap_dmtimer_probe(struct platform_device *pdev)
 
        omap->chip.dev = &pdev->dev;
        omap->chip.ops = &pwm_omap_dmtimer_ops;
-       omap->chip.base = -1;
        omap->chip.npwm = 1;
        omap->chip.of_xlate = of_pwm_xlate_with_flags;
        omap->chip.of_pwm_n_cells = 3;
index 4a55dc1..7c9f174 100644 (file)
@@ -51,7 +51,6 @@
 #define PCA9685_PRESCALE_MAX   0xFF    /* => min. frequency of 24 Hz */
 
 #define PCA9685_COUNTER_RANGE  4096
-#define PCA9685_DEFAULT_PERIOD 5000000 /* Default period_ns = 1/200 Hz */
 #define PCA9685_OSC_CLOCK_MHZ  25      /* Internal oscillator with 25 MHz */
 
 #define PCA9685_NUMREGS                0xFF
 #define LED_N_OFF_H(N) (PCA9685_LEDX_OFF_H + (4 * (N)))
 #define LED_N_OFF_L(N) (PCA9685_LEDX_OFF_L + (4 * (N)))
 
+#define REG_ON_H(C)    ((C) >= PCA9685_MAXCHAN ? PCA9685_ALL_LED_ON_H : LED_N_ON_H((C)))
+#define REG_ON_L(C)    ((C) >= PCA9685_MAXCHAN ? PCA9685_ALL_LED_ON_L : LED_N_ON_L((C)))
+#define REG_OFF_H(C)   ((C) >= PCA9685_MAXCHAN ? PCA9685_ALL_LED_OFF_H : LED_N_OFF_H((C)))
+#define REG_OFF_L(C)   ((C) >= PCA9685_MAXCHAN ? PCA9685_ALL_LED_OFF_L : LED_N_OFF_L((C)))
+
 struct pca9685 {
        struct pwm_chip chip;
        struct regmap *regmap;
-       int period_ns;
 #if IS_ENABLED(CONFIG_GPIOLIB)
        struct mutex lock;
        struct gpio_chip gpio;
@@ -87,6 +90,53 @@ static inline struct pca9685 *to_pca(struct pwm_chip *chip)
        return container_of(chip, struct pca9685, chip);
 }
 
+/* Helper function to set the duty cycle ratio to duty/4096 (e.g. duty=2048 -> 50%) */
+static void pca9685_pwm_set_duty(struct pca9685 *pca, int channel, unsigned int duty)
+{
+       if (duty == 0) {
+               /* Set the full OFF bit, which has the highest precedence */
+               regmap_write(pca->regmap, REG_OFF_H(channel), LED_FULL);
+       } else if (duty >= PCA9685_COUNTER_RANGE) {
+               /* Set the full ON bit and clear the full OFF bit */
+               regmap_write(pca->regmap, REG_ON_H(channel), LED_FULL);
+               regmap_write(pca->regmap, REG_OFF_H(channel), 0);
+       } else {
+               /* Set OFF time (clears the full OFF bit) */
+               regmap_write(pca->regmap, REG_OFF_L(channel), duty & 0xff);
+               regmap_write(pca->regmap, REG_OFF_H(channel), (duty >> 8) & 0xf);
+               /* Clear the full ON bit */
+               regmap_write(pca->regmap, REG_ON_H(channel), 0);
+       }
+}
+
+static unsigned int pca9685_pwm_get_duty(struct pca9685 *pca, int channel)
+{
+       unsigned int off_h = 0, val = 0;
+
+       if (WARN_ON(channel >= PCA9685_MAXCHAN)) {
+               /* HW does not support reading state of "all LEDs" channel */
+               return 0;
+       }
+
+       regmap_read(pca->regmap, LED_N_OFF_H(channel), &off_h);
+       if (off_h & LED_FULL) {
+               /* Full OFF bit is set */
+               return 0;
+       }
+
+       regmap_read(pca->regmap, LED_N_ON_H(channel), &val);
+       if (val & LED_FULL) {
+               /* Full ON bit is set */
+               return PCA9685_COUNTER_RANGE;
+       }
+
+       if (regmap_read(pca->regmap, LED_N_OFF_L(channel), &val)) {
+               /* Reset val to 0 in case reading LED_N_OFF_L failed */
+               val = 0;
+       }
+       return ((off_h & 0xf) << 8) | (val & 0xff);
+}
+
 #if IS_ENABLED(CONFIG_GPIOLIB)
 static bool pca9685_pwm_test_and_set_inuse(struct pca9685 *pca, int pwm_idx)
 {
@@ -138,34 +188,23 @@ static int pca9685_pwm_gpio_request(struct gpio_chip *gpio, unsigned int offset)
 static int pca9685_pwm_gpio_get(struct gpio_chip *gpio, unsigned int offset)
 {
        struct pca9685 *pca = gpiochip_get_data(gpio);
-       struct pwm_device *pwm = &pca->chip.pwms[offset];
-       unsigned int value;
-
-       regmap_read(pca->regmap, LED_N_ON_H(pwm->hwpwm), &value);
 
-       return value & LED_FULL;
+       return pca9685_pwm_get_duty(pca, offset) != 0;
 }
 
 static void pca9685_pwm_gpio_set(struct gpio_chip *gpio, unsigned int offset,
                                 int value)
 {
        struct pca9685 *pca = gpiochip_get_data(gpio);
-       struct pwm_device *pwm = &pca->chip.pwms[offset];
-       unsigned int on = value ? LED_FULL : 0;
 
-       /* Clear both OFF registers */
-       regmap_write(pca->regmap, LED_N_OFF_L(pwm->hwpwm), 0);
-       regmap_write(pca->regmap, LED_N_OFF_H(pwm->hwpwm), 0);
-
-       /* Set the full ON bit */
-       regmap_write(pca->regmap, LED_N_ON_H(pwm->hwpwm), on);
+       pca9685_pwm_set_duty(pca, offset, value ? PCA9685_COUNTER_RANGE : 0);
 }
 
 static void pca9685_pwm_gpio_free(struct gpio_chip *gpio, unsigned int offset)
 {
        struct pca9685 *pca = gpiochip_get_data(gpio);
 
-       pca9685_pwm_gpio_set(gpio, offset, 0);
+       pca9685_pwm_set_duty(pca, offset, 0);
        pm_runtime_put(pca->chip.dev);
        pca9685_pwm_clear_inuse(pca, offset);
 }
@@ -246,165 +285,85 @@ static void pca9685_set_sleep_mode(struct pca9685 *pca, bool enable)
        }
 }
 
-static int pca9685_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
-                             int duty_ns, int period_ns)
+static int pca9685_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
+                            const struct pwm_state *state)
 {
        struct pca9685 *pca = to_pca(chip);
-       unsigned long long duty;
-       unsigned int reg;
-       int prescale;
-
-       if (period_ns != pca->period_ns) {
-               prescale = DIV_ROUND_CLOSEST(PCA9685_OSC_CLOCK_MHZ * period_ns,
-                                            PCA9685_COUNTER_RANGE * 1000) - 1;
-
-               if (prescale >= PCA9685_PRESCALE_MIN &&
-                       prescale <= PCA9685_PRESCALE_MAX) {
-                       /*
-                        * Putting the chip briefly into SLEEP mode
-                        * at this point won't interfere with the
-                        * pm_runtime framework, because the pm_runtime
-                        * state is guaranteed active here.
-                        */
-                       /* Put chip into sleep mode */
-                       pca9685_set_sleep_mode(pca, true);
-
-                       /* Change the chip-wide output frequency */
-                       regmap_write(pca->regmap, PCA9685_PRESCALE, prescale);
-
-                       /* Wake the chip up */
-                       pca9685_set_sleep_mode(pca, false);
-
-                       pca->period_ns = period_ns;
-               } else {
-                       dev_err(chip->dev,
-                               "prescaler not set: period out of bounds!\n");
-                       return -EINVAL;
-               }
-       }
+       unsigned long long duty, prescale;
+       unsigned int val = 0;
 
-       if (duty_ns < 1) {
-               if (pwm->hwpwm >= PCA9685_MAXCHAN)
-                       reg = PCA9685_ALL_LED_OFF_H;
-               else
-                       reg = LED_N_OFF_H(pwm->hwpwm);
+       if (state->polarity != PWM_POLARITY_NORMAL)
+               return -EINVAL;
 
-               regmap_write(pca->regmap, reg, LED_FULL);
-
-               return 0;
+       prescale = DIV_ROUND_CLOSEST_ULL(PCA9685_OSC_CLOCK_MHZ * state->period,
+                                        PCA9685_COUNTER_RANGE * 1000) - 1;
+       if (prescale < PCA9685_PRESCALE_MIN || prescale > PCA9685_PRESCALE_MAX) {
+               dev_err(chip->dev, "pwm not changed: period out of bounds!\n");
+               return -EINVAL;
        }
 
-       if (duty_ns == period_ns) {
-               /* Clear both OFF registers */
-               if (pwm->hwpwm >= PCA9685_MAXCHAN)
-                       reg = PCA9685_ALL_LED_OFF_L;
-               else
-                       reg = LED_N_OFF_L(pwm->hwpwm);
-
-               regmap_write(pca->regmap, reg, 0x0);
-
-               if (pwm->hwpwm >= PCA9685_MAXCHAN)
-                       reg = PCA9685_ALL_LED_OFF_H;
-               else
-                       reg = LED_N_OFF_H(pwm->hwpwm);
-
-               regmap_write(pca->regmap, reg, 0x0);
-
-               /* Set the full ON bit */
-               if (pwm->hwpwm >= PCA9685_MAXCHAN)
-                       reg = PCA9685_ALL_LED_ON_H;
-               else
-                       reg = LED_N_ON_H(pwm->hwpwm);
-
-               regmap_write(pca->regmap, reg, LED_FULL);
-
+       if (!state->enabled) {
+               pca9685_pwm_set_duty(pca, pwm->hwpwm, 0);
                return 0;
        }
 
-       duty = PCA9685_COUNTER_RANGE * (unsigned long long)duty_ns;
-       duty = DIV_ROUND_UP_ULL(duty, period_ns);
-
-       if (pwm->hwpwm >= PCA9685_MAXCHAN)
-               reg = PCA9685_ALL_LED_OFF_L;
-       else
-               reg = LED_N_OFF_L(pwm->hwpwm);
-
-       regmap_write(pca->regmap, reg, (int)duty & 0xff);
-
-       if (pwm->hwpwm >= PCA9685_MAXCHAN)
-               reg = PCA9685_ALL_LED_OFF_H;
-       else
-               reg = LED_N_OFF_H(pwm->hwpwm);
-
-       regmap_write(pca->regmap, reg, ((int)duty >> 8) & 0xf);
+       regmap_read(pca->regmap, PCA9685_PRESCALE, &val);
+       if (prescale != val) {
+               /*
+                * Putting the chip briefly into SLEEP mode
+                * at this point won't interfere with the
+                * pm_runtime framework, because the pm_runtime
+                * state is guaranteed active here.
+                */
+               /* Put chip into sleep mode */
+               pca9685_set_sleep_mode(pca, true);
 
-       /* Clear the full ON bit, otherwise the set OFF time has no effect */
-       if (pwm->hwpwm >= PCA9685_MAXCHAN)
-               reg = PCA9685_ALL_LED_ON_H;
-       else
-               reg = LED_N_ON_H(pwm->hwpwm);
+               /* Change the chip-wide output frequency */
+               regmap_write(pca->regmap, PCA9685_PRESCALE, prescale);
 
-       regmap_write(pca->regmap, reg, 0);
+               /* Wake the chip up */
+               pca9685_set_sleep_mode(pca, false);
+       }
 
+       duty = PCA9685_COUNTER_RANGE * state->duty_cycle;
+       duty = DIV_ROUND_UP_ULL(duty, state->period);
+       pca9685_pwm_set_duty(pca, pwm->hwpwm, duty);
        return 0;
 }
 
-static int pca9685_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
+static void pca9685_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
+                                 struct pwm_state *state)
 {
        struct pca9685 *pca = to_pca(chip);
-       unsigned int reg;
-
-       /*
-        * The PWM subsystem does not support a pre-delay.
-        * So, set the ON-timeout to 0
-        */
-       if (pwm->hwpwm >= PCA9685_MAXCHAN)
-               reg = PCA9685_ALL_LED_ON_L;
-       else
-               reg = LED_N_ON_L(pwm->hwpwm);
-
-       regmap_write(pca->regmap, reg, 0);
-
-       if (pwm->hwpwm >= PCA9685_MAXCHAN)
-               reg = PCA9685_ALL_LED_ON_H;
-       else
-               reg = LED_N_ON_H(pwm->hwpwm);
-
-       regmap_write(pca->regmap, reg, 0);
+       unsigned long long duty;
+       unsigned int val = 0;
 
+       /* Calculate (chip-wide) period from prescale value */
+       regmap_read(pca->regmap, PCA9685_PRESCALE, &val);
        /*
-        * Clear the full-off bit.
-        * It has precedence over the others and must be off.
+        * PCA9685_OSC_CLOCK_MHZ is 25, i.e. an integer divider of 1000.
+        * The following calculation is therefore only a multiplication
+        * and we are not losing precision.
         */
-       if (pwm->hwpwm >= PCA9685_MAXCHAN)
-               reg = PCA9685_ALL_LED_OFF_H;
-       else
-               reg = LED_N_OFF_H(pwm->hwpwm);
-
-       regmap_update_bits(pca->regmap, reg, LED_FULL, 0x0);
+       state->period = (PCA9685_COUNTER_RANGE * 1000 / PCA9685_OSC_CLOCK_MHZ) *
+                       (val + 1);
 
-       return 0;
-}
+       /* The (per-channel) polarity is fixed */
+       state->polarity = PWM_POLARITY_NORMAL;
 
-static void pca9685_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
-{
-       struct pca9685 *pca = to_pca(chip);
-       unsigned int reg;
-
-       if (pwm->hwpwm >= PCA9685_MAXCHAN)
-               reg = PCA9685_ALL_LED_OFF_H;
-       else
-               reg = LED_N_OFF_H(pwm->hwpwm);
-
-       regmap_write(pca->regmap, reg, LED_FULL);
-
-       /* Clear the LED_OFF counter. */
-       if (pwm->hwpwm >= PCA9685_MAXCHAN)
-               reg = PCA9685_ALL_LED_OFF_L;
-       else
-               reg = LED_N_OFF_L(pwm->hwpwm);
+       if (pwm->hwpwm >= PCA9685_MAXCHAN) {
+               /*
+                * The "all LEDs" channel does not support HW readout
+                * Return 0 and disabled for backwards compatibility
+                */
+               state->duty_cycle = 0;
+               state->enabled = false;
+               return;
+       }
 
-       regmap_write(pca->regmap, reg, 0x0);
+       state->enabled = true;
+       duty = pca9685_pwm_get_duty(pca, pwm->hwpwm);
+       state->duty_cycle = DIV_ROUND_DOWN_ULL(duty * state->period, PCA9685_COUNTER_RANGE);
 }
 
 static int pca9685_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
@@ -422,15 +381,14 @@ static void pca9685_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
 {
        struct pca9685 *pca = to_pca(chip);
 
-       pca9685_pwm_disable(chip, pwm);
+       pca9685_pwm_set_duty(pca, pwm->hwpwm, 0);
        pm_runtime_put(chip->dev);
        pca9685_pwm_clear_inuse(pca, pwm->hwpwm);
 }
 
 static const struct pwm_ops pca9685_pwm_ops = {
-       .enable = pca9685_pwm_enable,
-       .disable = pca9685_pwm_disable,
-       .config = pca9685_pwm_config,
+       .apply = pca9685_pwm_apply,
+       .get_state = pca9685_pwm_get_state,
        .request = pca9685_pwm_request,
        .free = pca9685_pwm_free,
        .owner = THIS_MODULE,
@@ -461,7 +419,6 @@ static int pca9685_pwm_probe(struct i2c_client *client,
                        ret);
                return ret;
        }
-       pca->period_ns = PCA9685_DEFAULT_PERIOD;
 
        i2c_set_clientdata(client, pca);
 
@@ -484,16 +441,15 @@ static int pca9685_pwm_probe(struct i2c_client *client,
        reg &= ~(MODE1_ALLCALL | MODE1_SUB1 | MODE1_SUB2 | MODE1_SUB3);
        regmap_write(pca->regmap, PCA9685_MODE1, reg);
 
-       /* Clear all "full off" bits */
-       regmap_write(pca->regmap, PCA9685_ALL_LED_OFF_L, 0);
-       regmap_write(pca->regmap, PCA9685_ALL_LED_OFF_H, 0);
+       /* Reset OFF registers to POR default */
+       regmap_write(pca->regmap, PCA9685_ALL_LED_OFF_L, LED_FULL);
+       regmap_write(pca->regmap, PCA9685_ALL_LED_OFF_H, LED_FULL);
 
        pca->chip.ops = &pca9685_pwm_ops;
        /* Add an extra channel for ALL_LED */
        pca->chip.npwm = PCA9685_MAXCHAN + 1;
 
        pca->chip.dev = &client->dev;
-       pca->chip.base = -1;
 
        ret = pwmchip_add(&pca->chip);
        if (ret < 0)
@@ -505,14 +461,20 @@ static int pca9685_pwm_probe(struct i2c_client *client,
                return ret;
        }
 
-       /* The chip comes out of power-up in the active state */
-       pm_runtime_set_active(&client->dev);
-       /*
-        * Enable will put the chip into suspend, which is what we
-        * want as all outputs are disabled at this point
-        */
        pm_runtime_enable(&client->dev);
 
+       if (pm_runtime_enabled(&client->dev)) {
+               /*
+                * Although the chip comes out of power-up in the sleep state,
+                * we force it to sleep in case it was woken up before
+                */
+               pca9685_set_sleep_mode(pca, true);
+               pm_runtime_set_suspended(&client->dev);
+       } else {
+               /* Wake the chip up if runtime PM is disabled */
+               pca9685_set_sleep_mode(pca, false);
+       }
+
        return 0;
 }
 
@@ -524,7 +486,14 @@ static int pca9685_pwm_remove(struct i2c_client *client)
        ret = pwmchip_remove(&pca->chip);
        if (ret)
                return ret;
+
+       if (!pm_runtime_enabled(&client->dev)) {
+               /* Put chip in sleep state if runtime PM is disabled */
+               pca9685_set_sleep_mode(pca, true);
+       }
+
        pm_runtime_disable(&client->dev);
+
        return 0;
 }
 
index d06cf60..cfb6838 100644 (file)
@@ -184,7 +184,6 @@ static int pwm_probe(struct platform_device *pdev)
 
        pwm->chip.dev = &pdev->dev;
        pwm->chip.ops = &pxa_pwm_ops;
-       pwm->chip.base = -1;
        pwm->chip.npwm = (id->driver_data & HAS_SECONDARY_PWM) ? 2 : 1;
 
        if (IS_ENABLED(CONFIG_OF)) {
index 002ab79..9daca0c 100644 (file)
@@ -224,7 +224,6 @@ static int rcar_pwm_probe(struct platform_device *pdev)
 
        rcar_pwm->chip.dev = &pdev->dev;
        rcar_pwm->chip.ops = &rcar_pwm_ops;
-       rcar_pwm->chip.base = -1;
        rcar_pwm->chip.npwm = 1;
 
        pm_runtime_enable(&pdev->dev);
index d02b24b..e2959fa 100644 (file)
@@ -410,7 +410,6 @@ static int tpu_probe(struct platform_device *pdev)
        tpu->chip.ops = &tpu_pwm_ops;
        tpu->chip.of_xlate = of_pwm_xlate_with_flags;
        tpu->chip.of_pwm_n_cells = 3;
-       tpu->chip.base = -1;
        tpu->chip.npwm = TPU_CHANNEL_MAX;
 
        pm_runtime_enable(&pdev->dev);
index 6ad7d0a..301785f 100644 (file)
@@ -352,7 +352,6 @@ static int rockchip_pwm_probe(struct platform_device *pdev)
        pc->data = id->data;
        pc->chip.dev = &pdev->dev;
        pc->chip.ops = &rockchip_pwm_ops;
-       pc->chip.base = -1;
        pc->chip.npwm = 1;
 
        if (pc->data->supports_polarity) {
index 645d006..515489f 100644 (file)
@@ -519,7 +519,6 @@ static int pwm_samsung_probe(struct platform_device *pdev)
 
        chip->chip.dev = &pdev->dev;
        chip->chip.ops = &pwm_samsung_ops;
-       chip->chip.base = -1;
        chip->chip.npwm = SAMSUNG_PWM_NUM;
        chip->inverter_mask = BIT(SAMSUNG_PWM_NUM) - 1;
 
index 2a7cd2d..688737f 100644 (file)
@@ -244,7 +244,6 @@ static int pwm_sifive_probe(struct platform_device *pdev)
        chip->ops = &pwm_sifive_ops;
        chip->of_xlate = of_pwm_xlate_with_flags;
        chip->of_pwm_n_cells = 3;
-       chip->base = -1;
        chip->npwm = 4;
 
        ddata->regs = devm_platform_ioremap_resource(pdev, 0);
index 0b01ec2..7a69c1a 100644 (file)
@@ -229,7 +229,6 @@ static int sl28cpld_pwm_probe(struct platform_device *pdev)
        chip = &priv->pwm_chip;
        chip->dev = &pdev->dev;
        chip->ops = &sl28cpld_pwm_ops;
-       chip->base = -1;
        chip->npwm = 1;
 
        platform_set_drvdata(pdev, priv);
index f63b54a..1a1cedf 100644 (file)
@@ -193,7 +193,6 @@ static int spear_pwm_probe(struct platform_device *pdev)
 
        pc->chip.dev = &pdev->dev;
        pc->chip.ops = &spear_pwm_ops;
-       pc->chip.base = -1;
        pc->chip.npwm = NUM_PWM;
 
        ret = clk_prepare(pc->clk);
index 5123d94..98c479d 100644 (file)
@@ -164,6 +164,9 @@ static int sprd_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
        struct pwm_state *cstate = &pwm->state;
        int ret;
 
+       if (state->polarity != PWM_POLARITY_NORMAL)
+               return -EINVAL;
+
        if (state->enabled) {
                if (!cstate->enabled) {
                        /*
@@ -268,7 +271,6 @@ static int sprd_pwm_probe(struct platform_device *pdev)
 
        spc->chip.dev = &pdev->dev;
        spc->chip.ops = &sprd_pwm_ops;
-       spc->chip.base = -1;
        spc->chip.npwm = spc->num_pwms;
 
        ret = pwmchip_add(&spc->chip);
index 99c70e0..f491d56 100644 (file)
@@ -619,7 +619,6 @@ static int sti_pwm_probe(struct platform_device *pdev)
 
        pc->chip.dev = dev;
        pc->chip.ops = &sti_pwm_ops;
-       pc->chip.base = -1;
        pc->chip.npwm = pc->cdata->pwm_num_devs;
 
        ret = pwmchip_add(&pc->chip);
@@ -650,15 +649,13 @@ static int sti_pwm_probe(struct platform_device *pdev)
 static int sti_pwm_remove(struct platform_device *pdev)
 {
        struct sti_pwm_chip *pc = platform_get_drvdata(pdev);
-       unsigned int i;
 
-       for (i = 0; i < pc->cdata->pwm_num_devs; i++)
-               pwm_disable(&pc->chip.pwms[i]);
+       pwmchip_remove(&pc->chip);
 
        clk_unprepare(pc->pwm_clk);
        clk_unprepare(pc->cpt_clk);
 
-       return pwmchip_remove(&pc->chip);
+       return 0;
 }
 
 static const struct of_device_id sti_pwm_of_match[] = {
index 134c146..af08f56 100644 (file)
@@ -205,7 +205,6 @@ static int stm32_pwm_lp_probe(struct platform_device *pdev)
 
        priv->regmap = ddata->regmap;
        priv->clk = ddata->clk;
-       priv->chip.base = -1;
        priv->chip.dev = &pdev->dev;
        priv->chip.ops = &stm32_pwm_lp_ops;
        priv->chip.npwm = 1;
index d3be944..c46fb90 100644 (file)
@@ -633,7 +633,6 @@ static int stm32_pwm_probe(struct platform_device *pdev)
 
        stm32_pwm_detect_complementary(priv);
 
-       priv->chip.base = -1;
        priv->chip.dev = dev;
        priv->chip.ops = &stm32pwm_ops;
        priv->chip.npwm = stm32_pwm_detect_channels(priv);
index be5f6d7..9dc983a 100644 (file)
@@ -278,7 +278,6 @@ static int __init stmpe_pwm_probe(struct platform_device *pdev)
 
        pwm->stmpe = stmpe;
        pwm->chip.dev = &pdev->dev;
-       pwm->chip.base = -1;
 
        if (stmpe->partnum == STMPE2401 || stmpe->partnum == STMPE2403) {
                pwm->chip.ops = &stmpe_24xx_pwm_ops;
index ce5c4fc..e01becd 100644 (file)
@@ -459,7 +459,6 @@ static int sun4i_pwm_probe(struct platform_device *pdev)
 
        pwm->chip.dev = &pdev->dev;
        pwm->chip.ops = &sun4i_pwm_ops;
-       pwm->chip.base = -1;
        pwm->chip.npwm = pwm->data->npwm;
        pwm->chip.of_xlate = of_pwm_xlate_with_flags;
        pwm->chip.of_pwm_n_cells = 3;
index 55bc63d..c529a17 100644 (file)
@@ -285,7 +285,6 @@ static int tegra_pwm_probe(struct platform_device *pdev)
 
        pwm->chip.dev = &pdev->dev;
        pwm->chip.ops = &tegra_pwm_ops;
-       pwm->chip.base = -1;
        pwm->chip.npwm = pwm->soc->num_channels;
 
        ret = pwmchip_add(&pwm->chip);
index 2a89490..b9a17ab 100644 (file)
@@ -226,7 +226,6 @@ static int ecap_pwm_probe(struct platform_device *pdev)
        pc->chip.ops = &ecap_pwm_ops;
        pc->chip.of_xlate = of_pwm_xlate_with_flags;
        pc->chip.of_pwm_n_cells = 3;
-       pc->chip.base = -1;
        pc->chip.npwm = 1;
 
        pc->mmio_base = devm_platform_ioremap_resource(pdev, 0);
index a7fb224..90095a1 100644 (file)
@@ -449,7 +449,6 @@ static int ehrpwm_pwm_probe(struct platform_device *pdev)
        pc->chip.ops = &ehrpwm_pwm_ops;
        pc->chip.of_xlate = of_pwm_xlate_with_flags;
        pc->chip.of_pwm_n_cells = 3;
-       pc->chip.base = -1;
        pc->chip.npwm = NUM_PWM_CHANNEL;
 
        pc->mmio_base = devm_platform_ioremap_resource(pdev, 0);
index 630b9a5..6c8df5f 100644 (file)
@@ -291,7 +291,6 @@ static int twl_pwmled_probe(struct platform_device *pdev)
        }
 
        twl->chip.dev = &pdev->dev;
-       twl->chip.base = -1;
 
        mutex_init(&twl->mutex);
 
index aee6797..e83a826 100644 (file)
@@ -310,7 +310,6 @@ static int twl_pwm_probe(struct platform_device *pdev)
                twl->chip.ops = &twl6030_pwm_ops;
 
        twl->chip.dev = &pdev->dev;
-       twl->chip.base = -1;
        twl->chip.npwm = 2;
 
        mutex_init(&twl->mutex);
diff --git a/drivers/pwm/pwm-visconti.c b/drivers/pwm/pwm-visconti.c
new file mode 100644 (file)
index 0000000..46d9037
--- /dev/null
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Toshiba Visconti pulse-width-modulation controller driver
+ *
+ * Copyright (c) 2020 - 2021 TOSHIBA CORPORATION
+ * Copyright (c) 2020 - 2021 Toshiba Electronic Devices & Storage Corporation
+ *
+ * Authors: Nobuhiro Iwamatsu <nobuhiro1.iwamatsu@toshiba.co.jp>
+ *
+ * Limitations:
+ * - The fixed input clock is running at 1 MHz and is divided by either 1,
+ *   2, 4 or 8.
+ * - When the settings of the PWM are modified, the new values are shadowed
+ *   in hardware until the PIPGM_PCSR register is written and the currently
+ *   running period is completed. This way the hardware switches atomically
+ *   from the old setting to the new.
+ * - Disabling the hardware completes the currently running period and keeps
+ *   the output at low level at all times.
+ */
+
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/pwm.h>
+
+#define PIPGM_PCSR(ch) (0x400 + 4 * (ch))
+#define PIPGM_PDUT(ch) (0x420 + 4 * (ch))
+#define PIPGM_PWMC(ch) (0x440 + 4 * (ch))
+
+#define PIPGM_PWMC_PWMACT              BIT(5)
+#define PIPGM_PWMC_CLK_MASK            GENMASK(1, 0)
+#define PIPGM_PWMC_POLARITY_MASK       GENMASK(5, 5)
+
+struct visconti_pwm_chip {
+       struct pwm_chip chip;
+       void __iomem *base;
+};
+
+static inline struct visconti_pwm_chip *visconti_pwm_from_chip(struct pwm_chip *chip)
+{
+       return container_of(chip, struct visconti_pwm_chip, chip);
+}
+
+static int visconti_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
+                             const struct pwm_state *state)
+{
+       struct visconti_pwm_chip *priv = visconti_pwm_from_chip(chip);
+       u32 period, duty_cycle, pwmc0;
+
+       if (!state->enabled) {
+               writel(0, priv->base + PIPGM_PCSR(pwm->hwpwm));
+               return 0;
+       }
+
+       /*
+        * The biggest period the hardware can provide is
+        *      (0xffff << 3) * 1000 ns
+        * This value fits easily in an u32, so simplify the maths by
+        * capping the values to 32 bit integers.
+        */
+       if (state->period > (0xffff << 3) * 1000)
+               period = (0xffff << 3) * 1000;
+       else
+               period = state->period;
+
+       if (state->duty_cycle > period)
+               duty_cycle = period;
+       else
+               duty_cycle = state->duty_cycle;
+
+       /*
+        * The input clock runs fixed at 1 MHz, so we have only
+        * microsecond resolution and so can divide by
+        * NSEC_PER_SEC / CLKFREQ = 1000 without losing precision.
+        */
+       period /= 1000;
+       duty_cycle /= 1000;
+
+       if (!period)
+               return -ERANGE;
+
+       /*
+        * PWMC controls a divider that divides the input clk by a
+        * power of two between 1 and 8. As a smaller divider yields
+        * higher precision, pick the smallest possible one.
+        */
+       if (period > 0xffff) {
+               pwmc0 = ilog2(period >> 16);
+               if (WARN_ON(pwmc0 > 3))
+                       return -EINVAL;
+       } else {
+               pwmc0 = 0;
+       }
+
+       period >>= pwmc0;
+       duty_cycle >>= pwmc0;
+
+       if (state->polarity == PWM_POLARITY_INVERSED)
+               pwmc0 |= PIPGM_PWMC_PWMACT;
+       writel(pwmc0, priv->base + PIPGM_PWMC(pwm->hwpwm));
+       writel(duty_cycle, priv->base + PIPGM_PDUT(pwm->hwpwm));
+       writel(period, priv->base + PIPGM_PCSR(pwm->hwpwm));
+
+       return 0;
+}
+
+static void visconti_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
+                                  struct pwm_state *state)
+{
+       struct visconti_pwm_chip *priv = visconti_pwm_from_chip(chip);
+       u32 period, duty, pwmc0, pwmc0_clk;
+
+       period = readl(priv->base + PIPGM_PCSR(pwm->hwpwm));
+       duty = readl(priv->base + PIPGM_PDUT(pwm->hwpwm));
+       pwmc0 = readl(priv->base + PIPGM_PWMC(pwm->hwpwm));
+       pwmc0_clk = pwmc0 & PIPGM_PWMC_CLK_MASK;
+
+       state->period = (period << pwmc0_clk) * NSEC_PER_USEC;
+       state->duty_cycle = (duty << pwmc0_clk) * NSEC_PER_USEC;
+       if (pwmc0 & PIPGM_PWMC_POLARITY_MASK)
+               state->polarity = PWM_POLARITY_INVERSED;
+       else
+               state->polarity = PWM_POLARITY_NORMAL;
+
+       state->enabled = true;
+}
+
+static const struct pwm_ops visconti_pwm_ops = {
+       .apply = visconti_pwm_apply,
+       .get_state = visconti_pwm_get_state,
+       .owner = THIS_MODULE,
+};
+
+static int visconti_pwm_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct visconti_pwm_chip *priv;
+       int ret;
+
+       priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       priv->base = devm_platform_ioremap_resource(pdev, 0);
+       if (IS_ERR(priv->base))
+               return PTR_ERR(priv->base);
+
+       platform_set_drvdata(pdev, priv);
+
+       priv->chip.dev = dev;
+       priv->chip.ops = &visconti_pwm_ops;
+       priv->chip.npwm = 4;
+
+       ret = pwmchip_add(&priv->chip);
+       if (ret < 0)
+               return dev_err_probe(&pdev->dev, ret, "Cannot register visconti PWM\n");
+
+       return 0;
+}
+
+static int visconti_pwm_remove(struct platform_device *pdev)
+{
+       struct visconti_pwm_chip *priv = platform_get_drvdata(pdev);
+
+       pwmchip_remove(&priv->chip);
+
+       return 0;
+}
+
+static const struct of_device_id visconti_pwm_of_match[] = {
+       { .compatible = "toshiba,visconti-pwm", },
+       { }
+};
+MODULE_DEVICE_TABLE(of, visconti_pwm_of_match);
+
+static struct platform_driver visconti_pwm_driver = {
+       .driver = {
+               .name = "pwm-visconti",
+               .of_match_table = visconti_pwm_of_match,
+       },
+       .probe = visconti_pwm_probe,
+       .remove = visconti_pwm_remove,
+};
+module_platform_driver(visconti_pwm_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Nobuhiro Iwamatsu <nobuhiro1.iwamatsu@toshiba.co.jp>");
+MODULE_ALIAS("platform:pwm-visconti");
index 6e36851..52fe5d1 100644 (file)
@@ -209,7 +209,6 @@ static int vt8500_pwm_probe(struct platform_device *pdev)
        chip->chip.ops = &vt8500_pwm_ops;
        chip->chip.of_xlate = of_pwm_xlate_with_flags;
        chip->chip.of_pwm_n_cells = 3;
-       chip->chip.base = -1;
        chip->chip.npwm = VT8500_NR_PWMS;
 
        chip->clk = devm_clk_get(&pdev->dev, NULL);
index 15d1574..e68fced 100644 (file)
@@ -24,11 +24,12 @@ config REMOTEPROC_CDEV
          It's safe to say N if you don't want to use this interface.
 
 config IMX_REMOTEPROC
-       tristate "IMX6/7 remoteproc support"
+       tristate "i.MX remoteproc support"
        depends on ARCH_MXC
+       select MAILBOX
        help
-         Say y here to support iMX's remote processors (Cortex M4
-         on iMX7D) via the remote processor framework.
+         Say y here to support iMX's remote processors via the remote
+         processor framework.
 
          It's safe to say N here.
 
index 8957ed2..d633887 100644 (file)
@@ -7,13 +7,18 @@
 #include <linux/err.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
+#include <linux/mailbox_client.h>
 #include <linux/mfd/syscon.h>
 #include <linux/module.h>
 #include <linux/of_address.h>
+#include <linux/of_reserved_mem.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
 #include <linux/remoteproc.h>
+#include <linux/workqueue.h>
+
+#include "remoteproc_internal.h"
 
 #define IMX7D_SRC_SCR                  0x0C
 #define IMX7D_ENABLE_M4                        BIT(3)
@@ -43,7 +48,7 @@
                                         | IMX6SX_SW_M4C_NON_SCLR_RST \
                                         | IMX6SX_SW_M4C_RST)
 
-#define IMX7D_RPROC_MEM_MAX            8
+#define IMX_RPROC_MEM_MAX              32
 
 /**
  * struct imx_rproc_mem - slim internal memory structure
@@ -83,8 +88,42 @@ struct imx_rproc {
        struct regmap                   *regmap;
        struct rproc                    *rproc;
        const struct imx_rproc_dcfg     *dcfg;
-       struct imx_rproc_mem            mem[IMX7D_RPROC_MEM_MAX];
+       struct imx_rproc_mem            mem[IMX_RPROC_MEM_MAX];
        struct clk                      *clk;
+       struct mbox_client              cl;
+       struct mbox_chan                *tx_ch;
+       struct mbox_chan                *rx_ch;
+       struct work_struct              rproc_work;
+       struct workqueue_struct         *workqueue;
+       void __iomem                    *rsc_table;
+};
+
+static const struct imx_rproc_att imx_rproc_att_imx8mq[] = {
+       /* dev addr , sys addr  , size      , flags */
+       /* TCML - alias */
+       { 0x00000000, 0x007e0000, 0x00020000, 0 },
+       /* OCRAM_S */
+       { 0x00180000, 0x00180000, 0x00008000, 0 },
+       /* OCRAM */
+       { 0x00900000, 0x00900000, 0x00020000, 0 },
+       /* OCRAM */
+       { 0x00920000, 0x00920000, 0x00020000, 0 },
+       /* QSPI Code - alias */
+       { 0x08000000, 0x08000000, 0x08000000, 0 },
+       /* DDR (Code) - alias */
+       { 0x10000000, 0x80000000, 0x0FFE0000, 0 },
+       /* TCML */
+       { 0x1FFE0000, 0x007E0000, 0x00020000, ATT_OWN },
+       /* TCMU */
+       { 0x20000000, 0x00800000, 0x00020000, ATT_OWN },
+       /* OCRAM_S */
+       { 0x20180000, 0x00180000, 0x00008000, ATT_OWN },
+       /* OCRAM */
+       { 0x20200000, 0x00900000, 0x00020000, ATT_OWN },
+       /* OCRAM */
+       { 0x20220000, 0x00920000, 0x00020000, ATT_OWN },
+       /* DDR (Data) */
+       { 0x40000000, 0x40000000, 0x80000000, 0 },
 };
 
 static const struct imx_rproc_att imx_rproc_att_imx7d[] = {
@@ -137,6 +176,15 @@ static const struct imx_rproc_att imx_rproc_att_imx6sx[] = {
        { 0x80000000, 0x80000000, 0x60000000, 0 },
 };
 
+static const struct imx_rproc_dcfg imx_rproc_cfg_imx8mq = {
+       .src_reg        = IMX7D_SRC_SCR,
+       .src_mask       = IMX7D_M4_RST_MASK,
+       .src_start      = IMX7D_M4_START,
+       .src_stop       = IMX7D_M4_STOP,
+       .att            = imx_rproc_att_imx8mq,
+       .att_size       = ARRAY_SIZE(imx_rproc_att_imx8mq),
+};
+
 static const struct imx_rproc_dcfg imx_rproc_cfg_imx7d = {
        .src_reg        = IMX7D_SRC_SCR,
        .src_mask       = IMX7D_M4_RST_MASK,
@@ -208,7 +256,7 @@ static int imx_rproc_da_to_sys(struct imx_rproc *priv, u64 da,
        return -ENOENT;
 }
 
-static void *imx_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *imx_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct imx_rproc *priv = rproc->priv;
        void *va = NULL;
@@ -225,7 +273,7 @@ static void *imx_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
        if (imx_rproc_da_to_sys(priv, da, len, &sys))
                return NULL;
 
-       for (i = 0; i < IMX7D_RPROC_MEM_MAX; i++) {
+       for (i = 0; i < IMX_RPROC_MEM_MAX; i++) {
                if (sys >= priv->mem[i].sys_addr && sys + len <
                    priv->mem[i].sys_addr +  priv->mem[i].size) {
                        unsigned int offset = sys - priv->mem[i].sys_addr;
@@ -241,10 +289,143 @@ static void *imx_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
        return va;
 }
 
+static int imx_rproc_mem_alloc(struct rproc *rproc,
+                              struct rproc_mem_entry *mem)
+{
+       struct device *dev = rproc->dev.parent;
+       void *va;
+
+       dev_dbg(dev, "map memory: %p+%zx\n", &mem->dma, mem->len);
+       va = ioremap_wc(mem->dma, mem->len);
+       if (IS_ERR_OR_NULL(va)) {
+               dev_err(dev, "Unable to map memory region: %p+%zx\n",
+                       &mem->dma, mem->len);
+               return -ENOMEM;
+       }
+
+       /* Update memory entry va */
+       mem->va = va;
+
+       return 0;
+}
+
+static int imx_rproc_mem_release(struct rproc *rproc,
+                                struct rproc_mem_entry *mem)
+{
+       dev_dbg(rproc->dev.parent, "unmap memory: %pa\n", &mem->dma);
+       iounmap(mem->va);
+
+       return 0;
+}
+
+static int imx_rproc_prepare(struct rproc *rproc)
+{
+       struct imx_rproc *priv = rproc->priv;
+       struct device_node *np = priv->dev->of_node;
+       struct of_phandle_iterator it;
+       struct rproc_mem_entry *mem;
+       struct reserved_mem *rmem;
+       u32 da;
+
+       /* Register associated reserved memory regions */
+       of_phandle_iterator_init(&it, np, "memory-region", NULL, 0);
+       while (of_phandle_iterator_next(&it) == 0) {
+               /*
+                * Ignore the first memory region which will be used vdev buffer.
+                * No need to do extra handlings, rproc_add_virtio_dev will handle it.
+                */
+               if (!strcmp(it.node->name, "vdev0buffer"))
+                       continue;
+
+               rmem = of_reserved_mem_lookup(it.node);
+               if (!rmem) {
+                       dev_err(priv->dev, "unable to acquire memory-region\n");
+                       return -EINVAL;
+               }
+
+               /* No need to translate pa to da, i.MX use same map */
+               da = rmem->base;
+
+               /* Register memory region */
+               mem = rproc_mem_entry_init(priv->dev, NULL, (dma_addr_t)rmem->base, rmem->size, da,
+                                          imx_rproc_mem_alloc, imx_rproc_mem_release,
+                                          it.node->name);
+
+               if (mem)
+                       rproc_coredump_add_segment(rproc, da, rmem->size);
+               else
+                       return -ENOMEM;
+
+               rproc_add_carveout(rproc, mem);
+       }
+
+       return  0;
+}
+
+static int imx_rproc_parse_fw(struct rproc *rproc, const struct firmware *fw)
+{
+       int ret;
+
+       ret = rproc_elf_load_rsc_table(rproc, fw);
+       if (ret)
+               dev_info(&rproc->dev, "No resource table in elf\n");
+
+       return 0;
+}
+
+static void imx_rproc_kick(struct rproc *rproc, int vqid)
+{
+       struct imx_rproc *priv = rproc->priv;
+       int err;
+       __u32 mmsg;
+
+       if (!priv->tx_ch) {
+               dev_err(priv->dev, "No initialized mbox tx channel\n");
+               return;
+       }
+
+       /*
+        * Send the index of the triggered virtqueue as the mu payload.
+        * Let remote processor know which virtqueue is used.
+        */
+       mmsg = vqid << 16;
+
+       err = mbox_send_message(priv->tx_ch, (void *)&mmsg);
+       if (err < 0)
+               dev_err(priv->dev, "%s: failed (%d, err:%d)\n",
+                       __func__, vqid, err);
+}
+
+static int imx_rproc_attach(struct rproc *rproc)
+{
+       return 0;
+}
+
+static struct resource_table *imx_rproc_get_loaded_rsc_table(struct rproc *rproc, size_t *table_sz)
+{
+       struct imx_rproc *priv = rproc->priv;
+
+       /* The resource table has already been mapped in imx_rproc_addr_init */
+       if (!priv->rsc_table)
+               return NULL;
+
+       *table_sz = SZ_1K;
+       return (struct resource_table *)priv->rsc_table;
+}
+
 static const struct rproc_ops imx_rproc_ops = {
+       .prepare        = imx_rproc_prepare,
+       .attach         = imx_rproc_attach,
        .start          = imx_rproc_start,
        .stop           = imx_rproc_stop,
+       .kick           = imx_rproc_kick,
        .da_to_va       = imx_rproc_da_to_va,
+       .load           = rproc_elf_load_segments,
+       .parse_fw       = imx_rproc_parse_fw,
+       .find_loaded_rsc_table = rproc_elf_find_loaded_rsc_table,
+       .get_loaded_rsc_table = imx_rproc_get_loaded_rsc_table,
+       .sanity_check   = rproc_elf_sanity_check,
+       .get_boot_addr  = rproc_elf_get_boot_addr,
 };
 
 static int imx_rproc_addr_init(struct imx_rproc *priv,
@@ -262,13 +443,13 @@ static int imx_rproc_addr_init(struct imx_rproc *priv,
                if (!(att->flags & ATT_OWN))
                        continue;
 
-               if (b >= IMX7D_RPROC_MEM_MAX)
+               if (b >= IMX_RPROC_MEM_MAX)
                        break;
 
                priv->mem[b].cpu_addr = devm_ioremap(&pdev->dev,
                                                     att->sa, att->size);
                if (!priv->mem[b].cpu_addr) {
-                       dev_err(dev, "devm_ioremap_resource failed\n");
+                       dev_err(dev, "failed to remap %#x bytes from %#x\n", att->size, att->sa);
                        return -ENOMEM;
                }
                priv->mem[b].sys_addr = att->sa;
@@ -287,29 +468,115 @@ static int imx_rproc_addr_init(struct imx_rproc *priv,
                struct resource res;
 
                node = of_parse_phandle(np, "memory-region", a);
+               /* Not map vdev region */
+               if (!strcmp(node->name, "vdev"))
+                       continue;
                err = of_address_to_resource(node, 0, &res);
                if (err) {
                        dev_err(dev, "unable to resolve memory region\n");
                        return err;
                }
 
-               if (b >= IMX7D_RPROC_MEM_MAX)
+               of_node_put(node);
+
+               if (b >= IMX_RPROC_MEM_MAX)
                        break;
 
-               priv->mem[b].cpu_addr = devm_ioremap_resource(&pdev->dev, &res);
-               if (IS_ERR(priv->mem[b].cpu_addr)) {
-                       dev_err(dev, "devm_ioremap_resource failed\n");
-                       err = PTR_ERR(priv->mem[b].cpu_addr);
-                       return err;
+               /* Not use resource version, because we might share region */
+               priv->mem[b].cpu_addr = devm_ioremap(&pdev->dev, res.start, resource_size(&res));
+               if (!priv->mem[b].cpu_addr) {
+                       dev_err(dev, "failed to remap %pr\n", &res);
+                       return -ENOMEM;
                }
                priv->mem[b].sys_addr = res.start;
                priv->mem[b].size = resource_size(&res);
+               if (!strcmp(node->name, "rsc_table"))
+                       priv->rsc_table = priv->mem[b].cpu_addr;
                b++;
        }
 
        return 0;
 }
 
+static void imx_rproc_vq_work(struct work_struct *work)
+{
+       struct imx_rproc *priv = container_of(work, struct imx_rproc,
+                                             rproc_work);
+
+       rproc_vq_interrupt(priv->rproc, 0);
+       rproc_vq_interrupt(priv->rproc, 1);
+}
+
+static void imx_rproc_rx_callback(struct mbox_client *cl, void *msg)
+{
+       struct rproc *rproc = dev_get_drvdata(cl->dev);
+       struct imx_rproc *priv = rproc->priv;
+
+       queue_work(priv->workqueue, &priv->rproc_work);
+}
+
+static int imx_rproc_xtr_mbox_init(struct rproc *rproc)
+{
+       struct imx_rproc *priv = rproc->priv;
+       struct device *dev = priv->dev;
+       struct mbox_client *cl;
+       int ret;
+
+       if (!of_get_property(dev->of_node, "mbox-names", NULL))
+               return 0;
+
+       cl = &priv->cl;
+       cl->dev = dev;
+       cl->tx_block = true;
+       cl->tx_tout = 100;
+       cl->knows_txdone = false;
+       cl->rx_callback = imx_rproc_rx_callback;
+
+       priv->tx_ch = mbox_request_channel_byname(cl, "tx");
+       if (IS_ERR(priv->tx_ch)) {
+               ret = PTR_ERR(priv->tx_ch);
+               return dev_err_probe(cl->dev, ret,
+                                    "failed to request tx mailbox channel: %d\n", ret);
+       }
+
+       priv->rx_ch = mbox_request_channel_byname(cl, "rx");
+       if (IS_ERR(priv->rx_ch)) {
+               mbox_free_channel(priv->tx_ch);
+               ret = PTR_ERR(priv->rx_ch);
+               return dev_err_probe(cl->dev, ret,
+                                    "failed to request rx mailbox channel: %d\n", ret);
+       }
+
+       return 0;
+}
+
+static void imx_rproc_free_mbox(struct rproc *rproc)
+{
+       struct imx_rproc *priv = rproc->priv;
+
+       mbox_free_channel(priv->tx_ch);
+       mbox_free_channel(priv->rx_ch);
+}
+
+static int imx_rproc_detect_mode(struct imx_rproc *priv)
+{
+       const struct imx_rproc_dcfg *dcfg = priv->dcfg;
+       struct device *dev = priv->dev;
+       int ret;
+       u32 val;
+
+       ret = regmap_read(priv->regmap, dcfg->src_reg, &val);
+       if (ret) {
+               dev_err(dev, "Failed to read src\n");
+               return ret;
+       }
+
+       if (!(val & dcfg->src_stop))
+               priv->rproc->state = RPROC_DETACHED;
+
+       return 0;
+}
+
 static int imx_rproc_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
@@ -347,18 +614,32 @@ static int imx_rproc_probe(struct platform_device *pdev)
        priv->dev = dev;
 
        dev_set_drvdata(dev, rproc);
+       priv->workqueue = create_workqueue(dev_name(dev));
+       if (!priv->workqueue) {
+               dev_err(dev, "cannot create workqueue\n");
+               ret = -ENOMEM;
+               goto err_put_rproc;
+       }
+
+       ret = imx_rproc_xtr_mbox_init(rproc);
+       if (ret)
+               goto err_put_wkq;
 
        ret = imx_rproc_addr_init(priv, pdev);
        if (ret) {
                dev_err(dev, "failed on imx_rproc_addr_init\n");
-               goto err_put_rproc;
+               goto err_put_mbox;
        }
 
+       ret = imx_rproc_detect_mode(priv);
+       if (ret)
+               goto err_put_mbox;
+
        priv->clk = devm_clk_get(dev, NULL);
        if (IS_ERR(priv->clk)) {
                dev_err(dev, "Failed to get clock\n");
                ret = PTR_ERR(priv->clk);
-               goto err_put_rproc;
+               goto err_put_mbox;
        }
 
        /*
@@ -368,9 +649,11 @@ static int imx_rproc_probe(struct platform_device *pdev)
        ret = clk_prepare_enable(priv->clk);
        if (ret) {
                dev_err(&rproc->dev, "Failed to enable clock\n");
-               goto err_put_rproc;
+               goto err_put_mbox;
        }
 
+       INIT_WORK(&priv->rproc_work, imx_rproc_vq_work);
+
        ret = rproc_add(rproc);
        if (ret) {
                dev_err(dev, "rproc_add failed\n");
@@ -381,6 +664,10 @@ static int imx_rproc_probe(struct platform_device *pdev)
 
 err_put_clk:
        clk_disable_unprepare(priv->clk);
+err_put_mbox:
+       imx_rproc_free_mbox(rproc);
+err_put_wkq:
+       destroy_workqueue(priv->workqueue);
 err_put_rproc:
        rproc_free(rproc);
 
@@ -394,6 +681,7 @@ static int imx_rproc_remove(struct platform_device *pdev)
 
        clk_disable_unprepare(priv->clk);
        rproc_del(rproc);
+       imx_rproc_free_mbox(rproc);
        rproc_free(rproc);
 
        return 0;
@@ -402,6 +690,8 @@ static int imx_rproc_remove(struct platform_device *pdev)
 static const struct of_device_id imx_rproc_of_match[] = {
        { .compatible = "fsl,imx7d-cm4", .data = &imx_rproc_cfg_imx7d },
        { .compatible = "fsl,imx6sx-cm4", .data = &imx_rproc_cfg_imx6sx },
+       { .compatible = "fsl,imx8mq-cm4", .data = &imx_rproc_cfg_imx8mq },
+       { .compatible = "fsl,imx8mm-cm4", .data = &imx_rproc_cfg_imx8mq },
        {},
 };
 MODULE_DEVICE_TABLE(of, imx_rproc_of_match);
@@ -418,5 +708,5 @@ static struct platform_driver imx_rproc_driver = {
 module_platform_driver(imx_rproc_driver);
 
 MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("IMX6SX/7D remote processor control driver");
+MODULE_DESCRIPTION("i.MX remote processor control driver");
 MODULE_AUTHOR("Oleksij Rempel <o.rempel@pengutronix.de>");
index e2618c3..a356738 100644 (file)
@@ -121,7 +121,7 @@ static void ingenic_rproc_kick(struct rproc *rproc, int vqid)
        writel(vqid, vpu->aux_base + REG_CORE_MSG);
 }
 
-static void *ingenic_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *ingenic_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct vpu *vpu = rproc->priv;
        void __iomem *va = NULL;
index cd26616..54781f5 100644 (file)
@@ -246,7 +246,7 @@ static void keystone_rproc_kick(struct rproc *rproc, int vqid)
  * can be used either by the remoteproc core for loading (when using kernel
  * remoteproc loader), or by any rpmsg bus drivers.
  */
-static void *keystone_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *keystone_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct keystone_rproc *ksproc = rproc->priv;
        void __iomem *va = NULL;
index ce72759..9679cc2 100644 (file)
@@ -272,7 +272,7 @@ static int scp_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
                }
 
                /* grab the kernel address for this device address */
-               ptr = (void __iomem *)rproc_da_to_va(rproc, da, memsz);
+               ptr = (void __iomem *)rproc_da_to_va(rproc, da, memsz, NULL);
                if (!ptr) {
                        dev_err(dev, "bad phdr da 0x%x mem 0x%x\n", da, memsz);
                        ret = -EINVAL;
@@ -509,7 +509,7 @@ static void *mt8192_scp_da_to_va(struct mtk_scp *scp, u64 da, size_t len)
        return NULL;
 }
 
-static void *scp_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *scp_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct mtk_scp *scp = (struct mtk_scp *)rproc->priv;
 
@@ -627,7 +627,7 @@ void *scp_mapping_dm_addr(struct mtk_scp *scp, u32 mem_addr)
 {
        void *ptr;
 
-       ptr = scp_da_to_va(scp->rproc, mem_addr, 0);
+       ptr = scp_da_to_va(scp->rproc, mem_addr, 0, NULL);
        if (!ptr)
                return ERR_PTR(-EINVAL);
 
index d94b739..43531ca 100644 (file)
@@ -728,7 +728,7 @@ out:
  * Return: translated virtual address in kernel memory space on success,
  *         or NULL on failure.
  */
-static void *omap_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *omap_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct omap_rproc *oproc = rproc->priv;
        int i;
index dcb380e..e5778e4 100644 (file)
@@ -244,8 +244,8 @@ static int pru_rproc_debug_ss_get(void *data, u64 *val)
 
        return 0;
 }
-DEFINE_SIMPLE_ATTRIBUTE(pru_rproc_debug_ss_fops, pru_rproc_debug_ss_get,
-                       pru_rproc_debug_ss_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(pru_rproc_debug_ss_fops, pru_rproc_debug_ss_get,
+                        pru_rproc_debug_ss_set, "%llu\n");
 
 /*
  * Create PRU-specific debugfs entries
@@ -266,12 +266,17 @@ static void pru_rproc_create_debug_entries(struct rproc *rproc)
 
 static void pru_dispose_irq_mapping(struct pru_rproc *pru)
 {
-       while (pru->evt_count--) {
+       if (!pru->mapped_irq)
+               return;
+
+       while (pru->evt_count) {
+               pru->evt_count--;
                if (pru->mapped_irq[pru->evt_count] > 0)
                        irq_dispose_mapping(pru->mapped_irq[pru->evt_count]);
        }
 
        kfree(pru->mapped_irq);
+       pru->mapped_irq = NULL;
 }
 
 /*
@@ -284,7 +289,7 @@ static int pru_handle_intrmap(struct rproc *rproc)
        struct pru_rproc *pru = rproc->priv;
        struct pru_irq_rsc *rsc = pru->pru_interrupt_map;
        struct irq_fwspec fwspec;
-       struct device_node *irq_parent;
+       struct device_node *parent, *irq_parent;
        int i, ret = 0;
 
        /* not having pru_interrupt_map is not an error */
@@ -307,16 +312,31 @@ static int pru_handle_intrmap(struct rproc *rproc)
        pru->evt_count = rsc->num_evts;
        pru->mapped_irq = kcalloc(pru->evt_count, sizeof(unsigned int),
                                  GFP_KERNEL);
-       if (!pru->mapped_irq)
+       if (!pru->mapped_irq) {
+               pru->evt_count = 0;
                return -ENOMEM;
+       }
 
        /*
         * parse and fill in system event to interrupt channel and
-        * channel-to-host mapping
+        * channel-to-host mapping. The interrupt controller to be used
+        * for these mappings for a given PRU remoteproc is always its
+        * corresponding sibling PRUSS INTC node.
         */
-       irq_parent = of_irq_find_parent(pru->dev->of_node);
+       parent = of_get_parent(dev_of_node(pru->dev));
+       if (!parent) {
+               kfree(pru->mapped_irq);
+               pru->mapped_irq = NULL;
+               pru->evt_count = 0;
+               return -ENODEV;
+       }
+
+       irq_parent = of_get_child_by_name(parent, "interrupt-controller");
+       of_node_put(parent);
        if (!irq_parent) {
                kfree(pru->mapped_irq);
+               pru->mapped_irq = NULL;
+               pru->evt_count = 0;
                return -ENODEV;
        }
 
@@ -332,16 +352,20 @@ static int pru_handle_intrmap(struct rproc *rproc)
 
                pru->mapped_irq[i] = irq_create_fwspec_mapping(&fwspec);
                if (!pru->mapped_irq[i]) {
-                       dev_err(dev, "failed to get virq\n");
-                       ret = pru->mapped_irq[i];
+                       dev_err(dev, "failed to get virq for fw mapping %d: event %d chnl %d host %d\n",
+                               i, fwspec.param[0], fwspec.param[1],
+                               fwspec.param[2]);
+                       ret = -EINVAL;
                        goto map_fail;
                }
        }
+       of_node_put(irq_parent);
 
        return ret;
 
 map_fail:
        pru_dispose_irq_mapping(pru);
+       of_node_put(irq_parent);
 
        return ret;
 }
@@ -387,8 +411,7 @@ static int pru_rproc_stop(struct rproc *rproc)
        pru_control_write_reg(pru, PRU_CTRL_CTRL, val);
 
        /* dispose irq mapping - new firmware can provide new mapping */
-       if (pru->mapped_irq)
-               pru_dispose_irq_mapping(pru);
+       pru_dispose_irq_mapping(pru);
 
        return 0;
 }
@@ -483,7 +506,7 @@ static void *pru_i_da_to_va(struct pru_rproc *pru, u32 da, size_t len)
  * core for any PRU client drivers. The PRU Instruction RAM access is restricted
  * only to the PRU loader code.
  */
-static void *pru_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *pru_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct pru_rproc *pru = rproc->priv;
 
index e024502..8b0d8bb 100644 (file)
@@ -281,7 +281,7 @@ static int adsp_stop(struct rproc *rproc)
        return ret;
 }
 
-static void *adsp_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *adsp_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct qcom_adsp *adsp = (struct qcom_adsp *)rproc->priv;
        int offset;
index 66106ba..423b31d 100644 (file)
@@ -1210,6 +1210,14 @@ static int q6v5_mpss_load(struct q6v5 *qproc)
                        goto release_firmware;
                }
 
+               if (phdr->p_filesz > phdr->p_memsz) {
+                       dev_err(qproc->dev,
+                               "refusing to load segment %d with p_filesz > p_memsz\n",
+                               i);
+                       ret = -EINVAL;
+                       goto release_firmware;
+               }
+
                ptr = memremap(qproc->mpss_phys + offset, phdr->p_memsz, MEMREMAP_WC);
                if (!ptr) {
                        dev_err(qproc->dev,
@@ -1241,6 +1249,16 @@ static int q6v5_mpss_load(struct q6v5 *qproc)
                                goto release_firmware;
                        }
 
+                       if (seg_fw->size != phdr->p_filesz) {
+                               dev_err(qproc->dev,
+                                       "failed to load segment %d from truncated file %s\n",
+                                       i, fw_name);
+                               ret = -EINVAL;
+                               release_firmware(seg_fw);
+                               memunmap(ptr);
+                               goto release_firmware;
+                       }
+
                        release_firmware(seg_fw);
                }
 
@@ -1661,8 +1679,10 @@ static int q6v5_probe(struct platform_device *pdev)
        mba_image = desc->hexagon_mba_image;
        ret = of_property_read_string_index(pdev->dev.of_node, "firmware-name",
                                            0, &mba_image);
-       if (ret < 0 && ret != -EINVAL)
+       if (ret < 0 && ret != -EINVAL) {
+               dev_err(&pdev->dev, "unable to read mba firmware-name\n");
                return ret;
+       }
 
        rproc = rproc_alloc(&pdev->dev, pdev->name, &q6v5_ops,
                            mba_image, sizeof(*qproc));
@@ -1680,8 +1700,10 @@ static int q6v5_probe(struct platform_device *pdev)
        qproc->hexagon_mdt_image = "modem.mdt";
        ret = of_property_read_string_index(pdev->dev.of_node, "firmware-name",
                                            1, &qproc->hexagon_mdt_image);
-       if (ret < 0 && ret != -EINVAL)
+       if (ret < 0 && ret != -EINVAL) {
+               dev_err(&pdev->dev, "unable to read mpss firmware-name\n");
                goto free_rproc;
+       }
 
        platform_set_drvdata(pdev, qproc);
 
index e635454..b921fc2 100644 (file)
@@ -242,7 +242,7 @@ static int adsp_stop(struct rproc *rproc)
        return ret;
 }
 
-static void *adsp_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *adsp_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct qcom_adsp *adsp = (struct qcom_adsp *)rproc->priv;
        int offset;
@@ -785,6 +785,22 @@ static const struct adsp_data wcss_resource_init = {
        .ssctl_id = 0x12,
 };
 
+static const struct adsp_data sdx55_mpss_resource = {
+       .crash_reason_smem = 421,
+       .firmware_name = "modem.mdt",
+       .pas_id = 4,
+       .has_aggre2_clk = false,
+       .auto_boot = true,
+       .proxy_pd_names = (char*[]){
+               "cx",
+               "mss",
+               NULL
+       },
+       .ssr_name = "mpss",
+       .sysmon_name = "modem",
+       .ssctl_id = 0x22,
+};
+
 static const struct of_device_id adsp_of_match[] = {
        { .compatible = "qcom,msm8974-adsp-pil", .data = &adsp_resource_init},
        { .compatible = "qcom,msm8996-adsp-pil", .data = &adsp_resource_init},
@@ -797,6 +813,7 @@ static const struct of_device_id adsp_of_match[] = {
        { .compatible = "qcom,sc7180-mpss-pas", .data = &mpss_resource_init},
        { .compatible = "qcom,sdm845-adsp-pas", .data = &adsp_resource_init},
        { .compatible = "qcom,sdm845-cdsp-pas", .data = &cdsp_resource_init},
+       { .compatible = "qcom,sdx55-mpss-pas", .data = &sdx55_mpss_resource},
        { .compatible = "qcom,sm8150-adsp-pas", .data = &sm8150_adsp_resource},
        { .compatible = "qcom,sm8150-cdsp-pas", .data = &sm8150_cdsp_resource},
        { .compatible = "qcom,sm8150-mpss-pas", .data = &mpss_resource_init},
index 78ebe11..20d50ec 100644 (file)
@@ -4,13 +4,18 @@
  * Copyright (C) 2014 Sony Mobile Communications AB
  * Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
  */
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/io.h>
 #include <linux/iopoll.h>
 #include <linux/kernel.h>
 #include <linux/mfd/syscon.h>
 #include <linux/module.h>
+#include <linux/of_address.h>
 #include <linux/of_reserved_mem.h>
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
+#include <linux/regulator/consumer.h>
 #include <linux/reset.h>
 #include <linux/soc/qcom/mdt_loader.h>
 #include "qcom_common.h"
@@ -24,6 +29,9 @@
 #define Q6SS_GFMUX_CTL_REG             0x020
 #define Q6SS_PWR_CTL_REG               0x030
 #define Q6SS_MEM_PWR_CTL               0x0B0
+#define Q6SS_STRAP_ACC                 0x110
+#define Q6SS_CGC_OVERRIDE              0x034
+#define Q6SS_BCR_REG                   0x6000
 
 /* AXI Halt Register Offsets */
 #define AXI_HALTREQ_REG                        0x0
 #define Q6SS_CORE_ARES                 BIT(1)
 #define Q6SS_BUS_ARES_ENABLE           BIT(2)
 
+/* Q6SS_BRC_RESET */
+#define Q6SS_BRC_BLK_ARES              BIT(0)
+
 /* Q6SS_GFMUX_CTL */
 #define Q6SS_CLK_ENABLE                        BIT(1)
+#define Q6SS_SWITCH_CLK_SRC            BIT(8)
 
 /* Q6SS_PWR_CTL */
 #define Q6SS_L2DATA_STBY_N             BIT(18)
 #define Q6SS_SLP_RET_N                 BIT(19)
 #define Q6SS_CLAMP_IO                  BIT(20)
 #define QDSS_BHS_ON                    BIT(21)
+#define QDSS_Q6_MEMORIES               GENMASK(15, 0)
 
 /* Q6SS parameters */
 #define Q6SS_LDO_BYP           BIT(25)
@@ -53,6 +66,7 @@
 #define Q6SS_CLAMP_QMC_MEM             BIT(22)
 #define HALT_CHECK_MAX_LOOPS           200
 #define Q6SS_XO_CBCR           GENMASK(5, 3)
+#define Q6SS_SLEEP_CBCR                GENMASK(5, 2)
 
 /* Q6SS config/status registers */
 #define TCSR_GLOBAL_CFG0       0x0
 #define TCSR_WCSS_CLK_MASK     0x1F
 #define TCSR_WCSS_CLK_ENABLE   0x14
 
+#define MAX_HALT_REG           3
+enum {
+       WCSS_IPQ8074,
+       WCSS_QCS404,
+};
+
+struct wcss_data {
+       const char *firmware_name;
+       unsigned int crash_reason_smem;
+       u32 version;
+       bool aon_reset_required;
+       bool wcss_q6_reset_required;
+       const char *ssr_name;
+       const char *sysmon_name;
+       int ssctl_id;
+       const struct rproc_ops *ops;
+       bool requires_force_stop;
+};
+
 struct q6v5_wcss {
        struct device *dev;
 
@@ -82,9 +115,26 @@ struct q6v5_wcss {
        u32 halt_wcss;
        u32 halt_nc;
 
+       struct clk *xo;
+       struct clk *ahbfabric_cbcr_clk;
+       struct clk *gcc_abhs_cbcr;
+       struct clk *gcc_axim_cbcr;
+       struct clk *lcc_csr_cbcr;
+       struct clk *ahbs_cbcr;
+       struct clk *tcm_slave_cbcr;
+       struct clk *qdsp6ss_abhm_cbcr;
+       struct clk *qdsp6ss_sleep_cbcr;
+       struct clk *qdsp6ss_axim_cbcr;
+       struct clk *qdsp6ss_xo_cbcr;
+       struct clk *qdsp6ss_core_gfmux;
+       struct clk *lcc_bcr_sleep;
+       struct regulator *cx_supply;
+       struct qcom_sysmon *sysmon;
+
        struct reset_control *wcss_aon_reset;
        struct reset_control *wcss_reset;
        struct reset_control *wcss_q6_reset;
+       struct reset_control *wcss_q6_bcr_reset;
 
        struct qcom_q6v5 q6v5;
 
@@ -93,6 +143,10 @@ struct q6v5_wcss {
        void *mem_region;
        size_t mem_size;
 
+       unsigned int crash_reason_smem;
+       u32 version;
+       bool requires_force_stop;
+
        struct qcom_rproc_glink glink_subdev;
        struct qcom_rproc_ssr ssr_subdev;
 };
@@ -237,6 +291,207 @@ wcss_reset:
        return ret;
 }
 
+static int q6v5_wcss_qcs404_power_on(struct q6v5_wcss *wcss)
+{
+       unsigned long val;
+       int ret, idx;
+
+       /* Toggle the restart */
+       reset_control_assert(wcss->wcss_reset);
+       usleep_range(200, 300);
+       reset_control_deassert(wcss->wcss_reset);
+       usleep_range(200, 300);
+
+       /* Enable GCC_WDSP_Q6SS_AHBS_CBCR clock */
+       ret = clk_prepare_enable(wcss->gcc_abhs_cbcr);
+       if (ret)
+               return ret;
+
+       /* Remove reset to the WCNSS QDSP6SS */
+       reset_control_deassert(wcss->wcss_q6_bcr_reset);
+
+       /* Enable Q6SSTOP_AHBFABRIC_CBCR clock */
+       ret = clk_prepare_enable(wcss->ahbfabric_cbcr_clk);
+       if (ret)
+               goto disable_gcc_abhs_cbcr_clk;
+
+       /* Enable the LCCCSR CBC clock, Q6SSTOP_Q6SSTOP_LCC_CSR_CBCR clock */
+       ret = clk_prepare_enable(wcss->lcc_csr_cbcr);
+       if (ret)
+               goto disable_ahbfabric_cbcr_clk;
+
+       /* Enable the Q6AHBS CBC, Q6SSTOP_Q6SS_AHBS_CBCR clock */
+       ret = clk_prepare_enable(wcss->ahbs_cbcr);
+       if (ret)
+               goto disable_csr_cbcr_clk;
+
+       /* Enable the TCM slave CBC, Q6SSTOP_Q6SS_TCM_SLAVE_CBCR clock */
+       ret = clk_prepare_enable(wcss->tcm_slave_cbcr);
+       if (ret)
+               goto disable_ahbs_cbcr_clk;
+
+       /* Enable the Q6SS AHB master CBC, Q6SSTOP_Q6SS_AHBM_CBCR clock */
+       ret = clk_prepare_enable(wcss->qdsp6ss_abhm_cbcr);
+       if (ret)
+               goto disable_tcm_slave_cbcr_clk;
+
+       /* Enable the Q6SS AXI master CBC, Q6SSTOP_Q6SS_AXIM_CBCR clock */
+       ret = clk_prepare_enable(wcss->qdsp6ss_axim_cbcr);
+       if (ret)
+               goto disable_abhm_cbcr_clk;
+
+       /* Enable the Q6SS XO CBC */
+       val = readl(wcss->reg_base + Q6SS_XO_CBCR);
+       val |= BIT(0);
+       writel(val, wcss->reg_base + Q6SS_XO_CBCR);
+       /* Read CLKOFF bit to go low indicating CLK is enabled */
+       ret = readl_poll_timeout(wcss->reg_base + Q6SS_XO_CBCR,
+                                val, !(val & BIT(31)), 1,
+                                HALT_CHECK_MAX_LOOPS);
+       if (ret) {
+               dev_err(wcss->dev,
+                       "xo cbcr enabling timed out (rc:%d)\n", ret);
+               return ret;
+       }
+
+       writel(0, wcss->reg_base + Q6SS_CGC_OVERRIDE);
+
+       /* Enable QDSP6 sleep clock clock */
+       val = readl(wcss->reg_base + Q6SS_SLEEP_CBCR);
+       val |= BIT(0);
+       writel(val, wcss->reg_base + Q6SS_SLEEP_CBCR);
+
+       /* Enable the Enable the Q6 AXI clock, GCC_WDSP_Q6SS_AXIM_CBCR*/
+       ret = clk_prepare_enable(wcss->gcc_axim_cbcr);
+       if (ret)
+               goto disable_sleep_cbcr_clk;
+
+       /* Assert resets, stop core */
+       val = readl(wcss->reg_base + Q6SS_RESET_REG);
+       val |= Q6SS_CORE_ARES | Q6SS_BUS_ARES_ENABLE | Q6SS_STOP_CORE;
+       writel(val, wcss->reg_base + Q6SS_RESET_REG);
+
+       /* Program the QDSP6SS PWR_CTL register */
+       writel(0x01700000, wcss->reg_base + Q6SS_PWR_CTL_REG);
+
+       writel(0x03700000, wcss->reg_base + Q6SS_PWR_CTL_REG);
+
+       writel(0x03300000, wcss->reg_base + Q6SS_PWR_CTL_REG);
+
+       writel(0x033C0000, wcss->reg_base + Q6SS_PWR_CTL_REG);
+
+       /*
+        * Enable memories by turning on the QDSP6 memory foot/head switch, one
+        * bank at a time to avoid in-rush current
+        */
+       for (idx = 28; idx >= 0; idx--) {
+               writel((readl(wcss->reg_base + Q6SS_MEM_PWR_CTL) |
+                       (1 << idx)), wcss->reg_base + Q6SS_MEM_PWR_CTL);
+       }
+
+       writel(0x031C0000, wcss->reg_base + Q6SS_PWR_CTL_REG);
+       writel(0x030C0000, wcss->reg_base + Q6SS_PWR_CTL_REG);
+
+       val = readl(wcss->reg_base + Q6SS_RESET_REG);
+       val &= ~Q6SS_CORE_ARES;
+       writel(val, wcss->reg_base + Q6SS_RESET_REG);
+
+       /* Enable the Q6 core clock at the GFM, Q6SSTOP_QDSP6SS_GFMUX_CTL */
+       val = readl(wcss->reg_base + Q6SS_GFMUX_CTL_REG);
+       val |= Q6SS_CLK_ENABLE | Q6SS_SWITCH_CLK_SRC;
+       writel(val, wcss->reg_base + Q6SS_GFMUX_CTL_REG);
+
+       /* Enable sleep clock branch needed for BCR circuit */
+       ret = clk_prepare_enable(wcss->lcc_bcr_sleep);
+       if (ret)
+               goto disable_core_gfmux_clk;
+
+       return 0;
+
+disable_core_gfmux_clk:
+       val = readl(wcss->reg_base + Q6SS_GFMUX_CTL_REG);
+       val &= ~(Q6SS_CLK_ENABLE | Q6SS_SWITCH_CLK_SRC);
+       writel(val, wcss->reg_base + Q6SS_GFMUX_CTL_REG);
+       clk_disable_unprepare(wcss->gcc_axim_cbcr);
+disable_sleep_cbcr_clk:
+       val = readl(wcss->reg_base + Q6SS_SLEEP_CBCR);
+       val &= ~Q6SS_CLK_ENABLE;
+       writel(val, wcss->reg_base + Q6SS_SLEEP_CBCR);
+       val = readl(wcss->reg_base + Q6SS_XO_CBCR);
+       val &= ~Q6SS_CLK_ENABLE;
+       writel(val, wcss->reg_base + Q6SS_XO_CBCR);
+       clk_disable_unprepare(wcss->qdsp6ss_axim_cbcr);
+disable_abhm_cbcr_clk:
+       clk_disable_unprepare(wcss->qdsp6ss_abhm_cbcr);
+disable_tcm_slave_cbcr_clk:
+       clk_disable_unprepare(wcss->tcm_slave_cbcr);
+disable_ahbs_cbcr_clk:
+       clk_disable_unprepare(wcss->ahbs_cbcr);
+disable_csr_cbcr_clk:
+       clk_disable_unprepare(wcss->lcc_csr_cbcr);
+disable_ahbfabric_cbcr_clk:
+       clk_disable_unprepare(wcss->ahbfabric_cbcr_clk);
+disable_gcc_abhs_cbcr_clk:
+       clk_disable_unprepare(wcss->gcc_abhs_cbcr);
+
+       return ret;
+}
+
+static inline int q6v5_wcss_qcs404_reset(struct q6v5_wcss *wcss)
+{
+       unsigned long val;
+
+       writel(0x80800000, wcss->reg_base + Q6SS_STRAP_ACC);
+
+       /* Start core execution */
+       val = readl(wcss->reg_base + Q6SS_RESET_REG);
+       val &= ~Q6SS_STOP_CORE;
+       writel(val, wcss->reg_base + Q6SS_RESET_REG);
+
+       return 0;
+}
+
+static int q6v5_qcs404_wcss_start(struct rproc *rproc)
+{
+       struct q6v5_wcss *wcss = rproc->priv;
+       int ret;
+
+       ret = clk_prepare_enable(wcss->xo);
+       if (ret)
+               return ret;
+
+       ret = regulator_enable(wcss->cx_supply);
+       if (ret)
+               goto disable_xo_clk;
+
+       qcom_q6v5_prepare(&wcss->q6v5);
+
+       ret = q6v5_wcss_qcs404_power_on(wcss);
+       if (ret) {
+               dev_err(wcss->dev, "wcss clk_enable failed\n");
+               goto disable_cx_supply;
+       }
+
+       writel(rproc->bootaddr >> 4, wcss->reg_base + Q6SS_RST_EVB);
+
+       q6v5_wcss_qcs404_reset(wcss);
+
+       ret = qcom_q6v5_wait_for_start(&wcss->q6v5, 5 * HZ);
+       if (ret == -ETIMEDOUT) {
+               dev_err(wcss->dev, "start timed out\n");
+               goto disable_cx_supply;
+       }
+
+       return 0;
+
+disable_cx_supply:
+       regulator_disable(wcss->cx_supply);
+disable_xo_clk:
+       clk_disable_unprepare(wcss->xo);
+
+       return ret;
+}
+
 static void q6v5_wcss_halt_axi_port(struct q6v5_wcss *wcss,
                                    struct regmap *halt_map,
                                    u32 offset)
@@ -271,6 +526,70 @@ static void q6v5_wcss_halt_axi_port(struct q6v5_wcss *wcss,
        regmap_write(halt_map, offset + AXI_HALTREQ_REG, 0);
 }
 
+static int q6v5_qcs404_wcss_shutdown(struct q6v5_wcss *wcss)
+{
+       unsigned long val;
+       int ret;
+
+       q6v5_wcss_halt_axi_port(wcss, wcss->halt_map, wcss->halt_wcss);
+
+       /* assert clamps to avoid MX current inrush */
+       val = readl(wcss->reg_base + Q6SS_PWR_CTL_REG);
+       val |= (Q6SS_CLAMP_IO | Q6SS_CLAMP_WL | Q6SS_CLAMP_QMC_MEM);
+       writel(val, wcss->reg_base + Q6SS_PWR_CTL_REG);
+
+       /* Disable memories by turning off memory foot/headswitch */
+       writel((readl(wcss->reg_base + Q6SS_MEM_PWR_CTL) &
+               ~QDSS_Q6_MEMORIES),
+               wcss->reg_base + Q6SS_MEM_PWR_CTL);
+
+       /* Clear the BHS_ON bit */
+       val = readl(wcss->reg_base + Q6SS_PWR_CTL_REG);
+       val &= ~Q6SS_BHS_ON;
+       writel(val, wcss->reg_base + Q6SS_PWR_CTL_REG);
+
+       clk_disable_unprepare(wcss->ahbfabric_cbcr_clk);
+       clk_disable_unprepare(wcss->lcc_csr_cbcr);
+       clk_disable_unprepare(wcss->tcm_slave_cbcr);
+       clk_disable_unprepare(wcss->qdsp6ss_abhm_cbcr);
+       clk_disable_unprepare(wcss->qdsp6ss_axim_cbcr);
+
+       val = readl(wcss->reg_base + Q6SS_SLEEP_CBCR);
+       val &= ~BIT(0);
+       writel(val, wcss->reg_base + Q6SS_SLEEP_CBCR);
+
+       val = readl(wcss->reg_base + Q6SS_XO_CBCR);
+       val &= ~BIT(0);
+       writel(val, wcss->reg_base + Q6SS_XO_CBCR);
+
+       clk_disable_unprepare(wcss->ahbs_cbcr);
+       clk_disable_unprepare(wcss->lcc_bcr_sleep);
+
+       val = readl(wcss->reg_base + Q6SS_GFMUX_CTL_REG);
+       val &= ~(Q6SS_CLK_ENABLE | Q6SS_SWITCH_CLK_SRC);
+       writel(val, wcss->reg_base + Q6SS_GFMUX_CTL_REG);
+
+       clk_disable_unprepare(wcss->gcc_abhs_cbcr);
+
+       ret = reset_control_assert(wcss->wcss_reset);
+       if (ret) {
+               dev_err(wcss->dev, "wcss_reset failed\n");
+               return ret;
+       }
+       usleep_range(200, 300);
+
+       ret = reset_control_deassert(wcss->wcss_reset);
+       if (ret) {
+               dev_err(wcss->dev, "wcss_reset failed\n");
+               return ret;
+       }
+       usleep_range(200, 300);
+
+       clk_disable_unprepare(wcss->gcc_axim_cbcr);
+
+       return 0;
+}
+
 static int q6v5_wcss_powerdown(struct q6v5_wcss *wcss)
 {
        int ret;
@@ -390,27 +709,35 @@ static int q6v5_wcss_stop(struct rproc *rproc)
        int ret;
 
        /* WCSS powerdown */
-       ret = qcom_q6v5_request_stop(&wcss->q6v5, NULL);
-       if (ret == -ETIMEDOUT) {
-               dev_err(wcss->dev, "timed out on wait\n");
-               return ret;
+       if (wcss->requires_force_stop) {
+               ret = qcom_q6v5_request_stop(&wcss->q6v5, NULL);
+               if (ret == -ETIMEDOUT) {
+                       dev_err(wcss->dev, "timed out on wait\n");
+                       return ret;
+               }
        }
 
-       ret = q6v5_wcss_powerdown(wcss);
-       if (ret)
-               return ret;
-
-       /* Q6 Power down */
-       ret = q6v5_q6_powerdown(wcss);
-       if (ret)
-               return ret;
+       if (wcss->version == WCSS_QCS404) {
+               ret = q6v5_qcs404_wcss_shutdown(wcss);
+               if (ret)
+                       return ret;
+       } else {
+               ret = q6v5_wcss_powerdown(wcss);
+               if (ret)
+                       return ret;
+
+               /* Q6 Power down */
+               ret = q6v5_q6_powerdown(wcss);
+               if (ret)
+                       return ret;
+       }
 
        qcom_q6v5_unprepare(&wcss->q6v5);
 
        return 0;
 }
 
-static void *q6v5_wcss_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *q6v5_wcss_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct q6v5_wcss *wcss = rproc->priv;
        int offset;
@@ -438,7 +765,7 @@ static int q6v5_wcss_load(struct rproc *rproc, const struct firmware *fw)
        return ret;
 }
 
-static const struct rproc_ops q6v5_wcss_ops = {
+static const struct rproc_ops q6v5_wcss_ipq8074_ops = {
        .start = q6v5_wcss_start,
        .stop = q6v5_wcss_stop,
        .da_to_va = q6v5_wcss_da_to_va,
@@ -446,26 +773,46 @@ static const struct rproc_ops q6v5_wcss_ops = {
        .get_boot_addr = rproc_elf_get_boot_addr,
 };
 
-static int q6v5_wcss_init_reset(struct q6v5_wcss *wcss)
+static const struct rproc_ops q6v5_wcss_qcs404_ops = {
+       .start = q6v5_qcs404_wcss_start,
+       .stop = q6v5_wcss_stop,
+       .da_to_va = q6v5_wcss_da_to_va,
+       .load = q6v5_wcss_load,
+       .get_boot_addr = rproc_elf_get_boot_addr,
+       .parse_fw = qcom_register_dump_segments,
+};
+
+static int q6v5_wcss_init_reset(struct q6v5_wcss *wcss,
+                               const struct wcss_data *desc)
 {
        struct device *dev = wcss->dev;
 
-       wcss->wcss_aon_reset = devm_reset_control_get(dev, "wcss_aon_reset");
-       if (IS_ERR(wcss->wcss_aon_reset)) {
-               dev_err(wcss->dev, "unable to acquire wcss_aon_reset\n");
-               return PTR_ERR(wcss->wcss_aon_reset);
+       if (desc->aon_reset_required) {
+               wcss->wcss_aon_reset = devm_reset_control_get_exclusive(dev, "wcss_aon_reset");
+               if (IS_ERR(wcss->wcss_aon_reset)) {
+                       dev_err(wcss->dev, "fail to acquire wcss_aon_reset\n");
+                       return PTR_ERR(wcss->wcss_aon_reset);
+               }
        }
 
-       wcss->wcss_reset = devm_reset_control_get(dev, "wcss_reset");
+       wcss->wcss_reset = devm_reset_control_get_exclusive(dev, "wcss_reset");
        if (IS_ERR(wcss->wcss_reset)) {
                dev_err(wcss->dev, "unable to acquire wcss_reset\n");
                return PTR_ERR(wcss->wcss_reset);
        }
 
-       wcss->wcss_q6_reset = devm_reset_control_get(dev, "wcss_q6_reset");
-       if (IS_ERR(wcss->wcss_q6_reset)) {
-               dev_err(wcss->dev, "unable to acquire wcss_q6_reset\n");
-               return PTR_ERR(wcss->wcss_q6_reset);
+       if (desc->wcss_q6_reset_required) {
+               wcss->wcss_q6_reset = devm_reset_control_get_exclusive(dev, "wcss_q6_reset");
+               if (IS_ERR(wcss->wcss_q6_reset)) {
+                       dev_err(wcss->dev, "unable to acquire wcss_q6_reset\n");
+                       return PTR_ERR(wcss->wcss_q6_reset);
+               }
+       }
+
+       wcss->wcss_q6_bcr_reset = devm_reset_control_get_exclusive(dev, "wcss_q6_bcr_reset");
+       if (IS_ERR(wcss->wcss_q6_bcr_reset)) {
+               dev_err(wcss->dev, "unable to acquire wcss_q6_bcr_reset\n");
+               return PTR_ERR(wcss->wcss_q6_bcr_reset);
        }
 
        return 0;
@@ -474,35 +821,48 @@ static int q6v5_wcss_init_reset(struct q6v5_wcss *wcss)
 static int q6v5_wcss_init_mmio(struct q6v5_wcss *wcss,
                               struct platform_device *pdev)
 {
-       struct of_phandle_args args;
+       unsigned int halt_reg[MAX_HALT_REG] = {0};
+       struct device_node *syscon;
        struct resource *res;
        int ret;
 
        res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "qdsp6");
-       wcss->reg_base = devm_ioremap_resource(&pdev->dev, res);
-       if (IS_ERR(wcss->reg_base))
-               return PTR_ERR(wcss->reg_base);
+       wcss->reg_base = devm_ioremap(&pdev->dev, res->start,
+                                     resource_size(res));
+       if (!wcss->reg_base)
+               return -ENOMEM;
 
-       res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rmb");
-       wcss->rmb_base = devm_ioremap_resource(&pdev->dev, res);
-       if (IS_ERR(wcss->rmb_base))
-               return PTR_ERR(wcss->rmb_base);
+       if (wcss->version == WCSS_IPQ8074) {
+               res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rmb");
+               wcss->rmb_base = devm_ioremap_resource(&pdev->dev, res);
+               if (IS_ERR(wcss->rmb_base))
+                       return PTR_ERR(wcss->rmb_base);
+       }
 
-       ret = of_parse_phandle_with_fixed_args(pdev->dev.of_node,
-                                              "qcom,halt-regs", 3, 0, &args);
-       if (ret < 0) {
+       syscon = of_parse_phandle(pdev->dev.of_node,
+                                 "qcom,halt-regs", 0);
+       if (!syscon) {
                dev_err(&pdev->dev, "failed to parse qcom,halt-regs\n");
                return -EINVAL;
        }
 
-       wcss->halt_map = syscon_node_to_regmap(args.np);
-       of_node_put(args.np);
+       wcss->halt_map = syscon_node_to_regmap(syscon);
+       of_node_put(syscon);
        if (IS_ERR(wcss->halt_map))
                return PTR_ERR(wcss->halt_map);
 
-       wcss->halt_q6 = args.args[0];
-       wcss->halt_wcss = args.args[1];
-       wcss->halt_nc = args.args[2];
+       ret = of_property_read_variable_u32_array(pdev->dev.of_node,
+                                                 "qcom,halt-regs",
+                                                 halt_reg, 0,
+                                                 MAX_HALT_REG);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "failed to parse qcom,halt-regs\n");
+               return -EINVAL;
+       }
+
+       wcss->halt_q6 = halt_reg[0];
+       wcss->halt_wcss = halt_reg[1];
+       wcss->halt_nc = halt_reg[2];
 
        return 0;
 }
@@ -536,14 +896,120 @@ static int q6v5_alloc_memory_region(struct q6v5_wcss *wcss)
        return 0;
 }
 
+static int q6v5_wcss_init_clock(struct q6v5_wcss *wcss)
+{
+       int ret;
+
+       wcss->xo = devm_clk_get(wcss->dev, "xo");
+       if (IS_ERR(wcss->xo)) {
+               ret = PTR_ERR(wcss->xo);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(wcss->dev, "failed to get xo clock");
+               return ret;
+       }
+
+       wcss->gcc_abhs_cbcr = devm_clk_get(wcss->dev, "gcc_abhs_cbcr");
+       if (IS_ERR(wcss->gcc_abhs_cbcr)) {
+               ret = PTR_ERR(wcss->gcc_abhs_cbcr);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(wcss->dev, "failed to get gcc abhs clock");
+               return ret;
+       }
+
+       wcss->gcc_axim_cbcr = devm_clk_get(wcss->dev, "gcc_axim_cbcr");
+       if (IS_ERR(wcss->gcc_axim_cbcr)) {
+               ret = PTR_ERR(wcss->gcc_axim_cbcr);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(wcss->dev, "failed to get gcc axim clock\n");
+               return ret;
+       }
+
+       wcss->ahbfabric_cbcr_clk = devm_clk_get(wcss->dev,
+                                               "lcc_ahbfabric_cbc");
+       if (IS_ERR(wcss->ahbfabric_cbcr_clk)) {
+               ret = PTR_ERR(wcss->ahbfabric_cbcr_clk);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(wcss->dev, "failed to get ahbfabric clock\n");
+               return ret;
+       }
+
+       wcss->lcc_csr_cbcr = devm_clk_get(wcss->dev, "tcsr_lcc_cbc");
+       if (IS_ERR(wcss->lcc_csr_cbcr)) {
+               ret = PTR_ERR(wcss->lcc_csr_cbcr);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(wcss->dev, "failed to get csr cbcr clk\n");
+               return ret;
+       }
+
+       wcss->ahbs_cbcr = devm_clk_get(wcss->dev,
+                                      "lcc_abhs_cbc");
+       if (IS_ERR(wcss->ahbs_cbcr)) {
+               ret = PTR_ERR(wcss->ahbs_cbcr);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(wcss->dev, "failed to get ahbs_cbcr clk\n");
+               return ret;
+       }
+
+       wcss->tcm_slave_cbcr = devm_clk_get(wcss->dev,
+                                           "lcc_tcm_slave_cbc");
+       if (IS_ERR(wcss->tcm_slave_cbcr)) {
+               ret = PTR_ERR(wcss->tcm_slave_cbcr);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(wcss->dev, "failed to get tcm cbcr clk\n");
+               return ret;
+       }
+
+       wcss->qdsp6ss_abhm_cbcr = devm_clk_get(wcss->dev, "lcc_abhm_cbc");
+       if (IS_ERR(wcss->qdsp6ss_abhm_cbcr)) {
+               ret = PTR_ERR(wcss->qdsp6ss_abhm_cbcr);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(wcss->dev, "failed to get abhm cbcr clk\n");
+               return ret;
+       }
+
+       wcss->qdsp6ss_axim_cbcr = devm_clk_get(wcss->dev, "lcc_axim_cbc");
+       if (IS_ERR(wcss->qdsp6ss_axim_cbcr)) {
+               ret = PTR_ERR(wcss->qdsp6ss_axim_cbcr);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(wcss->dev, "failed to get axim cbcr clk\n");
+               return ret;
+       }
+
+       wcss->lcc_bcr_sleep = devm_clk_get(wcss->dev, "lcc_bcr_sleep");
+       if (IS_ERR(wcss->lcc_bcr_sleep)) {
+               ret = PTR_ERR(wcss->lcc_bcr_sleep);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(wcss->dev, "failed to get bcr cbcr clk\n");
+               return ret;
+       }
+
+       return 0;
+}
+
+static int q6v5_wcss_init_regulator(struct q6v5_wcss *wcss)
+{
+       wcss->cx_supply = devm_regulator_get(wcss->dev, "cx");
+       if (IS_ERR(wcss->cx_supply))
+               return PTR_ERR(wcss->cx_supply);
+
+       regulator_set_load(wcss->cx_supply, 100000);
+
+       return 0;
+}
+
 static int q6v5_wcss_probe(struct platform_device *pdev)
 {
+       const struct wcss_data *desc;
        struct q6v5_wcss *wcss;
        struct rproc *rproc;
        int ret;
 
-       rproc = rproc_alloc(&pdev->dev, pdev->name, &q6v5_wcss_ops,
-                           "IPQ8074/q6_fw.mdt", sizeof(*wcss));
+       desc = device_get_match_data(&pdev->dev);
+       if (!desc)
+               return -EINVAL;
+
+       rproc = rproc_alloc(&pdev->dev, pdev->name, desc->ops,
+                           desc->firmware_name, sizeof(*wcss));
        if (!rproc) {
                dev_err(&pdev->dev, "failed to allocate rproc\n");
                return -ENOMEM;
@@ -551,6 +1017,10 @@ static int q6v5_wcss_probe(struct platform_device *pdev)
 
        wcss = rproc->priv;
        wcss->dev = &pdev->dev;
+       wcss->version = desc->version;
+
+       wcss->version = desc->version;
+       wcss->requires_force_stop = desc->requires_force_stop;
 
        ret = q6v5_wcss_init_mmio(wcss, pdev);
        if (ret)
@@ -560,17 +1030,33 @@ static int q6v5_wcss_probe(struct platform_device *pdev)
        if (ret)
                goto free_rproc;
 
-       ret = q6v5_wcss_init_reset(wcss);
+       if (wcss->version == WCSS_QCS404) {
+               ret = q6v5_wcss_init_clock(wcss);
+               if (ret)
+                       goto free_rproc;
+
+               ret = q6v5_wcss_init_regulator(wcss);
+               if (ret)
+                       goto free_rproc;
+       }
+
+       ret = q6v5_wcss_init_reset(wcss, desc);
        if (ret)
                goto free_rproc;
 
-       ret = qcom_q6v5_init(&wcss->q6v5, pdev, rproc, WCSS_CRASH_REASON, NULL);
+       ret = qcom_q6v5_init(&wcss->q6v5, pdev, rproc, desc->crash_reason_smem,
+                            NULL);
        if (ret)
                goto free_rproc;
 
        qcom_add_glink_subdev(rproc, &wcss->glink_subdev, "q6wcss");
        qcom_add_ssr_subdev(rproc, &wcss->ssr_subdev, "q6wcss");
 
+       if (desc->ssctl_id)
+               wcss->sysmon = qcom_add_sysmon_subdev(rproc,
+                                                     desc->sysmon_name,
+                                                     desc->ssctl_id);
+
        ret = rproc_add(rproc);
        if (ret)
                goto free_rproc;
@@ -595,8 +1081,31 @@ static int q6v5_wcss_remove(struct platform_device *pdev)
        return 0;
 }
 
+static const struct wcss_data wcss_ipq8074_res_init = {
+       .firmware_name = "IPQ8074/q6_fw.mdt",
+       .crash_reason_smem = WCSS_CRASH_REASON,
+       .aon_reset_required = true,
+       .wcss_q6_reset_required = true,
+       .ops = &q6v5_wcss_ipq8074_ops,
+       .requires_force_stop = true,
+};
+
+static const struct wcss_data wcss_qcs404_res_init = {
+       .crash_reason_smem = WCSS_CRASH_REASON,
+       .firmware_name = "wcnss.mdt",
+       .version = WCSS_QCS404,
+       .aon_reset_required = false,
+       .wcss_q6_reset_required = false,
+       .ssr_name = "mpss",
+       .sysmon_name = "wcnss",
+       .ssctl_id = 0x12,
+       .ops = &q6v5_wcss_qcs404_ops,
+       .requires_force_stop = false,
+};
+
 static const struct of_device_id q6v5_wcss_of_match[] = {
-       { .compatible = "qcom,ipq8074-wcss-pil" },
+       { .compatible = "qcom,ipq8074-wcss-pil", .data = &wcss_ipq8074_res_init },
+       { .compatible = "qcom,qcs404-wcss-pil", .data = &wcss_qcs404_res_init },
        { },
 };
 MODULE_DEVICE_TABLE(of, q6v5_wcss_of_match);
index 2a6a23c..5f3455a 100644 (file)
@@ -320,7 +320,7 @@ static int wcnss_stop(struct rproc *rproc)
        return ret;
 }
 
-static void *wcnss_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *wcnss_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct qcom_wcnss *wcnss = (struct qcom_wcnss *)rproc->priv;
        int offset;
@@ -530,6 +530,7 @@ static int wcnss_alloc_memory_region(struct qcom_wcnss *wcnss)
 
 static int wcnss_probe(struct platform_device *pdev)
 {
+       const char *fw_name = WCNSS_FIRMWARE_NAME;
        const struct wcnss_data *data;
        struct qcom_wcnss *wcnss;
        struct resource *res;
@@ -547,8 +548,13 @@ static int wcnss_probe(struct platform_device *pdev)
                return -ENXIO;
        }
 
+       ret = of_property_read_string(pdev->dev.of_node, "firmware-name",
+                                     &fw_name);
+       if (ret < 0 && ret != -EINVAL)
+               return ret;
+
        rproc = rproc_alloc(&pdev->dev, pdev->name, &wcnss_ops,
-                           WCNSS_FIRMWARE_NAME, sizeof(*wcnss));
+                           fw_name, sizeof(*wcnss));
        if (!rproc) {
                dev_err(&pdev->dev, "unable to allocate remoteproc\n");
                return -ENOMEM;
index b19ea30..0b8a84c 100644 (file)
@@ -32,15 +32,22 @@ static ssize_t rproc_cdev_write(struct file *filp, const char __user *buf, size_
                return -EFAULT;
 
        if (!strncmp(cmd, "start", len)) {
-               if (rproc->state == RPROC_RUNNING)
+               if (rproc->state == RPROC_RUNNING ||
+                   rproc->state == RPROC_ATTACHED)
                        return -EBUSY;
 
                ret = rproc_boot(rproc);
        } else if (!strncmp(cmd, "stop", len)) {
-               if (rproc->state != RPROC_RUNNING)
+               if (rproc->state != RPROC_RUNNING &&
+                   rproc->state != RPROC_ATTACHED)
                        return -EINVAL;
 
                rproc_shutdown(rproc);
+       } else if (!strncmp(cmd, "detach", len)) {
+               if (rproc->state != RPROC_ATTACHED)
+                       return -EINVAL;
+
+               ret = rproc_detach(rproc);
        } else {
                dev_err(&rproc->dev, "Unrecognized option\n");
                ret = -EINVAL;
@@ -79,11 +86,17 @@ static long rproc_device_ioctl(struct file *filp, unsigned int ioctl, unsigned l
 static int rproc_cdev_release(struct inode *inode, struct file *filp)
 {
        struct rproc *rproc = container_of(inode->i_cdev, struct rproc, cdev);
+       int ret = 0;
 
-       if (rproc->cdev_put_on_release && rproc->state == RPROC_RUNNING)
+       if (!rproc->cdev_put_on_release)
+               return 0;
+
+       if (rproc->state == RPROC_RUNNING)
                rproc_shutdown(rproc);
+       else if (rproc->state == RPROC_ATTACHED)
+               ret = rproc_detach(rproc);
 
-       return 0;
+       return ret;
 }
 
 static const struct file_operations rproc_fops = {
index ab15076..626a6b9 100644 (file)
@@ -189,13 +189,13 @@ EXPORT_SYMBOL(rproc_va_to_pa);
  * here the output of the DMA API for the carveouts, which should be more
  * correct.
  */
-void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct rproc_mem_entry *carveout;
        void *ptr = NULL;
 
        if (rproc->ops->da_to_va) {
-               ptr = rproc->ops->da_to_va(rproc, da, len);
+               ptr = rproc->ops->da_to_va(rproc, da, len, is_iomem);
                if (ptr)
                        goto out;
        }
@@ -217,6 +217,9 @@ void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
 
                ptr = carveout->va + offset;
 
+               if (is_iomem)
+                       *is_iomem = carveout->is_iomem;
+
                break;
        }
 
@@ -482,7 +485,7 @@ static int copy_dma_range_map(struct device *to, struct device *from)
 /**
  * rproc_handle_vdev() - handle a vdev fw resource
  * @rproc: the remote processor
- * @rsc: the vring resource descriptor
+ * @ptr: the vring resource descriptor
  * @offset: offset of the resource entry
  * @avail: size of available data (for sanity checking the image)
  *
@@ -507,9 +510,10 @@ static int copy_dma_range_map(struct device *to, struct device *from)
  *
  * Returns 0 on success, or an appropriate error code otherwise
  */
-static int rproc_handle_vdev(struct rproc *rproc, struct fw_rsc_vdev *rsc,
+static int rproc_handle_vdev(struct rproc *rproc, void *ptr,
                             int offset, int avail)
 {
+       struct fw_rsc_vdev *rsc = ptr;
        struct device *dev = &rproc->dev;
        struct rproc_vdev *rvdev;
        int i, ret;
@@ -627,7 +631,7 @@ void rproc_vdev_release(struct kref *ref)
 /**
  * rproc_handle_trace() - handle a shared trace buffer resource
  * @rproc: the remote processor
- * @rsc: the trace resource descriptor
+ * @ptr: the trace resource descriptor
  * @offset: offset of the resource entry
  * @avail: size of available data (for sanity checking the image)
  *
@@ -641,9 +645,10 @@ void rproc_vdev_release(struct kref *ref)
  *
  * Returns 0 on success, or an appropriate error code otherwise
  */
-static int rproc_handle_trace(struct rproc *rproc, struct fw_rsc_trace *rsc,
+static int rproc_handle_trace(struct rproc *rproc, void *ptr,
                              int offset, int avail)
 {
+       struct fw_rsc_trace *rsc = ptr;
        struct rproc_debug_trace *trace;
        struct device *dev = &rproc->dev;
        char name[15];
@@ -693,7 +698,7 @@ static int rproc_handle_trace(struct rproc *rproc, struct fw_rsc_trace *rsc,
 /**
  * rproc_handle_devmem() - handle devmem resource entry
  * @rproc: remote processor handle
- * @rsc: the devmem resource entry
+ * @ptr: the devmem resource entry
  * @offset: offset of the resource entry
  * @avail: size of available data (for sanity checking the image)
  *
@@ -716,9 +721,10 @@ static int rproc_handle_trace(struct rproc *rproc, struct fw_rsc_trace *rsc,
  * and not allow firmwares to request access to physical addresses that
  * are outside those ranges.
  */
-static int rproc_handle_devmem(struct rproc *rproc, struct fw_rsc_devmem *rsc,
+static int rproc_handle_devmem(struct rproc *rproc, void *ptr,
                               int offset, int avail)
 {
+       struct fw_rsc_devmem *rsc = ptr;
        struct rproc_mem_entry *mapping;
        struct device *dev = &rproc->dev;
        int ret;
@@ -896,7 +902,7 @@ static int rproc_release_carveout(struct rproc *rproc,
 /**
  * rproc_handle_carveout() - handle phys contig memory allocation requests
  * @rproc: rproc handle
- * @rsc: the resource entry
+ * @ptr: the resource entry
  * @offset: offset of the resource entry
  * @avail: size of available data (for image validation)
  *
@@ -913,9 +919,9 @@ static int rproc_release_carveout(struct rproc *rproc,
  * pressure is important; it may have a substantial impact on performance.
  */
 static int rproc_handle_carveout(struct rproc *rproc,
-                                struct fw_rsc_carveout *rsc,
-                                int offset, int avail)
+                                void *ptr, int offset, int avail)
 {
+       struct fw_rsc_carveout *rsc = ptr;
        struct rproc_mem_entry *carveout;
        struct device *dev = &rproc->dev;
 
@@ -1097,10 +1103,10 @@ EXPORT_SYMBOL(rproc_of_parse_firmware);
  * enum fw_resource_type.
  */
 static rproc_handle_resource_t rproc_loading_handlers[RSC_LAST] = {
-       [RSC_CARVEOUT] = (rproc_handle_resource_t)rproc_handle_carveout,
-       [RSC_DEVMEM] = (rproc_handle_resource_t)rproc_handle_devmem,
-       [RSC_TRACE] = (rproc_handle_resource_t)rproc_handle_trace,
-       [RSC_VDEV] = (rproc_handle_resource_t)rproc_handle_vdev,
+       [RSC_CARVEOUT] = rproc_handle_carveout,
+       [RSC_DEVMEM] = rproc_handle_devmem,
+       [RSC_TRACE] = rproc_handle_trace,
+       [RSC_VDEV] = rproc_handle_vdev,
 };
 
 /* handle firmware resource entries before booting the remote processor */
@@ -1416,7 +1422,7 @@ reset_table_ptr:
        return ret;
 }
 
-static int rproc_attach(struct rproc *rproc)
+static int __rproc_attach(struct rproc *rproc)
 {
        struct device *dev = &rproc->dev;
        int ret;
@@ -1444,7 +1450,7 @@ static int rproc_attach(struct rproc *rproc)
                goto stop_rproc;
        }
 
-       rproc->state = RPROC_RUNNING;
+       rproc->state = RPROC_ATTACHED;
 
        dev_info(dev, "remote processor %s is now attached\n", rproc->name);
 
@@ -1537,11 +1543,149 @@ disable_iommu:
        return ret;
 }
 
+static int rproc_set_rsc_table(struct rproc *rproc)
+{
+       struct resource_table *table_ptr;
+       struct device *dev = &rproc->dev;
+       size_t table_sz;
+       int ret;
+
+       table_ptr = rproc_get_loaded_rsc_table(rproc, &table_sz);
+       if (!table_ptr) {
+               /* Not having a resource table is acceptable */
+               return 0;
+       }
+
+       if (IS_ERR(table_ptr)) {
+               ret = PTR_ERR(table_ptr);
+               dev_err(dev, "can't load resource table: %d\n", ret);
+               return ret;
+       }
+
+       /*
+        * If it is possible to detach the remote processor, keep an untouched
+        * copy of the resource table.  That way we can start fresh again when
+        * the remote processor is re-attached, that is:
+        *
+        *      DETACHED -> ATTACHED -> DETACHED -> ATTACHED
+        *
+        * Free'd in rproc_reset_rsc_table_on_detach() and
+        * rproc_reset_rsc_table_on_stop().
+        */
+       if (rproc->ops->detach) {
+               rproc->clean_table = kmemdup(table_ptr, table_sz, GFP_KERNEL);
+               if (!rproc->clean_table)
+                       return -ENOMEM;
+       } else {
+               rproc->clean_table = NULL;
+       }
+
+       rproc->cached_table = NULL;
+       rproc->table_ptr = table_ptr;
+       rproc->table_sz = table_sz;
+
+       return 0;
+}
+
+static int rproc_reset_rsc_table_on_detach(struct rproc *rproc)
+{
+       struct resource_table *table_ptr;
+
+       /* A resource table was never retrieved, nothing to do here */
+       if (!rproc->table_ptr)
+               return 0;
+
+       /*
+        * If we made it to this point a clean_table _must_ have been
+        * allocated in rproc_set_rsc_table().  If one isn't present
+        * something went really wrong and we must complain.
+        */
+       if (WARN_ON(!rproc->clean_table))
+               return -EINVAL;
+
+       /* Remember where the external entity installed the resource table */
+       table_ptr = rproc->table_ptr;
+
+       /*
+        * If we made it here the remote processor was started by another
+        * entity and a cache table doesn't exist.  As such make a copy of
+        * the resource table currently used by the remote processor and
+        * use that for the rest of the shutdown process.  The memory
+        * allocated here is free'd in rproc_detach().
+        */
+       rproc->cached_table = kmemdup(rproc->table_ptr,
+                                     rproc->table_sz, GFP_KERNEL);
+       if (!rproc->cached_table)
+               return -ENOMEM;
+
+       /*
+        * Use a copy of the resource table for the remainder of the
+        * shutdown process.
+        */
+       rproc->table_ptr = rproc->cached_table;
+
+       /*
+        * Reset the memory area where the firmware loaded the resource table
+        * to its original value.  That way when we re-attach the remote
+        * processor the resource table is clean and ready to be used again.
+        */
+       memcpy(table_ptr, rproc->clean_table, rproc->table_sz);
+
+       /*
+        * The clean resource table is no longer needed.  Allocated in
+        * rproc_set_rsc_table().
+        */
+       kfree(rproc->clean_table);
+
+       return 0;
+}
+
+static int rproc_reset_rsc_table_on_stop(struct rproc *rproc)
+{
+       /* A resource table was never retrieved, nothing to do here */
+       if (!rproc->table_ptr)
+               return 0;
+
+       /*
+        * If a cache table exists the remote processor was started by
+        * the remoteproc core.  That cache table should be used for
+        * the rest of the shutdown process.
+        */
+       if (rproc->cached_table)
+               goto out;
+
+       /*
+        * If we made it here the remote processor was started by another
+        * entity and a cache table doesn't exist.  As such make a copy of
+        * the resource table currently used by the remote processor and
+        * use that for the rest of the shutdown process.  The memory
+        * allocated here is free'd in rproc_shutdown().
+        */
+       rproc->cached_table = kmemdup(rproc->table_ptr,
+                                     rproc->table_sz, GFP_KERNEL);
+       if (!rproc->cached_table)
+               return -ENOMEM;
+
+       /*
+        * Since the remote processor is being switched off the clean table
+        * won't be needed.  Allocated in rproc_set_rsc_table().
+        */
+       kfree(rproc->clean_table);
+
+out:
+       /*
+        * Use a copy of the resource table for the remainder of the
+        * shutdown process.
+        */
+       rproc->table_ptr = rproc->cached_table;
+       return 0;
+}
+
 /*
  * Attach to remote processor - similar to rproc_fw_boot() but without
  * the steps that deal with the firmware image.
  */
-static int rproc_actuate(struct rproc *rproc)
+static int rproc_attach(struct rproc *rproc)
 {
        struct device *dev = &rproc->dev;
        int ret;
@@ -1556,6 +1700,19 @@ static int rproc_actuate(struct rproc *rproc)
                return ret;
        }
 
+       /* Do anything that is needed to boot the remote processor */
+       ret = rproc_prepare_device(rproc);
+       if (ret) {
+               dev_err(dev, "can't prepare rproc %s: %d\n", rproc->name, ret);
+               goto disable_iommu;
+       }
+
+       ret = rproc_set_rsc_table(rproc);
+       if (ret) {
+               dev_err(dev, "can't load resource table: %d\n", ret);
+               goto unprepare_device;
+       }
+
        /* reset max_notifyid */
        rproc->max_notifyid = -1;
 
@@ -1570,7 +1727,7 @@ static int rproc_actuate(struct rproc *rproc)
        ret = rproc_handle_resources(rproc, rproc_loading_handlers);
        if (ret) {
                dev_err(dev, "Failed to process resources: %d\n", ret);
-               goto disable_iommu;
+               goto unprepare_device;
        }
 
        /* Allocate carveout resources associated to rproc */
@@ -1581,7 +1738,7 @@ static int rproc_actuate(struct rproc *rproc)
                goto clean_up_resources;
        }
 
-       ret = rproc_attach(rproc);
+       ret = __rproc_attach(rproc);
        if (ret)
                goto clean_up_resources;
 
@@ -1589,6 +1746,9 @@ static int rproc_actuate(struct rproc *rproc)
 
 clean_up_resources:
        rproc_resource_cleanup(rproc);
+unprepare_device:
+       /* release HW resources if needed */
+       rproc_unprepare_device(rproc);
 disable_iommu:
        rproc_disable_iommu(rproc);
        return ret;
@@ -1642,11 +1802,20 @@ static int rproc_stop(struct rproc *rproc, bool crashed)
        struct device *dev = &rproc->dev;
        int ret;
 
+       /* No need to continue if a stop() operation has not been provided */
+       if (!rproc->ops->stop)
+               return -EINVAL;
+
        /* Stop any subdevices for the remote processor */
        rproc_stop_subdevices(rproc, crashed);
 
        /* the installed resource table is no longer accessible */
-       rproc->table_ptr = rproc->cached_table;
+       ret = rproc_reset_rsc_table_on_stop(rproc);
+       if (ret) {
+               dev_err(dev, "can't reset resource table: %d\n", ret);
+               return ret;
+       }
+
 
        /* power off the remote processor */
        ret = rproc->ops->stop(rproc);
@@ -1659,19 +1828,48 @@ static int rproc_stop(struct rproc *rproc, bool crashed)
 
        rproc->state = RPROC_OFFLINE;
 
-       /*
-        * The remote processor has been stopped and is now offline, which means
-        * that the next time it is brought back online the remoteproc core will
-        * be responsible to load its firmware.  As such it is no longer
-        * autonomous.
-        */
-       rproc->autonomous = false;
-
        dev_info(dev, "stopped remote processor %s\n", rproc->name);
 
        return 0;
 }
 
+/*
+ * __rproc_detach(): Does the opposite of __rproc_attach()
+ */
+static int __rproc_detach(struct rproc *rproc)
+{
+       struct device *dev = &rproc->dev;
+       int ret;
+
+       /* No need to continue if a detach() operation has not been provided */
+       if (!rproc->ops->detach)
+               return -EINVAL;
+
+       /* Stop any subdevices for the remote processor */
+       rproc_stop_subdevices(rproc, false);
+
+       /* the installed resource table is no longer accessible */
+       ret = rproc_reset_rsc_table_on_detach(rproc);
+       if (ret) {
+               dev_err(dev, "can't reset resource table: %d\n", ret);
+               return ret;
+       }
+
+       /* Tell the remote processor the core isn't available anymore */
+       ret = rproc->ops->detach(rproc);
+       if (ret) {
+               dev_err(dev, "can't detach from rproc: %d\n", ret);
+               return ret;
+       }
+
+       rproc_unprepare_subdevices(rproc);
+
+       rproc->state = RPROC_DETACHED;
+
+       dev_info(dev, "detached remote processor %s\n", rproc->name);
+
+       return 0;
+}
 
 /**
  * rproc_trigger_recovery() - recover a remoteproc
@@ -1802,7 +2000,7 @@ int rproc_boot(struct rproc *rproc)
        if (rproc->state == RPROC_DETACHED) {
                dev_info(dev, "attaching to %s\n", rproc->name);
 
-               ret = rproc_actuate(rproc);
+               ret = rproc_attach(rproc);
        } else {
                dev_info(dev, "powering up %s\n", rproc->name);
 
@@ -1884,6 +2082,65 @@ out:
 }
 EXPORT_SYMBOL(rproc_shutdown);
 
+/**
+ * rproc_detach() - Detach the remote processor from the
+ * remoteproc core
+ *
+ * @rproc: the remote processor
+ *
+ * Detach a remote processor (previously attached to with rproc_attach()).
+ *
+ * In case @rproc is still being used by an additional user(s), then
+ * this function will just decrement the power refcount and exit,
+ * without disconnecting the device.
+ *
+ * Function rproc_detach() calls __rproc_detach() in order to let a remote
+ * processor know that services provided by the application processor are
+ * no longer available.  From there it should be possible to remove the
+ * platform driver and even power cycle the application processor (if the HW
+ * supports it) without needing to switch off the remote processor.
+ */
+int rproc_detach(struct rproc *rproc)
+{
+       struct device *dev = &rproc->dev;
+       int ret;
+
+       ret = mutex_lock_interruptible(&rproc->lock);
+       if (ret) {
+               dev_err(dev, "can't lock rproc %s: %d\n", rproc->name, ret);
+               return ret;
+       }
+
+       /* if the remote proc is still needed, bail out */
+       if (!atomic_dec_and_test(&rproc->power)) {
+               ret = 0;
+               goto out;
+       }
+
+       ret = __rproc_detach(rproc);
+       if (ret) {
+               atomic_inc(&rproc->power);
+               goto out;
+       }
+
+       /* clean up all acquired resources */
+       rproc_resource_cleanup(rproc);
+
+       /* release HW resources if needed */
+       rproc_unprepare_device(rproc);
+
+       rproc_disable_iommu(rproc);
+
+       /* Free the copy of the resource table */
+       kfree(rproc->cached_table);
+       rproc->cached_table = NULL;
+       rproc->table_ptr = NULL;
+out:
+       mutex_unlock(&rproc->lock);
+       return ret;
+}
+EXPORT_SYMBOL(rproc_detach);
+
 /**
  * rproc_get_by_phandle() - find a remote processor by phandle
  * @phandle: phandle to the rproc
@@ -2077,16 +2334,6 @@ int rproc_add(struct rproc *rproc)
        if (ret < 0)
                return ret;
 
-       /*
-        * Remind ourselves the remote processor has been attached to rather
-        * than booted by the remoteproc core.  This is important because the
-        * RPROC_DETACHED state will be lost as soon as the remote processor
-        * has been attached to.  Used in firmware_show() and reset in
-        * rproc_stop().
-        */
-       if (rproc->state == RPROC_DETACHED)
-               rproc->autonomous = true;
-
        /* if rproc is marked always-on, request it to boot */
        if (rproc->auto_boot) {
                ret = rproc_trigger_auto_boot(rproc);
@@ -2347,10 +2594,8 @@ int rproc_del(struct rproc *rproc)
        if (!rproc)
                return -EINVAL;
 
-       /* if rproc is marked always-on, rproc_add() booted it */
        /* TODO: make sure this works with rproc->power > 1 */
-       if (rproc->auto_boot)
-               rproc_shutdown(rproc);
+       rproc_shutdown(rproc);
 
        mutex_lock(&rproc->lock);
        rproc->state = RPROC_DELETED;
@@ -2492,7 +2737,11 @@ static int rproc_panic_handler(struct notifier_block *nb, unsigned long event,
 
        rcu_read_lock();
        list_for_each_entry_rcu(rproc, &rproc_list, node) {
-               if (!rproc->ops->panic || rproc->state != RPROC_RUNNING)
+               if (!rproc->ops->panic)
+                       continue;
+
+               if (rproc->state != RPROC_RUNNING &&
+                   rproc->state != RPROC_ATTACHED)
                        continue;
 
                d = rproc->ops->panic(rproc);
index 81ec154..aee657c 100644 (file)
@@ -153,18 +153,22 @@ static void rproc_copy_segment(struct rproc *rproc, void *dest,
                               size_t offset, size_t size)
 {
        void *ptr;
+       bool is_iomem;
 
        if (segment->dump) {
                segment->dump(rproc, segment, dest, offset, size);
        } else {
-               ptr = rproc_da_to_va(rproc, segment->da + offset, size);
+               ptr = rproc_da_to_va(rproc, segment->da + offset, size, &is_iomem);
                if (!ptr) {
                        dev_err(&rproc->dev,
                                "invalid copy request for segment %pad with offset %zu and size %zu)\n",
                                &segment->da, offset, size);
                        memset(dest, 0xff, size);
                } else {
-                       memcpy(dest, ptr, size);
+                       if (is_iomem)
+                               memcpy_fromio(dest, ptr, size);
+                       else
+                               memcpy(dest, ptr, size);
                }
        }
 }
index 7e58453..b5a1e3b 100644 (file)
@@ -132,7 +132,7 @@ static ssize_t rproc_trace_read(struct file *filp, char __user *userbuf,
        char buf[100];
        int len;
 
-       va = rproc_da_to_va(data->rproc, trace->da, trace->len);
+       va = rproc_da_to_va(data->rproc, trace->da, trace->len, NULL);
 
        if (!va) {
                len = scnprintf(buf, sizeof(buf), "Trace %s not available\n",
index df68d87..1142358 100644 (file)
@@ -175,6 +175,7 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
                u64 offset = elf_phdr_get_p_offset(class, phdr);
                u32 type = elf_phdr_get_p_type(class, phdr);
                void *ptr;
+               bool is_iomem;
 
                if (type != PT_LOAD)
                        continue;
@@ -204,7 +205,7 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
                }
 
                /* grab the kernel address for this device address */
-               ptr = rproc_da_to_va(rproc, da, memsz);
+               ptr = rproc_da_to_va(rproc, da, memsz, &is_iomem);
                if (!ptr) {
                        dev_err(dev, "bad phdr da 0x%llx mem 0x%llx\n", da,
                                memsz);
@@ -213,8 +214,12 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
                }
 
                /* put the segment where the remote processor expects it */
-               if (filesz)
-                       memcpy(ptr, elf_data + offset, filesz);
+               if (filesz) {
+                       if (is_iomem)
+                               memcpy_fromio(ptr, (void __iomem *)(elf_data + offset), filesz);
+                       else
+                               memcpy(ptr, elf_data + offset, filesz);
+               }
 
                /*
                 * Zero out remaining memory for this segment.
@@ -223,8 +228,12 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
                 * did this for us. albeit harmless, we may consider removing
                 * this.
                 */
-               if (memsz > filesz)
-                       memset(ptr + filesz, 0, memsz - filesz);
+               if (memsz > filesz) {
+                       if (is_iomem)
+                               memset_io((void __iomem *)(ptr + filesz), 0, memsz - filesz);
+                       else
+                               memset(ptr + filesz, 0, memsz - filesz);
+               }
        }
 
        return ret;
@@ -377,6 +386,6 @@ struct resource_table *rproc_elf_find_loaded_rsc_table(struct rproc *rproc,
                return NULL;
        }
 
-       return rproc_da_to_va(rproc, sh_addr, sh_size);
+       return rproc_da_to_va(rproc, sh_addr, sh_size, NULL);
 }
 EXPORT_SYMBOL(rproc_elf_find_loaded_rsc_table);
index c340028..a328e63 100644 (file)
@@ -84,7 +84,7 @@ static inline void  rproc_char_device_remove(struct rproc *rproc)
 void rproc_free_vring(struct rproc_vring *rvring);
 int rproc_alloc_vring(struct rproc_vdev *rvdev, int i);
 
-void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len);
+void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem);
 phys_addr_t rproc_va_to_pa(void *cpu_addr);
 int rproc_trigger_recovery(struct rproc *rproc);
 
@@ -177,6 +177,16 @@ struct resource_table *rproc_find_loaded_rsc_table(struct rproc *rproc,
        return NULL;
 }
 
+static inline
+struct resource_table *rproc_get_loaded_rsc_table(struct rproc *rproc,
+                                                 size_t *size)
+{
+       if (rproc->ops->get_loaded_rsc_table)
+               return rproc->ops->get_loaded_rsc_table(rproc, size);
+
+       return NULL;
+}
+
 static inline
 bool rproc_u64_fit_in_size_t(u64 val)
 {
index 1dbef89..ea8b89f 100644 (file)
@@ -15,7 +15,7 @@ static ssize_t recovery_show(struct device *dev,
 {
        struct rproc *rproc = to_rproc(dev);
 
-       return sprintf(buf, "%s", rproc->recovery_disabled ? "disabled\n" : "enabled\n");
+       return sysfs_emit(buf, "%s", rproc->recovery_disabled ? "disabled\n" : "enabled\n");
 }
 
 /*
@@ -82,7 +82,7 @@ static ssize_t coredump_show(struct device *dev,
 {
        struct rproc *rproc = to_rproc(dev);
 
-       return sprintf(buf, "%s\n", rproc_coredump_str[rproc->dump_conf]);
+       return sysfs_emit(buf, "%s\n", rproc_coredump_str[rproc->dump_conf]);
 }
 
 /*
@@ -138,11 +138,8 @@ static ssize_t firmware_show(struct device *dev, struct device_attribute *attr,
         * If the remote processor has been started by an external
         * entity we have no idea of what image it is running.  As such
         * simply display a generic string rather then rproc->firmware.
-        *
-        * Here we rely on the autonomous flag because a remote processor
-        * may have been attached to and currently in a running state.
         */
-       if (rproc->autonomous)
+       if (rproc->state == RPROC_ATTACHED)
                firmware = "unknown";
 
        return sprintf(buf, "%s\n", firmware);
@@ -172,6 +169,7 @@ static const char * const rproc_state_string[] = {
        [RPROC_RUNNING]         = "running",
        [RPROC_CRASHED]         = "crashed",
        [RPROC_DELETED]         = "deleted",
+       [RPROC_ATTACHED]        = "attached",
        [RPROC_DETACHED]        = "detached",
        [RPROC_LAST]            = "invalid",
 };
@@ -196,17 +194,24 @@ static ssize_t state_store(struct device *dev,
        int ret = 0;
 
        if (sysfs_streq(buf, "start")) {
-               if (rproc->state == RPROC_RUNNING)
+               if (rproc->state == RPROC_RUNNING ||
+                   rproc->state == RPROC_ATTACHED)
                        return -EBUSY;
 
                ret = rproc_boot(rproc);
                if (ret)
                        dev_err(&rproc->dev, "Boot failed: %d\n", ret);
        } else if (sysfs_streq(buf, "stop")) {
-               if (rproc->state != RPROC_RUNNING)
+               if (rproc->state != RPROC_RUNNING &&
+                   rproc->state != RPROC_ATTACHED)
                        return -EINVAL;
 
                rproc_shutdown(rproc);
+       } else if (sysfs_streq(buf, "detach")) {
+               if (rproc->state != RPROC_ATTACHED)
+                       return -EINVAL;
+
+               ret = rproc_detach(rproc);
        } else {
                dev_err(&rproc->dev, "Unrecognised option: %s\n", buf);
                ret = -EINVAL;
index 09bcb4d..22096ad 100644 (file)
@@ -174,7 +174,7 @@ static int slim_rproc_stop(struct rproc *rproc)
        return 0;
 }
 
-static void *slim_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *slim_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct st_slim_rproc *slim_rproc = rproc->priv;
        void *va = NULL;
index ccb3c14..7353f9e 100644 (file)
@@ -28,7 +28,7 @@
 #define RELEASE_BOOT           1
 
 #define MBOX_NB_VQ             2
-#define MBOX_NB_MBX            3
+#define MBOX_NB_MBX            4
 
 #define STM32_SMC_RCC          0x82001000
 #define STM32_SMC_REG_WRITE    0x1
@@ -38,6 +38,7 @@
 #define STM32_MBX_VQ1          "vq1"
 #define STM32_MBX_VQ1_ID       1
 #define STM32_MBX_SHUTDOWN     "shutdown"
+#define STM32_MBX_DETACH       "detach"
 
 #define RSC_TBL_SIZE           1024
 
@@ -207,16 +208,7 @@ static int stm32_rproc_mbox_idx(struct rproc *rproc, const unsigned char *name)
        return -EINVAL;
 }
 
-static int stm32_rproc_elf_load_rsc_table(struct rproc *rproc,
-                                         const struct firmware *fw)
-{
-       if (rproc_elf_load_rsc_table(rproc, fw))
-               dev_warn(&rproc->dev, "no resource table found for this firmware\n");
-
-       return 0;
-}
-
-static int stm32_rproc_parse_memory_regions(struct rproc *rproc)
+static int stm32_rproc_prepare(struct rproc *rproc)
 {
        struct device *dev = rproc->dev.parent;
        struct device_node *np = dev->of_node;
@@ -274,12 +266,10 @@ static int stm32_rproc_parse_memory_regions(struct rproc *rproc)
 
 static int stm32_rproc_parse_fw(struct rproc *rproc, const struct firmware *fw)
 {
-       int ret = stm32_rproc_parse_memory_regions(rproc);
-
-       if (ret)
-               return ret;
+       if (rproc_elf_load_rsc_table(rproc, fw))
+               dev_warn(&rproc->dev, "no resource table found for this firmware\n");
 
-       return stm32_rproc_elf_load_rsc_table(rproc, fw);
+       return 0;
 }
 
 static irqreturn_t stm32_rproc_wdg(int irq, void *data)
@@ -347,6 +337,15 @@ static const struct stm32_mbox stm32_rproc_mbox[MBOX_NB_MBX] = {
                        .tx_done = NULL,
                        .tx_tout = 500, /* 500 ms time out */
                },
+       },
+       {
+               .name = STM32_MBX_DETACH,
+               .vq_id = -1,
+               .client = {
+                       .tx_block = true,
+                       .tx_done = NULL,
+                       .tx_tout = 200, /* 200 ms time out to detach should be fair enough */
+               },
        }
 };
 
@@ -472,6 +471,25 @@ static int stm32_rproc_attach(struct rproc *rproc)
        return stm32_rproc_set_hold_boot(rproc, true);
 }
 
+static int stm32_rproc_detach(struct rproc *rproc)
+{
+       struct stm32_rproc *ddata = rproc->priv;
+       int err, dummy_data, idx;
+
+       /* Inform the remote processor of the detach */
+       idx = stm32_rproc_mbox_idx(rproc, STM32_MBX_DETACH);
+       if (idx >= 0 && ddata->mb[idx].chan) {
+               /* A dummy data is sent to allow to block on transmit */
+               err = mbox_send_message(ddata->mb[idx].chan,
+                                       &dummy_data);
+               if (err < 0)
+                       dev_warn(&rproc->dev, "warning: remote FW detach without ack\n");
+       }
+
+       /* Allow remote processor to auto-reboot */
+       return stm32_rproc_set_hold_boot(rproc, false);
+}
+
 static int stm32_rproc_stop(struct rproc *rproc)
 {
        struct stm32_rproc *ddata = rproc->priv;
@@ -546,14 +564,89 @@ static void stm32_rproc_kick(struct rproc *rproc, int vqid)
        }
 }
 
+static int stm32_rproc_da_to_pa(struct rproc *rproc,
+                               u64 da, phys_addr_t *pa)
+{
+       struct stm32_rproc *ddata = rproc->priv;
+       struct device *dev = rproc->dev.parent;
+       struct stm32_rproc_mem *p_mem;
+       unsigned int i;
+
+       for (i = 0; i < ddata->nb_rmems; i++) {
+               p_mem = &ddata->rmems[i];
+
+               if (da < p_mem->dev_addr ||
+                   da >= p_mem->dev_addr + p_mem->size)
+                       continue;
+
+               *pa = da - p_mem->dev_addr + p_mem->bus_addr;
+               dev_dbg(dev, "da %llx to pa %#x\n", da, *pa);
+
+               return 0;
+       }
+
+       dev_err(dev, "can't translate da %llx\n", da);
+
+       return -EINVAL;
+}
+
+static struct resource_table *
+stm32_rproc_get_loaded_rsc_table(struct rproc *rproc, size_t *table_sz)
+{
+       struct stm32_rproc *ddata = rproc->priv;
+       struct device *dev = rproc->dev.parent;
+       phys_addr_t rsc_pa;
+       u32 rsc_da;
+       int err;
+
+       /* The resource table has already been mapped, nothing to do */
+       if (ddata->rsc_va)
+               goto done;
+
+       err = regmap_read(ddata->rsctbl.map, ddata->rsctbl.reg, &rsc_da);
+       if (err) {
+               dev_err(dev, "failed to read rsc tbl addr\n");
+               return ERR_PTR(-EINVAL);
+       }
+
+       if (!rsc_da)
+               /* no rsc table */
+               return ERR_PTR(-ENOENT);
+
+       err = stm32_rproc_da_to_pa(rproc, rsc_da, &rsc_pa);
+       if (err)
+               return ERR_PTR(err);
+
+       ddata->rsc_va = devm_ioremap_wc(dev, rsc_pa, RSC_TBL_SIZE);
+       if (IS_ERR_OR_NULL(ddata->rsc_va)) {
+               dev_err(dev, "Unable to map memory region: %pa+%zx\n",
+                       &rsc_pa, RSC_TBL_SIZE);
+               ddata->rsc_va = NULL;
+               return ERR_PTR(-ENOMEM);
+       }
+
+done:
+       /*
+        * Assuming the resource table fits in 1kB is fair.
+        * Notice for the detach, that this 1 kB memory area has to be reserved in the coprocessor
+        * firmware for the resource table. On detach, the remoteproc core re-initializes this
+        * entire area by overwriting it with the initial values stored in rproc->clean_table.
+        */
+       *table_sz = RSC_TBL_SIZE;
+       return (struct resource_table *)ddata->rsc_va;
+}
+
 static const struct rproc_ops st_rproc_ops = {
+       .prepare        = stm32_rproc_prepare,
        .start          = stm32_rproc_start,
        .stop           = stm32_rproc_stop,
        .attach         = stm32_rproc_attach,
+       .detach         = stm32_rproc_detach,
        .kick           = stm32_rproc_kick,
        .load           = rproc_elf_load_segments,
        .parse_fw       = stm32_rproc_parse_fw,
        .find_loaded_rsc_table = rproc_elf_find_loaded_rsc_table,
+       .get_loaded_rsc_table = stm32_rproc_get_loaded_rsc_table,
        .sanity_check   = rproc_elf_sanity_check,
        .get_boot_addr  = rproc_elf_get_boot_addr,
 };
@@ -695,75 +788,6 @@ static int stm32_rproc_get_m4_status(struct stm32_rproc *ddata,
        return regmap_read(ddata->m4_state.map, ddata->m4_state.reg, state);
 }
 
-static int stm32_rproc_da_to_pa(struct platform_device *pdev,
-                               struct stm32_rproc *ddata,
-                               u64 da, phys_addr_t *pa)
-{
-       struct device *dev = &pdev->dev;
-       struct stm32_rproc_mem *p_mem;
-       unsigned int i;
-
-       for (i = 0; i < ddata->nb_rmems; i++) {
-               p_mem = &ddata->rmems[i];
-
-               if (da < p_mem->dev_addr ||
-                   da >= p_mem->dev_addr + p_mem->size)
-                       continue;
-
-               *pa = da - p_mem->dev_addr + p_mem->bus_addr;
-               dev_dbg(dev, "da %llx to pa %#x\n", da, *pa);
-
-               return 0;
-       }
-
-       dev_err(dev, "can't translate da %llx\n", da);
-
-       return -EINVAL;
-}
-
-static int stm32_rproc_get_loaded_rsc_table(struct platform_device *pdev,
-                                           struct rproc *rproc,
-                                           struct stm32_rproc *ddata)
-{
-       struct device *dev = &pdev->dev;
-       phys_addr_t rsc_pa;
-       u32 rsc_da;
-       int err;
-
-       err = regmap_read(ddata->rsctbl.map, ddata->rsctbl.reg, &rsc_da);
-       if (err) {
-               dev_err(dev, "failed to read rsc tbl addr\n");
-               return err;
-       }
-
-       if (!rsc_da)
-               /* no rsc table */
-               return 0;
-
-       err = stm32_rproc_da_to_pa(pdev, ddata, rsc_da, &rsc_pa);
-       if (err)
-               return err;
-
-       ddata->rsc_va = devm_ioremap_wc(dev, rsc_pa, RSC_TBL_SIZE);
-       if (IS_ERR_OR_NULL(ddata->rsc_va)) {
-               dev_err(dev, "Unable to map memory region: %pa+%zx\n",
-                       &rsc_pa, RSC_TBL_SIZE);
-               ddata->rsc_va = NULL;
-               return -ENOMEM;
-       }
-
-       /*
-        * The resource table is already loaded in device memory, no need
-        * to work with a cached table.
-        */
-       rproc->cached_table = NULL;
-       /* Assuming the resource table fits in 1kB is fair */
-       rproc->table_sz = RSC_TBL_SIZE;
-       rproc->table_ptr = (struct resource_table *)ddata->rsc_va;
-
-       return 0;
-}
-
 static int stm32_rproc_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
@@ -797,18 +821,9 @@ static int stm32_rproc_probe(struct platform_device *pdev)
        if (ret)
                goto free_rproc;
 
-       if (state == M4_STATE_CRUN) {
+       if (state == M4_STATE_CRUN)
                rproc->state = RPROC_DETACHED;
 
-               ret = stm32_rproc_parse_memory_regions(rproc);
-               if (ret)
-                       goto free_resources;
-
-               ret = stm32_rproc_get_loaded_rsc_table(pdev, rproc, ddata);
-               if (ret)
-                       goto free_resources;
-       }
-
        rproc->has_iommu = false;
        ddata->workqueue = create_workqueue(dev_name(dev));
        if (!ddata->workqueue) {
index 863c021..fd4eb67 100644 (file)
@@ -354,7 +354,7 @@ static int k3_dsp_rproc_stop(struct rproc *rproc)
  * can be used either by the remoteproc core for loading (when using kernel
  * remoteproc loader), or by any rpmsg bus drivers.
  */
-static void *k3_dsp_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *k3_dsp_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct k3_dsp_rproc *kproc = rproc->priv;
        void __iomem *va = NULL;
index 62b5a4c..5cf8d03 100644 (file)
@@ -590,7 +590,7 @@ out:
  * present in a DSP or IPU device). The translated addresses can be used
  * either by the remoteproc core for loading, or by any rpmsg bus drivers.
  */
-static void *k3_r5_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *k3_r5_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct k3_r5_rproc *kproc = rproc->priv;
        struct k3_r5_core *core = kproc->core;
index 92d387d..484f760 100644 (file)
@@ -89,7 +89,7 @@ static int wkup_m3_rproc_stop(struct rproc *rproc)
        return error;
 }
 
-static void *wkup_m3_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *wkup_m3_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
        struct wkup_m3_rproc *wkupm3 = rproc->priv;
        void *va = NULL;
index 7043c7f..3e7f55e 100644 (file)
@@ -197,6 +197,7 @@ config RESET_SIMPLE
           - RCC reset controller in STM32 MCUs
           - Allwinner SoCs
           - ZTE's zx2967 family
+          - SiFive FU740 SoCs
 
 config RESET_STM32MP157
        bool "STM32MP157 Reset Driver" if COMPILE_TEST
index 27a0516..05533c7 100644 (file)
@@ -857,6 +857,7 @@ static int qcom_glink_rx_data(struct qcom_glink *glink, size_t avail)
                        dev_err(glink->dev,
                                "no intent found for channel %s intent %d",
                                channel->name, liid);
+                       ret = -ENOENT;
                        goto advance_rx;
                }
        }
@@ -1332,6 +1333,20 @@ static int qcom_glink_trysend(struct rpmsg_endpoint *ept, void *data, int len)
        return __qcom_glink_send(channel, data, len, false);
 }
 
+static int qcom_glink_sendto(struct rpmsg_endpoint *ept, void *data, int len, u32 dst)
+{
+       struct glink_channel *channel = to_glink_channel(ept);
+
+       return __qcom_glink_send(channel, data, len, true);
+}
+
+static int qcom_glink_trysendto(struct rpmsg_endpoint *ept, void *data, int len, u32 dst)
+{
+       struct glink_channel *channel = to_glink_channel(ept);
+
+       return __qcom_glink_send(channel, data, len, false);
+}
+
 /*
  * Finds the device_node for the glink child interested in this channel.
  */
@@ -1364,7 +1379,9 @@ static const struct rpmsg_device_ops glink_device_ops = {
 static const struct rpmsg_endpoint_ops glink_endpoint_ops = {
        .destroy_ept = qcom_glink_destroy_ept,
        .send = qcom_glink_send,
+       .sendto = qcom_glink_sendto,
        .trysend = qcom_glink_trysend,
+       .trysendto = qcom_glink_trysendto,
 };
 
 static void qcom_glink_rpdev_release(struct device *dev)
index 19903de..8da1b5c 100644 (file)
@@ -974,6 +974,20 @@ static int qcom_smd_trysend(struct rpmsg_endpoint *ept, void *data, int len)
        return __qcom_smd_send(qsept->qsch, data, len, false);
 }
 
+static int qcom_smd_sendto(struct rpmsg_endpoint *ept, void *data, int len, u32 dst)
+{
+       struct qcom_smd_endpoint *qsept = to_smd_endpoint(ept);
+
+       return __qcom_smd_send(qsept->qsch, data, len, true);
+}
+
+static int qcom_smd_trysendto(struct rpmsg_endpoint *ept, void *data, int len, u32 dst)
+{
+       struct qcom_smd_endpoint *qsept = to_smd_endpoint(ept);
+
+       return __qcom_smd_send(qsept->qsch, data, len, false);
+}
+
 static __poll_t qcom_smd_poll(struct rpmsg_endpoint *ept,
                                  struct file *filp, poll_table *wait)
 {
@@ -1038,7 +1052,9 @@ static const struct rpmsg_device_ops qcom_smd_device_ops = {
 static const struct rpmsg_endpoint_ops qcom_smd_endpoint_ops = {
        .destroy_ept = qcom_smd_destroy_ept,
        .send = qcom_smd_send,
+       .sendto = qcom_smd_sendto,
        .trysend = qcom_smd_trysend,
+       .trysendto = qcom_smd_trysendto,
        .poll = qcom_smd_poll,
 };
 
index 4bbbacd..2bebc9b 100644 (file)
@@ -127,6 +127,9 @@ static int rpmsg_eptdev_open(struct inode *inode, struct file *filp)
        struct rpmsg_device *rpdev = eptdev->rpdev;
        struct device *dev = &eptdev->dev;
 
+       if (eptdev->ept)
+               return -EBUSY;
+
        get_device(dev);
 
        ept = rpmsg_create_ept(rpdev, rpmsg_ept_cb, eptdev, eptdev->chinfo);
@@ -239,9 +242,9 @@ static ssize_t rpmsg_eptdev_write_iter(struct kiocb *iocb,
        }
 
        if (filp->f_flags & O_NONBLOCK)
-               ret = rpmsg_trysend(eptdev->ept, kbuf, len);
+               ret = rpmsg_trysendto(eptdev->ept, kbuf, len, eptdev->chinfo.dst);
        else
-               ret = rpmsg_send(eptdev->ept, kbuf, len);
+               ret = rpmsg_sendto(eptdev->ept, kbuf, len, eptdev->chinfo.dst);
 
 unlock_eptdev:
        mutex_unlock(&eptdev->ept_lock);
@@ -543,7 +546,7 @@ static struct rpmsg_driver rpmsg_chrdev_driver = {
        },
 };
 
-static int rpmsg_char_init(void)
+static int rpmsg_chrdev_init(void)
 {
        int ret;
 
@@ -569,7 +572,7 @@ static int rpmsg_char_init(void)
 
        return ret;
 }
-postcore_initcall(rpmsg_char_init);
+postcore_initcall(rpmsg_chrdev_init);
 
 static void rpmsg_chrdev_exit(void)
 {
index e87d4cf..8e49a3b 100644 (file)
@@ -813,14 +813,57 @@ static void rpmsg_xmit_done(struct virtqueue *svq)
        wake_up_interruptible(&vrp->sendq);
 }
 
+/*
+ * Called to expose to user a /dev/rpmsg_ctrlX interface allowing to
+ * create endpoint-to-endpoint communication without associated RPMsg channel.
+ * The endpoints are rattached to the ctrldev RPMsg device.
+ */
+static struct rpmsg_device *rpmsg_virtio_add_ctrl_dev(struct virtio_device *vdev)
+{
+       struct virtproc_info *vrp = vdev->priv;
+       struct virtio_rpmsg_channel *vch;
+       struct rpmsg_device *rpdev_ctrl;
+       int err = 0;
+
+       vch = kzalloc(sizeof(*vch), GFP_KERNEL);
+       if (!vch)
+               return ERR_PTR(-ENOMEM);
+
+       /* Link the channel to the vrp */
+       vch->vrp = vrp;
+
+       /* Assign public information to the rpmsg_device */
+       rpdev_ctrl = &vch->rpdev;
+       rpdev_ctrl->ops = &virtio_rpmsg_ops;
+
+       rpdev_ctrl->dev.parent = &vrp->vdev->dev;
+       rpdev_ctrl->dev.release = virtio_rpmsg_release_device;
+       rpdev_ctrl->little_endian = virtio_is_little_endian(vrp->vdev);
+
+       err = rpmsg_chrdev_register_device(rpdev_ctrl);
+       if (err) {
+               kfree(vch);
+               return ERR_PTR(err);
+       }
+
+       return rpdev_ctrl;
+}
+
+static void rpmsg_virtio_del_ctrl_dev(struct rpmsg_device *rpdev_ctrl)
+{
+       if (!rpdev_ctrl)
+               return;
+       kfree(to_virtio_rpmsg_channel(rpdev_ctrl));
+}
+
 static int rpmsg_probe(struct virtio_device *vdev)
 {
        vq_callback_t *vq_cbs[] = { rpmsg_recv_done, rpmsg_xmit_done };
        static const char * const names[] = { "input", "output" };
        struct virtqueue *vqs[2];
        struct virtproc_info *vrp;
-       struct virtio_rpmsg_channel *vch;
-       struct rpmsg_device *rpdev_ns;
+       struct virtio_rpmsg_channel *vch = NULL;
+       struct rpmsg_device *rpdev_ns, *rpdev_ctrl;
        void *bufs_va;
        int err = 0, i;
        size_t total_buf_space;
@@ -894,12 +937,18 @@ static int rpmsg_probe(struct virtio_device *vdev)
 
        vdev->priv = vrp;
 
+       rpdev_ctrl = rpmsg_virtio_add_ctrl_dev(vdev);
+       if (IS_ERR(rpdev_ctrl)) {
+               err = PTR_ERR(rpdev_ctrl);
+               goto free_coherent;
+       }
+
        /* if supported by the remote processor, enable the name service */
        if (virtio_has_feature(vdev, VIRTIO_RPMSG_F_NS)) {
                vch = kzalloc(sizeof(*vch), GFP_KERNEL);
                if (!vch) {
                        err = -ENOMEM;
-                       goto free_coherent;
+                       goto free_ctrldev;
                }
 
                /* Link the channel to our vrp */
@@ -915,7 +964,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
 
                err = rpmsg_ns_register_device(rpdev_ns);
                if (err)
-                       goto free_coherent;
+                       goto free_vch;
        }
 
        /*
@@ -939,8 +988,11 @@ static int rpmsg_probe(struct virtio_device *vdev)
 
        return 0;
 
-free_coherent:
+free_vch:
        kfree(vch);
+free_ctrldev:
+       rpmsg_virtio_del_ctrl_dev(rpdev_ctrl);
+free_coherent:
        dma_free_coherent(vdev->dev.parent, total_buf_space,
                          bufs_va, vrp->bufs_dma);
 vqs_del:
index 773386b..d8c13fd 100644 (file)
@@ -1339,6 +1339,7 @@ config RTC_DRV_DIGICOLOR
 config RTC_DRV_IMXDI
        tristate "Freescale IMX DryIce Real Time Clock"
        depends on ARCH_MXC
+       depends on OF
        help
           Support for Freescale IMX DryIce RTC
 
@@ -1906,7 +1907,7 @@ config RTC_DRV_HID_SENSOR_TIME
 
 config RTC_DRV_GOLDFISH
        tristate "Goldfish Real Time Clock"
-       depends on OF && HAS_IOMEM
+       depends on HAS_IOMEM
        help
          Say yes to enable RTC driver for the Goldfish based virtual platform.
 
index dcb34c7..9a2bd49 100644 (file)
@@ -545,7 +545,7 @@ EXPORT_SYMBOL_GPL(rtc_alarm_irq_enable);
 
 int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled)
 {
-       int rc = 0, err;
+       int err;
 
        err = mutex_lock_interruptible(&rtc->ops_lock);
        if (err)
@@ -561,17 +561,21 @@ int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled)
        if (rtc->uie_rtctimer.enabled == enabled)
                goto out;
 
-       if (rtc->uie_unsupported) {
-               err = -EINVAL;
-               goto out;
+       if (rtc->uie_unsupported || !test_bit(RTC_FEATURE_ALARM, rtc->features)) {
+               mutex_unlock(&rtc->ops_lock);
+#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
+               return rtc_dev_update_irq_enable_emul(rtc, enabled);
+#else
+               return -EINVAL;
+#endif
        }
 
        if (enabled) {
                struct rtc_time tm;
                ktime_t now, onesec;
 
-               rc = __rtc_read_time(rtc, &tm);
-               if (rc)
+               err = __rtc_read_time(rtc, &tm);
+               if (err)
                        goto out;
                onesec = ktime_set(1, 0);
                now = rtc_tm_to_ktime(tm);
@@ -585,24 +589,6 @@ int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled)
 out:
        mutex_unlock(&rtc->ops_lock);
 
-       /*
-        * __rtc_read_time() failed, this probably means that the RTC time has
-        * never been set or less probably there is a transient error on the
-        * bus. In any case, avoid enabling emulation has this will fail when
-        * reading the time too.
-        */
-       if (rc)
-               return rc;
-
-#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
-       /*
-        * Enable emulation if the driver returned -EINVAL to signal that it has
-        * been configured without interrupts or they are not available at the
-        * moment.
-        */
-       if (err == -EINVAL)
-               err = rtc_dev_update_irq_enable_emul(rtc, enabled);
-#endif
        return err;
 }
 EXPORT_SYMBOL_GPL(rtc_update_irq_enable);
index b20d8f2..a9b3555 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/bcd.h>
 #include <linux/of.h>
 #include <linux/regmap.h>
+#include <linux/bitfield.h>
 #include <linux/hwmon.h>
 #include <linux/hwmon-sysfs.h>
 
 
 #define ABEOZ9_SEC_LEN                 7
 
+#define ABEOZ9_REG_ALARM_SEC           0x10
+#define ABEOZ9_BIT_ALARM_SEC           GENMASK(6, 0)
+#define ABEOZ9_REG_ALARM_MIN           0x11
+#define ABEOZ9_BIT_ALARM_MIN           GENMASK(6, 0)
+#define ABEOZ9_REG_ALARM_HOURS         0x12
+#define ABEOZ9_BIT_ALARM_HOURS_PM      BIT(5)
+#define ABEOZ9_BIT_ALARM_HOURS         GENMASK(4, 0)
+#define ABEOZ9_REG_ALARM_DAYS          0x13
+#define ABEOZ9_BIT_ALARM_DAYS          GENMASK(5, 0)
+#define ABEOZ9_REG_ALARM_WEEKDAYS      0x14
+#define ABEOZ9_BIT_ALARM_WEEKDAYS      GENMASK(2, 0)
+#define ABEOZ9_REG_ALARM_MONTHS                0x15
+#define ABEOZ9_BIT_ALARM_MONTHS                GENMASK(4, 0)
+#define ABEOZ9_REG_ALARM_YEARS         0x16
+
+#define ABEOZ9_ALARM_LEN               7
+#define ABEOZ9_BIT_ALARM_AE            BIT(7)
+
 #define ABEOZ9_REG_REG_TEMP            0x20
 #define ABEOZ953_TEMP_MAX              120
 #define ABEOZ953_TEMP_MIN              -60
@@ -186,6 +205,98 @@ static int abeoz9_rtc_set_time(struct device *dev, struct rtc_time *tm)
        return abeoz9_reset_validity(regmap);
 }
 
+static int abeoz9_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm)
+{
+       struct abeoz9_rtc_data *data = dev_get_drvdata(dev);
+       struct regmap *regmap = data->regmap;
+       u8 regs[ABEOZ9_ALARM_LEN];
+       u8 val[2];
+       int ret;
+
+       ret = abeoz9_check_validity(dev);
+       if (ret)
+               return ret;
+
+       ret = regmap_bulk_read(regmap, ABEOZ9_REG_CTRL_INT, val, sizeof(val));
+       if (ret)
+               return ret;
+
+       alarm->enabled = val[0] & ABEOZ9_REG_CTRL_INT_AIE;
+       alarm->pending = val[1] & ABEOZ9_REG_CTRL_INT_FLAG_AF;
+
+       ret = regmap_bulk_read(regmap, ABEOZ9_REG_ALARM_SEC, regs, sizeof(regs));
+       if (ret)
+               return ret;
+
+       alarm->time.tm_sec = bcd2bin(FIELD_GET(ABEOZ9_BIT_ALARM_SEC, regs[0]));
+       alarm->time.tm_min = bcd2bin(FIELD_GET(ABEOZ9_BIT_ALARM_MIN, regs[1]));
+       alarm->time.tm_hour = bcd2bin(FIELD_GET(ABEOZ9_BIT_ALARM_HOURS, regs[2]));
+       if (FIELD_GET(ABEOZ9_BIT_ALARM_HOURS_PM, regs[2]))
+               alarm->time.tm_hour += 12;
+
+       alarm->time.tm_mday = bcd2bin(FIELD_GET(ABEOZ9_BIT_ALARM_DAYS, regs[3]));
+
+       return 0;
+}
+
+static int abeoz9_rtc_alarm_irq_enable(struct device *dev, u32 enable)
+{
+       struct abeoz9_rtc_data *data = dev_get_drvdata(dev);
+
+       return regmap_update_bits(data->regmap, ABEOZ9_REG_CTRL_INT,
+                                 ABEOZ9_REG_CTRL_INT_AIE,
+                                 FIELD_PREP(ABEOZ9_REG_CTRL_INT_AIE, enable));
+}
+
+static int abeoz9_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
+{
+       struct abeoz9_rtc_data *data = dev_get_drvdata(dev);
+       u8 regs[ABEOZ9_ALARM_LEN] = {0};
+       int ret;
+
+       ret = regmap_update_bits(data->regmap, ABEOZ9_REG_CTRL_INT_FLAG,
+                                ABEOZ9_REG_CTRL_INT_FLAG_AF, 0);
+       if (ret)
+               return ret;
+
+       regs[0] = ABEOZ9_BIT_ALARM_AE | FIELD_PREP(ABEOZ9_BIT_ALARM_SEC,
+                                                  bin2bcd(alarm->time.tm_sec));
+       regs[1] = ABEOZ9_BIT_ALARM_AE | FIELD_PREP(ABEOZ9_BIT_ALARM_MIN,
+                                                  bin2bcd(alarm->time.tm_min));
+       regs[2] = ABEOZ9_BIT_ALARM_AE | FIELD_PREP(ABEOZ9_BIT_ALARM_HOURS,
+                                                  bin2bcd(alarm->time.tm_hour));
+       regs[3] = ABEOZ9_BIT_ALARM_AE | FIELD_PREP(ABEOZ9_BIT_ALARM_DAYS,
+                                                  bin2bcd(alarm->time.tm_mday));
+
+       ret = regmap_bulk_write(data->regmap, ABEOZ9_REG_ALARM_SEC, regs,
+                               sizeof(regs));
+       if (ret)
+               return ret;
+
+       return abeoz9_rtc_alarm_irq_enable(dev, alarm->enabled);
+}
+
+static irqreturn_t abeoz9_rtc_irq(int irq, void *dev)
+{
+       struct abeoz9_rtc_data *data = dev_get_drvdata(dev);
+       unsigned int val;
+       int ret;
+
+       ret = regmap_read(data->regmap, ABEOZ9_REG_CTRL_INT_FLAG, &val);
+       if (ret)
+               return IRQ_NONE;
+
+       if (!FIELD_GET(ABEOZ9_REG_CTRL_INT_FLAG_AF, val))
+               return IRQ_NONE;
+
+       regmap_update_bits(data->regmap, ABEOZ9_REG_CTRL_INT_FLAG,
+                          ABEOZ9_REG_CTRL_INT_FLAG_AF, 0);
+
+       rtc_update_irq(data->rtc, 1, RTC_IRQF | RTC_AF);
+
+       return IRQ_HANDLED;
+}
+
 static int abeoz9_trickle_parse_dt(struct device_node *node)
 {
        u32 ohms = 0;
@@ -258,12 +369,16 @@ static int abeoz9_rtc_setup(struct device *dev, struct device_node *node)
 
 static const struct rtc_class_ops rtc_ops = {
        .read_time = abeoz9_rtc_get_time,
-       .set_time  = abeoz9_rtc_set_time,
+       .set_time = abeoz9_rtc_set_time,
+       .read_alarm = abeoz9_rtc_read_alarm,
+       .set_alarm = abeoz9_rtc_set_alarm,
+       .alarm_irq_enable = abeoz9_rtc_alarm_irq_enable,
 };
 
 static const struct regmap_config abeoz9_rtc_regmap_config = {
        .reg_bits = 8,
        .val_bits = 8,
+       .max_register = 0x3f,
 };
 
 #if IS_REACHABLE(CONFIG_HWMON)
@@ -419,6 +534,24 @@ static int abeoz9_probe(struct i2c_client *client,
        data->rtc->ops = &rtc_ops;
        data->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
        data->rtc->range_max = RTC_TIMESTAMP_END_2099;
+       data->rtc->uie_unsupported = 1;
+       clear_bit(RTC_FEATURE_ALARM, data->rtc->features);
+
+       if (client->irq > 0) {
+               ret = devm_request_threaded_irq(dev, client->irq, NULL,
+                                               abeoz9_rtc_irq,
+                                               IRQF_TRIGGER_LOW | IRQF_ONESHOT,
+                                               dev_name(dev), dev);
+               if (ret) {
+                       dev_err(dev, "failed to request alarm irq\n");
+                       return ret;
+               }
+       }
+
+       if (client->irq > 0 || device_property_read_bool(dev, "wakeup-source")) {
+               ret = device_init_wakeup(dev, true);
+               set_bit(RTC_FEATURE_ALARM, data->rtc->features);
+       }
 
        ret = devm_rtc_register_device(data->rtc);
        if (ret)
index cd8e438..336cb9a 100644 (file)
@@ -169,9 +169,6 @@ enum ds_type {
 
 struct ds1307 {
        enum ds_type            type;
-       unsigned long           flags;
-#define HAS_NVRAM      0               /* bit 0 == sysfs file active */
-#define HAS_ALARM      1               /* bit 1 == irq claimed */
        struct device           *dev;
        struct regmap           *regmap;
        const char              *name;
@@ -296,7 +293,11 @@ static int ds1307_get_time(struct device *dev, struct rtc_time *t)
        t->tm_min = bcd2bin(regs[DS1307_REG_MIN] & 0x7f);
        tmp = regs[DS1307_REG_HOUR] & 0x3f;
        t->tm_hour = bcd2bin(tmp);
-       t->tm_wday = bcd2bin(regs[DS1307_REG_WDAY] & 0x07) - 1;
+       /* rx8130 is bit position, not BCD */
+       if (ds1307->type == rx_8130)
+               t->tm_wday = fls(regs[DS1307_REG_WDAY] & 0x7f);
+       else
+               t->tm_wday = bcd2bin(regs[DS1307_REG_WDAY] & 0x07) - 1;
        t->tm_mday = bcd2bin(regs[DS1307_REG_MDAY] & 0x3f);
        tmp = regs[DS1307_REG_MONTH] & 0x1f;
        t->tm_mon = bcd2bin(tmp) - 1;
@@ -343,7 +344,11 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t)
        regs[DS1307_REG_SECS] = bin2bcd(t->tm_sec);
        regs[DS1307_REG_MIN] = bin2bcd(t->tm_min);
        regs[DS1307_REG_HOUR] = bin2bcd(t->tm_hour);
-       regs[DS1307_REG_WDAY] = bin2bcd(t->tm_wday + 1);
+       /* rx8130 is bit position, not BCD */
+       if (ds1307->type == rx_8130)
+               regs[DS1307_REG_WDAY] = 1 << t->tm_wday;
+       else
+               regs[DS1307_REG_WDAY] = bin2bcd(t->tm_wday + 1);
        regs[DS1307_REG_MDAY] = bin2bcd(t->tm_mday);
        regs[DS1307_REG_MONTH] = bin2bcd(t->tm_mon + 1);
 
@@ -411,9 +416,6 @@ static int ds1337_read_alarm(struct device *dev, struct rtc_wkalrm *t)
        int                     ret;
        u8                      regs[9];
 
-       if (!test_bit(HAS_ALARM, &ds1307->flags))
-               return -EINVAL;
-
        /* read all ALARM1, ALARM2, and status registers at once */
        ret = regmap_bulk_read(ds1307->regmap, DS1339_REG_ALARM1_SECS,
                               regs, sizeof(regs));
@@ -454,9 +456,6 @@ static int ds1337_set_alarm(struct device *dev, struct rtc_wkalrm *t)
        u8                      control, status;
        int                     ret;
 
-       if (!test_bit(HAS_ALARM, &ds1307->flags))
-               return -EINVAL;
-
        dev_dbg(dev, "%s secs=%d, mins=%d, "
                "hours=%d, mday=%d, enabled=%d, pending=%d\n",
                "alarm set", t->time.tm_sec, t->time.tm_min,
@@ -512,9 +511,6 @@ static int ds1307_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
        struct ds1307           *ds1307 = dev_get_drvdata(dev);
 
-       if (!test_bit(HAS_ALARM, &ds1307->flags))
-               return -ENOTTY;
-
        return regmap_update_bits(ds1307->regmap, DS1337_REG_CONTROL,
                                  DS1337_BIT_A1IE,
                                  enabled ? DS1337_BIT_A1IE : 0);
@@ -592,9 +588,6 @@ static int rx8130_read_alarm(struct device *dev, struct rtc_wkalrm *t)
        u8 ald[3], ctl[3];
        int ret;
 
-       if (!test_bit(HAS_ALARM, &ds1307->flags))
-               return -EINVAL;
-
        /* Read alarm registers. */
        ret = regmap_bulk_read(ds1307->regmap, RX8130_REG_ALARM_MIN, ald,
                               sizeof(ald));
@@ -634,9 +627,6 @@ static int rx8130_set_alarm(struct device *dev, struct rtc_wkalrm *t)
        u8 ald[3], ctl[3];
        int ret;
 
-       if (!test_bit(HAS_ALARM, &ds1307->flags))
-               return -EINVAL;
-
        dev_dbg(dev, "%s, sec=%d min=%d hour=%d wday=%d mday=%d mon=%d "
                "enabled=%d pending=%d\n", __func__,
                t->time.tm_sec, t->time.tm_min, t->time.tm_hour,
@@ -681,9 +671,6 @@ static int rx8130_alarm_irq_enable(struct device *dev, unsigned int enabled)
        struct ds1307 *ds1307 = dev_get_drvdata(dev);
        int ret, reg;
 
-       if (!test_bit(HAS_ALARM, &ds1307->flags))
-               return -EINVAL;
-
        ret = regmap_read(ds1307->regmap, RX8130_REG_CONTROL0, &reg);
        if (ret < 0)
                return ret;
@@ -735,9 +722,6 @@ static int mcp794xx_read_alarm(struct device *dev, struct rtc_wkalrm *t)
        u8 regs[10];
        int ret;
 
-       if (!test_bit(HAS_ALARM, &ds1307->flags))
-               return -EINVAL;
-
        /* Read control and alarm 0 registers. */
        ret = regmap_bulk_read(ds1307->regmap, MCP794XX_REG_CONTROL, regs,
                               sizeof(regs));
@@ -793,9 +777,6 @@ static int mcp794xx_set_alarm(struct device *dev, struct rtc_wkalrm *t)
        unsigned char regs[10];
        int wday, ret;
 
-       if (!test_bit(HAS_ALARM, &ds1307->flags))
-               return -EINVAL;
-
        wday = mcp794xx_alm_weekday(dev, &t->time);
        if (wday < 0)
                return wday;
@@ -842,9 +823,6 @@ static int mcp794xx_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
        struct ds1307 *ds1307 = dev_get_drvdata(dev);
 
-       if (!test_bit(HAS_ALARM, &ds1307->flags))
-               return -EINVAL;
-
        return regmap_update_bits(ds1307->regmap, MCP794XX_REG_CONTROL,
                                  MCP794XX_BIT_ALM0_EN,
                                  enabled ? MCP794XX_BIT_ALM0_EN : 0);
@@ -1641,7 +1619,7 @@ static int ds3231_clks_register(struct ds1307 *ds1307)
                 * Interrupt signal due to alarm conditions and square-wave
                 * output share same pin, so don't initialize both.
                 */
-               if (i == DS3231_CLK_SQW && test_bit(HAS_ALARM, &ds1307->flags))
+               if (i == DS3231_CLK_SQW && test_bit(RTC_FEATURE_ALARM, ds1307->rtc->features))
                        continue;
 
                init.name = ds3231_clks_names[i];
@@ -1964,15 +1942,15 @@ static int ds1307_probe(struct i2c_client *client,
                             bin2bcd(tmp));
        }
 
-       if (want_irq || ds1307_can_wakeup_device) {
-               device_set_wakeup_capable(ds1307->dev, true);
-               set_bit(HAS_ALARM, &ds1307->flags);
-       }
-
        ds1307->rtc = devm_rtc_allocate_device(ds1307->dev);
        if (IS_ERR(ds1307->rtc))
                return PTR_ERR(ds1307->rtc);
 
+       if (want_irq || ds1307_can_wakeup_device)
+               device_set_wakeup_capable(ds1307->dev, true);
+       else
+               clear_bit(RTC_FEATURE_ALARM, ds1307->rtc->features);
+
        if (ds1307_can_wakeup_device && !want_irq) {
                dev_info(ds1307->dev,
                         "'wakeup-source' is set, request for an IRQ is disabled!\n");
@@ -1988,7 +1966,7 @@ static int ds1307_probe(struct i2c_client *client,
                if (err) {
                        client->irq = 0;
                        device_set_wakeup_capable(ds1307->dev, false);
-                       clear_bit(HAS_ALARM, &ds1307->flags);
+                       clear_bit(RTC_FEATURE_ALARM, ds1307->rtc->features);
                        dev_err(ds1307->dev, "unable to request IRQ!\n");
                } else {
                        dev_dbg(ds1307->dev, "got IRQ %d\n", client->irq);
index bda8843..1109cad 100644 (file)
@@ -104,12 +104,6 @@ rtc_write(uint8_t val, uint32_t reg)
        writeb(val, ds1511_base + (reg * reg_spacing));
 }
 
-static inline void
-rtc_write_alarm(uint8_t val, enum ds1511reg reg)
-{
-       rtc_write((val | 0x80), reg);
-}
-
 static noinline uint8_t
 rtc_read(enum ds1511reg reg)
 {
index 57cc09d..c0df49f 100644 (file)
@@ -310,6 +310,7 @@ static const struct of_device_id ftm_rtc_match[] = {
        { .compatible = "fsl,lx2160a-ftm-alarm", },
        { },
 };
+MODULE_DEVICE_TABLE(of, ftm_rtc_match);
 
 static const struct acpi_device_id ftm_imx_acpi_ids[] = {
        {"NXP0014",},
index cc9fbab..814d516 100644 (file)
@@ -80,16 +80,6 @@ static int imx_sc_rtc_alarm_irq_enable(struct device *dev, unsigned int enable)
        return imx_scu_irq_group_enable(SC_IRQ_GROUP_RTC, SC_IRQ_RTC, enable);
 }
 
-static int imx_sc_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
-{
-       /*
-        * SCU firmware does NOT provide read alarm API, but .read_alarm
-        * callback is required by RTC framework to support alarm function,
-        * so just return here.
-        */
-       return 0;
-}
-
 static int imx_sc_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 {
        struct imx_sc_msg_timer_rtc_set_alarm msg;
@@ -127,7 +117,6 @@ static int imx_sc_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 static const struct rtc_class_ops imx_sc_rtc_ops = {
        .read_time = imx_sc_rtc_read_time,
        .set_time = imx_sc_rtc_set_time,
-       .read_alarm = imx_sc_rtc_read_alarm,
        .set_alarm = imx_sc_rtc_set_alarm,
        .alarm_irq_enable = imx_sc_rtc_alarm_irq_enable,
 };
index c2692da..c1806f4 100644 (file)
@@ -840,19 +840,17 @@ static int __exit dryice_rtc_remove(struct platform_device *pdev)
        return 0;
 }
 
-#ifdef CONFIG_OF
 static const struct of_device_id dryice_dt_ids[] = {
        { .compatible = "fsl,imx25-rtc" },
        { /* sentinel */ }
 };
 
 MODULE_DEVICE_TABLE(of, dryice_dt_ids);
-#endif
 
 static struct platform_driver dryice_rtc_driver = {
        .driver = {
                   .name = "imxdi_rtc",
-                  .of_match_table = of_match_ptr(dryice_dt_ids),
+                  .of_match_table = dryice_dt_ids,
                   },
        .remove = __exit_p(dryice_rtc_remove),
 };
index 1d2e99a..f0f6b9b 100644 (file)
@@ -421,7 +421,7 @@ static int m48t59_rtc_probe(struct platform_device *pdev)
        /* Try to get irq number. We also can work in
         * the mode without IRQ.
         */
-       m48t59->irq = platform_get_irq(pdev, 0);
+       m48t59->irq = platform_get_irq_optional(pdev, 0);
        if (m48t59->irq <= 0)
                m48t59->irq = NO_IRQ;
 
index db57dda..0f08f22 100644 (file)
@@ -415,7 +415,7 @@ static int mxc_rtc_probe(struct platform_device *pdev)
 static struct platform_driver mxc_rtc_driver = {
        .driver = {
                   .name        = "mxc_rtc",
-                  .of_match_table = of_match_ptr(imx_rtc_dt_ids),
+                  .of_match_table = imx_rtc_dt_ids,
        },
        .probe = mxc_rtc_probe,
 };
index dc7db24..d46e0f0 100644 (file)
@@ -786,8 +786,7 @@ static int omap_rtc_probe(struct platform_device *pdev)
        /* enable RTC functional clock */
        if (rtc->type->has_32kclk_en) {
                reg = rtc_read(rtc, OMAP_RTC_OSC_REG);
-               rtc_writel(rtc, OMAP_RTC_OSC_REG,
-                               reg | OMAP_RTC_OSC_32KCLK_EN);
+               rtc_write(rtc, OMAP_RTC_OSC_REG, reg | OMAP_RTC_OSC_32KCLK_EN);
        }
 
        /* clear old status */
@@ -845,7 +844,7 @@ static int omap_rtc_probe(struct platform_device *pdev)
                reg = rtc_read(rtc, OMAP_RTC_OSC_REG);
                reg &= ~OMAP_RTC_OSC_OSC32K_GZ_DISABLE;
                reg |= OMAP_RTC_OSC_32KCLK_EN | OMAP_RTC_OSC_SEL_32KCLK_SRC;
-               rtc_writel(rtc, OMAP_RTC_OSC_REG, reg);
+               rtc_write(rtc, OMAP_RTC_OSC_REG, reg);
        }
 
        rtc->type->lock(rtc);
index aef6c1e..82becae 100644 (file)
@@ -478,6 +478,7 @@ static struct clk *pcf85063_clkout_register_clk(struct pcf85063 *pcf85063)
 {
        struct clk *clk;
        struct clk_init_data init;
+       struct device_node *node = pcf85063->rtc->dev.parent->of_node;
 
        init.name = "pcf85063-clkout";
        init.ops = &pcf85063_clkout_ops;
@@ -487,15 +488,13 @@ static struct clk *pcf85063_clkout_register_clk(struct pcf85063 *pcf85063)
        pcf85063->clkout_hw.init = &init;
 
        /* optional override of the clockname */
-       of_property_read_string(pcf85063->rtc->dev.of_node,
-                               "clock-output-names", &init.name);
+       of_property_read_string(node, "clock-output-names", &init.name);
 
        /* register the clock */
        clk = devm_clk_register(&pcf85063->rtc->dev, &pcf85063->clkout_hw);
 
        if (!IS_ERR(clk))
-               of_clk_add_provider(pcf85063->rtc->dev.of_node,
-                                   of_clk_src_simple_get, clk);
+               of_clk_add_provider(node, of_clk_src_simple_get, clk);
 
        return clk;
 }
index 5e1e7b2..740e213 100644 (file)
@@ -8,12 +8,15 @@
 #include <linux/module.h>
 #include <linux/rtc.h>
 #include <linux/of.h>
-
-#define DRIVER_NAME "rtc-pcf8523"
+#include <linux/pm_wakeirq.h>
 
 #define REG_CONTROL1 0x00
 #define REG_CONTROL1_CAP_SEL BIT(7)
 #define REG_CONTROL1_STOP    BIT(5)
+#define REG_CONTROL1_AIE    BIT(1)
+
+#define REG_CONTROL2 0x01
+#define REG_CONTROL2_AF BIT(3)
 
 #define REG_CONTROL3 0x02
 #define REG_CONTROL3_PM_BLD BIT(7) /* battery low detection disabled */
 #define REG_MONTHS   0x08
 #define REG_YEARS    0x09
 
+#define REG_MINUTE_ALARM       0x0a
+#define REG_HOUR_ALARM         0x0b
+#define REG_DAY_ALARM          0x0c
+#define REG_WEEKDAY_ALARM      0x0d
+#define ALARM_DIS BIT(7)
+
 #define REG_OFFSET   0x0e
 #define REG_OFFSET_MODE BIT(7)
 
+#define REG_TMR_CLKOUT_CTRL 0x0f
+
+struct pcf8523 {
+       struct rtc_device *rtc;
+       struct i2c_client *client;
+};
+
 static int pcf8523_read(struct i2c_client *client, u8 reg, u8 *valuep)
 {
        struct i2c_msg msgs[2];
@@ -140,6 +156,27 @@ static int pcf8523_set_pm(struct i2c_client *client, u8 pm)
        return 0;
 }
 
+static irqreturn_t pcf8523_irq(int irq, void *dev_id)
+{
+       struct pcf8523 *pcf8523 = i2c_get_clientdata(dev_id);
+       u8 value;
+       int err;
+
+       err = pcf8523_read(pcf8523->client, REG_CONTROL2, &value);
+       if (err < 0)
+               return IRQ_HANDLED;
+
+       if (value & REG_CONTROL2_AF) {
+               value &= ~REG_CONTROL2_AF;
+               pcf8523_write(pcf8523->client, REG_CONTROL2, value);
+               rtc_update_irq(pcf8523->rtc, 1, RTC_IRQF | RTC_AF);
+
+               return IRQ_HANDLED;
+       }
+
+       return IRQ_NONE;
+}
+
 static int pcf8523_stop_rtc(struct i2c_client *client)
 {
        u8 value;
@@ -259,11 +296,118 @@ static int pcf8523_rtc_set_time(struct device *dev, struct rtc_time *tm)
        return pcf8523_start_rtc(client);
 }
 
+static int pcf8523_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *tm)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       u8 start = REG_MINUTE_ALARM, regs[4];
+       struct i2c_msg msgs[2];
+       u8 value;
+       int err;
+
+       msgs[0].addr = client->addr;
+       msgs[0].flags = 0;
+       msgs[0].len = 1;
+       msgs[0].buf = &start;
+
+       msgs[1].addr = client->addr;
+       msgs[1].flags = I2C_M_RD;
+       msgs[1].len = sizeof(regs);
+       msgs[1].buf = regs;
+
+       err = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs));
+       if (err < 0)
+               return err;
+
+       tm->time.tm_sec = 0;
+       tm->time.tm_min = bcd2bin(regs[0] & 0x7F);
+       tm->time.tm_hour = bcd2bin(regs[1] & 0x3F);
+       tm->time.tm_mday = bcd2bin(regs[2] & 0x3F);
+       tm->time.tm_wday = bcd2bin(regs[3] & 0x7);
+
+       err = pcf8523_read(client, REG_CONTROL1, &value);
+       if (err < 0)
+               return err;
+       tm->enabled = !!(value & REG_CONTROL1_AIE);
+
+       err = pcf8523_read(client, REG_CONTROL2, &value);
+       if (err < 0)
+               return err;
+       tm->pending = !!(value & REG_CONTROL2_AF);
+
+       return 0;
+}
+
+static int pcf8523_irq_enable(struct device *dev, unsigned int enabled)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       u8 value;
+       int err;
+
+       err = pcf8523_read(client, REG_CONTROL1, &value);
+       if (err < 0)
+               return err;
+
+       value &= REG_CONTROL1_AIE;
+
+       if (enabled)
+               value |= REG_CONTROL1_AIE;
+
+       err = pcf8523_write(client, REG_CONTROL1, value);
+       if (err < 0)
+               return err;
+
+       return 0;
+}
+
+static int pcf8523_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *tm)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct i2c_msg msg;
+       u8 regs[5];
+       int err;
+
+       err = pcf8523_irq_enable(dev, 0);
+       if (err)
+               return err;
+
+       err = pcf8523_write(client, REG_CONTROL2, 0);
+       if (err < 0)
+               return err;
+
+       /* The alarm has no seconds, round up to nearest minute */
+       if (tm->time.tm_sec) {
+               time64_t alarm_time = rtc_tm_to_time64(&tm->time);
+
+               alarm_time += 60 - tm->time.tm_sec;
+               rtc_time64_to_tm(alarm_time, &tm->time);
+       }
+
+       regs[0] = REG_MINUTE_ALARM;
+       regs[1] = bin2bcd(tm->time.tm_min);
+       regs[2] = bin2bcd(tm->time.tm_hour);
+       regs[3] = bin2bcd(tm->time.tm_mday);
+       regs[4] = ALARM_DIS;
+       msg.addr = client->addr;
+       msg.flags = 0;
+       msg.len = sizeof(regs);
+       msg.buf = regs;
+       err = i2c_transfer(client->adapter, &msg, 1);
+       if (err < 0)
+               return err;
+
+       if (tm->enabled)
+               return pcf8523_irq_enable(dev, tm->enabled);
+
+       return 0;
+}
+
 #ifdef CONFIG_RTC_INTF_DEV
 static int pcf8523_rtc_ioctl(struct device *dev, unsigned int cmd,
                             unsigned long arg)
 {
        struct i2c_client *client = to_i2c_client(dev);
+       unsigned int flags = 0;
+       u8 value;
        int ret;
 
        switch (cmd) {
@@ -272,9 +416,16 @@ static int pcf8523_rtc_ioctl(struct device *dev, unsigned int cmd,
                if (ret < 0)
                        return ret;
                if (ret)
-                       ret = RTC_VL_BACKUP_LOW;
+                       flags |= RTC_VL_BACKUP_LOW;
 
-               return put_user(ret, (unsigned int __user *)arg);
+               ret = pcf8523_read(client, REG_SECONDS, &value);
+               if (ret < 0)
+                       return ret;
+
+               if (value & REG_SECONDS_OS)
+                       flags |= RTC_VL_DATA_INVALID;
+
+               return put_user(flags, (unsigned int __user *)arg);
 
        default:
                return -ENOIOCTLCMD;
@@ -322,6 +473,9 @@ static int pcf8523_rtc_set_offset(struct device *dev, long offset)
 static const struct rtc_class_ops pcf8523_rtc_ops = {
        .read_time = pcf8523_rtc_read_time,
        .set_time = pcf8523_rtc_set_time,
+       .read_alarm = pcf8523_rtc_read_alarm,
+       .set_alarm = pcf8523_rtc_set_alarm,
+       .alarm_irq_enable = pcf8523_irq_enable,
        .ioctl = pcf8523_rtc_ioctl,
        .read_offset = pcf8523_rtc_read_offset,
        .set_offset = pcf8523_rtc_set_offset,
@@ -330,12 +484,21 @@ static const struct rtc_class_ops pcf8523_rtc_ops = {
 static int pcf8523_probe(struct i2c_client *client,
                         const struct i2c_device_id *id)
 {
+       struct pcf8523 *pcf8523;
        struct rtc_device *rtc;
+       bool wakeup_source = false;
        int err;
 
        if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C))
                return -ENODEV;
 
+       pcf8523 = devm_kzalloc(&client->dev, sizeof(struct pcf8523), GFP_KERNEL);
+       if (!pcf8523)
+               return -ENOMEM;
+
+       i2c_set_clientdata(client, pcf8523);
+       pcf8523->client = client;
+
        err = pcf8523_load_capacitance(client);
        if (err < 0)
                dev_warn(&client->dev, "failed to set xtal load capacitance: %d",
@@ -349,9 +512,32 @@ static int pcf8523_probe(struct i2c_client *client,
        if (IS_ERR(rtc))
                return PTR_ERR(rtc);
 
+       pcf8523->rtc = rtc;
        rtc->ops = &pcf8523_rtc_ops;
        rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
        rtc->range_max = RTC_TIMESTAMP_END_2099;
+       rtc->uie_unsupported = 1;
+
+       if (client->irq > 0) {
+               err = pcf8523_write(client, REG_TMR_CLKOUT_CTRL, 0x38);
+               if (err < 0)
+                       return err;
+
+               err = devm_request_threaded_irq(&client->dev, client->irq,
+                                               NULL, pcf8523_irq,
+                                               IRQF_SHARED | IRQF_ONESHOT | IRQF_TRIGGER_LOW,
+                                               dev_name(&rtc->dev), client);
+               if (err)
+                       return err;
+
+               dev_pm_set_wake_irq(&client->dev, client->irq);
+       }
+
+#ifdef CONFIG_OF
+       wakeup_source = of_property_read_bool(client->dev.of_node, "wakeup-source");
+#endif
+       if (client->irq > 0 || wakeup_source)
+               device_init_wakeup(&client->dev, true);
 
        return devm_rtc_register_device(rtc);
 }
@@ -373,7 +559,7 @@ MODULE_DEVICE_TABLE(of, pcf8523_of_match);
 
 static struct i2c_driver pcf8523_driver = {
        .driver = {
-               .name = DRIVER_NAME,
+               .name = "rtc-pcf8523",
                .of_match_table = of_match_ptr(pcf8523_of_match),
        },
        .probe = pcf8523_probe,
index eb20659..29a1c65 100644 (file)
@@ -445,6 +445,16 @@ static const struct pm8xxx_rtc_regs pm8941_regs = {
        .alarm_en       = BIT(7),
 };
 
+static const struct pm8xxx_rtc_regs pmk8350_regs = {
+       .ctrl           = 0x6146,
+       .write          = 0x6140,
+       .read           = 0x6148,
+       .alarm_rw       = 0x6240,
+       .alarm_ctrl     = 0x6246,
+       .alarm_ctrl2    = 0x6248,
+       .alarm_en       = BIT(7),
+};
+
 /*
  * Hardcoded RTC bases until IORESOURCE_REG mapping is figured out
  */
@@ -453,6 +463,7 @@ static const struct of_device_id pm8xxx_id_table[] = {
        { .compatible = "qcom,pm8018-rtc", .data = &pm8921_regs },
        { .compatible = "qcom,pm8058-rtc", .data = &pm8058_regs },
        { .compatible = "qcom,pm8941-rtc", .data = &pm8941_regs },
+       { .compatible = "qcom,pmk8350-rtc", .data = &pmk8350_regs },
        { },
 };
 MODULE_DEVICE_TABLE(of, pm8xxx_id_table);
index 0c48d98..12c8073 100644 (file)
@@ -320,7 +320,7 @@ static int rv3028_get_time(struct device *dev, struct rtc_time *tm)
        tm->tm_sec  = bcd2bin(date[RV3028_SEC] & 0x7f);
        tm->tm_min  = bcd2bin(date[RV3028_MIN] & 0x7f);
        tm->tm_hour = bcd2bin(date[RV3028_HOUR] & 0x3f);
-       tm->tm_wday = ilog2(date[RV3028_WDAY] & 0x7f);
+       tm->tm_wday = date[RV3028_WDAY] & 0x7f;
        tm->tm_mday = bcd2bin(date[RV3028_DAY] & 0x3f);
        tm->tm_mon  = bcd2bin(date[RV3028_MONTH] & 0x1f) - 1;
        tm->tm_year = bcd2bin(date[RV3028_YEAR]) + 100;
@@ -337,7 +337,7 @@ static int rv3028_set_time(struct device *dev, struct rtc_time *tm)
        date[RV3028_SEC]   = bin2bcd(tm->tm_sec);
        date[RV3028_MIN]   = bin2bcd(tm->tm_min);
        date[RV3028_HOUR]  = bin2bcd(tm->tm_hour);
-       date[RV3028_WDAY]  = 1 << (tm->tm_wday);
+       date[RV3028_WDAY]  = tm->tm_wday;
        date[RV3028_DAY]   = bin2bcd(tm->tm_mday);
        date[RV3028_MONTH] = bin2bcd(tm->tm_mon + 1);
        date[RV3028_YEAR]  = bin2bcd(tm->tm_year - 100);
index 79161d4..f4d4250 100644 (file)
@@ -447,6 +447,12 @@ static int rx6110_i2c_probe(struct i2c_client *client,
        return rx6110_probe(rx6110, &client->dev);
 }
 
+static const struct acpi_device_id rx6110_i2c_acpi_match[] = {
+       { "SECC6110" },
+       { }
+};
+MODULE_DEVICE_TABLE(acpi, rx6110_i2c_acpi_match);
+
 static const struct i2c_device_id rx6110_i2c_id[] = {
        { "rx6110", 0 },
        { }
@@ -456,6 +462,7 @@ MODULE_DEVICE_TABLE(i2c, rx6110_i2c_id);
 static struct i2c_driver rx6110_i2c_driver = {
        .driver = {
                .name = RX6110_DRIVER_NAME,
+               .acpi_match_table = rx6110_i2c_acpi_match,
        },
        .probe          = rx6110_i2c_probe,
        .id_table       = rx6110_i2c_id,
index 80b66f1..038269a 100644 (file)
@@ -713,16 +713,10 @@ static int s5m8767_rtc_init_reg(struct s5m_rtc_info *info)
 static int s5m_rtc_probe(struct platform_device *pdev)
 {
        struct sec_pmic_dev *s5m87xx = dev_get_drvdata(pdev->dev.parent);
-       struct sec_platform_data *pdata = s5m87xx->pdata;
        struct s5m_rtc_info *info;
        const struct regmap_config *regmap_cfg;
        int ret, alarm_irq;
 
-       if (!pdata) {
-               dev_err(pdev->dev.parent, "Platform data not supplied\n");
-               return -ENODEV;
-       }
-
        info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
        if (!info)
                return -ENOMEM;
index 833daeb..ee721e5 100644 (file)
@@ -153,12 +153,12 @@ static void rtc_wait_not_busy(struct spear_rtc_config *config)
 static irqreturn_t spear_rtc_irq(int irq, void *dev_id)
 {
        struct spear_rtc_config *config = dev_id;
-       unsigned long flags, events = 0;
+       unsigned long events = 0;
        unsigned int irq_data;
 
-       spin_lock_irqsave(&config->lock, flags);
+       spin_lock(&config->lock);
        irq_data = readl(config->ioaddr + STATUS_REG);
-       spin_unlock_irqrestore(&config->lock, flags);
+       spin_unlock(&config->lock);
 
        if ((irq_data & RTC_INT_MASK)) {
                spear_rtc_clear_interrupt(config);
index 288abb1..bc89c62 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/rtc.h>
 #include <linux/bcd.h>
 #include <linux/math64.h>
+#include <linux/property.h>
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
 #include <linux/mfd/tps65910.h>
index 8a957d3..74026f6 100644 (file)
@@ -273,7 +273,7 @@ static bool rtc_does_wakealarm(struct rtc_device *rtc)
        if (!device_can_wakeup(rtc->dev.parent))
                return false;
 
-       return rtc->ops->set_alarm != NULL;
+       return !!test_bit(RTC_FEATURE_ALARM, rtc->features);
 }
 
 static umode_t rtc_attr_is_visible(struct kobject *kobj,
index ca24a78..7365121 100644 (file)
@@ -52,7 +52,7 @@
 #define DASD_ECKD_CCW_RCD               0xFA
 #define DASD_ECKD_CCW_DSO               0xF7
 
-/* Define Subssystem Function / Orders */
+/* Define Subsystem Function / Orders */
 #define DSO_ORDER_RAS                   0x81
 
 /*
 #define DASD_ECKD_PG_GROUPED            0x10
 
 /*
- * Size that is reportet for large volumes in the old 16-bit no_cyl field
+ * Size that is reported for large volumes in the old 16-bit no_cyl field
  */
 #define LV_COMPAT_CYL 0xFFFE
 
@@ -555,7 +555,7 @@ struct dasd_dso_ras_ext_range {
 } __packed;
 
 /*
- * Define Subsytem Operation - Release Allocated Space
+ * Define Subsystem Operation - Release Allocated Space
  */
 struct dasd_dso_ras_data {
        __u8 order;
@@ -676,7 +676,7 @@ struct dasd_eckd_private {
        struct dasd_ext_pool_sum eps;
        u32 real_cyl;
 
-       /* alias managemnet */
+       /* alias management */
        struct dasd_uid uid;
        struct alias_pav_group *pavgroup;
        struct alias_lcu *lcu;
index 3f02602..84f659c 100644 (file)
@@ -1532,8 +1532,7 @@ static int io_subchannel_sch_event(struct subchannel *sch, int process)
        switch (action) {
        case IO_SCH_ORPH_UNREG:
        case IO_SCH_UNREG:
-               if (!cdev)
-                       css_sch_device_unregister(sch);
+               css_sch_device_unregister(sch);
                break;
        case IO_SCH_ORPH_ATTACH:
        case IO_SCH_UNREG_ATTACH:
index ab42fea..77ccb96 100644 (file)
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* NCR (or Symbios) 53c700 and 53c700-66 Driver
  *
index c9f8c49..2df347c 100644 (file)
@@ -1,5 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* Driver for 53c700 and 53c700-66 chips from NCR and Symbios
  *
index cb74ab1..9b89c26 100644 (file)
@@ -1058,9 +1058,3 @@ static void __exit exit_ch_module(void)
 
 module_init(init_ch_module);
 module_exit(exit_ch_module);
-
-/*
- * Local variables:
- * c-basic-offset: 8
- * End:
- */
index dc36531..222593b 100644 (file)
@@ -1649,8 +1649,7 @@ static int read_vpd(struct cxlflash_cfg *cfg, u64 wwpn[])
        }
 
        /* Get the read only section offset */
-       ro_start = pci_vpd_find_tag(vpd_data, 0, vpd_size,
-                                   PCI_VPD_LRDT_RO_DATA);
+       ro_start = pci_vpd_find_tag(vpd_data, vpd_size, PCI_VPD_LRDT_RO_DATA);
        if (unlikely(ro_start < 0)) {
                dev_err(dev, "%s: VPD Read-only data not found\n", __func__);
                rc = -ENODEV;
index 5d9eeac..45ec9f1 100644 (file)
@@ -616,6 +616,7 @@ static const struct file_operations esas2r_proc_fops = {
 };
 
 static const struct proc_ops esas2r_proc_ops = {
+       .proc_lseek             = default_llseek,
        .proc_ioctl             = esas2r_proc_ioctl,
 #ifdef CONFIG_COMPAT
        .proc_compat_ioctl      = compat_ptr_ioctl,
index 1a3c534..bc33d54 100644 (file)
@@ -7099,23 +7099,3 @@ ips_init_phase2(int index)
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("IBM ServeRAID Adapter Driver " IPS_VER_STRING);
 MODULE_VERSION(IPS_VER_STRING);
-
-
-/*
- * Overrides for Emacs so that we almost follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 2
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -2
- * c-argdecl-indent: 2
- * c-label-offset: -2
- * c-continued-statement-offset: 2
- * c-continued-brace-offset: 0
- * indent-tabs-mode: nil
- * tab-width: 8
- * End:
- */
index 6c0678f..65edf00 100644 (file)
@@ -1211,23 +1211,3 @@ typedef struct {
       IPS_COMPAT_TAMPA, \
       IPS_COMPAT_KEYWEST \
    }
-
-
-/*
- * Overrides for Emacs so that we almost follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 2
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -2
- * c-argdecl-indent: 2
- * c-label-offset: -2
- * c-continued-statement-offset: 2
- * c-continued-brace-offset: 0
- * indent-tabs-mode: nil
- * tab-width: 8
- * End:
- */
index de71d24..6d14a7a 100644 (file)
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* PARISC LASI driver for the 53c700 chip
  *
index 01a1bfb..f0ef8f7 100644 (file)
@@ -781,5 +781,3 @@ typedef struct {
 } __attribute__ ((packed)) mbox_sgl32;
 
 #endif         // _MRAID_MBOX_DEFS_H_
-
-/* vim: set ts=8 sw=8 tw=78: */
index 3a7596e..2ad0aa2 100644 (file)
@@ -282,5 +282,3 @@ struct mraid_pci_blk {
 };
 
 #endif // _MEGA_COMMON_H_
-
-// vim: set ts=8 sw=8 tw=78:
index b1a2d35..145fde3 100644 (file)
@@ -4068,5 +4068,3 @@ megaraid_sysfs_show_ldnum(struct device *dev, struct device_attribute *attr, cha
  */
 module_init(megaraid_init);
 module_exit(megaraid_exit);
-
-/* vim: set ts=8 sw=8 tw=78 ai si: */
index 3e4347c..d2fe7f6 100644 (file)
@@ -230,5 +230,3 @@ typedef struct {
 #define WROUTDOOR(rdev, value) writel(value, (rdev)->baseaddr + 0x2C)
 
 #endif // _MEGARAID_H_
-
-// vim: set ts=8 sw=8 tw=78:
index 8f35174..928da90 100644 (file)
@@ -4403,15 +4403,3 @@ MODULE_FIRMWARE("qlogic/1040.bin");
 MODULE_FIRMWARE("qlogic/1280.bin");
 MODULE_FIRMWARE("qlogic/12160.bin");
 MODULE_VERSION(QLA1280_VERSION);
-
-/*
- * Overrides for Emacs so that we almost follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-basic-offset: 8
- * tab-width: 8
- * End:
- */
index f1553a4..0ffdb8f 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/genhd.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/msdos_partition.h>
 #include <asm/unaligned.h>
 
index 97c6f81..678651b 100644 (file)
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* SNI RM driver
  *
index 4274bd1..96f74a1 100644 (file)
@@ -46,9 +46,6 @@ static void portal_set_cpu(struct qm_portal_config *pcfg, int cpu)
 {
 #ifdef CONFIG_FSL_PAMU
        struct device *dev = pcfg->dev;
-       int window_count = 1;
-       struct iommu_domain_geometry geom_attr;
-       struct pamu_stash_attribute stash_attr;
        int ret;
 
        pcfg->iommu_domain = iommu_domain_alloc(&platform_bus_type);
@@ -56,38 +53,9 @@ static void portal_set_cpu(struct qm_portal_config *pcfg, int cpu)
                dev_err(dev, "%s(): iommu_domain_alloc() failed", __func__);
                goto no_iommu;
        }
-       geom_attr.aperture_start = 0;
-       geom_attr.aperture_end =
-               ((dma_addr_t)1 << min(8 * sizeof(dma_addr_t), (size_t)36)) - 1;
-       geom_attr.force_aperture = true;
-       ret = iommu_domain_set_attr(pcfg->iommu_domain, DOMAIN_ATTR_GEOMETRY,
-                                   &geom_attr);
+       ret = fsl_pamu_configure_l1_stash(pcfg->iommu_domain, cpu);
        if (ret < 0) {
-               dev_err(dev, "%s(): iommu_domain_set_attr() = %d", __func__,
-                       ret);
-               goto out_domain_free;
-       }
-       ret = iommu_domain_set_attr(pcfg->iommu_domain, DOMAIN_ATTR_WINDOWS,
-                                   &window_count);
-       if (ret < 0) {
-               dev_err(dev, "%s(): iommu_domain_set_attr() = %d", __func__,
-                       ret);
-               goto out_domain_free;
-       }
-       stash_attr.cpu = cpu;
-       stash_attr.cache = PAMU_ATTR_CACHE_L1;
-       ret = iommu_domain_set_attr(pcfg->iommu_domain,
-                                   DOMAIN_ATTR_FSL_PAMU_STASH,
-                                   &stash_attr);
-       if (ret < 0) {
-               dev_err(dev, "%s(): iommu_domain_set_attr() = %d",
-                       __func__, ret);
-               goto out_domain_free;
-       }
-       ret = iommu_domain_window_enable(pcfg->iommu_domain, 0, 0, 1ULL << 36,
-                                        IOMMU_READ | IOMMU_WRITE);
-       if (ret < 0) {
-               dev_err(dev, "%s(): iommu_domain_window_enable() = %d",
+               dev_err(dev, "%s(): fsl_pamu_configure_l1_stash() = %d",
                        __func__, ret);
                goto out_domain_free;
        }
@@ -97,14 +65,6 @@ static void portal_set_cpu(struct qm_portal_config *pcfg, int cpu)
                        ret);
                goto out_domain_free;
        }
-       ret = iommu_domain_set_attr(pcfg->iommu_domain,
-                                   DOMAIN_ATTR_FSL_PAMU_ENABLE,
-                                   &window_count);
-       if (ret < 0) {
-               dev_err(dev, "%s(): iommu_domain_set_attr() = %d", __func__,
-                       ret);
-               goto out_detach_device;
-       }
 
 no_iommu:
 #endif
@@ -113,8 +73,6 @@ no_iommu:
        return;
 
 #ifdef CONFIG_FSL_PAMU
-out_detach_device:
-       iommu_detach_device(pcfg->iommu_domain, NULL);
 out_domain_free:
        iommu_domain_free(pcfg->iommu_domain);
        pcfg->iommu_domain = NULL;
@@ -169,15 +127,8 @@ static void qman_portal_update_sdest(const struct qm_portal_config *pcfg,
                                                        unsigned int cpu)
 {
 #ifdef CONFIG_FSL_PAMU /* TODO */
-       struct pamu_stash_attribute stash_attr;
-       int ret;
-
        if (pcfg->iommu_domain) {
-               stash_attr.cpu = cpu;
-               stash_attr.cache = PAMU_ATTR_CACHE_L1;
-               ret = iommu_domain_set_attr(pcfg->iommu_domain,
-                               DOMAIN_ATTR_FSL_PAMU_STASH, &stash_attr);
-               if (ret < 0) {
+               if (fsl_pamu_configure_l1_stash(pcfg->iommu_domain, cpu) < 0) {
                        dev_err(pcfg->dev,
                                "Failed to update pamu stash setting\n");
                        return;
index dffe3ba..e61b91d 100644 (file)
@@ -254,10 +254,8 @@ static int amlogic_thermal_probe(struct platform_device *pdev)
        platform_set_drvdata(pdev, pdata);
 
        base = devm_platform_ioremap_resource(pdev, 0);
-       if (IS_ERR(base)) {
-               dev_err(dev, "failed to get io address\n");
+       if (IS_ERR(base))
                return PTR_ERR(base);
-       }
 
        pdata->regmap = devm_regmap_init_mmio(dev, base,
                                              pdata->data->regmap_config);
index 3199977..c8e4344 100644 (file)
@@ -184,7 +184,6 @@ static int bcm2835_thermal_probe(struct platform_device *pdev)
        data->regs = devm_ioremap_resource(&pdev->dev, res);
        if (IS_ERR(data->regs)) {
                err = PTR_ERR(data->regs);
-               dev_err(&pdev->dev, "Could not get registers: %d\n", err);
                return err;
        }
 
index 10af334..eeb4e4b 100644 (file)
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
 #include <linux/cpu_cooling.h>
+#include <linux/device.h>
 #include <linux/energy_model.h>
 #include <linux/err.h>
 #include <linux/export.h>
-#include <linux/idr.h>
 #include <linux/pm_opp.h>
 #include <linux/pm_qos.h>
 #include <linux/slab.h>
@@ -50,8 +50,6 @@ struct time_in_idle {
 
 /**
  * struct cpufreq_cooling_device - data for cooling device with cpufreq
- * @id: unique integer value corresponding to each cpufreq_cooling_device
- *     registered.
  * @last_load: load measured by the latest call to cpufreq_get_requested_power()
  * @cpufreq_state: integer value representing the current state of cpufreq
  *     cooling devices.
@@ -61,7 +59,6 @@ struct time_in_idle {
  * @cdev: thermal_cooling_device pointer to keep track of the
  *     registered cooling device.
  * @policy: cpufreq policy.
- * @node: list_head to link all cpufreq_cooling_device together.
  * @idle_time: idle time stats
  * @qos_req: PM QoS contraint to apply
  *
@@ -69,23 +66,17 @@ struct time_in_idle {
  * cpufreq_cooling_device.
  */
 struct cpufreq_cooling_device {
-       int id;
        u32 last_load;
        unsigned int cpufreq_state;
        unsigned int max_level;
        struct em_perf_domain *em;
        struct cpufreq_policy *policy;
-       struct list_head node;
 #ifndef CONFIG_SMP
        struct time_in_idle *idle_time;
 #endif
        struct freq_qos_request qos_req;
 };
 
-static DEFINE_IDA(cpufreq_ida);
-static DEFINE_MUTEX(cooling_list_lock);
-static LIST_HEAD(cpufreq_cdev_list);
-
 #ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
 /**
  * get_level: Find the level for a particular frequency
@@ -125,7 +116,7 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
 {
        int i;
 
-       for (i = cpufreq_cdev->max_level; i >= 0; i--) {
+       for (i = cpufreq_cdev->max_level; i > 0; i--) {
                if (power >= cpufreq_cdev->em->table[i].power)
                        break;
        }
@@ -528,11 +519,11 @@ __cpufreq_cooling_register(struct device_node *np,
 {
        struct thermal_cooling_device *cdev;
        struct cpufreq_cooling_device *cpufreq_cdev;
-       char dev_name[THERMAL_NAME_LENGTH];
        unsigned int i;
        struct device *dev;
        int ret;
        struct thermal_cooling_device_ops *cooling_ops;
+       char *name;
 
        dev = get_cpu_device(policy->cpu);
        if (unlikely(!dev)) {
@@ -567,16 +558,6 @@ __cpufreq_cooling_register(struct device_node *np,
        /* max_level is an index, not a counter */
        cpufreq_cdev->max_level = i - 1;
 
-       ret = ida_simple_get(&cpufreq_ida, 0, 0, GFP_KERNEL);
-       if (ret < 0) {
-               cdev = ERR_PTR(ret);
-               goto free_idle_time;
-       }
-       cpufreq_cdev->id = ret;
-
-       snprintf(dev_name, sizeof(dev_name), "thermal-cpufreq-%d",
-                cpufreq_cdev->id);
-
        cooling_ops = &cpufreq_cooling_ops;
 
 #ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
@@ -591,7 +572,7 @@ __cpufreq_cooling_register(struct device_node *np,
                pr_err("%s: unsorted frequency tables are not supported\n",
                       __func__);
                cdev = ERR_PTR(-EINVAL);
-               goto remove_ida;
+               goto free_idle_time;
        }
 
        ret = freq_qos_add_request(&policy->constraints,
@@ -601,24 +582,25 @@ __cpufreq_cooling_register(struct device_node *np,
                pr_err("%s: Failed to add freq constraint (%d)\n", __func__,
                       ret);
                cdev = ERR_PTR(ret);
-               goto remove_ida;
+               goto free_idle_time;
        }
 
-       cdev = thermal_of_cooling_device_register(np, dev_name, cpufreq_cdev,
+       cdev = ERR_PTR(-ENOMEM);
+       name = kasprintf(GFP_KERNEL, "cpufreq-%s", dev_name(dev));
+       if (!name)
+               goto remove_qos_req;
+
+       cdev = thermal_of_cooling_device_register(np, name, cpufreq_cdev,
                                                  cooling_ops);
+       kfree(name);
+
        if (IS_ERR(cdev))
                goto remove_qos_req;
 
-       mutex_lock(&cooling_list_lock);
-       list_add(&cpufreq_cdev->node, &cpufreq_cdev_list);
-       mutex_unlock(&cooling_list_lock);
-
        return cdev;
 
 remove_qos_req:
        freq_qos_remove_request(&cpufreq_cdev->qos_req);
-remove_ida:
-       ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
 free_idle_time:
        free_idle_time(cpufreq_cdev);
 free_cdev:
@@ -706,13 +688,8 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 
        cpufreq_cdev = cdev->devdata;
 
-       mutex_lock(&cooling_list_lock);
-       list_del(&cpufreq_cdev->node);
-       mutex_unlock(&cooling_list_lock);
-
        thermal_cooling_device_unregister(cdev);
        freq_qos_remove_request(&cpufreq_cdev->qos_req);
-       ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
        free_idle_time(cpufreq_cdev);
        kfree(cpufreq_cdev);
 }
index 7ecab4b..4f41102 100644 (file)
@@ -9,9 +9,9 @@
 
 #include <linux/cpu_cooling.h>
 #include <linux/cpuidle.h>
+#include <linux/device.h>
 #include <linux/err.h>
 #include <linux/idle_inject.h>
-#include <linux/idr.h>
 #include <linux/of_device.h>
 #include <linux/slab.h>
 #include <linux/thermal.h>
@@ -26,8 +26,6 @@ struct cpuidle_cooling_device {
        unsigned long state;
 };
 
-static DEFINE_IDA(cpuidle_ida);
-
 /**
  * cpuidle_cooling_runtime - Running time computation
  * @idle_duration_us: CPU idle time to inject in microseconds
@@ -174,10 +172,11 @@ static int __cpuidle_cooling_register(struct device_node *np,
        struct idle_inject_device *ii_dev;
        struct cpuidle_cooling_device *idle_cdev;
        struct thermal_cooling_device *cdev;
+       struct device *dev;
        unsigned int idle_duration_us = TICK_USEC;
        unsigned int latency_us = UINT_MAX;
-       char dev_name[THERMAL_NAME_LENGTH];
-       int id, ret;
+       char *name;
+       int ret;
 
        idle_cdev = kzalloc(sizeof(*idle_cdev), GFP_KERNEL);
        if (!idle_cdev) {
@@ -185,16 +184,10 @@ static int __cpuidle_cooling_register(struct device_node *np,
                goto out;
        }
 
-       id = ida_simple_get(&cpuidle_ida, 0, 0, GFP_KERNEL);
-       if (id < 0) {
-               ret = id;
-               goto out_kfree;
-       }
-
        ii_dev = idle_inject_register(drv->cpumask);
        if (!ii_dev) {
                ret = -EINVAL;
-               goto out_id;
+               goto out_kfree;
        }
 
        of_property_read_u32(np, "duration-us", &idle_duration_us);
@@ -205,24 +198,32 @@ static int __cpuidle_cooling_register(struct device_node *np,
 
        idle_cdev->ii_dev = ii_dev;
 
-       snprintf(dev_name, sizeof(dev_name), "thermal-idle-%d", id);
+       dev = get_cpu_device(cpumask_first(drv->cpumask));
 
-       cdev = thermal_of_cooling_device_register(np, dev_name, idle_cdev,
+       name = kasprintf(GFP_KERNEL, "idle-%s", dev_name(dev));
+       if (!name) {
+               ret = -ENOMEM;
+               goto out_unregister;
+       }
+
+       cdev = thermal_of_cooling_device_register(np, name, idle_cdev,
                                                  &cpuidle_cooling_ops);
        if (IS_ERR(cdev)) {
                ret = PTR_ERR(cdev);
-               goto out_unregister;
+               goto out_kfree_name;
        }
 
        pr_debug("%s: Idle injection set with idle duration=%u, latency=%u\n",
-                dev_name, idle_duration_us, latency_us);
+                name, idle_duration_us, latency_us);
+
+       kfree(name);
 
        return 0;
 
+out_kfree_name:
+       kfree(name);
 out_unregister:
        idle_inject_unregister(ii_dev);
-out_id:
-       ida_simple_remove(&cpuidle_ida, id);
 out_kfree:
        kfree(idle_cdev);
 out:
index fed3121..3a788ac 100644 (file)
@@ -14,7 +14,6 @@
 #include <linux/devfreq_cooling.h>
 #include <linux/energy_model.h>
 #include <linux/export.h>
-#include <linux/idr.h>
 #include <linux/slab.h>
 #include <linux/pm_opp.h>
 #include <linux/pm_qos.h>
 #define HZ_PER_KHZ             1000
 #define SCALE_ERROR_MITIGATION 100
 
-static DEFINE_IDA(devfreq_ida);
-
 /**
  * struct devfreq_cooling_device - Devfreq cooling device
- * @id:                unique integer value corresponding to each
  *             devfreq_cooling_device registered.
  * @cdev:      Pointer to associated thermal cooling device.
  * @devfreq:   Pointer to associated devfreq device.
@@ -51,7 +47,6 @@ static DEFINE_IDA(devfreq_ida);
  * @em_pd:             Energy Model for the associated Devfreq device
  */
 struct devfreq_cooling_device {
-       int id;
        struct thermal_cooling_device *cdev;
        struct devfreq *devfreq;
        unsigned long cooling_state;
@@ -363,7 +358,7 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
        struct thermal_cooling_device *cdev;
        struct device *dev = df->dev.parent;
        struct devfreq_cooling_device *dfc;
-       char dev_name[THERMAL_NAME_LENGTH];
+       char *name;
        int err, num_opps;
 
        dfc = kzalloc(sizeof(*dfc), GFP_KERNEL);
@@ -407,30 +402,27 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
        if (err < 0)
                goto free_table;
 
-       err = ida_simple_get(&devfreq_ida, 0, 0, GFP_KERNEL);
-       if (err < 0)
+       err = -ENOMEM;
+       name = kasprintf(GFP_KERNEL, "devfreq-%s", dev_name(dev));
+       if (!name)
                goto remove_qos_req;
 
-       dfc->id = err;
-
-       snprintf(dev_name, sizeof(dev_name), "thermal-devfreq-%d", dfc->id);
-
-       cdev = thermal_of_cooling_device_register(np, dev_name, dfc,
+       cdev = thermal_of_cooling_device_register(np, name, dfc,
                                                  &devfreq_cooling_ops);
+       kfree(name);
+
        if (IS_ERR(cdev)) {
                err = PTR_ERR(cdev);
                dev_err(dev,
                        "Failed to register devfreq cooling device (%d)\n",
                        err);
-               goto release_ida;
+               goto remove_qos_req;
        }
 
        dfc->cdev = cdev;
 
        return cdev;
 
-release_ida:
-       ida_simple_remove(&devfreq_ida, dfc->id);
 remove_qos_req:
        dev_pm_qos_remove_request(&dfc->req_max_freq);
 free_table:
@@ -527,7 +519,6 @@ void devfreq_cooling_unregister(struct thermal_cooling_device *cdev)
        dev = dfc->devfreq->dev.parent;
 
        thermal_cooling_device_unregister(dfc->cdev);
-       ida_simple_remove(&devfreq_ida, dfc->id);
        dev_pm_qos_remove_request(&dfc->req_max_freq);
 
        em_dev_unregister_perf_domain(dev);
index aaa0718..1e5abf4 100644 (file)
@@ -82,6 +82,8 @@ static int fair_share_throttle(struct thermal_zone_device *tz, int trip)
        int total_instance = 0;
        int cur_trip_level = get_trip_level(tz);
 
+       mutex_lock(&tz->lock);
+
        list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
                if (instance->trip != trip)
                        continue;
@@ -105,11 +107,12 @@ static int fair_share_throttle(struct thermal_zone_device *tz, int trip)
                instance->target = get_target_state(tz, cdev, percentage,
                                                    cur_trip_level);
 
-               mutex_lock(&instance->cdev->lock);
-               instance->cdev->updated = false;
-               mutex_unlock(&instance->cdev->lock);
-               thermal_cdev_update(cdev);
+               mutex_lock(&cdev->lock);
+               __thermal_cdev_update(cdev);
+               mutex_unlock(&cdev->lock);
        }
+
+       mutex_unlock(&tz->lock);
        return 0;
 }
 
index 92acae5..13e3757 100644 (file)
@@ -301,9 +301,8 @@ power_actor_set_power(struct thermal_cooling_device *cdev,
 
        instance->target = clamp_val(state, instance->lower, instance->upper);
        mutex_lock(&cdev->lock);
-       cdev->updated = false;
+       __thermal_cdev_update(cdev);
        mutex_unlock(&cdev->lock);
-       thermal_cdev_update(cdev);
 
        return 0;
 }
@@ -374,9 +373,11 @@ static void divvy_up_power(u32 *req_power, u32 *max_power, int num_actors,
         */
        extra_power = min(extra_power, capped_extra_power);
        if (capped_extra_power > 0)
-               for (i = 0; i < num_actors; i++)
-                       granted_power[i] += (extra_actor_power[i] *
-                                       extra_power) / capped_extra_power;
+               for (i = 0; i < num_actors; i++) {
+                       u64 extra_range = (u64)extra_actor_power[i] * extra_power;
+                       granted_power[i] += DIV_ROUND_CLOSEST_ULL(extra_range,
+                                                        capped_extra_power);
+               }
 }
 
 static int allocate_power(struct thermal_zone_device *tz,
@@ -569,22 +570,33 @@ static void reset_pid_controller(struct power_allocator_params *params)
        params->prev_err = 0;
 }
 
-static void allow_maximum_power(struct thermal_zone_device *tz)
+static void allow_maximum_power(struct thermal_zone_device *tz, bool update)
 {
        struct thermal_instance *instance;
        struct power_allocator_params *params = tz->governor_data;
+       u32 req_power;
 
        mutex_lock(&tz->lock);
        list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+               struct thermal_cooling_device *cdev = instance->cdev;
+
                if ((instance->trip != params->trip_max_desired_temperature) ||
                    (!cdev_is_power_actor(instance->cdev)))
                        continue;
 
                instance->target = 0;
                mutex_lock(&instance->cdev->lock);
-               instance->cdev->updated = false;
+               /*
+                * Call for updating the cooling devices local stats and avoid
+                * periods of dozen of seconds when those have not been
+                * maintained.
+                */
+               cdev->ops->get_requested_power(cdev, &req_power);
+
+               if (update)
+                       __thermal_cdev_update(instance->cdev);
+
                mutex_unlock(&instance->cdev->lock);
-               thermal_cdev_update(instance->cdev);
        }
        mutex_unlock(&tz->lock);
 }
@@ -698,6 +710,7 @@ static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
        int ret;
        int switch_on_temp, control_temp;
        struct power_allocator_params *params = tz->governor_data;
+       bool update;
 
        /*
         * We get called for every trip point but we only need to do
@@ -709,9 +722,10 @@ static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
        ret = tz->ops->get_trip_temp(tz, params->trip_switch_on,
                                     &switch_on_temp);
        if (!ret && (tz->temperature < switch_on_temp)) {
+               update = (tz->last_temperature >= switch_on_temp);
                tz->passive = 0;
                reset_pid_controller(params);
-               allow_maximum_power(tz);
+               allow_maximum_power(tz, update);
                return 0;
        }
 
index ee05950..9a21ac0 100644 (file)
@@ -1,7 +1,7 @@
 /*
- * Hisilicon thermal sensor driver
+ * HiSilicon thermal sensor driver
  *
- * Copyright (c) 2014-2015 Hisilicon Limited.
+ * Copyright (c) 2014-2015 HiSilicon Limited.
  * Copyright (c) 2014-2015 Linaro Limited.
  *
  * Xinwei Kong <kong.kongxinwei@hisilicon.com>
@@ -572,10 +572,8 @@ static int hisi_thermal_probe(struct platform_device *pdev)
 
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        data->regs = devm_ioremap_resource(dev, res);
-       if (IS_ERR(data->regs)) {
-               dev_err(dev, "failed to get io address\n");
+       if (IS_ERR(data->regs))
                return PTR_ERR(data->regs);
-       }
 
        ret = data->ops->probe(data);
        if (ret)
@@ -672,5 +670,5 @@ module_platform_driver(hisi_thermal_driver);
 
 MODULE_AUTHOR("Xinwei Kong <kong.kongxinwei@hisilicon.com>");
 MODULE_AUTHOR("Leo Yan <leo.yan@linaro.org>");
-MODULE_DESCRIPTION("Hisilicon thermal driver");
+MODULE_DESCRIPTION("HiSilicon thermal driver");
 MODULE_LICENSE("GPL v2");
index ce4f592..e4299ca 100644 (file)
@@ -79,3 +79,14 @@ config INTEL_PCH_THERMAL
          Enable this to support thermal reporting on certain intel PCHs.
          Thermal reporting device will provide temperature reading,
          programmable trip points and other information.
+
+config INTEL_TCC_COOLING
+       tristate "Intel TCC offset cooling Driver"
+       depends on X86
+       help
+         Enable this to support system cooling by adjusting the effective TCC
+         activation temperature via the TCC Offset register, which is widely
+         supported on modern Intel platforms.
+         Note that, on different platforms, the behavior might be different
+         on how fast the setting takes effect, and how much the CPU frequency
+         is reduced.
index ff2ad30..5ff2afa 100644 (file)
@@ -10,4 +10,5 @@ obj-$(CONFIG_INTEL_QUARK_DTS_THERMAL) += intel_quark_dts_thermal.o
 obj-$(CONFIG_INT340X_THERMAL)  += int340x_thermal/
 obj-$(CONFIG_INTEL_BXT_PMIC_THERMAL) += intel_bxt_pmic_thermal.o
 obj-$(CONFIG_INTEL_PCH_THERMAL)        += intel_pch_thermal.o
+obj-$(CONFIG_INTEL_TCC_COOLING)        += intel_tcc_cooling.o
 obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
diff --git a/drivers/thermal/intel/intel_tcc_cooling.c b/drivers/thermal/intel/intel_tcc_cooling.c
new file mode 100644 (file)
index 0000000..8ec10d5
--- /dev/null
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * cooling device driver that activates the processor throttling by
+ * programming the TCC Offset register.
+ * Copyright (c) 2021, Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/thermal.h>
+#include <asm/cpu_device_id.h>
+
+#define TCC_SHIFT 24
+#define TCC_MASK       (0x3fULL<<24)
+#define TCC_PROGRAMMABLE       BIT(30)
+
+static struct thermal_cooling_device *tcc_cdev;
+
+static int tcc_get_max_state(struct thermal_cooling_device *cdev, unsigned long
+                            *state)
+{
+       *state = TCC_MASK >> TCC_SHIFT;
+       return 0;
+}
+
+static int tcc_offset_update(int tcc)
+{
+       u64 val;
+       int err;
+
+       err = rdmsrl_safe(MSR_IA32_TEMPERATURE_TARGET, &val);
+       if (err)
+               return err;
+
+       val &= ~TCC_MASK;
+       val |= tcc << TCC_SHIFT;
+
+       err = wrmsrl_safe(MSR_IA32_TEMPERATURE_TARGET, val);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int tcc_get_cur_state(struct thermal_cooling_device *cdev, unsigned long
+                            *state)
+{
+       u64 val;
+       int err;
+
+       err = rdmsrl_safe(MSR_IA32_TEMPERATURE_TARGET, &val);
+       if (err)
+               return err;
+
+       *state = (val & TCC_MASK) >> TCC_SHIFT;
+       return 0;
+}
+
+static int tcc_set_cur_state(struct thermal_cooling_device *cdev, unsigned long
+                            state)
+{
+       return tcc_offset_update(state);
+}
+
+static const struct thermal_cooling_device_ops tcc_cooling_ops = {
+       .get_max_state = tcc_get_max_state,
+       .get_cur_state = tcc_get_cur_state,
+       .set_cur_state = tcc_set_cur_state,
+};
+
+static const struct x86_cpu_id tcc_ids[] __initconst = {
+       X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, NULL),
+       X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, NULL),
+       X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, NULL),
+       X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, NULL),
+       X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, NULL),
+       X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, NULL),
+       X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, NULL),
+       X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, NULL),
+       X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, NULL),
+       {}
+};
+
+MODULE_DEVICE_TABLE(x86cpu, tcc_ids);
+
+static int __init tcc_cooling_init(void)
+{
+       int ret;
+       u64 val;
+       const struct x86_cpu_id *id;
+
+       int err;
+
+       id = x86_match_cpu(tcc_ids);
+       if (!id)
+               return -ENODEV;
+
+       err = rdmsrl_safe(MSR_PLATFORM_INFO, &val);
+       if (err)
+               return err;
+
+       if (!(val & TCC_PROGRAMMABLE))
+               return -ENODEV;
+
+       pr_info("Programmable TCC Offset detected\n");
+
+       tcc_cdev =
+           thermal_cooling_device_register("TCC Offset", NULL,
+                                           &tcc_cooling_ops);
+       if (IS_ERR(tcc_cdev)) {
+               ret = PTR_ERR(tcc_cdev);
+               return ret;
+       }
+       return 0;
+}
+
+module_init(tcc_cooling_init)
+
+static void __exit tcc_cooling_exit(void)
+{
+       thermal_cooling_device_unregister(tcc_cdev);
+}
+
+module_exit(tcc_cooling_exit)
+
+MODULE_DESCRIPTION("TCC offset cooling device Driver");
+MODULE_AUTHOR("Zhang Rui <rui.zhang@intel.com>");
+MODULE_LICENSE("GPL v2");
index 149c6d7..97e8678 100644 (file)
@@ -573,12 +573,12 @@ static int raw_to_mcelsius_v1(struct mtk_thermal *mt, int sensno, s32 raw)
 
 static int raw_to_mcelsius_v2(struct mtk_thermal *mt, int sensno, s32 raw)
 {
-       s32 format_1 = 0;
-       s32 format_2 = 0;
-       s32 g_oe = 1;
-       s32 g_gain = 1;
-       s32 g_x_roomt = 0;
-       s32 tmp = 0;
+       s32 format_1;
+       s32 format_2;
+       s32 g_oe;
+       s32 g_gain;
+       s32 g_x_roomt;
+       s32 tmp;
 
        if (raw == 0)
                return 0;
index 6dc879f..7419e19 100644 (file)
@@ -17,6 +17,7 @@
 
 #include "../thermal_core.h"
 
+#define QPNP_TM_REG_DIG_MAJOR          0x01
 #define QPNP_TM_REG_TYPE               0x04
 #define QPNP_TM_REG_SUBTYPE            0x05
 #define QPNP_TM_REG_STATUS             0x08
 
 #define ALARM_CTRL_FORCE_ENABLE                BIT(7)
 
-/*
- * Trip point values based on threshold control
- * 0 = {105 C, 125 C, 145 C}
- * 1 = {110 C, 130 C, 150 C}
- * 2 = {115 C, 135 C, 155 C}
- * 3 = {120 C, 140 C, 160 C}
-*/
-#define TEMP_STAGE_STEP                        20000   /* Stage step: 20.000 C */
-#define TEMP_STAGE_HYSTERESIS          2000
+#define THRESH_COUNT                   4
+#define STAGE_COUNT                    3
+
+/* Over-temperature trip point values in mC */
+static const long temp_map_gen1[THRESH_COUNT][STAGE_COUNT] = {
+       { 105000, 125000, 145000 },
+       { 110000, 130000, 150000 },
+       { 115000, 135000, 155000 },
+       { 120000, 140000, 160000 },
+};
+
+static const long temp_map_gen2_v1[THRESH_COUNT][STAGE_COUNT] = {
+       {  90000, 110000, 140000 },
+       {  95000, 115000, 145000 },
+       { 100000, 120000, 150000 },
+       { 105000, 125000, 155000 },
+};
 
-#define TEMP_THRESH_MIN                        105000  /* Threshold Min: 105 C */
-#define TEMP_THRESH_STEP               5000    /* Threshold step: 5 C */
+#define TEMP_THRESH_STEP               5000 /* Threshold step: 5 C */
 
 #define THRESH_MIN                     0
 #define THRESH_MAX                     3
 
-/* Stage 2 Threshold Min: 125 C */
-#define STAGE2_THRESHOLD_MIN           125000
-/* Stage 2 Threshold Max: 140 C */
-#define STAGE2_THRESHOLD_MAX           140000
+#define TEMP_STAGE_HYSTERESIS          2000
 
 /* Temperature in Milli Celsius reported during stage 0 if no ADC is present */
 #define DEFAULT_TEMP                   37000
@@ -77,6 +82,7 @@ struct qpnp_tm_chip {
        bool                            initialized;
 
        struct iio_channel              *adc;
+       const long                      (*temp_map)[THRESH_COUNT][STAGE_COUNT];
 };
 
 /* This array maps from GEN2 alarm state to GEN1 alarm stage */
@@ -100,6 +106,23 @@ static int qpnp_tm_write(struct qpnp_tm_chip *chip, u16 addr, u8 data)
        return regmap_write(chip->map, chip->base + addr, data);
 }
 
+/**
+ * qpnp_tm_decode_temp() - return temperature in mC corresponding to the
+ *             specified over-temperature stage
+ * @chip:              Pointer to the qpnp_tm chip
+ * @stage:             Over-temperature stage
+ *
+ * Return: temperature in mC
+ */
+static long qpnp_tm_decode_temp(struct qpnp_tm_chip *chip, unsigned int stage)
+{
+       if (!chip->temp_map || chip->thresh >= THRESH_COUNT || stage == 0 ||
+           stage > STAGE_COUNT)
+               return 0;
+
+       return (*chip->temp_map)[chip->thresh][stage - 1];
+}
+
 /**
  * qpnp_tm_get_temp_stage() - return over-temperature stage
  * @chip:              Pointer to the qpnp_tm chip
@@ -149,14 +172,12 @@ static int qpnp_tm_update_temp_no_adc(struct qpnp_tm_chip *chip)
 
        if (stage_new > stage_old) {
                /* increasing stage, use lower bound */
-               chip->temp = (stage_new - 1) * TEMP_STAGE_STEP +
-                            chip->thresh * TEMP_THRESH_STEP +
-                            TEMP_STAGE_HYSTERESIS + TEMP_THRESH_MIN;
+               chip->temp = qpnp_tm_decode_temp(chip, stage_new)
+                               + TEMP_STAGE_HYSTERESIS;
        } else if (stage_new < stage_old) {
                /* decreasing stage, use upper bound */
-               chip->temp = stage_new * TEMP_STAGE_STEP +
-                            chip->thresh * TEMP_THRESH_STEP -
-                            TEMP_STAGE_HYSTERESIS + TEMP_THRESH_MIN;
+               chip->temp = qpnp_tm_decode_temp(chip, stage_new + 1)
+                               - TEMP_STAGE_HYSTERESIS;
        }
 
        chip->stage = stage;
@@ -199,26 +220,28 @@ static int qpnp_tm_get_temp(void *data, int *temp)
 static int qpnp_tm_update_critical_trip_temp(struct qpnp_tm_chip *chip,
                                             int temp)
 {
-       u8 reg;
+       long stage2_threshold_min = (*chip->temp_map)[THRESH_MIN][1];
+       long stage2_threshold_max = (*chip->temp_map)[THRESH_MAX][1];
        bool disable_s2_shutdown = false;
+       u8 reg;
 
        WARN_ON(!mutex_is_locked(&chip->lock));
 
        /*
         * Default: S2 and S3 shutdown enabled, thresholds at
-        * 105C/125C/145C, monitoring at 25Hz
+        * lowest threshold set, monitoring at 25Hz
         */
        reg = SHUTDOWN_CTRL1_RATE_25HZ;
 
        if (temp == THERMAL_TEMP_INVALID ||
-           temp < STAGE2_THRESHOLD_MIN) {
+           temp < stage2_threshold_min) {
                chip->thresh = THRESH_MIN;
                goto skip;
        }
 
-       if (temp <= STAGE2_THRESHOLD_MAX) {
+       if (temp <= stage2_threshold_max) {
                chip->thresh = THRESH_MAX -
-                       ((STAGE2_THRESHOLD_MAX - temp) /
+                       ((stage2_threshold_max - temp) /
                         TEMP_THRESH_STEP);
                disable_s2_shutdown = true;
        } else {
@@ -326,9 +349,7 @@ static int qpnp_tm_init(struct qpnp_tm_chip *chip)
                ? chip->stage : alarm_state_map[chip->stage];
 
        if (stage)
-               chip->temp = chip->thresh * TEMP_THRESH_STEP +
-                            (stage - 1) * TEMP_STAGE_STEP +
-                            TEMP_THRESH_MIN;
+               chip->temp = qpnp_tm_decode_temp(chip, stage);
 
        crit_temp = qpnp_tm_get_critical_trip_temp(chip);
        ret = qpnp_tm_update_critical_trip_temp(chip, crit_temp);
@@ -350,7 +371,7 @@ static int qpnp_tm_probe(struct platform_device *pdev)
 {
        struct qpnp_tm_chip *chip;
        struct device_node *node;
-       u8 type, subtype;
+       u8 type, subtype, dig_major;
        u32 res;
        int ret, irq;
 
@@ -400,6 +421,12 @@ static int qpnp_tm_probe(struct platform_device *pdev)
                return ret;
        }
 
+       ret = qpnp_tm_read(chip, QPNP_TM_REG_DIG_MAJOR, &dig_major);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "could not read dig_major\n");
+               return ret;
+       }
+
        if (type != QPNP_TM_TYPE || (subtype != QPNP_TM_SUBTYPE_GEN1
                                     && subtype != QPNP_TM_SUBTYPE_GEN2)) {
                dev_err(&pdev->dev, "invalid type 0x%02x or subtype 0x%02x\n",
@@ -408,6 +435,10 @@ static int qpnp_tm_probe(struct platform_device *pdev)
        }
 
        chip->subtype = subtype;
+       if (subtype == QPNP_TM_SUBTYPE_GEN2 && dig_major >= 1)
+               chip->temp_map = &temp_map_gen2_v1;
+       else
+               chip->temp_map = &temp_map_gen1;
 
        /*
         * Register the sensor before initializing the hardware to be able to
index 2a28a5a..67c1748 100644 (file)
@@ -10,8 +10,6 @@
 #include <linux/thermal.h>
 #include "tsens.h"
 
-#define CAL_MDEGC              30000
-
 #define CONFIG_ADDR            0x3640
 #define CONFIG_ADDR_8660       0x3620
 /* CONFIG_ADDR bitmasks */
 #define CONFIG_SHIFT_8660      28
 #define CONFIG_MASK_8660       (3 << CONFIG_SHIFT_8660)
 
-#define STATUS_CNTL_ADDR_8064  0x3660
 #define CNTL_ADDR              0x3620
 /* CNTL_ADDR bitmasks */
 #define EN                     BIT(0)
 #define SW_RST                 BIT(1)
-#define SENSOR0_EN             BIT(3)
+
+#define MEASURE_PERIOD         BIT(18)
 #define SLP_CLK_ENA            BIT(26)
 #define SLP_CLK_ENA_8660       BIT(24)
-#define MEASURE_PERIOD         1
 #define SENSOR0_SHIFT          3
 
-/* INT_STATUS_ADDR bitmasks */
-#define MIN_STATUS_MASK                BIT(0)
-#define LOWER_STATUS_CLR       BIT(1)
-#define UPPER_STATUS_CLR       BIT(2)
-#define MAX_STATUS_MASK                BIT(3)
-
 #define THRESHOLD_ADDR         0x3624
-/* THRESHOLD_ADDR bitmasks */
-#define THRESHOLD_MAX_LIMIT_SHIFT      24
-#define THRESHOLD_MIN_LIMIT_SHIFT      16
-#define THRESHOLD_UPPER_LIMIT_SHIFT    8
-#define THRESHOLD_LOWER_LIMIT_SHIFT    0
-
-/* Initial temperature threshold values */
-#define LOWER_LIMIT_TH         0x50
-#define UPPER_LIMIT_TH         0xdf
-#define MIN_LIMIT_TH           0x0
-#define MAX_LIMIT_TH           0xff
-
-#define S0_STATUS_ADDR         0x3628
+
 #define INT_STATUS_ADDR                0x363c
-#define TRDY_MASK              BIT(7)
-#define TIMEOUT_US             100
+
+#define S0_STATUS_OFF          0x3628
+#define S1_STATUS_OFF          0x362c
+#define S2_STATUS_OFF          0x3630
+#define S3_STATUS_OFF          0x3634
+#define S4_STATUS_OFF          0x3638
+#define S5_STATUS_OFF          0x3664  /* Sensors 5-10 found on apq8064/msm8960 */
+#define S6_STATUS_OFF          0x3668
+#define S7_STATUS_OFF          0x366c
+#define S8_STATUS_OFF          0x3670
+#define S9_STATUS_OFF          0x3674
+#define S10_STATUS_OFF         0x3678
+
+/* Original slope - 350 to compensate mC to C inaccuracy */
+static u32 tsens_msm8960_slope[] = {
+                       826, 826, 804, 826,
+                       761, 782, 782, 849,
+                       782, 849, 782
+                       };
 
 static int suspend_8960(struct tsens_priv *priv)
 {
@@ -115,17 +111,34 @@ static int resume_8960(struct tsens_priv *priv)
 static int enable_8960(struct tsens_priv *priv, int id)
 {
        int ret;
-       u32 reg, mask;
+       u32 reg, mask = BIT(id);
 
        ret = regmap_read(priv->tm_map, CNTL_ADDR, &reg);
        if (ret)
                return ret;
 
-       mask = BIT(id + SENSOR0_SHIFT);
+       /* HARDWARE BUG:
+        * On platforms with more than 6 sensors, all remaining sensors
+        * must be enabled together, otherwise undefined results are expected.
+        * (Sensor 6-7 disabled, Sensor 3 disabled...) In the original driver,
+        * all the sensors are enabled in one step hence this bug is not
+        * triggered.
+        */
+       if (id > 5)
+               mask = GENMASK(10, 6);
+
+       mask <<= SENSOR0_SHIFT;
+
+       /* Sensors already enabled. Skip. */
+       if ((reg & mask) == mask)
+               return 0;
+
        ret = regmap_write(priv->tm_map, CNTL_ADDR, reg | SW_RST);
        if (ret)
                return ret;
 
+       reg |= MEASURE_PERIOD;
+
        if (priv->num_sensors > 1)
                reg |= mask | SLP_CLK_ENA | EN;
        else
@@ -162,63 +175,11 @@ static void disable_8960(struct tsens_priv *priv)
        regmap_write(priv->tm_map, CNTL_ADDR, reg_cntl);
 }
 
-static int init_8960(struct tsens_priv *priv)
-{
-       int ret, i;
-       u32 reg_cntl;
-
-       priv->tm_map = dev_get_regmap(priv->dev, NULL);
-       if (!priv->tm_map)
-               return -ENODEV;
-
-       /*
-        * The status registers for each sensor are discontiguous
-        * because some SoCs have 5 sensors while others have more
-        * but the control registers stay in the same place, i.e
-        * directly after the first 5 status registers.
-        */
-       for (i = 0; i < priv->num_sensors; i++) {
-               if (i >= 5)
-                       priv->sensor[i].status = S0_STATUS_ADDR + 40;
-               priv->sensor[i].status += i * 4;
-       }
-
-       reg_cntl = SW_RST;
-       ret = regmap_update_bits(priv->tm_map, CNTL_ADDR, SW_RST, reg_cntl);
-       if (ret)
-               return ret;
-
-       if (priv->num_sensors > 1) {
-               reg_cntl |= SLP_CLK_ENA | (MEASURE_PERIOD << 18);
-               reg_cntl &= ~SW_RST;
-               ret = regmap_update_bits(priv->tm_map, CONFIG_ADDR,
-                                        CONFIG_MASK, CONFIG);
-       } else {
-               reg_cntl |= SLP_CLK_ENA_8660 | (MEASURE_PERIOD << 16);
-               reg_cntl &= ~CONFIG_MASK_8660;
-               reg_cntl |= CONFIG_8660 << CONFIG_SHIFT_8660;
-       }
-
-       reg_cntl |= GENMASK(priv->num_sensors - 1, 0) << SENSOR0_SHIFT;
-       ret = regmap_write(priv->tm_map, CNTL_ADDR, reg_cntl);
-       if (ret)
-               return ret;
-
-       reg_cntl |= EN;
-       ret = regmap_write(priv->tm_map, CNTL_ADDR, reg_cntl);
-       if (ret)
-               return ret;
-
-       return 0;
-}
-
 static int calibrate_8960(struct tsens_priv *priv)
 {
        int i;
        char *data;
-
-       ssize_t num_read = priv->num_sensors;
-       struct tsens_sensor *s = priv->sensor;
+       u32 p1[11];
 
        data = qfprom_read(priv->dev, "calib");
        if (IS_ERR(data))
@@ -226,60 +187,96 @@ static int calibrate_8960(struct tsens_priv *priv)
        if (IS_ERR(data))
                return PTR_ERR(data);
 
-       for (i = 0; i < num_read; i++, s++)
-               s->offset = data[i];
+       for (i = 0; i < priv->num_sensors; i++) {
+               p1[i] = data[i];
+               priv->sensor[i].slope = tsens_msm8960_slope[i];
+       }
+
+       compute_intercept_slope(priv, p1, NULL, ONE_PT_CALIB);
 
        kfree(data);
 
        return 0;
 }
 
-/* Temperature on y axis and ADC-code on x-axis */
-static inline int code_to_mdegC(u32 adc_code, const struct tsens_sensor *s)
-{
-       int slope, offset;
-
-       slope = thermal_zone_get_slope(s->tzd);
-       offset = CAL_MDEGC - slope * s->offset;
-
-       return adc_code * slope + offset;
-}
-
-static int get_temp_8960(const struct tsens_sensor *s, int *temp)
-{
-       int ret;
-       u32 code, trdy;
-       struct tsens_priv *priv = s->priv;
-       unsigned long timeout;
-
-       timeout = jiffies + usecs_to_jiffies(TIMEOUT_US);
-       do {
-               ret = regmap_read(priv->tm_map, INT_STATUS_ADDR, &trdy);
-               if (ret)
-                       return ret;
-               if (!(trdy & TRDY_MASK))
-                       continue;
-               ret = regmap_read(priv->tm_map, s->status, &code);
-               if (ret)
-                       return ret;
-               *temp = code_to_mdegC(code, s);
-               return 0;
-       } while (time_before(jiffies, timeout));
-
-       return -ETIMEDOUT;
-}
+static const struct reg_field tsens_8960_regfields[MAX_REGFIELDS] = {
+       /* ----- SROT ------ */
+       /* No VERSION information */
+
+       /* CNTL */
+       [TSENS_EN]     = REG_FIELD(CNTL_ADDR,  0, 0),
+       [TSENS_SW_RST] = REG_FIELD(CNTL_ADDR,  1, 1),
+       /* 8960 has 5 sensors, 8660 has 11, we only handle 5 */
+       [SENSOR_EN]    = REG_FIELD(CNTL_ADDR,  3, 7),
+
+       /* ----- TM ------ */
+       /* INTERRUPT ENABLE */
+       /* NO INTERRUPT ENABLE */
+
+       /* Single UPPER/LOWER TEMPERATURE THRESHOLD for all sensors */
+       [LOW_THRESH_0]   = REG_FIELD(THRESHOLD_ADDR,  0,  7),
+       [UP_THRESH_0]    = REG_FIELD(THRESHOLD_ADDR,  8, 15),
+       /* MIN_THRESH_0 and MAX_THRESH_0 are not present in the regfield
+        * Recycle CRIT_THRESH_0 and 1 to set the required regs to hardcoded temp
+        * MIN_THRESH_0 -> CRIT_THRESH_1
+        * MAX_THRESH_0 -> CRIT_THRESH_0
+        */
+       [CRIT_THRESH_1]   = REG_FIELD(THRESHOLD_ADDR, 16, 23),
+       [CRIT_THRESH_0]   = REG_FIELD(THRESHOLD_ADDR, 24, 31),
+
+       /* UPPER/LOWER INTERRUPT [CLEAR/STATUS] */
+       /* 1 == clear, 0 == normal operation */
+       [LOW_INT_CLEAR_0]   = REG_FIELD(CNTL_ADDR,  9,  9),
+       [UP_INT_CLEAR_0]    = REG_FIELD(CNTL_ADDR, 10, 10),
+
+       /* NO CRITICAL INTERRUPT SUPPORT on 8960 */
+
+       /* Sn_STATUS */
+       [LAST_TEMP_0]  = REG_FIELD(S0_STATUS_OFF,  0,  7),
+       [LAST_TEMP_1]  = REG_FIELD(S1_STATUS_OFF,  0,  7),
+       [LAST_TEMP_2]  = REG_FIELD(S2_STATUS_OFF,  0,  7),
+       [LAST_TEMP_3]  = REG_FIELD(S3_STATUS_OFF,  0,  7),
+       [LAST_TEMP_4]  = REG_FIELD(S4_STATUS_OFF,  0,  7),
+       [LAST_TEMP_5]  = REG_FIELD(S5_STATUS_OFF,  0,  7),
+       [LAST_TEMP_6]  = REG_FIELD(S6_STATUS_OFF,  0,  7),
+       [LAST_TEMP_7]  = REG_FIELD(S7_STATUS_OFF,  0,  7),
+       [LAST_TEMP_8]  = REG_FIELD(S8_STATUS_OFF,  0,  7),
+       [LAST_TEMP_9]  = REG_FIELD(S9_STATUS_OFF,  0,  7),
+       [LAST_TEMP_10] = REG_FIELD(S10_STATUS_OFF, 0,  7),
+
+       /* No VALID field on 8960 */
+       /* TSENS_INT_STATUS bits: 1 == threshold violated */
+       [MIN_STATUS_0] = REG_FIELD(INT_STATUS_ADDR, 0, 0),
+       [LOWER_STATUS_0] = REG_FIELD(INT_STATUS_ADDR, 1, 1),
+       [UPPER_STATUS_0] = REG_FIELD(INT_STATUS_ADDR, 2, 2),
+       /* No CRITICAL field on 8960 */
+       [MAX_STATUS_0] = REG_FIELD(INT_STATUS_ADDR, 3, 3),
+
+       /* TRDY: 1=ready, 0=in progress */
+       [TRDY] = REG_FIELD(INT_STATUS_ADDR, 7, 7),
+};
 
 static const struct tsens_ops ops_8960 = {
-       .init           = init_8960,
+       .init           = init_common,
        .calibrate      = calibrate_8960,
-       .get_temp       = get_temp_8960,
+       .get_temp       = get_temp_common,
        .enable         = enable_8960,
        .disable        = disable_8960,
        .suspend        = suspend_8960,
        .resume         = resume_8960,
 };
 
+static struct tsens_features tsens_8960_feat = {
+       .ver_major      = VER_0,
+       .crit_int       = 0,
+       .adc            = 1,
+       .srot_split     = 0,
+       .max_sensors    = 11,
+};
+
 struct tsens_plat_data data_8960 = {
        .num_sensors    = 11,
        .ops            = &ops_8960,
+       .feat           = &tsens_8960_feat,
+       .fields         = tsens_8960_regfields,
 };
index 4ffa2e2..f136cb3 100644 (file)
 
 #define BIT_APPEND             0x3
 
+/* eeprom layout data for mdm9607 */
+#define MDM9607_BASE0_MASK     0x000000ff
+#define MDM9607_BASE1_MASK     0x000ff000
+#define MDM9607_BASE0_SHIFT    0
+#define MDM9607_BASE1_SHIFT    12
+
+#define MDM9607_S0_P1_MASK     0x00003f00
+#define MDM9607_S1_P1_MASK     0x03f00000
+#define MDM9607_S2_P1_MASK     0x0000003f
+#define MDM9607_S3_P1_MASK     0x0003f000
+#define MDM9607_S4_P1_MASK     0x0000003f
+
+#define MDM9607_S0_P2_MASK     0x000fc000
+#define MDM9607_S1_P2_MASK     0xfc000000
+#define MDM9607_S2_P2_MASK     0x00000fc0
+#define MDM9607_S3_P2_MASK     0x00fc0000
+#define MDM9607_S4_P2_MASK     0x00000fc0
+
+#define MDM9607_S0_P1_SHIFT    8
+#define MDM9607_S1_P1_SHIFT    20
+#define MDM9607_S2_P1_SHIFT    0
+#define MDM9607_S3_P1_SHIFT    12
+#define MDM9607_S4_P1_SHIFT    0
+
+#define MDM9607_S0_P2_SHIFT    14
+#define MDM9607_S1_P2_SHIFT    26
+#define MDM9607_S2_P2_SHIFT    6
+#define MDM9607_S3_P2_SHIFT    18
+#define MDM9607_S4_P2_SHIFT    6
+
+#define MDM9607_CAL_SEL_MASK   0x00700000
+#define MDM9607_CAL_SEL_SHIFT  20
+
 static int calibrate_8916(struct tsens_priv *priv)
 {
        int base0 = 0, base1 = 0, i;
@@ -452,7 +485,56 @@ static int calibrate_8974(struct tsens_priv *priv)
        return 0;
 }
 
-/* v0.1: 8916, 8939, 8974 */
+static int calibrate_9607(struct tsens_priv *priv)
+{
+       int base, i;
+       u32 p1[5], p2[5];
+       int mode = 0;
+       u32 *qfprom_cdata;
+
+       qfprom_cdata = (u32 *)qfprom_read(priv->dev, "calib");
+       if (IS_ERR(qfprom_cdata))
+               return PTR_ERR(qfprom_cdata);
+
+       mode = (qfprom_cdata[2] & MDM9607_CAL_SEL_MASK) >> MDM9607_CAL_SEL_SHIFT;
+       dev_dbg(priv->dev, "calibration mode is %d\n", mode);
+
+       switch (mode) {
+       case TWO_PT_CALIB:
+               base = (qfprom_cdata[2] & MDM9607_BASE1_MASK) >> MDM9607_BASE1_SHIFT;
+               p2[0] = (qfprom_cdata[0] & MDM9607_S0_P2_MASK) >> MDM9607_S0_P2_SHIFT;
+               p2[1] = (qfprom_cdata[0] & MDM9607_S1_P2_MASK) >> MDM9607_S1_P2_SHIFT;
+               p2[2] = (qfprom_cdata[1] & MDM9607_S2_P2_MASK) >> MDM9607_S2_P2_SHIFT;
+               p2[3] = (qfprom_cdata[1] & MDM9607_S3_P2_MASK) >> MDM9607_S3_P2_SHIFT;
+               p2[4] = (qfprom_cdata[2] & MDM9607_S4_P2_MASK) >> MDM9607_S4_P2_SHIFT;
+               for (i = 0; i < priv->num_sensors; i++)
+                       p2[i] = ((base + p2[i]) << 2);
+               fallthrough;
+       case ONE_PT_CALIB2:
+               base = (qfprom_cdata[0] & MDM9607_BASE0_MASK);
+               p1[0] = (qfprom_cdata[0] & MDM9607_S0_P1_MASK) >> MDM9607_S0_P1_SHIFT;
+               p1[1] = (qfprom_cdata[0] & MDM9607_S1_P1_MASK) >> MDM9607_S1_P1_SHIFT;
+               p1[2] = (qfprom_cdata[1] & MDM9607_S2_P1_MASK) >> MDM9607_S2_P1_SHIFT;
+               p1[3] = (qfprom_cdata[1] & MDM9607_S3_P1_MASK) >> MDM9607_S3_P1_SHIFT;
+               p1[4] = (qfprom_cdata[2] & MDM9607_S4_P1_MASK) >> MDM9607_S4_P1_SHIFT;
+               for (i = 0; i < priv->num_sensors; i++)
+                       p1[i] = ((base + p1[i]) << 2);
+               break;
+       default:
+               for (i = 0; i < priv->num_sensors; i++) {
+                       p1[i] = 500;
+                       p2[i] = 780;
+               }
+               break;
+       }
+
+       compute_intercept_slope(priv, p1, p2, mode);
+       kfree(qfprom_cdata);
+
+       return 0;
+}
+
+/* v0.1: 8916, 8939, 8974, 9607 */
 
 static struct tsens_features tsens_v0_1_feat = {
        .ver_major      = VER_0_1,
@@ -540,3 +622,17 @@ struct tsens_plat_data data_8974 = {
        .feat           = &tsens_v0_1_feat,
        .fields = tsens_v0_1_regfields,
 };
+
+static const struct tsens_ops ops_9607 = {
+       .init           = init_common,
+       .calibrate      = calibrate_9607,
+       .get_temp       = get_temp_common,
+};
+
+struct tsens_plat_data data_9607 = {
+       .num_sensors    = 5,
+       .ops            = &ops_9607,
+       .hw_ids         = (unsigned int []){ 0, 1, 2, 3, 4 },
+       .feat           = &tsens_v0_1_feat,
+       .fields = tsens_v0_1_regfields,
+};
index 3c19a38..573e261 100644 (file)
@@ -380,11 +380,11 @@ static const struct tsens_ops ops_8976 = {
        .get_temp       = get_temp_tsens_valid,
 };
 
-/* Valid for both MSM8956 and MSM8976. Sensor ID 3 is unused. */
+/* Valid for both MSM8956 and MSM8976. */
 struct tsens_plat_data data_8976 = {
        .num_sensors    = 11,
        .ops            = &ops_8976,
-       .hw_ids         = (unsigned int[]){0, 1, 2, 4, 5, 6, 7, 8, 9, 10},
+       .hw_ids         = (unsigned int[]){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
        .feat           = &tsens_v1_feat,
        .fields         = tsens_v1_regfields,
 };
index d8ce3a6..4c7ebd1 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_platform.h>
+#include <linux/mfd/syscon.h>
 #include <linux/platform_device.h>
 #include <linux/pm.h>
 #include <linux/regmap.h>
@@ -85,7 +86,8 @@ void compute_intercept_slope(struct tsens_priv *priv, u32 *p1,
                        "%s: sensor%d - data_point1:%#x data_point2:%#x\n",
                        __func__, i, p1[i], p2[i]);
 
-               priv->sensor[i].slope = SLOPE_DEFAULT;
+               if (!priv->sensor[i].slope)
+                       priv->sensor[i].slope = SLOPE_DEFAULT;
                if (mode == TWO_PT_CALIB) {
                        /*
                         * slope (m) = adc_code2 - adc_code1 (y2 - y1)/
@@ -515,6 +517,15 @@ static irqreturn_t tsens_irq_thread(int irq, void *data)
                        dev_dbg(priv->dev, "[%u] %s: no violation:  %d\n",
                                hw_id, __func__, temp);
                }
+
+               if (tsens_version(priv) < VER_0_1) {
+                       /* Constraint: There is only 1 interrupt control register for all
+                        * 11 temperature sensor. So monitoring more than 1 sensor based
+                        * on interrupts will yield inconsistent result. To overcome this
+                        * issue we will monitor only sensor 0 which is the master sensor.
+                        */
+                       break;
+               }
        }
 
        return IRQ_HANDLED;
@@ -530,6 +541,13 @@ static int tsens_set_trips(void *_sensor, int low, int high)
        int high_val, low_val, cl_high, cl_low;
        u32 hw_id = s->hw_id;
 
+       if (tsens_version(priv) < VER_0_1) {
+               /* Pre v0.1 IP had a single register for each type of interrupt
+                * and thresholds
+                */
+               hw_id = 0;
+       }
+
        dev_dbg(dev, "[%u] %s: proposed thresholds: (%d:%d)\n",
                hw_id, __func__, low, high);
 
@@ -584,18 +602,21 @@ int get_temp_tsens_valid(const struct tsens_sensor *s, int *temp)
        u32 valid;
        int ret;
 
-       ret = regmap_field_read(priv->rf[valid_idx], &valid);
-       if (ret)
-               return ret;
-       while (!valid) {
-               /* Valid bit is 0 for 6 AHB clock cycles.
-                * At 19.2MHz, 1 AHB clock is ~60ns.
-                * We should enter this loop very, very rarely.
-                */
-               ndelay(400);
+       /* VER_0 doesn't have VALID bit */
+       if (tsens_version(priv) >= VER_0_1) {
                ret = regmap_field_read(priv->rf[valid_idx], &valid);
                if (ret)
                        return ret;
+               while (!valid) {
+                       /* Valid bit is 0 for 6 AHB clock cycles.
+                        * At 19.2MHz, 1 AHB clock is ~60ns.
+                        * We should enter this loop very, very rarely.
+                        */
+                       ndelay(400);
+                       ret = regmap_field_read(priv->rf[valid_idx], &valid);
+                       if (ret)
+                               return ret;
+               }
        }
 
        /* Valid bit is set, OK to read the temperature */
@@ -608,15 +629,29 @@ int get_temp_common(const struct tsens_sensor *s, int *temp)
 {
        struct tsens_priv *priv = s->priv;
        int hw_id = s->hw_id;
-       int last_temp = 0, ret;
+       int last_temp = 0, ret, trdy;
+       unsigned long timeout;
 
-       ret = regmap_field_read(priv->rf[LAST_TEMP_0 + hw_id], &last_temp);
-       if (ret)
-               return ret;
+       timeout = jiffies + usecs_to_jiffies(TIMEOUT_US);
+       do {
+               if (tsens_version(priv) == VER_0) {
+                       ret = regmap_field_read(priv->rf[TRDY], &trdy);
+                       if (ret)
+                               return ret;
+                       if (!trdy)
+                               continue;
+               }
 
-       *temp = code_to_degc(last_temp, s) * 1000;
+               ret = regmap_field_read(priv->rf[LAST_TEMP_0 + hw_id], &last_temp);
+               if (ret)
+                       return ret;
 
-       return 0;
+               *temp = code_to_degc(last_temp, s) * 1000;
+
+               return 0;
+       } while (time_before(jiffies, timeout));
+
+       return -ETIMEDOUT;
 }
 
 #ifdef CONFIG_DEBUG_FS
@@ -738,25 +773,42 @@ int __init init_common(struct tsens_priv *priv)
                priv->tm_offset = 0x1000;
        }
 
-       res = platform_get_resource(op, IORESOURCE_MEM, 0);
-       tm_base = devm_ioremap_resource(dev, res);
-       if (IS_ERR(tm_base)) {
-               ret = PTR_ERR(tm_base);
-               goto err_put_device;
+       if (tsens_version(priv) >= VER_0_1) {
+               res = platform_get_resource(op, IORESOURCE_MEM, 0);
+               tm_base = devm_ioremap_resource(dev, res);
+               if (IS_ERR(tm_base)) {
+                       ret = PTR_ERR(tm_base);
+                       goto err_put_device;
+               }
+
+               priv->tm_map = devm_regmap_init_mmio(dev, tm_base, &tsens_config);
+       } else { /* VER_0 share the same gcc regs using a syscon */
+               struct device *parent = priv->dev->parent;
+
+               if (parent)
+                       priv->tm_map = syscon_node_to_regmap(parent->of_node);
        }
 
-       priv->tm_map = devm_regmap_init_mmio(dev, tm_base, &tsens_config);
-       if (IS_ERR(priv->tm_map)) {
-               ret = PTR_ERR(priv->tm_map);
+       if (IS_ERR_OR_NULL(priv->tm_map)) {
+               if (!priv->tm_map)
+                       ret = -ENODEV;
+               else
+                       ret = PTR_ERR(priv->tm_map);
                goto err_put_device;
        }
 
+       /* VER_0 have only tm_map */
+       if (!priv->srot_map)
+               priv->srot_map = priv->tm_map;
+
        if (tsens_version(priv) > VER_0_1) {
                for (i = VER_MAJOR; i <= VER_STEP; i++) {
                        priv->rf[i] = devm_regmap_field_alloc(dev, priv->srot_map,
                                                              priv->fields[i]);
-                       if (IS_ERR(priv->rf[i]))
-                               return PTR_ERR(priv->rf[i]);
+                       if (IS_ERR(priv->rf[i])) {
+                               ret = PTR_ERR(priv->rf[i]);
+                               goto err_put_device;
+                       }
                }
                ret = regmap_field_read(priv->rf[VER_MINOR], &ver_minor);
                if (ret)
@@ -769,6 +821,10 @@ int __init init_common(struct tsens_priv *priv)
                ret = PTR_ERR(priv->rf[TSENS_EN]);
                goto err_put_device;
        }
+       /* in VER_0 TSENS need to be explicitly enabled */
+       if (tsens_version(priv) == VER_0)
+               regmap_field_write(priv->rf[TSENS_EN], 1);
+
        ret = regmap_field_read(priv->rf[TSENS_EN], &enabled);
        if (ret)
                goto err_put_device;
@@ -791,6 +847,19 @@ int __init init_common(struct tsens_priv *priv)
                goto err_put_device;
        }
 
+       priv->rf[TSENS_SW_RST] =
+               devm_regmap_field_alloc(dev, priv->srot_map, priv->fields[TSENS_SW_RST]);
+       if (IS_ERR(priv->rf[TSENS_SW_RST])) {
+               ret = PTR_ERR(priv->rf[TSENS_SW_RST]);
+               goto err_put_device;
+       }
+
+       priv->rf[TRDY] = devm_regmap_field_alloc(dev, priv->tm_map, priv->fields[TRDY]);
+       if (IS_ERR(priv->rf[TRDY])) {
+               ret = PTR_ERR(priv->rf[TRDY]);
+               goto err_put_device;
+       }
+
        /* This loop might need changes if enum regfield_ids is reordered */
        for (j = LAST_TEMP_0; j <= UP_THRESH_15; j += 16) {
                for (i = 0; i < priv->feat->max_sensors; i++) {
@@ -806,7 +875,7 @@ int __init init_common(struct tsens_priv *priv)
                }
        }
 
-       if (priv->feat->crit_int) {
+       if (priv->feat->crit_int || tsens_version(priv) < VER_0_1) {
                /* Loop might need changes if enum regfield_ids is reordered */
                for (j = CRITICAL_STATUS_0; j <= CRIT_THRESH_15; j += 16) {
                        for (i = 0; i < priv->feat->max_sensors; i++) {
@@ -844,7 +913,11 @@ int __init init_common(struct tsens_priv *priv)
        }
 
        spin_lock_init(&priv->ul_lock);
-       tsens_enable_irq(priv);
+
+       /* VER_0 interrupt doesn't need to be enabled */
+       if (tsens_version(priv) >= VER_0_1)
+               tsens_enable_irq(priv);
+
        tsens_debug_init(op);
 
 err_put_device:
@@ -895,6 +968,12 @@ static SIMPLE_DEV_PM_OPS(tsens_pm_ops, tsens_suspend, tsens_resume);
 
 static const struct of_device_id tsens_table[] = {
        {
+               .compatible = "qcom,ipq8064-tsens",
+               .data = &data_8960,
+       }, {
+               .compatible = "qcom,mdm9607-tsens",
+               .data = &data_9607,
+       }, {
                .compatible = "qcom,msm8916-tsens",
                .data = &data_8916,
        }, {
@@ -943,10 +1022,19 @@ static int tsens_register_irq(struct tsens_priv *priv, char *irqname,
                if (irq == -ENXIO)
                        ret = 0;
        } else {
-               ret = devm_request_threaded_irq(&pdev->dev, irq,
-                                               NULL, thread_fn,
-                                               IRQF_ONESHOT,
-                                               dev_name(&pdev->dev), priv);
+               /* VER_0 interrupt is TRIGGER_RISING, VER_0_1 and up is ONESHOT */
+               if (tsens_version(priv) == VER_0)
+                       ret = devm_request_threaded_irq(&pdev->dev, irq,
+                                                       thread_fn, NULL,
+                                                       IRQF_TRIGGER_RISING,
+                                                       dev_name(&pdev->dev),
+                                                       priv);
+               else
+                       ret = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+                                                       thread_fn, IRQF_ONESHOT,
+                                                       dev_name(&pdev->dev),
+                                                       priv);
+
                if (ret)
                        dev_err(&pdev->dev, "%s: failed to get irq\n",
                                __func__);
@@ -975,6 +1063,19 @@ static int tsens_register(struct tsens_priv *priv)
                        priv->ops->enable(priv, i);
        }
 
+       /* VER_0 require to set MIN and MAX THRESH
+        * These 2 regs are set using the:
+        * - CRIT_THRESH_0 for MAX THRESH hardcoded to 120°C
+        * - CRIT_THRESH_1 for MIN THRESH hardcoded to   0°C
+        */
+       if (tsens_version(priv) < VER_0_1) {
+               regmap_field_write(priv->rf[CRIT_THRESH_0],
+                                  tsens_mC_to_hw(priv->sensor, 120000));
+
+               regmap_field_write(priv->rf[CRIT_THRESH_1],
+                                  tsens_mC_to_hw(priv->sensor, 0));
+       }
+
        ret = tsens_register_irq(priv, "uplow", tsens_irq_thread);
        if (ret < 0)
                return ret;
index f40b625..1471a2c 100644 (file)
@@ -13,6 +13,7 @@
 #define CAL_DEGC_PT2           120
 #define SLOPE_FACTOR           1000
 #define SLOPE_DEFAULT          3200
+#define TIMEOUT_US             100
 #define THRESHOLD_MAX_ADC_CODE 0x3ff
 #define THRESHOLD_MIN_ADC_CODE 0x0
 
@@ -25,7 +26,8 @@ struct tsens_priv;
 
 /* IP version numbers in ascending order */
 enum tsens_ver {
-       VER_0_1 = 0,
+       VER_0 = 0,
+       VER_0_1,
        VER_1_X,
        VER_2_X,
 };
@@ -585,7 +587,7 @@ int get_temp_common(const struct tsens_sensor *s, int *temp);
 extern struct tsens_plat_data data_8960;
 
 /* TSENS v0.1 targets */
-extern struct tsens_plat_data data_8916, data_8939, data_8974;
+extern struct tsens_plat_data data_8916, data_8939, data_8974, data_9607;
 
 /* TSENS v1 targets */
 extern struct tsens_plat_data data_tsens_v1, data_8976;
index 75c69fe..e1e4123 100644 (file)
@@ -60,7 +60,7 @@
 #define MCELSIUS(temp) ((temp) * 1000)
 #define GEN3_FUSE_MASK 0xFFF
 
-#define TSC_MAX_NUM    4
+#define TSC_MAX_NUM    5
 
 /* default THCODE values if FUSEs are missing */
 static const int thcodes[TSC_MAX_NUM][3] = {
@@ -68,6 +68,7 @@ static const int thcodes[TSC_MAX_NUM][3] = {
        { 3393, 2795, 2216 },
        { 3389, 2805, 2237 },
        { 3415, 2694, 2195 },
+       { 3356, 2724, 2244 },
 };
 
 /* Structure for thermal temperature calculation */
index 8c80bd0..d9cd23c 100644 (file)
@@ -300,7 +300,7 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
                 * or 0x8xx, so they won't be away from the default value
                 * for a lot.
                 *
-                * So here we do not return error if the calibartion data is
+                * So here we do not return error if the calibration data is
                 * not available, except the probe needs deferring.
                 */
                goto out;
@@ -418,7 +418,7 @@ static int sun8i_h3_thermal_init(struct ths_device *tmdev)
 }
 
 /*
- * Without this undocummented value, the returned temperatures would
+ * Without this undocumented value, the returned temperatures would
  * be higher than real ones by about 20C.
  */
 #define SUN50I_H6_CTRL0_UNK 0x0000002f
index 66e0639..8e303e9 100644 (file)
@@ -2118,7 +2118,6 @@ static int tegra_soctherm_probe(struct platform_device *pdev)
        struct tegra_soctherm *tegra;
        struct thermal_zone_device *z;
        struct tsensor_shared_calib shared_calib;
-       struct resource *res;
        struct tegra_soctherm_soc *soc;
        unsigned int i;
        int err;
@@ -2140,26 +2139,20 @@ static int tegra_soctherm_probe(struct platform_device *pdev)
 
        tegra->soc = soc;
 
-       res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
-                                          "soctherm-reg");
-       tegra->regs = devm_ioremap_resource(&pdev->dev, res);
+       tegra->regs = devm_platform_ioremap_resource_byname(pdev, "soctherm-reg");
        if (IS_ERR(tegra->regs)) {
                dev_err(&pdev->dev, "can't get soctherm registers");
                return PTR_ERR(tegra->regs);
        }
 
        if (!tegra->soc->use_ccroc) {
-               res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
-                                                  "car-reg");
-               tegra->clk_regs = devm_ioremap_resource(&pdev->dev, res);
+               tegra->clk_regs = devm_platform_ioremap_resource_byname(pdev, "car-reg");
                if (IS_ERR(tegra->clk_regs)) {
                        dev_err(&pdev->dev, "can't get car clk registers");
                        return PTR_ERR(tegra->clk_regs);
                }
        } else {
-               res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
-                                                  "ccroc-reg");
-               tegra->ccroc_regs = devm_ioremap_resource(&pdev->dev, res);
+               tegra->ccroc_regs = devm_platform_ioremap_resource_byname(pdev, "ccroc-reg");
                if (IS_ERR(tegra->ccroc_regs)) {
                        dev_err(&pdev->dev, "can't get ccroc registers");
                        return PTR_ERR(tegra->ccroc_regs);
@@ -2195,7 +2188,7 @@ static int tegra_soctherm_probe(struct platform_device *pdev)
        if (err)
                return err;
 
-       /* calculate tsensor calibaration data */
+       /* calculate tsensor calibration data */
        for (i = 0; i < soc->num_tsensors; ++i) {
                err = tegra_calc_tsensor_calib(&soc->tsensors[i],
                                               &shared_calib,
index 996c038..d20b25f 100644 (file)
@@ -561,24 +561,6 @@ void thermal_zone_device_update(struct thermal_zone_device *tz,
 }
 EXPORT_SYMBOL_GPL(thermal_zone_device_update);
 
-/**
- * thermal_notify_framework - Sensor drivers use this API to notify framework
- * @tz:                thermal zone device
- * @trip:      indicates which trip point has been crossed
- *
- * This function handles the trip events from sensor drivers. It starts
- * throttling the cooling devices according to the policy configured.
- * For CRITICAL and HOT trip points, this notifies the respective drivers,
- * and does actual throttling for other trip points i.e ACTIVE and PASSIVE.
- * The throttling policy is based on the configured platform data; if no
- * platform data is provided, this uses the step_wise throttling policy.
- */
-void thermal_notify_framework(struct thermal_zone_device *tz, int trip)
-{
-       handle_thermal_trip(tz, trip);
-}
-EXPORT_SYMBOL_GPL(thermal_notify_framework);
-
 static void thermal_zone_device_check(struct work_struct *work)
 {
        struct thermal_zone_device *tz = container_of(work, struct
@@ -960,10 +942,7 @@ __thermal_cooling_device_register(struct device_node *np,
 {
        struct thermal_cooling_device *cdev;
        struct thermal_zone_device *pos = NULL;
-       int result;
-
-       if (type && strlen(type) >= THERMAL_NAME_LENGTH)
-               return ERR_PTR(-EINVAL);
+       int ret;
 
        if (!ops || !ops->get_max_state || !ops->get_cur_state ||
            !ops->set_cur_state)
@@ -973,14 +952,17 @@ __thermal_cooling_device_register(struct device_node *np,
        if (!cdev)
                return ERR_PTR(-ENOMEM);
 
-       result = ida_simple_get(&thermal_cdev_ida, 0, 0, GFP_KERNEL);
-       if (result < 0) {
-               kfree(cdev);
-               return ERR_PTR(result);
+       ret = ida_simple_get(&thermal_cdev_ida, 0, 0, GFP_KERNEL);
+       if (ret < 0)
+               goto out_kfree_cdev;
+       cdev->id = ret;
+
+       cdev->type = kstrdup(type ? type : "", GFP_KERNEL);
+       if (!cdev->type) {
+               ret = -ENOMEM;
+               goto out_ida_remove;
        }
 
-       cdev->id = result;
-       strlcpy(cdev->type, type ? : "", sizeof(cdev->type));
        mutex_init(&cdev->lock);
        INIT_LIST_HEAD(&cdev->thermal_instances);
        cdev->np = np;
@@ -990,12 +972,9 @@ __thermal_cooling_device_register(struct device_node *np,
        cdev->devdata = devdata;
        thermal_cooling_device_setup_sysfs(cdev);
        dev_set_name(&cdev->device, "cooling_device%d", cdev->id);
-       result = device_register(&cdev->device);
-       if (result) {
-               ida_simple_remove(&thermal_cdev_ida, cdev->id);
-               put_device(&cdev->device);
-               return ERR_PTR(result);
-       }
+       ret = device_register(&cdev->device);
+       if (ret)
+               goto out_kfree_type;
 
        /* Add 'this' new cdev to the global cdev list */
        mutex_lock(&thermal_list_lock);
@@ -1013,6 +992,15 @@ __thermal_cooling_device_register(struct device_node *np,
        mutex_unlock(&thermal_list_lock);
 
        return cdev;
+
+out_kfree_type:
+       kfree(cdev->type);
+       put_device(&cdev->device);
+out_ida_remove:
+       ida_simple_remove(&thermal_cdev_ida, cdev->id);
+out_kfree_cdev:
+       kfree(cdev);
+       return ERR_PTR(ret);
 }
 
 /**
@@ -1171,6 +1159,7 @@ void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
        ida_simple_remove(&thermal_cdev_ida, cdev->id);
        device_del(&cdev->device);
        thermal_cooling_device_destroy_sysfs(cdev);
+       kfree(cdev->type);
        put_device(&cdev->device);
 }
 EXPORT_SYMBOL_GPL(thermal_cooling_device_unregister);
index 86b8cef..726e327 100644 (file)
@@ -66,6 +66,7 @@ static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev)
 }
 
 void thermal_cdev_update(struct thermal_cooling_device *);
+void __thermal_cdev_update(struct thermal_cooling_device *cdev);
 
 /**
  * struct thermal_trip - representation of a point in temperature domain
index 7f50f41..3edd047 100644 (file)
@@ -192,18 +192,11 @@ static void thermal_cdev_set_cur_state(struct thermal_cooling_device *cdev,
        thermal_cooling_device_stats_update(cdev, target);
 }
 
-void thermal_cdev_update(struct thermal_cooling_device *cdev)
+void __thermal_cdev_update(struct thermal_cooling_device *cdev)
 {
        struct thermal_instance *instance;
        unsigned long target = 0;
 
-       mutex_lock(&cdev->lock);
-       /* cooling device is updated*/
-       if (cdev->updated) {
-               mutex_unlock(&cdev->lock);
-               return;
-       }
-
        /* Make sure cdev enters the deepest cooling state */
        list_for_each_entry(instance, &cdev->thermal_instances, cdev_node) {
                dev_dbg(&cdev->device, "zone%d->target=%lu\n",
@@ -216,11 +209,25 @@ void thermal_cdev_update(struct thermal_cooling_device *cdev)
 
        thermal_cdev_set_cur_state(cdev, target);
 
-       cdev->updated = true;
-       mutex_unlock(&cdev->lock);
        trace_cdev_update(cdev, target);
        dev_dbg(&cdev->device, "set to state %lu\n", target);
 }
+
+/**
+ * thermal_cdev_update - update cooling device state if needed
+ * @cdev:      pointer to struct thermal_cooling_device
+ *
+ * Update the cooling device state if there is a need.
+ */
+void thermal_cdev_update(struct thermal_cooling_device *cdev)
+{
+       mutex_lock(&cdev->lock);
+       if (!cdev->updated) {
+               __thermal_cdev_update(cdev);
+               cdev->updated = true;
+       }
+       mutex_unlock(&cdev->lock);
+}
 EXPORT_SYMBOL(thermal_cdev_update);
 
 /**
index d0bdf1e..ded1dd0 100644 (file)
@@ -54,11 +54,8 @@ static int thermal_mmio_probe(struct platform_device *pdev)
 
        resource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        sensor->mmio_base = devm_ioremap_resource(&pdev->dev, resource);
-       if (IS_ERR(sensor->mmio_base)) {
-               dev_err(&pdev->dev, "failed to ioremap memory (%ld)\n",
-                       PTR_ERR(sensor->mmio_base));
+       if (IS_ERR(sensor->mmio_base))
                return PTR_ERR(sensor->mmio_base);
-       }
 
        sensor_init_func = device_get_match_data(&pdev->dev);
        if (sensor_init_func) {
index 69ef12f..5b76f9a 100644 (file)
@@ -704,14 +704,17 @@ static int thermal_of_populate_bind_params(struct device_node *np,
 
        count = of_count_phandle_with_args(np, "cooling-device",
                                           "#cooling-cells");
-       if (!count) {
+       if (count <= 0) {
                pr_err("Add a cooling_device property with at least one device\n");
+               ret = -ENOENT;
                goto end;
        }
 
        __tcbp = kcalloc(count, sizeof(*__tcbp), GFP_KERNEL);
-       if (!__tcbp)
+       if (!__tcbp) {
+               ret = -ENOMEM;
                goto end;
+       }
 
        for (i = 0; i < count; i++) {
                ret = of_parse_phandle_with_args(np, "cooling-device",
index 8a3646e..ebe7cb7 100644 (file)
@@ -9,30 +9,29 @@
  *   Eduardo Valentin <eduardo.valentin@ti.com>
  */
 
-#include <linux/module.h>
+#include <linux/clk.h>
+#include <linux/cpu_pm.h>
+#include <linux/device.h>
+#include <linux/err.h>
 #include <linux/export.h>
+#include <linux/gpio/consumer.h>
 #include <linux/init.h>
-#include <linux/kernel.h>
 #include <linux/interrupt.h>
-#include <linux/clk.h>
-#include <linux/gpio/consumer.h>
-#include <linux/platform_device.h>
-#include <linux/err.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/sys_soc.h>
-#include <linux/reboot.h>
-#include <linux/of_device.h>
-#include <linux/of_platform.h>
-#include <linux/of_irq.h>
 #include <linux/io.h>
 #include <linux/iopoll.h>
-#include <linux/cpu_pm.h>
-#include <linux/device.h>
-#include <linux/pm_runtime.h>
-#include <linux/pm.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/pm_runtime.h>
+#include <linux/reboot.h>
+#include <linux/spinlock.h>
+#include <linux/sys_soc.h>
+#include <linux/types.h>
 
 #include "ti-bandgap.h"
 
@@ -1143,14 +1142,10 @@ static int ti_bandgap_restore_ctxt(struct ti_bandgap *bgp)
        for (i = 0; i < bgp->conf->sensor_count; i++) {
                struct temp_sensor_registers *tsr;
                struct temp_sensor_regval *rval;
-               u32 val = 0;
 
                rval = &bgp->regval[i];
                tsr = bgp->conf->sensors[i].registers;
 
-               if (TI_BANDGAP_HAS(bgp, COUNTER))
-                       val = ti_bandgap_readl(bgp, tsr->bgap_counter);
-
                if (TI_BANDGAP_HAS(bgp, TSHUT_CONFIG))
                        ti_bandgap_writel(bgp, rval->tshut_threshold,
                                          tsr->tshut_threshold);
index ffd1e09..a503c1b 100644 (file)
@@ -14,6 +14,7 @@ config VDPA_SIM
        depends on RUNTIME_TESTING_MENU && HAS_DMA
        select DMA_OPS
        select VHOST_RING
+       select IOMMU_IOVA
        help
          Enable this module to support vDPA device simulators. These devices
          are used for testing, prototyping and development of vDPA.
@@ -25,6 +26,13 @@ config VDPA_SIM_NET
        help
          vDPA networking device simulator which loops TX traffic back to RX.
 
+config VDPA_SIM_BLOCK
+       tristate "vDPA simulator for block device"
+       depends on VDPA_SIM
+       help
+         vDPA block device simulator which terminates IO request in a
+         memory buffer.
+
 config IFCVF
        tristate "Intel IFC VF vDPA driver"
        depends on PCI_MSI
@@ -52,4 +60,11 @@ config MLX5_VDPA_NET
          be executed by the hardware. It also supports a variety of stateless
          offloads depending on the actual device used and firmware version.
 
+config VP_VDPA
+       tristate "Virtio PCI bridge vDPA driver"
+       select VIRTIO_PCI_LIB
+       depends on PCI_MSI
+       help
+         This kernel module bridges virtio PCI device to vDPA bus.
+
 endif # VDPA
index d160e9b..67fe7f3 100644 (file)
@@ -3,3 +3,4 @@ obj-$(CONFIG_VDPA) += vdpa.o
 obj-$(CONFIG_VDPA_SIM) += vdpa_sim/
 obj-$(CONFIG_IFCVF)    += ifcvf/
 obj-$(CONFIG_MLX5_VDPA) += mlx5/
+obj-$(CONFIG_VP_VDPA)    += virtio_pci/
index f2a128e..1a661ab 100644 (file)
@@ -202,10 +202,11 @@ static void ifcvf_add_status(struct ifcvf_hw *hw, u8 status)
        ifcvf_get_status(hw);
 }
 
-u64 ifcvf_get_features(struct ifcvf_hw *hw)
+u64 ifcvf_get_hw_features(struct ifcvf_hw *hw)
 {
        struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
        u32 features_lo, features_hi;
+       u64 features;
 
        ifc_iowrite32(0, &cfg->device_feature_select);
        features_lo = ifc_ioread32(&cfg->device_feature);
@@ -213,7 +214,26 @@ u64 ifcvf_get_features(struct ifcvf_hw *hw)
        ifc_iowrite32(1, &cfg->device_feature_select);
        features_hi = ifc_ioread32(&cfg->device_feature);
 
-       return ((u64)features_hi << 32) | features_lo;
+       features = ((u64)features_hi << 32) | features_lo;
+
+       return features;
+}
+
+u64 ifcvf_get_features(struct ifcvf_hw *hw)
+{
+       return hw->hw_features;
+}
+
+int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features)
+{
+       struct ifcvf_adapter *ifcvf = vf_to_adapter(hw);
+
+       if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)) && features) {
+               IFCVF_ERR(ifcvf->pdev, "VIRTIO_F_ACCESS_PLATFORM is not negotiated\n");
+               return -EINVAL;
+       }
+
+       return 0;
 }
 
 void ifcvf_read_net_config(struct ifcvf_hw *hw, u64 offset,
index 64696d6..0111bfd 100644 (file)
 #include <linux/pci_regs.h>
 #include <linux/vdpa.h>
 #include <uapi/linux/virtio_net.h>
+#include <uapi/linux/virtio_blk.h>
 #include <uapi/linux/virtio_config.h>
 #include <uapi/linux/virtio_pci.h>
 
-#define IFCVF_VENDOR_ID                0x1AF4
-#define IFCVF_DEVICE_ID                0x1041
-#define IFCVF_SUBSYS_VENDOR_ID 0x8086
-#define IFCVF_SUBSYS_DEVICE_ID 0x001A
+#define N3000_VENDOR_ID                0x1AF4
+#define N3000_DEVICE_ID                0x1041
+#define N3000_SUBSYS_VENDOR_ID 0x8086
+#define N3000_SUBSYS_DEVICE_ID 0x001A
 
-#define IFCVF_SUPPORTED_FEATURES \
+#define C5000X_PL_VENDOR_ID            0x1AF4
+#define C5000X_PL_DEVICE_ID            0x1000
+#define C5000X_PL_SUBSYS_VENDOR_ID     0x8086
+#define C5000X_PL_SUBSYS_DEVICE_ID     0x0001
+
+#define C5000X_PL_BLK_VENDOR_ID                0x1AF4
+#define C5000X_PL_BLK_DEVICE_ID                0x1001
+#define C5000X_PL_BLK_SUBSYS_VENDOR_ID 0x8086
+#define C5000X_PL_BLK_SUBSYS_DEVICE_ID 0x0002
+
+#define IFCVF_NET_SUPPORTED_FEATURES \
                ((1ULL << VIRTIO_NET_F_MAC)                     | \
                 (1ULL << VIRTIO_F_ANY_LAYOUT)                  | \
                 (1ULL << VIRTIO_F_VERSION_1)                   | \
@@ -78,6 +89,8 @@ struct ifcvf_hw {
        void __iomem *notify_base;
        u32 notify_off_multiplier;
        u64 req_features;
+       u64 hw_features;
+       u32 dev_type;
        struct virtio_pci_common_cfg __iomem *common_cfg;
        void __iomem *net_cfg;
        struct vring_info vring[IFCVF_MAX_QUEUE_PAIRS * 2];
@@ -116,7 +129,10 @@ void ifcvf_set_status(struct ifcvf_hw *hw, u8 status);
 void io_write64_twopart(u64 val, u32 *lo, u32 *hi);
 void ifcvf_reset(struct ifcvf_hw *hw);
 u64 ifcvf_get_features(struct ifcvf_hw *hw);
+u64 ifcvf_get_hw_features(struct ifcvf_hw *hw);
+int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features);
 u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid);
 int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num);
 struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw);
+int ifcvf_probed_virtio_net(struct ifcvf_hw *hw);
 #endif /* _IFCVF_H_ */
index d555a6a..ab0ab5c 100644 (file)
@@ -14,7 +14,6 @@
 #include <linux/sysfs.h>
 #include "ifcvf_base.h"
 
-#define VERSION_STRING  "0.1"
 #define DRIVER_AUTHOR   "Intel Corporation"
 #define IFCVF_DRIVER_NAME       "ifcvf"
 
@@ -169,10 +168,23 @@ static struct ifcvf_hw *vdpa_to_vf(struct vdpa_device *vdpa_dev)
 
 static u64 ifcvf_vdpa_get_features(struct vdpa_device *vdpa_dev)
 {
+       struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev);
        struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+       struct pci_dev *pdev = adapter->pdev;
+
        u64 features;
 
-       features = ifcvf_get_features(vf) & IFCVF_SUPPORTED_FEATURES;
+       switch (vf->dev_type) {
+       case VIRTIO_ID_NET:
+               features = ifcvf_get_features(vf) & IFCVF_NET_SUPPORTED_FEATURES;
+               break;
+       case VIRTIO_ID_BLOCK:
+               features = ifcvf_get_features(vf);
+               break;
+       default:
+               features = 0;
+               IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", vf->dev_type);
+       }
 
        return features;
 }
@@ -180,6 +192,11 @@ static u64 ifcvf_vdpa_get_features(struct vdpa_device *vdpa_dev)
 static int ifcvf_vdpa_set_features(struct vdpa_device *vdpa_dev, u64 features)
 {
        struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+       int ret;
+
+       ret = ifcvf_verify_min_features(vf, features);
+       if (ret)
+               return ret;
 
        vf->req_features = features;
 
@@ -319,12 +336,17 @@ static u32 ifcvf_vdpa_get_generation(struct vdpa_device *vdpa_dev)
 
 static u32 ifcvf_vdpa_get_device_id(struct vdpa_device *vdpa_dev)
 {
-       return VIRTIO_ID_NET;
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+       return vf->dev_type;
 }
 
 static u32 ifcvf_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev)
 {
-       return IFCVF_SUBSYS_VENDOR_ID;
+       struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev);
+       struct pci_dev *pdev = adapter->pdev;
+
+       return pdev->subsystem_vendor;
 }
 
 static u32 ifcvf_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
@@ -332,6 +354,28 @@ static u32 ifcvf_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
        return IFCVF_QUEUE_ALIGNMENT;
 }
 
+static size_t ifcvf_vdpa_get_config_size(struct vdpa_device *vdpa_dev)
+{
+       struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev);
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+       struct pci_dev *pdev = adapter->pdev;
+       size_t size;
+
+       switch (vf->dev_type) {
+       case VIRTIO_ID_NET:
+               size = sizeof(struct virtio_net_config);
+               break;
+       case VIRTIO_ID_BLOCK:
+               size = sizeof(struct virtio_blk_config);
+               break;
+       default:
+               size = 0;
+               IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", vf->dev_type);
+       }
+
+       return size;
+}
+
 static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev,
                                  unsigned int offset,
                                  void *buf, unsigned int len)
@@ -392,6 +436,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = {
        .get_device_id  = ifcvf_vdpa_get_device_id,
        .get_vendor_id  = ifcvf_vdpa_get_vendor_id,
        .get_vq_align   = ifcvf_vdpa_get_vq_align,
+       .get_config_size        = ifcvf_vdpa_get_config_size,
        .get_config     = ifcvf_vdpa_get_config,
        .set_config     = ifcvf_vdpa_set_config,
        .set_config_cb  = ifcvf_vdpa_set_config_cb,
@@ -441,6 +486,19 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        pci_set_drvdata(pdev, adapter);
 
        vf = &adapter->vf;
+
+       /* This drirver drives both modern virtio devices and transitional
+        * devices in modern mode.
+        * vDPA requires feature bit VIRTIO_F_ACCESS_PLATFORM,
+        * so legacy devices and transitional devices in legacy
+        * mode will not work for vDPA, this driver will not
+        * drive devices with legacy interface.
+        */
+       if (pdev->device < 0x1040)
+               vf->dev_type =  pdev->subsystem_device;
+       else
+               vf->dev_type =  pdev->device - 0x1040;
+
        vf->base = pcim_iomap_table(pdev);
 
        adapter->pdev = pdev;
@@ -455,6 +513,8 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++)
                vf->vring[i].irq = -EINVAL;
 
+       vf->hw_features = ifcvf_get_hw_features(vf);
+
        ret = vdpa_register_device(&adapter->vdpa, IFCVF_MAX_QUEUE_PAIRS * 2);
        if (ret) {
                IFCVF_ERR(pdev, "Failed to register ifcvf to vdpa bus");
@@ -476,10 +536,19 @@ static void ifcvf_remove(struct pci_dev *pdev)
 }
 
 static struct pci_device_id ifcvf_pci_ids[] = {
-       { PCI_DEVICE_SUB(IFCVF_VENDOR_ID,
-               IFCVF_DEVICE_ID,
-               IFCVF_SUBSYS_VENDOR_ID,
-               IFCVF_SUBSYS_DEVICE_ID) },
+       { PCI_DEVICE_SUB(N3000_VENDOR_ID,
+                        N3000_DEVICE_ID,
+                        N3000_SUBSYS_VENDOR_ID,
+                        N3000_SUBSYS_DEVICE_ID) },
+       { PCI_DEVICE_SUB(C5000X_PL_VENDOR_ID,
+                        C5000X_PL_DEVICE_ID,
+                        C5000X_PL_SUBSYS_VENDOR_ID,
+                        C5000X_PL_SUBSYS_DEVICE_ID) },
+       { PCI_DEVICE_SUB(C5000X_PL_BLK_VENDOR_ID,
+                        C5000X_PL_BLK_DEVICE_ID,
+                        C5000X_PL_BLK_SUBSYS_VENDOR_ID,
+                        C5000X_PL_BLK_SUBSYS_DEVICE_ID) },
+
        { 0 },
 };
 MODULE_DEVICE_TABLE(pci, ifcvf_pci_ids);
@@ -494,4 +563,3 @@ static struct pci_driver ifcvf_driver = {
 module_pci_driver(ifcvf_driver);
 
 MODULE_LICENSE("GPL v2");
-MODULE_VERSION(VERSION_STRING);
index 4d2809c..189e438 100644 (file)
@@ -1809,6 +1809,11 @@ err_setup:
        ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
 }
 
+static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
+{
+       return sizeof(struct virtio_net_config);
+}
+
 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
                                 unsigned int len)
 {
@@ -1895,6 +1900,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = {
        .get_vendor_id = mlx5_vdpa_get_vendor_id,
        .get_status = mlx5_vdpa_get_status,
        .set_status = mlx5_vdpa_set_status,
+       .get_config_size = mlx5_vdpa_get_config_size,
        .get_config = mlx5_vdpa_get_config,
        .set_config = mlx5_vdpa_set_config,
        .get_generation = mlx5_vdpa_get_generation,
@@ -1974,23 +1980,32 @@ static void init_mvqs(struct mlx5_vdpa_net *ndev)
        }
 }
 
-static int mlx5v_probe(struct auxiliary_device *adev,
-                      const struct auxiliary_device_id *id)
+struct mlx5_vdpa_mgmtdev {
+       struct vdpa_mgmt_dev mgtdev;
+       struct mlx5_adev *madev;
+       struct mlx5_vdpa_net *ndev;
+};
+
+static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
 {
-       struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
-       struct mlx5_core_dev *mdev = madev->mdev;
+       struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
        struct virtio_net_config *config;
        struct mlx5_vdpa_dev *mvdev;
        struct mlx5_vdpa_net *ndev;
+       struct mlx5_core_dev *mdev;
        u32 max_vqs;
        int err;
 
+       if (mgtdev->ndev)
+               return -ENOSPC;
+
+       mdev = mgtdev->madev->mdev;
        /* we save one virtqueue for control virtqueue should we require it */
        max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues);
        max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
 
        ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
-                                NULL);
+                                name);
        if (IS_ERR(ndev))
                return PTR_ERR(ndev);
 
@@ -2017,11 +2032,12 @@ static int mlx5v_probe(struct auxiliary_device *adev,
        if (err)
                goto err_res;
 
-       err = vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs));
+       mvdev->vdev.mdev = &mgtdev->mgtdev;
+       err = _vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs));
        if (err)
                goto err_reg;
 
-       dev_set_drvdata(&adev->dev, ndev);
+       mgtdev->ndev = ndev;
        return 0;
 
 err_reg:
@@ -2034,11 +2050,62 @@ err_mtu:
        return err;
 }
 
+static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
+{
+       struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
+
+       _vdpa_unregister_device(dev);
+       mgtdev->ndev = NULL;
+}
+
+static const struct vdpa_mgmtdev_ops mdev_ops = {
+       .dev_add = mlx5_vdpa_dev_add,
+       .dev_del = mlx5_vdpa_dev_del,
+};
+
+static struct virtio_device_id id_table[] = {
+       { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
+       { 0 },
+};
+
+static int mlx5v_probe(struct auxiliary_device *adev,
+                      const struct auxiliary_device_id *id)
+
+{
+       struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
+       struct mlx5_core_dev *mdev = madev->mdev;
+       struct mlx5_vdpa_mgmtdev *mgtdev;
+       int err;
+
+       mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
+       if (!mgtdev)
+               return -ENOMEM;
+
+       mgtdev->mgtdev.ops = &mdev_ops;
+       mgtdev->mgtdev.device = mdev->device;
+       mgtdev->mgtdev.id_table = id_table;
+       mgtdev->madev = madev;
+
+       err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
+       if (err)
+               goto reg_err;
+
+       dev_set_drvdata(&adev->dev, mgtdev);
+
+       return 0;
+
+reg_err:
+       kfree(mgtdev);
+       return err;
+}
+
 static void mlx5v_remove(struct auxiliary_device *adev)
 {
-       struct mlx5_vdpa_dev *mvdev = dev_get_drvdata(&adev->dev);
+       struct mlx5_vdpa_mgmtdev *mgtdev;
 
-       vdpa_unregister_device(&mvdev->vdev);
+       mgtdev = dev_get_drvdata(&adev->dev);
+       vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
+       kfree(mgtdev);
 }
 
 static const struct auxiliary_device_id mlx5v_id_table[] = {
index 5cffce6..bb3f1d1 100644 (file)
@@ -75,8 +75,8 @@ static void vdpa_release_dev(struct device *d)
  * Driver should use vdpa_alloc_device() wrapper macro instead of
  * using this directly.
  *
- * Returns an error when parent/config/dma_dev is not set or fail to get
- * ida.
+ * Return: Returns an error when parent/config/dma_dev is not set or fail to get
+ *        ida.
  */
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
                                        const struct vdpa_config_ops *config,
@@ -157,7 +157,7 @@ static int __vdpa_register_device(struct vdpa_device *vdev, int nvqs)
  * @vdev: the vdpa device to be registered to vDPA bus
  * @nvqs: number of virtqueues supported by this device
  *
- * Returns an error when fail to add device to vDPA bus
+ * Return: Returns an error when fail to add device to vDPA bus
  */
 int _vdpa_register_device(struct vdpa_device *vdev, int nvqs)
 {
@@ -174,7 +174,7 @@ EXPORT_SYMBOL_GPL(_vdpa_register_device);
  * @vdev: the vdpa device to be registered to vDPA bus
  * @nvqs: number of virtqueues supported by this device
  *
- * Returns an error when fail to add to vDPA bus
+ * Return: Returns an error when fail to add to vDPA bus
  */
 int vdpa_register_device(struct vdpa_device *vdev, int nvqs)
 {
@@ -218,7 +218,7 @@ EXPORT_SYMBOL_GPL(vdpa_unregister_device);
  * @drv: the vdpa device driver to be registered
  * @owner: module owner of the driver
  *
- * Returns an err when fail to do the registration
+ * Return: Returns an err when fail to do the registration
  */
 int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner)
 {
@@ -245,6 +245,8 @@ EXPORT_SYMBOL_GPL(vdpa_unregister_driver);
  * @mdev: Pointer to vdpa management device
  * vdpa_mgmtdev_register() register a vdpa management device which supports
  * vdpa device management.
+ * Return: Returns 0 on success or failure when required callback ops are not
+ *         initialized.
  */
 int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev)
 {
index 79d4536..d458103 100644 (file)
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_VDPA_SIM) += vdpa_sim.o
 obj-$(CONFIG_VDPA_SIM_NET) += vdpa_sim_net.o
+obj-$(CONFIG_VDPA_SIM_BLOCK) += vdpa_sim_blk.o
index 5b6b2f8..98f793b 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/vringh.h>
 #include <linux/vdpa.h>
 #include <linux/vhost_iotlb.h>
+#include <linux/iova.h>
 
 #include "vdpa_sim.h"
 
@@ -128,30 +129,57 @@ static int dir_to_perm(enum dma_data_direction dir)
        return perm;
 }
 
+static dma_addr_t vdpasim_map_range(struct vdpasim *vdpasim, phys_addr_t paddr,
+                                   size_t size, unsigned int perm)
+{
+       struct iova *iova;
+       dma_addr_t dma_addr;
+       int ret;
+
+       /* We set the limit_pfn to the maximum (ULONG_MAX - 1) */
+       iova = alloc_iova(&vdpasim->iova, size, ULONG_MAX - 1, true);
+       if (!iova)
+               return DMA_MAPPING_ERROR;
+
+       dma_addr = iova_dma_addr(&vdpasim->iova, iova);
+
+       spin_lock(&vdpasim->iommu_lock);
+       ret = vhost_iotlb_add_range(vdpasim->iommu, (u64)dma_addr,
+                                   (u64)dma_addr + size - 1, (u64)paddr, perm);
+       spin_unlock(&vdpasim->iommu_lock);
+
+       if (ret) {
+               __free_iova(&vdpasim->iova, iova);
+               return DMA_MAPPING_ERROR;
+       }
+
+       return dma_addr;
+}
+
+static void vdpasim_unmap_range(struct vdpasim *vdpasim, dma_addr_t dma_addr,
+                               size_t size)
+{
+       spin_lock(&vdpasim->iommu_lock);
+       vhost_iotlb_del_range(vdpasim->iommu, (u64)dma_addr,
+                             (u64)dma_addr + size - 1);
+       spin_unlock(&vdpasim->iommu_lock);
+
+       free_iova(&vdpasim->iova, iova_pfn(&vdpasim->iova, dma_addr));
+}
+
 static dma_addr_t vdpasim_map_page(struct device *dev, struct page *page,
                                   unsigned long offset, size_t size,
                                   enum dma_data_direction dir,
                                   unsigned long attrs)
 {
        struct vdpasim *vdpasim = dev_to_sim(dev);
-       struct vhost_iotlb *iommu = vdpasim->iommu;
-       u64 pa = (page_to_pfn(page) << PAGE_SHIFT) + offset;
-       int ret, perm = dir_to_perm(dir);
+       phys_addr_t paddr = page_to_phys(page) + offset;
+       int perm = dir_to_perm(dir);
 
        if (perm < 0)
                return DMA_MAPPING_ERROR;
 
-       /* For simplicity, use identical mapping to avoid e.g iova
-        * allocator.
-        */
-       spin_lock(&vdpasim->iommu_lock);
-       ret = vhost_iotlb_add_range(iommu, pa, pa + size - 1,
-                                   pa, dir_to_perm(dir));
-       spin_unlock(&vdpasim->iommu_lock);
-       if (ret)
-               return DMA_MAPPING_ERROR;
-
-       return (dma_addr_t)(pa);
+       return vdpasim_map_range(vdpasim, paddr, size, perm);
 }
 
 static void vdpasim_unmap_page(struct device *dev, dma_addr_t dma_addr,
@@ -159,12 +187,8 @@ static void vdpasim_unmap_page(struct device *dev, dma_addr_t dma_addr,
                               unsigned long attrs)
 {
        struct vdpasim *vdpasim = dev_to_sim(dev);
-       struct vhost_iotlb *iommu = vdpasim->iommu;
 
-       spin_lock(&vdpasim->iommu_lock);
-       vhost_iotlb_del_range(iommu, (u64)dma_addr,
-                             (u64)dma_addr + size - 1);
-       spin_unlock(&vdpasim->iommu_lock);
+       vdpasim_unmap_range(vdpasim, dma_addr, size);
 }
 
 static void *vdpasim_alloc_coherent(struct device *dev, size_t size,
@@ -172,27 +196,22 @@ static void *vdpasim_alloc_coherent(struct device *dev, size_t size,
                                    unsigned long attrs)
 {
        struct vdpasim *vdpasim = dev_to_sim(dev);
-       struct vhost_iotlb *iommu = vdpasim->iommu;
-       void *addr = kmalloc(size, flag);
-       int ret;
+       phys_addr_t paddr;
+       void *addr;
 
-       spin_lock(&vdpasim->iommu_lock);
+       addr = kmalloc(size, flag);
        if (!addr) {
                *dma_addr = DMA_MAPPING_ERROR;
-       } else {
-               u64 pa = virt_to_phys(addr);
-
-               ret = vhost_iotlb_add_range(iommu, (u64)pa,
-                                           (u64)pa + size - 1,
-                                           pa, VHOST_MAP_RW);
-               if (ret) {
-                       *dma_addr = DMA_MAPPING_ERROR;
-                       kfree(addr);
-                       addr = NULL;
-               } else
-                       *dma_addr = (dma_addr_t)pa;
+               return NULL;
+       }
+
+       paddr = virt_to_phys(addr);
+
+       *dma_addr = vdpasim_map_range(vdpasim, paddr, size, VHOST_MAP_RW);
+       if (*dma_addr == DMA_MAPPING_ERROR) {
+               kfree(addr);
+               return NULL;
        }
-       spin_unlock(&vdpasim->iommu_lock);
 
        return addr;
 }
@@ -202,14 +221,10 @@ static void vdpasim_free_coherent(struct device *dev, size_t size,
                                  unsigned long attrs)
 {
        struct vdpasim *vdpasim = dev_to_sim(dev);
-       struct vhost_iotlb *iommu = vdpasim->iommu;
 
-       spin_lock(&vdpasim->iommu_lock);
-       vhost_iotlb_del_range(iommu, (u64)dma_addr,
-                             (u64)dma_addr + size - 1);
-       spin_unlock(&vdpasim->iommu_lock);
+       vdpasim_unmap_range(vdpasim, dma_addr, size);
 
-       kfree(phys_to_virt((uintptr_t)dma_addr));
+       kfree(vaddr);
 }
 
 static const struct dma_map_ops vdpasim_dma_ops = {
@@ -269,7 +284,15 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr)
                goto err_iommu;
 
        for (i = 0; i < dev_attr->nvqs; i++)
-               vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu);
+               vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu,
+                                &vdpasim->iommu_lock);
+
+       ret = iova_cache_get();
+       if (ret)
+               goto err_iommu;
+
+       /* For simplicity we use an IOVA allocator with byte granularity */
+       init_iova_domain(&vdpasim->iova, 1, 0);
 
        vdpasim->vdpa.dma_dev = dev;
 
@@ -439,6 +462,13 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status)
        spin_unlock(&vdpasim->lock);
 }
 
+static size_t vdpasim_get_config_size(struct vdpa_device *vdpa)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+       return vdpasim->dev_attr.config_size;
+}
+
 static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset,
                             void *buf, unsigned int len)
 {
@@ -539,8 +569,17 @@ static int vdpasim_dma_unmap(struct vdpa_device *vdpa, u64 iova, u64 size)
 static void vdpasim_free(struct vdpa_device *vdpa)
 {
        struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+       int i;
 
        cancel_work_sync(&vdpasim->work);
+
+       for (i = 0; i < vdpasim->dev_attr.nvqs; i++) {
+               vringh_kiov_cleanup(&vdpasim->vqs[i].out_iov);
+               vringh_kiov_cleanup(&vdpasim->vqs[i].in_iov);
+       }
+
+       put_iova_domain(&vdpasim->iova);
+       iova_cache_put();
        kvfree(vdpasim->buffer);
        if (vdpasim->iommu)
                vhost_iotlb_free(vdpasim->iommu);
@@ -566,6 +605,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = {
        .get_vendor_id          = vdpasim_get_vendor_id,
        .get_status             = vdpasim_get_status,
        .set_status             = vdpasim_set_status,
+       .get_config_size        = vdpasim_get_config_size,
        .get_config             = vdpasim_get_config,
        .set_config             = vdpasim_set_config,
        .get_generation         = vdpasim_get_generation,
@@ -593,6 +633,7 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = {
        .get_vendor_id          = vdpasim_get_vendor_id,
        .get_status             = vdpasim_get_status,
        .set_status             = vdpasim_set_status,
+       .get_config_size        = vdpasim_get_config_size,
        .get_config             = vdpasim_get_config,
        .set_config             = vdpasim_set_config,
        .get_generation         = vdpasim_get_generation,
index 6d75444..cd58e88 100644 (file)
@@ -6,6 +6,7 @@
 #ifndef _VDPA_SIM_H
 #define _VDPA_SIM_H
 
+#include <linux/iova.h>
 #include <linux/vringh.h>
 #include <linux/vdpa.h>
 #include <linux/virtio_byteorder.h>
@@ -57,6 +58,7 @@ struct vdpasim {
        /* virtio config according to device type */
        void *config;
        struct vhost_iotlb *iommu;
+       struct iova_domain iova;
        void *buffer;
        u32 status;
        u32 generation;
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
new file mode 100644 (file)
index 0000000..5bfe1c2
--- /dev/null
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VDPA simulator for block device.
+ *
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2021, Red Hat Inc. All rights reserved.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/blkdev.h>
+#include <linux/vringh.h>
+#include <linux/vdpa.h>
+#include <linux/blkdev.h>
+#include <uapi/linux/virtio_blk.h>
+
+#include "vdpa_sim.h"
+
+#define DRV_VERSION  "0.1"
+#define DRV_AUTHOR   "Max Gurtovoy <mgurtovoy@nvidia.com>"
+#define DRV_DESC     "vDPA Device Simulator for block device"
+#define DRV_LICENSE  "GPL v2"
+
+#define VDPASIM_BLK_FEATURES   (VDPASIM_FEATURES | \
+                                (1ULL << VIRTIO_BLK_F_SIZE_MAX) | \
+                                (1ULL << VIRTIO_BLK_F_SEG_MAX)  | \
+                                (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
+                                (1ULL << VIRTIO_BLK_F_TOPOLOGY) | \
+                                (1ULL << VIRTIO_BLK_F_MQ))
+
+#define VDPASIM_BLK_CAPACITY   0x40000
+#define VDPASIM_BLK_SIZE_MAX   0x1000
+#define VDPASIM_BLK_SEG_MAX    32
+#define VDPASIM_BLK_VQ_NUM     1
+
+static char vdpasim_blk_id[VIRTIO_BLK_ID_BYTES] = "vdpa_blk_sim";
+
+static bool vdpasim_blk_check_range(u64 start_sector, size_t range_size)
+{
+       u64 range_sectors = range_size >> SECTOR_SHIFT;
+
+       if (range_size > VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX)
+               return false;
+
+       if (start_sector > VDPASIM_BLK_CAPACITY)
+               return false;
+
+       if (range_sectors > VDPASIM_BLK_CAPACITY - start_sector)
+               return false;
+
+       return true;
+}
+
+/* Returns 'true' if the request is handled (with or without an I/O error)
+ * and the status is correctly written in the last byte of the 'in iov',
+ * 'false' otherwise.
+ */
+static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim,
+                                  struct vdpasim_virtqueue *vq)
+{
+       size_t pushed = 0, to_pull, to_push;
+       struct virtio_blk_outhdr hdr;
+       ssize_t bytes;
+       loff_t offset;
+       u64 sector;
+       u8 status;
+       u32 type;
+       int ret;
+
+       ret = vringh_getdesc_iotlb(&vq->vring, &vq->out_iov, &vq->in_iov,
+                                  &vq->head, GFP_ATOMIC);
+       if (ret != 1)
+               return false;
+
+       if (vq->out_iov.used < 1 || vq->in_iov.used < 1) {
+               dev_err(&vdpasim->vdpa.dev, "missing headers - out_iov: %u in_iov %u\n",
+                       vq->out_iov.used, vq->in_iov.used);
+               return false;
+       }
+
+       if (vq->in_iov.iov[vq->in_iov.used - 1].iov_len < 1) {
+               dev_err(&vdpasim->vdpa.dev, "request in header too short\n");
+               return false;
+       }
+
+       /* The last byte is the status and we checked if the last iov has
+        * enough room for it.
+        */
+       to_push = vringh_kiov_length(&vq->in_iov) - 1;
+
+       to_pull = vringh_kiov_length(&vq->out_iov);
+
+       bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, &hdr,
+                                     sizeof(hdr));
+       if (bytes != sizeof(hdr)) {
+               dev_err(&vdpasim->vdpa.dev, "request out header too short\n");
+               return false;
+       }
+
+       to_pull -= bytes;
+
+       type = vdpasim32_to_cpu(vdpasim, hdr.type);
+       sector = vdpasim64_to_cpu(vdpasim, hdr.sector);
+       offset = sector << SECTOR_SHIFT;
+       status = VIRTIO_BLK_S_OK;
+
+       switch (type) {
+       case VIRTIO_BLK_T_IN:
+               if (!vdpasim_blk_check_range(sector, to_push)) {
+                       dev_err(&vdpasim->vdpa.dev,
+                               "reading over the capacity - offset: 0x%llx len: 0x%zx\n",
+                               offset, to_push);
+                       status = VIRTIO_BLK_S_IOERR;
+                       break;
+               }
+
+               bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov,
+                                             vdpasim->buffer + offset,
+                                             to_push);
+               if (bytes < 0) {
+                       dev_err(&vdpasim->vdpa.dev,
+                               "vringh_iov_push_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n",
+                               bytes, offset, to_push);
+                       status = VIRTIO_BLK_S_IOERR;
+                       break;
+               }
+
+               pushed += bytes;
+               break;
+
+       case VIRTIO_BLK_T_OUT:
+               if (!vdpasim_blk_check_range(sector, to_pull)) {
+                       dev_err(&vdpasim->vdpa.dev,
+                               "writing over the capacity - offset: 0x%llx len: 0x%zx\n",
+                               offset, to_pull);
+                       status = VIRTIO_BLK_S_IOERR;
+                       break;
+               }
+
+               bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov,
+                                             vdpasim->buffer + offset,
+                                             to_pull);
+               if (bytes < 0) {
+                       dev_err(&vdpasim->vdpa.dev,
+                               "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n",
+                               bytes, offset, to_pull);
+                       status = VIRTIO_BLK_S_IOERR;
+                       break;
+               }
+               break;
+
+       case VIRTIO_BLK_T_GET_ID:
+               bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov,
+                                             vdpasim_blk_id,
+                                             VIRTIO_BLK_ID_BYTES);
+               if (bytes < 0) {
+                       dev_err(&vdpasim->vdpa.dev,
+                               "vringh_iov_push_iotlb() error: %zd\n", bytes);
+                       status = VIRTIO_BLK_S_IOERR;
+                       break;
+               }
+
+               pushed += bytes;
+               break;
+
+       default:
+               dev_warn(&vdpasim->vdpa.dev,
+                        "Unsupported request type %d\n", type);
+               status = VIRTIO_BLK_S_IOERR;
+               break;
+       }
+
+       /* If some operations fail, we need to skip the remaining bytes
+        * to put the status in the last byte
+        */
+       if (to_push - pushed > 0)
+               vringh_kiov_advance(&vq->in_iov, to_push - pushed);
+
+       /* Last byte is the status */
+       bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, &status, 1);
+       if (bytes != 1)
+               return false;
+
+       pushed += bytes;
+
+       /* Make sure data is wrote before advancing index */
+       smp_wmb();
+
+       vringh_complete_iotlb(&vq->vring, vq->head, pushed);
+
+       return true;
+}
+
+static void vdpasim_blk_work(struct work_struct *work)
+{
+       struct vdpasim *vdpasim = container_of(work, struct vdpasim, work);
+       int i;
+
+       spin_lock(&vdpasim->lock);
+
+       if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK))
+               goto out;
+
+       for (i = 0; i < VDPASIM_BLK_VQ_NUM; i++) {
+               struct vdpasim_virtqueue *vq = &vdpasim->vqs[i];
+
+               if (!vq->ready)
+                       continue;
+
+               while (vdpasim_blk_handle_req(vdpasim, vq)) {
+                       /* Make sure used is visible before rasing the interrupt. */
+                       smp_wmb();
+
+                       local_bh_disable();
+                       if (vringh_need_notify_iotlb(&vq->vring) > 0)
+                               vringh_notify(&vq->vring);
+                       local_bh_enable();
+               }
+       }
+out:
+       spin_unlock(&vdpasim->lock);
+}
+
+static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config)
+{
+       struct virtio_blk_config *blk_config = config;
+
+       memset(config, 0, sizeof(struct virtio_blk_config));
+
+       blk_config->capacity = cpu_to_vdpasim64(vdpasim, VDPASIM_BLK_CAPACITY);
+       blk_config->size_max = cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_SIZE_MAX);
+       blk_config->seg_max = cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_SEG_MAX);
+       blk_config->num_queues = cpu_to_vdpasim16(vdpasim, VDPASIM_BLK_VQ_NUM);
+       blk_config->min_io_size = cpu_to_vdpasim16(vdpasim, 1);
+       blk_config->opt_io_size = cpu_to_vdpasim32(vdpasim, 1);
+       blk_config->blk_size = cpu_to_vdpasim32(vdpasim, SECTOR_SIZE);
+}
+
+static void vdpasim_blk_mgmtdev_release(struct device *dev)
+{
+}
+
+static struct device vdpasim_blk_mgmtdev = {
+       .init_name = "vdpasim_blk",
+       .release = vdpasim_blk_mgmtdev_release,
+};
+
+static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
+{
+       struct vdpasim_dev_attr dev_attr = {};
+       struct vdpasim *simdev;
+       int ret;
+
+       dev_attr.mgmt_dev = mdev;
+       dev_attr.name = name;
+       dev_attr.id = VIRTIO_ID_BLOCK;
+       dev_attr.supported_features = VDPASIM_BLK_FEATURES;
+       dev_attr.nvqs = VDPASIM_BLK_VQ_NUM;
+       dev_attr.config_size = sizeof(struct virtio_blk_config);
+       dev_attr.get_config = vdpasim_blk_get_config;
+       dev_attr.work_fn = vdpasim_blk_work;
+       dev_attr.buffer_size = VDPASIM_BLK_CAPACITY << SECTOR_SHIFT;
+
+       simdev = vdpasim_create(&dev_attr);
+       if (IS_ERR(simdev))
+               return PTR_ERR(simdev);
+
+       ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_BLK_VQ_NUM);
+       if (ret)
+               goto put_dev;
+
+       return 0;
+
+put_dev:
+       put_device(&simdev->vdpa.dev);
+       return ret;
+}
+
+static void vdpasim_blk_dev_del(struct vdpa_mgmt_dev *mdev,
+                               struct vdpa_device *dev)
+{
+       struct vdpasim *simdev = container_of(dev, struct vdpasim, vdpa);
+
+       _vdpa_unregister_device(&simdev->vdpa);
+}
+
+static const struct vdpa_mgmtdev_ops vdpasim_blk_mgmtdev_ops = {
+       .dev_add = vdpasim_blk_dev_add,
+       .dev_del = vdpasim_blk_dev_del
+};
+
+static struct virtio_device_id id_table[] = {
+       { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
+       { 0 },
+};
+
+static struct vdpa_mgmt_dev mgmt_dev = {
+       .device = &vdpasim_blk_mgmtdev,
+       .id_table = id_table,
+       .ops = &vdpasim_blk_mgmtdev_ops,
+};
+
+static int __init vdpasim_blk_init(void)
+{
+       int ret;
+
+       ret = device_register(&vdpasim_blk_mgmtdev);
+       if (ret)
+               return ret;
+
+       ret = vdpa_mgmtdev_register(&mgmt_dev);
+       if (ret)
+               goto parent_err;
+
+       return 0;
+
+parent_err:
+       device_unregister(&vdpasim_blk_mgmtdev);
+       return ret;
+}
+
+static void __exit vdpasim_blk_exit(void)
+{
+       vdpa_mgmtdev_unregister(&mgmt_dev);
+       device_unregister(&vdpasim_blk_mgmtdev);
+}
+
+module_init(vdpasim_blk_init)
+module_exit(vdpasim_blk_exit)
+
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE(DRV_LICENSE);
+MODULE_AUTHOR(DRV_AUTHOR);
+MODULE_DESCRIPTION(DRV_DESC);
diff --git a/drivers/vdpa/virtio_pci/Makefile b/drivers/vdpa/virtio_pci/Makefile
new file mode 100644 (file)
index 0000000..231088d
--- /dev/null
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_VP_VDPA) += vp_vdpa.o
diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c
new file mode 100644 (file)
index 0000000..c76ebb5
--- /dev/null
@@ -0,0 +1,484 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vDPA bridge driver for modern virtio-pci device
+ *
+ * Copyright (c) 2020, Red Hat Inc. All rights reserved.
+ * Author: Jason Wang <jasowang@redhat.com>
+ *
+ * Based on virtio_pci_modern.c.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/vdpa.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+#include <linux/virtio_pci_modern.h>
+
+#define VP_VDPA_QUEUE_MAX 256
+#define VP_VDPA_DRIVER_NAME "vp_vdpa"
+#define VP_VDPA_NAME_SIZE 256
+
+struct vp_vring {
+       void __iomem *notify;
+       char msix_name[VP_VDPA_NAME_SIZE];
+       struct vdpa_callback cb;
+       resource_size_t notify_pa;
+       int irq;
+};
+
+struct vp_vdpa {
+       struct vdpa_device vdpa;
+       struct virtio_pci_modern_device mdev;
+       struct vp_vring *vring;
+       struct vdpa_callback config_cb;
+       char msix_name[VP_VDPA_NAME_SIZE];
+       int config_irq;
+       int queues;
+       int vectors;
+};
+
+static struct vp_vdpa *vdpa_to_vp(struct vdpa_device *vdpa)
+{
+       return container_of(vdpa, struct vp_vdpa, vdpa);
+}
+
+static struct virtio_pci_modern_device *vdpa_to_mdev(struct vdpa_device *vdpa)
+{
+       struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+
+       return &vp_vdpa->mdev;
+}
+
+static u64 vp_vdpa_get_features(struct vdpa_device *vdpa)
+{
+       struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+       return vp_modern_get_features(mdev);
+}
+
+static int vp_vdpa_set_features(struct vdpa_device *vdpa, u64 features)
+{
+       struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+       vp_modern_set_features(mdev, features);
+
+       return 0;
+}
+
+static u8 vp_vdpa_get_status(struct vdpa_device *vdpa)
+{
+       struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+       return vp_modern_get_status(mdev);
+}
+
+static void vp_vdpa_free_irq(struct vp_vdpa *vp_vdpa)
+{
+       struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev;
+       struct pci_dev *pdev = mdev->pci_dev;
+       int i;
+
+       for (i = 0; i < vp_vdpa->queues; i++) {
+               if (vp_vdpa->vring[i].irq != VIRTIO_MSI_NO_VECTOR) {
+                       vp_modern_queue_vector(mdev, i, VIRTIO_MSI_NO_VECTOR);
+                       devm_free_irq(&pdev->dev, vp_vdpa->vring[i].irq,
+                                     &vp_vdpa->vring[i]);
+                       vp_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR;
+               }
+       }
+
+       if (vp_vdpa->config_irq != VIRTIO_MSI_NO_VECTOR) {
+               vp_modern_config_vector(mdev, VIRTIO_MSI_NO_VECTOR);
+               devm_free_irq(&pdev->dev, vp_vdpa->config_irq, vp_vdpa);
+               vp_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR;
+       }
+
+       if (vp_vdpa->vectors) {
+               pci_free_irq_vectors(pdev);
+               vp_vdpa->vectors = 0;
+       }
+}
+
+static irqreturn_t vp_vdpa_vq_handler(int irq, void *arg)
+{
+       struct vp_vring *vring = arg;
+
+       if (vring->cb.callback)
+               return vring->cb.callback(vring->cb.private);
+
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t vp_vdpa_config_handler(int irq, void *arg)
+{
+       struct vp_vdpa *vp_vdpa = arg;
+
+       if (vp_vdpa->config_cb.callback)
+               return vp_vdpa->config_cb.callback(vp_vdpa->config_cb.private);
+
+       return IRQ_HANDLED;
+}
+
+static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa)
+{
+       struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev;
+       struct pci_dev *pdev = mdev->pci_dev;
+       int i, ret, irq;
+       int queues = vp_vdpa->queues;
+       int vectors = queues + 1;
+
+       ret = pci_alloc_irq_vectors(pdev, vectors, vectors, PCI_IRQ_MSIX);
+       if (ret != vectors) {
+               dev_err(&pdev->dev,
+                       "vp_vdpa: fail to allocate irq vectors want %d but %d\n",
+                       vectors, ret);
+               return ret;
+       }
+
+       vp_vdpa->vectors = vectors;
+
+       for (i = 0; i < queues; i++) {
+               snprintf(vp_vdpa->vring[i].msix_name, VP_VDPA_NAME_SIZE,
+                       "vp-vdpa[%s]-%d\n", pci_name(pdev), i);
+               irq = pci_irq_vector(pdev, i);
+               ret = devm_request_irq(&pdev->dev, irq,
+                                      vp_vdpa_vq_handler,
+                                      0, vp_vdpa->vring[i].msix_name,
+                                      &vp_vdpa->vring[i]);
+               if (ret) {
+                       dev_err(&pdev->dev,
+                               "vp_vdpa: fail to request irq for vq %d\n", i);
+                       goto err;
+               }
+               vp_modern_queue_vector(mdev, i, i);
+               vp_vdpa->vring[i].irq = irq;
+       }
+
+       snprintf(vp_vdpa->msix_name, VP_VDPA_NAME_SIZE, "vp-vdpa[%s]-config\n",
+                pci_name(pdev));
+       irq = pci_irq_vector(pdev, queues);
+       ret = devm_request_irq(&pdev->dev, irq, vp_vdpa_config_handler, 0,
+                              vp_vdpa->msix_name, vp_vdpa);
+       if (ret) {
+               dev_err(&pdev->dev,
+                       "vp_vdpa: fail to request irq for vq %d\n", i);
+                       goto err;
+       }
+       vp_modern_config_vector(mdev, queues);
+       vp_vdpa->config_irq = irq;
+
+       return 0;
+err:
+       vp_vdpa_free_irq(vp_vdpa);
+       return ret;
+}
+
+static void vp_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
+{
+       struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+       struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev;
+       u8 s = vp_vdpa_get_status(vdpa);
+
+       if (status & VIRTIO_CONFIG_S_DRIVER_OK &&
+           !(s & VIRTIO_CONFIG_S_DRIVER_OK)) {
+               vp_vdpa_request_irq(vp_vdpa);
+       }
+
+       vp_modern_set_status(mdev, status);
+
+       if (!(status & VIRTIO_CONFIG_S_DRIVER_OK) &&
+           (s & VIRTIO_CONFIG_S_DRIVER_OK))
+               vp_vdpa_free_irq(vp_vdpa);
+}
+
+static u16 vp_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
+{
+       return VP_VDPA_QUEUE_MAX;
+}
+
+static int vp_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 qid,
+                               struct vdpa_vq_state *state)
+{
+       /* Note that this is not supported by virtio specification, so
+        * we return -EOPNOTSUPP here. This means we can't support live
+        * migration, vhost device start/stop.
+        */
+       return -EOPNOTSUPP;
+}
+
+static int vp_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 qid,
+                               const struct vdpa_vq_state *state)
+{
+       /* Note that this is not supported by virtio specification, so
+        * we return -ENOPOTSUPP here. This means we can't support live
+        * migration, vhost device start/stop.
+        */
+       return -EOPNOTSUPP;
+}
+
+static void vp_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 qid,
+                             struct vdpa_callback *cb)
+{
+       struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+
+       vp_vdpa->vring[qid].cb = *cb;
+}
+
+static void vp_vdpa_set_vq_ready(struct vdpa_device *vdpa,
+                                u16 qid, bool ready)
+{
+       struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+       vp_modern_set_queue_enable(mdev, qid, ready);
+}
+
+static bool vp_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 qid)
+{
+       struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+       return vp_modern_get_queue_enable(mdev, qid);
+}
+
+static void vp_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 qid,
+                              u32 num)
+{
+       struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+       vp_modern_set_queue_size(mdev, qid, num);
+}
+
+static int vp_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 qid,
+                                 u64 desc_area, u64 driver_area,
+                                 u64 device_area)
+{
+       struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+       vp_modern_queue_address(mdev, qid, desc_area,
+                               driver_area, device_area);
+
+       return 0;
+}
+
+static void vp_vdpa_kick_vq(struct vdpa_device *vdpa, u16 qid)
+{
+       struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+
+       vp_iowrite16(qid, vp_vdpa->vring[qid].notify);
+}
+
+static u32 vp_vdpa_get_generation(struct vdpa_device *vdpa)
+{
+       struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+       return vp_modern_generation(mdev);
+}
+
+static u32 vp_vdpa_get_device_id(struct vdpa_device *vdpa)
+{
+       struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+       return mdev->id.device;
+}
+
+static u32 vp_vdpa_get_vendor_id(struct vdpa_device *vdpa)
+{
+       struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+       return mdev->id.vendor;
+}
+
+static u32 vp_vdpa_get_vq_align(struct vdpa_device *vdpa)
+{
+       return PAGE_SIZE;
+}
+
+static size_t vp_vdpa_get_config_size(struct vdpa_device *vdpa)
+{
+       struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+       return mdev->device_len;
+}
+
+static void vp_vdpa_get_config(struct vdpa_device *vdpa,
+                              unsigned int offset,
+                              void *buf, unsigned int len)
+{
+       struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+       struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev;
+       u8 old, new;
+       u8 *p;
+       int i;
+
+       do {
+               old = vp_ioread8(&mdev->common->config_generation);
+               p = buf;
+               for (i = 0; i < len; i++)
+                       *p++ = vp_ioread8(mdev->device + offset + i);
+
+               new = vp_ioread8(&mdev->common->config_generation);
+       } while (old != new);
+}
+
+static void vp_vdpa_set_config(struct vdpa_device *vdpa,
+                              unsigned int offset, const void *buf,
+                              unsigned int len)
+{
+       struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+       struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev;
+       const u8 *p = buf;
+       int i;
+
+       for (i = 0; i < len; i++)
+               vp_iowrite8(*p++, mdev->device + offset + i);
+}
+
+static void vp_vdpa_set_config_cb(struct vdpa_device *vdpa,
+                                 struct vdpa_callback *cb)
+{
+       struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+
+       vp_vdpa->config_cb = *cb;
+}
+
+static struct vdpa_notification_area
+vp_vdpa_get_vq_notification(struct vdpa_device *vdpa, u16 qid)
+{
+       struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+       struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev;
+       struct vdpa_notification_area notify;
+
+       notify.addr = vp_vdpa->vring[qid].notify_pa;
+       notify.size = mdev->notify_offset_multiplier;
+
+       return notify;
+}
+
+static const struct vdpa_config_ops vp_vdpa_ops = {
+       .get_features   = vp_vdpa_get_features,
+       .set_features   = vp_vdpa_set_features,
+       .get_status     = vp_vdpa_get_status,
+       .set_status     = vp_vdpa_set_status,
+       .get_vq_num_max = vp_vdpa_get_vq_num_max,
+       .get_vq_state   = vp_vdpa_get_vq_state,
+       .get_vq_notification = vp_vdpa_get_vq_notification,
+       .set_vq_state   = vp_vdpa_set_vq_state,
+       .set_vq_cb      = vp_vdpa_set_vq_cb,
+       .set_vq_ready   = vp_vdpa_set_vq_ready,
+       .get_vq_ready   = vp_vdpa_get_vq_ready,
+       .set_vq_num     = vp_vdpa_set_vq_num,
+       .set_vq_address = vp_vdpa_set_vq_address,
+       .kick_vq        = vp_vdpa_kick_vq,
+       .get_generation = vp_vdpa_get_generation,
+       .get_device_id  = vp_vdpa_get_device_id,
+       .get_vendor_id  = vp_vdpa_get_vendor_id,
+       .get_vq_align   = vp_vdpa_get_vq_align,
+       .get_config_size = vp_vdpa_get_config_size,
+       .get_config     = vp_vdpa_get_config,
+       .set_config     = vp_vdpa_set_config,
+       .set_config_cb  = vp_vdpa_set_config_cb,
+};
+
+static void vp_vdpa_free_irq_vectors(void *data)
+{
+       pci_free_irq_vectors(data);
+}
+
+static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+       struct virtio_pci_modern_device *mdev;
+       struct device *dev = &pdev->dev;
+       struct vp_vdpa *vp_vdpa;
+       int ret, i;
+
+       ret = pcim_enable_device(pdev);
+       if (ret)
+               return ret;
+
+       vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa,
+                                   dev, &vp_vdpa_ops, NULL);
+       if (vp_vdpa == NULL) {
+               dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n");
+               return -ENOMEM;
+       }
+
+       mdev = &vp_vdpa->mdev;
+       mdev->pci_dev = pdev;
+
+       ret = vp_modern_probe(mdev);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to probe modern PCI device\n");
+               goto err;
+       }
+
+       pci_set_master(pdev);
+       pci_set_drvdata(pdev, vp_vdpa);
+
+       vp_vdpa->vdpa.dma_dev = &pdev->dev;
+       vp_vdpa->queues = vp_modern_get_num_queues(mdev);
+
+       ret = devm_add_action_or_reset(dev, vp_vdpa_free_irq_vectors, pdev);
+       if (ret) {
+               dev_err(&pdev->dev,
+                       "Failed for adding devres for freeing irq vectors\n");
+               goto err;
+       }
+
+       vp_vdpa->vring = devm_kcalloc(&pdev->dev, vp_vdpa->queues,
+                                     sizeof(*vp_vdpa->vring),
+                                     GFP_KERNEL);
+       if (!vp_vdpa->vring) {
+               ret = -ENOMEM;
+               dev_err(&pdev->dev, "Fail to allocate virtqueues\n");
+               goto err;
+       }
+
+       for (i = 0; i < vp_vdpa->queues; i++) {
+               vp_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR;
+               vp_vdpa->vring[i].notify =
+                       vp_modern_map_vq_notify(mdev, i,
+                                               &vp_vdpa->vring[i].notify_pa);
+               if (!vp_vdpa->vring[i].notify) {
+                       dev_warn(&pdev->dev, "Fail to map vq notify %d\n", i);
+                       goto err;
+               }
+       }
+       vp_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR;
+
+       ret = vdpa_register_device(&vp_vdpa->vdpa, vp_vdpa->queues);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to register to vdpa bus\n");
+               goto err;
+       }
+
+       return 0;
+
+err:
+       put_device(&vp_vdpa->vdpa.dev);
+       return ret;
+}
+
+static void vp_vdpa_remove(struct pci_dev *pdev)
+{
+       struct vp_vdpa *vp_vdpa = pci_get_drvdata(pdev);
+
+       vdpa_unregister_device(&vp_vdpa->vdpa);
+       vp_modern_remove(&vp_vdpa->mdev);
+}
+
+static struct pci_driver vp_vdpa_driver = {
+       .name           = "vp-vdpa",
+       .id_table       = NULL, /* only dynamic ids */
+       .probe          = vp_vdpa_probe,
+       .remove         = vp_vdpa_remove,
+};
+
+module_pci_driver(vp_vdpa_driver);
+
+MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>");
+MODULE_DESCRIPTION("vp-vdpa");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1");
index 0729632..a0747c3 100644 (file)
@@ -2248,7 +2248,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
        int ret;
        bool resv_msi, msi_remap;
        phys_addr_t resv_msi_base = 0;
-       struct iommu_domain_geometry geo;
+       struct iommu_domain_geometry *geo;
        LIST_HEAD(iova_copy);
        LIST_HEAD(group_resv_regions);
 
@@ -2316,10 +2316,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
        }
 
        if (iommu->nesting) {
-               int attr = 1;
-
-               ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
-                                           &attr);
+               ret = iommu_enable_nesting(domain->domain);
                if (ret)
                        goto out_domain;
        }
@@ -2329,10 +2326,9 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
                goto out_domain;
 
        /* Get aperture info */
-       iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
-
-       if (vfio_iommu_aper_conflict(iommu, geo.aperture_start,
-                                    geo.aperture_end)) {
+       geo = &domain->domain->geometry;
+       if (vfio_iommu_aper_conflict(iommu, geo->aperture_start,
+                                    geo->aperture_end)) {
                ret = -EINVAL;
                goto out_detach;
        }
@@ -2355,8 +2351,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
        if (ret)
                goto out_detach;
 
-       ret = vfio_iommu_aper_resize(&iova_copy, geo.aperture_start,
-                                    geo.aperture_end);
+       ret = vfio_iommu_aper_resize(&iova_copy, geo->aperture_start,
+                                    geo->aperture_end);
        if (ret)
                goto out_detach;
 
@@ -2489,7 +2485,6 @@ static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
                                   struct list_head *iova_copy)
 {
        struct vfio_domain *domain;
-       struct iommu_domain_geometry geo;
        struct vfio_iova *node;
        dma_addr_t start = 0;
        dma_addr_t end = (dma_addr_t)~0;
@@ -2498,12 +2493,12 @@ static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
                return;
 
        list_for_each_entry(domain, &iommu->domain_list, next) {
-               iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
-                                     &geo);
-               if (geo.aperture_start > start)
-                       start = geo.aperture_start;
-               if (geo.aperture_end < end)
-                       end = geo.aperture_end;
+               struct iommu_domain_geometry *geo = &domain->domain->geometry;
+
+               if (geo->aperture_start > start)
+                       start = geo->aperture_start;
+               if (geo->aperture_end < end)
+                       end = geo->aperture_end;
        }
 
        /* Modify aperture limits. The new aper is either same or bigger */
index bfa4c6e..fb41db3 100644 (file)
 #include <linux/cdev.h>
 #include <linux/device.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/iommu.h>
 #include <linux/uuid.h>
 #include <linux/vdpa.h>
 #include <linux/nospec.h>
 #include <linux/vhost.h>
-#include <linux/virtio_net.h>
 
 #include "vhost.h"
 
@@ -188,13 +188,8 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
 static int vhost_vdpa_config_validate(struct vhost_vdpa *v,
                                      struct vhost_vdpa_config *c)
 {
-       long size = 0;
-
-       switch (v->virtio_id) {
-       case VIRTIO_ID_NET:
-               size = sizeof(struct virtio_net_config);
-               break;
-       }
+       struct vdpa_device *vdpa = v->vdpa;
+       long size = vdpa->config->get_config_size(vdpa);
 
        if (c->len == 0)
                return -EINVAL;
@@ -836,18 +831,14 @@ static void vhost_vdpa_free_domain(struct vhost_vdpa *v)
 static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v)
 {
        struct vdpa_iova_range *range = &v->range;
-       struct iommu_domain_geometry geo;
        struct vdpa_device *vdpa = v->vdpa;
        const struct vdpa_config_ops *ops = vdpa->config;
 
        if (ops->get_iova_range) {
                *range = ops->get_iova_range(vdpa);
-       } else if (v->domain &&
-                  !iommu_domain_get_attr(v->domain,
-                  DOMAIN_ATTR_GEOMETRY, &geo) &&
-                  geo.force_aperture) {
-               range->first = geo.aperture_start;
-               range->last = geo.aperture_end;
+       } else if (v->domain && v->domain->geometry.force_aperture) {
+               range->first = v->domain->geometry.aperture_start;
+               range->last = v->domain->geometry.aperture_end;
        } else {
                range->first = 0;
                range->last = ULLONG_MAX;
@@ -993,6 +984,7 @@ static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma)
        if (vma->vm_end - vma->vm_start != notify.size)
                return -ENOTSUPP;
 
+       vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &vhost_vdpa_vm_ops;
        return 0;
 }
@@ -1027,10 +1019,6 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa)
        int minor;
        int r;
 
-       /* Currently, we only accept the network devices. */
-       if (ops->get_device_id(vdpa) != VIRTIO_ID_NET)
-               return -ENOTSUPP;
-
        v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
        if (!v)
                return -ENOMEM;
index 85d85fa..4af8fa2 100644 (file)
@@ -75,6 +75,34 @@ static inline int __vringh_get_head(const struct vringh *vrh,
        return head;
 }
 
+/**
+ * vringh_kiov_advance - skip bytes from vring_kiov
+ * @iov: an iov passed to vringh_getdesc_*() (updated as we consume)
+ * @len: the maximum length to advance
+ */
+void vringh_kiov_advance(struct vringh_kiov *iov, size_t len)
+{
+       while (len && iov->i < iov->used) {
+               size_t partlen = min(iov->iov[iov->i].iov_len, len);
+
+               iov->consumed += partlen;
+               iov->iov[iov->i].iov_len -= partlen;
+               iov->iov[iov->i].iov_base += partlen;
+
+               if (!iov->iov[iov->i].iov_len) {
+                       /* Fix up old iov element then increment. */
+                       iov->iov[iov->i].iov_len = iov->consumed;
+                       iov->iov[iov->i].iov_base -= iov->consumed;
+
+                       iov->consumed = 0;
+                       iov->i++;
+               }
+
+               len -= partlen;
+       }
+}
+EXPORT_SYMBOL(vringh_kiov_advance);
+
 /* Copy some bytes to/from the iovec.  Returns num copied. */
 static inline ssize_t vringh_iov_xfer(struct vringh *vrh,
                                      struct vringh_kiov *iov,
@@ -95,19 +123,8 @@ static inline ssize_t vringh_iov_xfer(struct vringh *vrh,
                done += partlen;
                len -= partlen;
                ptr += partlen;
-               iov->consumed += partlen;
-               iov->iov[iov->i].iov_len -= partlen;
-               iov->iov[iov->i].iov_base += partlen;
 
-               if (!iov->iov[iov->i].iov_len) {
-                       /* Fix up old iov element then increment. */
-                       iov->iov[iov->i].iov_len = iov->consumed;
-                       iov->iov[iov->i].iov_base -= iov->consumed;
-
-                       
-                       iov->consumed = 0;
-                       iov->i++;
-               }
+               vringh_kiov_advance(iov, partlen);
        }
        return done;
 }
@@ -290,9 +307,9 @@ __vringh_iov(struct vringh *vrh, u16 i,
                return -EINVAL;
 
        if (riov)
-               riov->i = riov->used = 0;
+               riov->i = riov->used = riov->consumed = 0;
        if (wiov)
-               wiov->i = wiov->used = 0;
+               wiov->i = wiov->used = wiov->consumed = 0;
 
        for (;;) {
                void *addr;
@@ -662,7 +679,10 @@ EXPORT_SYMBOL(vringh_init_user);
  * *head will be vrh->vring.num.  You may be able to ignore an invalid
  * descriptor, but there's not much you can do with an invalid ring.
  *
- * Note that you may need to clean up riov and wiov, even on error!
+ * Note that you can reuse riov and wiov with subsequent calls. Content is
+ * overwritten and memory reallocated if more space is needed.
+ * When you don't have to use riov and wiov anymore, you should clean up them
+ * calling vringh_iov_cleanup() to release the memory, even on error!
  */
 int vringh_getdesc_user(struct vringh *vrh,
                        struct vringh_iov *riov,
@@ -932,7 +952,10 @@ EXPORT_SYMBOL(vringh_init_kern);
  * *head will be vrh->vring.num.  You may be able to ignore an invalid
  * descriptor, but there's not much you can do with an invalid ring.
  *
- * Note that you may need to clean up riov and wiov, even on error!
+ * Note that you can reuse riov and wiov with subsequent calls. Content is
+ * overwritten and memory reallocated if more space is needed.
+ * When you don't have to use riov and wiov anymore, you should clean up them
+ * calling vringh_kiov_cleanup() to release the memory, even on error!
  */
 int vringh_getdesc_kern(struct vringh *vrh,
                        struct vringh_kiov *riov,
@@ -1074,6 +1097,8 @@ static int iotlb_translate(const struct vringh *vrh,
        int ret = 0;
        u64 s = 0;
 
+       spin_lock(vrh->iotlb_lock);
+
        while (len > s) {
                u64 size, pa, pfn;
 
@@ -1103,6 +1128,8 @@ static int iotlb_translate(const struct vringh *vrh,
                ++ret;
        }
 
+       spin_unlock(vrh->iotlb_lock);
+
        return ret;
 }
 
@@ -1262,10 +1289,13 @@ EXPORT_SYMBOL(vringh_init_iotlb);
  * vringh_set_iotlb - initialize a vringh for a ring with IOTLB.
  * @vrh: the vring
  * @iotlb: iotlb associated with this vring
+ * @iotlb_lock: spinlock to synchronize the iotlb accesses
  */
-void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb)
+void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb,
+                     spinlock_t *iotlb_lock)
 {
        vrh->iotlb = iotlb;
+       vrh->iotlb_lock = iotlb_lock;
 }
 EXPORT_SYMBOL(vringh_set_iotlb);
 
@@ -1285,7 +1315,10 @@ EXPORT_SYMBOL(vringh_set_iotlb);
  * *head will be vrh->vring.num.  You may be able to ignore an invalid
  * descriptor, but there's not much you can do with an invalid ring.
  *
- * Note that you may need to clean up riov and wiov, even on error!
+ * Note that you can reuse riov and wiov with subsequent calls. Content is
+ * overwritten and memory reallocated if more space is needed.
+ * When you don't have to use riov and wiov anymore, you should clean up them
+ * calling vringh_kiov_cleanup() to release the memory, even on error!
  */
 int vringh_getdesc_iotlb(struct vringh *vrh,
                         struct vringh_kiov *riov,
index a385342..4325bf7 100644 (file)
@@ -2608,12 +2608,3 @@ EXPORT_SYMBOL(matroxfb_register_driver);
 EXPORT_SYMBOL(matroxfb_unregister_driver);
 EXPORT_SYMBOL(matroxfb_wait_for_sync);
 EXPORT_SYMBOL(matroxfb_enable_irq);
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-basic-offset: 8
- * End:
- */
-
index 1e8a38a..e2757ff 100644 (file)
@@ -1451,13 +1451,3 @@ MODULE_DESCRIPTION("Legacy VGA framebuffer device driver");
 MODULE_LICENSE("GPL");
 module_init(vga16fb_init);
 module_exit(vga16fb_exit);
-
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-basic-offset: 8
- * End:
- */
-
index f1964ea..e21e1e8 100644 (file)
@@ -1524,7 +1524,8 @@ static const struct file_operations ne_enclave_fops = {
  *                       enclave file descriptor to be further used for enclave
  *                       resources handling e.g. memory regions and CPUs.
  * @ne_pci_dev :       Private data associated with the PCI device.
- * @slot_uid:          Generated unique slot id associated with an enclave.
+ * @slot_uid:          User pointer to store the generated unique slot id
+ *                     associated with an enclave to.
  *
  * Context: Process context. This function is called with the ne_pci_dev enclave
  *         mutex held.
@@ -1532,7 +1533,7 @@ static const struct file_operations ne_enclave_fops = {
  * * Enclave fd on success.
  * * Negative return value on failure.
  */
-static int ne_create_vm_ioctl(struct ne_pci_dev *ne_pci_dev, u64 *slot_uid)
+static int ne_create_vm_ioctl(struct ne_pci_dev *ne_pci_dev, u64 __user *slot_uid)
 {
        struct ne_pci_dev_cmd_reply cmd_reply = {};
        int enclave_fd = -1;
@@ -1634,7 +1635,18 @@ static int ne_create_vm_ioctl(struct ne_pci_dev *ne_pci_dev, u64 *slot_uid)
 
        list_add(&ne_enclave->enclave_list_entry, &ne_pci_dev->enclaves_list);
 
-       *slot_uid = ne_enclave->slot_uid;
+       if (copy_to_user(slot_uid, &ne_enclave->slot_uid, sizeof(ne_enclave->slot_uid))) {
+               /*
+                * As we're holding the only reference to 'enclave_file', fput()
+                * will call ne_enclave_release() which will do a proper cleanup
+                * of all so far allocated resources, leaving only the unused fd
+                * for us to free.
+                */
+               fput(enclave_file);
+               put_unused_fd(enclave_fd);
+
+               return -EFAULT;
+       }
 
        fd_install(enclave_fd, enclave_file);
 
@@ -1671,34 +1683,13 @@ static long ne_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        switch (cmd) {
        case NE_CREATE_VM: {
                int enclave_fd = -1;
-               struct file *enclave_file = NULL;
                struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
-               int rc = -EINVAL;
-               u64 slot_uid = 0;
+               u64 __user *slot_uid = (void __user *)arg;
 
                mutex_lock(&ne_pci_dev->enclaves_list_mutex);
-
-               enclave_fd = ne_create_vm_ioctl(ne_pci_dev, &slot_uid);
-               if (enclave_fd < 0) {
-                       rc = enclave_fd;
-
-                       mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
-
-                       return rc;
-               }
-
+               enclave_fd = ne_create_vm_ioctl(ne_pci_dev, slot_uid);
                mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
 
-               if (copy_to_user((void __user *)arg, &slot_uid, sizeof(slot_uid))) {
-                       enclave_file = fget(enclave_fd);
-                       /* Decrement file refs to have release() called. */
-                       fput(enclave_file);
-                       fput(enclave_file);
-                       put_unused_fd(enclave_fd);
-
-                       return -EFAULT;
-               }
-
                return enclave_fd;
        }
 
index 8985fc2..510e931 100644 (file)
@@ -734,7 +734,7 @@ static void report_free_page_func(struct work_struct *work)
 #ifdef CONFIG_BALLOON_COMPACTION
 /*
  * virtballoon_migratepage - perform the balloon page migration on behalf of
- *                          a compation thread.     (called under page lock)
+ *                          a compaction thread.     (called under page lock)
  * @vb_dev_info: the balloon device
  * @newpage: page that will replace the isolated page after migration finishes.
  * @page   : the isolated (old) page that is about to be migrated to newpage.
index fbd4ebc..30654d3 100644 (file)
@@ -192,7 +192,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 
        struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
        struct virtqueue *vq;
-       u16 num, off;
+       u16 num;
        int err;
 
        if (index >= vp_modern_get_num_queues(mdev))
@@ -208,9 +208,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
                return ERR_PTR(-EINVAL);
        }
 
-       /* get offset of notification word for this vq */
-       off = vp_modern_get_queue_notify_off(mdev, index);
-
        info->msix_vector = msix_vec;
 
        /* create the vring */
@@ -227,27 +224,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
                                virtqueue_get_avail_addr(vq),
                                virtqueue_get_used_addr(vq));
 
-       if (mdev->notify_base) {
-               /* offset should not wrap */
-               if ((u64)off * mdev->notify_offset_multiplier + 2
-                   > mdev->notify_len) {
-                       dev_warn(&mdev->pci_dev->dev,
-                                "bad notification offset %u (x %u) "
-                                "for queue %u > %zd",
-                                off, mdev->notify_offset_multiplier,
-                                index, mdev->notify_len);
-                       err = -EINVAL;
-                       goto err_map_notify;
-               }
-               vq->priv = (void __force *)mdev->notify_base +
-                       off * mdev->notify_offset_multiplier;
-       } else {
-               vq->priv = (void __force *)vp_modern_map_capability(mdev,
-                                                         mdev->notify_map_cap, 2, 2,
-                                                         off * mdev->notify_offset_multiplier, 2,
-                                                         NULL);
-       }
-
+       vq->priv = (void __force *)vp_modern_map_vq_notify(mdev, index, NULL);
        if (!vq->priv) {
                err = -ENOMEM;
                goto err_map_notify;
index cbd6674..54f2970 100644 (file)
  * @start: start from the capability
  * @size: map size
  * @len: the length that is actually mapped
+ * @pa: physical address of the capability
  *
  * Returns the io address of for the part of the capability
  */
-void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
-                                      size_t minlen,
-                                      u32 align,
-                                      u32 start, u32 size,
-                                      size_t *len)
+static void __iomem *
+vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
+                        size_t minlen, u32 align, u32 start, u32 size,
+                        size_t *len, resource_size_t *pa)
 {
        struct pci_dev *dev = mdev->pci_dev;
        u8 bar;
@@ -88,9 +88,11 @@ void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, in
                dev_err(&dev->dev,
                        "virtio_pci: unable to map virtio %u@%u on bar %i\n",
                        length, offset, bar);
+       else if (pa)
+               *pa = pci_resource_start(dev, bar) + offset;
+
        return p;
 }
-EXPORT_SYMBOL_GPL(vp_modern_map_capability);
 
 /**
  * virtio_pci_find_capability - walk capabilities to find device info.
@@ -275,12 +277,12 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev)
        mdev->common = vp_modern_map_capability(mdev, common,
                                      sizeof(struct virtio_pci_common_cfg), 4,
                                      0, sizeof(struct virtio_pci_common_cfg),
-                                     NULL);
+                                     NULL, NULL);
        if (!mdev->common)
                goto err_map_common;
        mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1,
                                             0, 1,
-                                            NULL);
+                                            NULL, NULL);
        if (!mdev->isr)
                goto err_map_isr;
 
@@ -308,7 +310,8 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev)
                mdev->notify_base = vp_modern_map_capability(mdev, notify,
                                                             2, 2,
                                                             0, notify_length,
-                                                            &mdev->notify_len);
+                                                            &mdev->notify_len,
+                                                            &mdev->notify_pa);
                if (!mdev->notify_base)
                        goto err_map_notify;
        } else {
@@ -321,7 +324,8 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev)
        if (device) {
                mdev->device = vp_modern_map_capability(mdev, device, 0, 4,
                                                        0, PAGE_SIZE,
-                                                       &mdev->device_len);
+                                                       &mdev->device_len,
+                                                       NULL);
                if (!mdev->device)
                        goto err_map_device;
        }
@@ -584,14 +588,51 @@ EXPORT_SYMBOL_GPL(vp_modern_get_num_queues);
  *
  * Returns the notification offset for a virtqueue
  */
-u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
-                                  u16 index)
+static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
+                                         u16 index)
 {
        vp_iowrite16(index, &mdev->common->queue_select);
 
        return vp_ioread16(&mdev->common->queue_notify_off);
 }
-EXPORT_SYMBOL_GPL(vp_modern_get_queue_notify_off);
+
+/*
+ * vp_modern_map_vq_notify - map notification area for a
+ * specific virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ * @pa: the pointer to the physical address of the nofity area
+ *
+ * Returns the address of the notification area
+ */
+void __iomem *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
+                                     u16 index, resource_size_t *pa)
+{
+       u16 off = vp_modern_get_queue_notify_off(mdev, index);
+
+       if (mdev->notify_base) {
+               /* offset should not wrap */
+               if ((u64)off * mdev->notify_offset_multiplier + 2
+                       > mdev->notify_len) {
+                       dev_warn(&mdev->pci_dev->dev,
+                                "bad notification offset %u (x %u) "
+                                "for queue %u > %zd",
+                                off, mdev->notify_offset_multiplier,
+                                index, mdev->notify_len);
+                       return NULL;
+               }
+               if (pa)
+                       *pa = mdev->notify_pa +
+                             off * mdev->notify_offset_multiplier;
+               return mdev->notify_base + off * mdev->notify_offset_multiplier;
+       } else {
+               return vp_modern_map_capability(mdev,
+                                      mdev->notify_map_cap, 2, 2,
+                                      off * mdev->notify_offset_multiplier, 2,
+                                      NULL, pa);
+       }
+}
+EXPORT_SYMBOL_GPL(vp_modern_map_vq_notify);
 
 MODULE_VERSION("0.1");
 MODULE_DESCRIPTION("Modern Virtio PCI Device");
index 2b385c1..4c89afc 100644 (file)
 
 #include <trace/events/swiotlb.h>
 #define MAX_DMA_BITS 32
-/*
- * Used to do a quick range check in swiotlb_tbl_unmap_single and
- * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
- * API.
- */
 
-static char *xen_io_tlb_start, *xen_io_tlb_end;
-static unsigned long xen_io_tlb_nslabs;
 /*
  * Quick lookup value of the bus address of the IOTLB.
  */
@@ -82,11 +75,6 @@ static inline phys_addr_t xen_dma_to_phys(struct device *dev,
        return xen_bus_to_phys(dev, dma_to_phys(dev, dma_addr));
 }
 
-static inline dma_addr_t xen_virt_to_bus(struct device *dev, void *address)
-{
-       return xen_phys_to_dma(dev, virt_to_phys(address));
-}
-
 static inline int range_straddles_page_boundary(phys_addr_t p, size_t size)
 {
        unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p);
@@ -111,15 +99,12 @@ static int is_xen_swiotlb_buffer(struct device *dev, dma_addr_t dma_addr)
         * have the same virtual address as another address
         * in our domain. Therefore _only_ check address within our domain.
         */
-       if (pfn_valid(PFN_DOWN(paddr))) {
-               return paddr >= virt_to_phys(xen_io_tlb_start) &&
-                      paddr < virt_to_phys(xen_io_tlb_end);
-       }
+       if (pfn_valid(PFN_DOWN(paddr)))
+               return is_swiotlb_buffer(paddr);
        return 0;
 }
 
-static int
-xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
+static int xen_swiotlb_fixup(void *buf, unsigned long nslabs)
 {
        int i, rc;
        int dma_bits;
@@ -145,16 +130,6 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
        } while (i < nslabs);
        return 0;
 }
-static unsigned long xen_set_nslabs(unsigned long nr_tbl)
-{
-       if (!nr_tbl) {
-               xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT);
-               xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE);
-       } else
-               xen_io_tlb_nslabs = nr_tbl;
-
-       return xen_io_tlb_nslabs << IO_TLB_SHIFT;
-}
 
 enum xen_swiotlb_err {
        XEN_SWIOTLB_UNKNOWN = 0,
@@ -177,102 +152,109 @@ static const char *xen_swiotlb_error(enum xen_swiotlb_err err)
        }
        return "";
 }
-int __ref xen_swiotlb_init(int verbose, bool early)
+
+#define DEFAULT_NSLABS         ALIGN(SZ_64M >> IO_TLB_SHIFT, IO_TLB_SEGSIZE)
+
+int __ref xen_swiotlb_init(void)
 {
-       unsigned long bytes, order;
-       int rc = -ENOMEM;
        enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN;
-       unsigned int repeat = 3;
+       unsigned long bytes = swiotlb_size_or_default();
+       unsigned long nslabs = bytes >> IO_TLB_SHIFT;
+       unsigned int order, repeat = 3;
+       int rc = -ENOMEM;
+       char *start;
 
-       xen_io_tlb_nslabs = swiotlb_nr_tbl();
 retry:
-       bytes = xen_set_nslabs(xen_io_tlb_nslabs);
-       order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT);
-
-       /*
-        * IO TLB memory already allocated. Just use it.
-        */
-       if (io_tlb_start != 0) {
-               xen_io_tlb_start = phys_to_virt(io_tlb_start);
-               goto end;
-       }
+       m_ret = XEN_SWIOTLB_ENOMEM;
+       order = get_order(bytes);
 
        /*
         * Get IO TLB memory from any location.
         */
-       if (early) {
-               xen_io_tlb_start = memblock_alloc(PAGE_ALIGN(bytes),
-                                                 PAGE_SIZE);
-               if (!xen_io_tlb_start)
-                       panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-                             __func__, PAGE_ALIGN(bytes), PAGE_SIZE);
-       } else {
 #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
 #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
-               while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
-                       xen_io_tlb_start = (void *)xen_get_swiotlb_free_pages(order);
-                       if (xen_io_tlb_start)
-                               break;
-                       order--;
-               }
-               if (order != get_order(bytes)) {
-                       pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n",
-                               (PAGE_SIZE << order) >> 20);
-                       xen_io_tlb_nslabs = SLABS_PER_PAGE << order;
-                       bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT;
-               }
+       while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
+               start = (void *)xen_get_swiotlb_free_pages(order);
+               if (start)
+                       break;
+               order--;
        }
-       if (!xen_io_tlb_start) {
-               m_ret = XEN_SWIOTLB_ENOMEM;
+       if (!start)
                goto error;
+       if (order != get_order(bytes)) {
+               pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n",
+                       (PAGE_SIZE << order) >> 20);
+               nslabs = SLABS_PER_PAGE << order;
+               bytes = nslabs << IO_TLB_SHIFT;
        }
+
        /*
         * And replace that memory with pages under 4GB.
         */
-       rc = xen_swiotlb_fixup(xen_io_tlb_start,
-                              bytes,
-                              xen_io_tlb_nslabs);
+       rc = xen_swiotlb_fixup(start, nslabs);
        if (rc) {
-               if (early)
-                       memblock_free(__pa(xen_io_tlb_start),
-                                     PAGE_ALIGN(bytes));
-               else {
-                       free_pages((unsigned long)xen_io_tlb_start, order);
-                       xen_io_tlb_start = NULL;
-               }
+               free_pages((unsigned long)start, order);
                m_ret = XEN_SWIOTLB_EFIXUP;
                goto error;
        }
-       if (early) {
-               if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs,
-                        verbose))
-                       panic("Cannot allocate SWIOTLB buffer");
-               rc = 0;
-       } else
-               rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs);
-
-end:
-       xen_io_tlb_end = xen_io_tlb_start + bytes;
-       if (!rc)
-               swiotlb_set_max_segment(PAGE_SIZE);
-
-       return rc;
+       rc = swiotlb_late_init_with_tbl(start, nslabs);
+       if (rc)
+               return rc;
+       swiotlb_set_max_segment(PAGE_SIZE);
+       return 0;
 error:
        if (repeat--) {
-               xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */
-                                       (xen_io_tlb_nslabs >> 1));
+               /* Min is 2MB */
+               nslabs = max(1024UL, (nslabs >> 1));
                pr_info("Lowering to %luMB\n",
-                       (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20);
+                       (nslabs << IO_TLB_SHIFT) >> 20);
                goto retry;
        }
        pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc);
-       if (early)
-               panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc);
-       else
-               free_pages((unsigned long)xen_io_tlb_start, order);
+       free_pages((unsigned long)start, order);
        return rc;
 }
 
+#ifdef CONFIG_X86
+void __init xen_swiotlb_init_early(void)
+{
+       unsigned long bytes = swiotlb_size_or_default();
+       unsigned long nslabs = bytes >> IO_TLB_SHIFT;
+       unsigned int repeat = 3;
+       char *start;
+       int rc;
+
+retry:
+       /*
+        * Get IO TLB memory from any location.
+        */
+       start = memblock_alloc(PAGE_ALIGN(bytes), PAGE_SIZE);
+       if (!start)
+               panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+                     __func__, PAGE_ALIGN(bytes), PAGE_SIZE);
+
+       /*
+        * And replace that memory with pages under 4GB.
+        */
+       rc = xen_swiotlb_fixup(start, nslabs);
+       if (rc) {
+               memblock_free(__pa(start), PAGE_ALIGN(bytes));
+               if (repeat--) {
+                       /* Min is 2MB */
+                       nslabs = max(1024UL, (nslabs >> 1));
+                       bytes = nslabs << IO_TLB_SHIFT;
+                       pr_info("Lowering to %luMB\n", bytes >> 20);
+                       goto retry;
+               }
+               panic("%s (rc:%d)", xen_swiotlb_error(XEN_SWIOTLB_EFIXUP), rc);
+       }
+
+       if (swiotlb_init_with_tbl(start, nslabs, false))
+               panic("Cannot allocate SWIOTLB buffer");
+       swiotlb_set_max_segment(PAGE_SIZE);
+}
+#endif /* CONFIG_X86 */
+
 static void *
 xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
                           dma_addr_t *dma_handle, gfp_t flags,
@@ -406,7 +388,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
         * Ensure that the address returned is DMA'ble
         */
        if (unlikely(!dma_capable(dev, dev_addr, size, true))) {
-               swiotlb_tbl_unmap_single(dev, map, size, size, dir,
+               swiotlb_tbl_unmap_single(dev, map, size, dir,
                                attrs | DMA_ATTR_SKIP_CPU_SYNC);
                return DMA_MAPPING_ERROR;
        }
@@ -445,7 +427,7 @@ static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
 
        /* NOTE: We use dev_addr here, not paddr! */
        if (is_xen_swiotlb_buffer(hwdev, dev_addr))
-               swiotlb_tbl_unmap_single(hwdev, paddr, size, size, dir, attrs);
+               swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs);
 }
 
 static void
@@ -462,7 +444,7 @@ xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr,
        }
 
        if (is_xen_swiotlb_buffer(dev, dma_addr))
-               swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
+               swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
 }
 
 static void
@@ -472,7 +454,7 @@ xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr,
        phys_addr_t paddr = xen_dma_to_phys(dev, dma_addr);
 
        if (is_xen_swiotlb_buffer(dev, dma_addr))
-               swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
+               swiotlb_sync_single_for_device(dev, paddr, size, dir);
 
        if (!dev_is_dma_coherent(dev)) {
                if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr))))
@@ -560,7 +542,7 @@ xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
 static int
 xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
-       return xen_virt_to_bus(hwdev, xen_io_tlb_end - 1) <= mask;
+       return xen_phys_to_dma(hwdev, io_tlb_default_mem->end - 1) <= mask;
 }
 
 const struct dma_map_ops xen_swiotlb_dma_ops = {
index 39def02..cdb9950 100644 (file)
@@ -583,7 +583,7 @@ static struct attribute *v9fs_attrs[] = {
        NULL,
 };
 
-static struct attribute_group v9fs_attr_group = {
+static const struct attribute_group v9fs_attr_group = {
        .attrs = v9fs_attrs,
 };
 
index 649f04f..59c32c9 100644 (file)
@@ -86,8 +86,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
                 * to work.
                 */
                writeback_fid = v9fs_writeback_fid(file_dentry(file));
-               if (IS_ERR(fid)) {
-                       err = PTR_ERR(fid);
+               if (IS_ERR(writeback_fid)) {
+                       err = PTR_ERR(writeback_fid);
                        mutex_unlock(&v9inode->v_mutex);
                        goto out_error;
                }
index 97e7b77..141a856 100644 (file)
@@ -223,10 +223,13 @@ config TMPFS_INODE64
 
          If unsure, say N.
 
+config ARCH_SUPPORTS_HUGETLBFS
+       def_bool n
+
 config HUGETLBFS
        bool "HugeTLB file system support"
        depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
-                  SYS_SUPPORTS_HUGETLBFS || BROKEN
+                  ARCH_SUPPORTS_HUGETLBFS || BROKEN
        help
          hugetlbfs is a filesystem backing for HugeTLB pages, based on
          ramfs. For architectures that support it, say Y here and read
@@ -335,8 +338,8 @@ config NFS_COMMON
        default y
 
 config NFS_V4_2_SSC_HELPER
-       tristate
-       default y if NFS_V4=y || NFS_FS=y
+       bool
+       default y if NFS_V4_2
 
 source "net/sunrpc/Kconfig"
 source "fs/ceph/Kconfig"
index c6f1c8c..06fb7a9 100644 (file)
@@ -112,6 +112,9 @@ config BINFMT_FLAT_ARGVP_ENVP_ON_STACK
 config BINFMT_FLAT_OLD_ALWAYS_RAM
        bool
 
+config BINFMT_FLAT_NO_DATA_START_OFFSET
+       bool
+
 config BINFMT_FLAT_OLD
        bool "Enable support for very old legacy flat binaries"
        depends on BINFMT_FLAT
index 117df15..9fbe5a5 100644 (file)
@@ -1419,6 +1419,7 @@ static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 
        afs_op_set_vnode(op, 0, dvnode);
        op->file[0].dv_delta = 1;
+       op->file[0].modification = true;
        op->file[0].update_ctime = true;
        op->dentry      = dentry;
        op->create.mode = S_IFDIR | mode;
@@ -1500,6 +1501,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 
        afs_op_set_vnode(op, 0, dvnode);
        op->file[0].dv_delta = 1;
+       op->file[0].modification = true;
        op->file[0].update_ctime = true;
 
        op->dentry      = dentry;
@@ -1636,6 +1638,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 
        afs_op_set_vnode(op, 0, dvnode);
        op->file[0].dv_delta = 1;
+       op->file[0].modification = true;
        op->file[0].update_ctime = true;
 
        /* Try to make sure we have a callback promise on the victim. */
@@ -1718,6 +1721,7 @@ static int afs_create(struct user_namespace *mnt_userns, struct inode *dir,
 
        afs_op_set_vnode(op, 0, dvnode);
        op->file[0].dv_delta = 1;
+       op->file[0].modification = true;
        op->file[0].update_ctime = true;
 
        op->dentry      = dentry;
@@ -1792,6 +1796,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
        afs_op_set_vnode(op, 0, dvnode);
        afs_op_set_vnode(op, 1, vnode);
        op->file[0].dv_delta = 1;
+       op->file[0].modification = true;
        op->file[0].update_ctime = true;
        op->file[1].update_ctime = true;
 
@@ -1987,6 +1992,8 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
        afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */
        op->file[0].dv_delta = 1;
        op->file[1].dv_delta = 1;
+       op->file[0].modification = true;
+       op->file[1].modification = true;
        op->file[0].update_ctime = true;
        op->file[1].update_ctime = true;
 
index 04f75a4..dae9a57 100644 (file)
@@ -73,6 +73,8 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
        afs_op_set_vnode(op, 1, dvnode);
        op->file[0].dv_delta = 1;
        op->file[1].dv_delta = 1;
+       op->file[0].modification = true;
+       op->file[1].modification = true;
        op->file[0].update_ctime = true;
        op->file[1].update_ctime = true;
 
@@ -201,6 +203,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
        afs_op_set_vnode(op, 0, dvnode);
        afs_op_set_vnode(op, 1, vnode);
        op->file[0].dv_delta = 1;
+       op->file[0].modification = true;
        op->file[0].update_ctime = true;
        op->file[1].op_unlinked = true;
        op->file[1].update_ctime = true;
index 2cb0951..d222dfb 100644 (file)
@@ -118,6 +118,8 @@ static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param *
                vp->cb_break_before     = afs_calc_vnode_cb_break(vnode);
                if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
                        op->flags       |= AFS_OPERATION_CUR_ONLY;
+               if (vp->modification)
+                       set_bit(AFS_VNODE_MODIFYING, &vnode->flags);
        }
 
        if (vp->fid.vnode)
@@ -225,6 +227,10 @@ int afs_put_operation(struct afs_operation *op)
 
        if (op->ops && op->ops->put)
                op->ops->put(op);
+       if (op->file[0].modification)
+               clear_bit(AFS_VNODE_MODIFYING, &op->file[0].vnode->flags);
+       if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode)
+               clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags);
        if (op->file[0].put_vnode)
                iput(&op->file[0].vnode->vfs_inode);
        if (op->file[1].put_vnode)
index 3a129b9..80b6c8d 100644 (file)
@@ -294,8 +294,9 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
                        op->flags &= ~AFS_OPERATION_DIR_CONFLICT;
                }
        } else if (vp->scb.have_status) {
-               if (vp->dv_before + vp->dv_delta != vp->scb.status.data_version &&
-                   vp->speculative)
+               if (vp->speculative &&
+                   (test_bit(AFS_VNODE_MODIFYING, &vnode->flags) ||
+                    vp->dv_before != vnode->status.data_version))
                        /* Ignore the result of a speculative bulk status fetch
                         * if it splits around a modification op, thereby
                         * appearing to regress the data version.
@@ -911,6 +912,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
        }
        op->ctime = attr->ia_ctime;
        op->file[0].update_ctime = 1;
+       op->file[0].modification = true;
 
        op->ops = &afs_setattr_operation;
        ret = afs_do_sync_operation(op);
index 52157a0..5ed416f 100644 (file)
@@ -645,6 +645,7 @@ struct afs_vnode {
 #define AFS_VNODE_PSEUDODIR    7               /* set if Vnode is a pseudo directory */
 #define AFS_VNODE_NEW_CONTENT  8               /* Set if file has new content (create/trunc-0) */
 #define AFS_VNODE_SILLY_DELETED        9               /* Set if file has been silly-deleted */
+#define AFS_VNODE_MODIFYING    10              /* Set if we're performing a modification op */
 
        struct list_head        wb_keys;        /* List of keys available for writeback */
        struct list_head        pending_locks;  /* locks waiting to be granted */
@@ -762,6 +763,7 @@ struct afs_vnode_param {
        bool                    set_size:1;     /* Must update i_size */
        bool                    op_unlinked:1;  /* True if file was unlinked by op */
        bool                    speculative:1;  /* T if speculative status fetch (no vnode lock) */
+       bool                    modification:1; /* Set if the content gets modified */
 };
 
 /*
index dc66ff1..3edb620 100644 (file)
@@ -377,6 +377,7 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t
 
        afs_op_set_vnode(op, 0, vnode);
        op->file[0].dv_delta = 1;
+       op->file[0].modification = true;
        op->store.write_iter = iter;
        op->store.pos = pos;
        op->store.size = size;
index 054f97b..918826e 100644 (file)
@@ -87,6 +87,7 @@ struct autofs_wait_queue {
        autofs_wqt_t wait_queue_token;
        /* We use the following to see what we are waiting for */
        struct qstr name;
+       u32 offset;
        u32 dev;
        u64 ino;
        kuid_t uid;
index a1c7701..b3fefd6 100644 (file)
@@ -355,7 +355,7 @@ static struct dentry *should_expire(struct dentry *dentry,
                return NULL;
        }
 
-       if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
+       if (d_is_symlink(dentry)) {
                pr_debug("checking symlink %p %pd\n", dentry, dentry);
 
                /* Forced expire, user space handles busy mounts */
index 5ced859..16b5fca 100644 (file)
@@ -30,7 +30,7 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi)
        while (wq) {
                nwq = wq->next;
                wq->status = -ENOENT; /* Magic is gone - report failure */
-               kfree(wq->name.name);
+               kfree(wq->name.name - wq->offset);
                wq->name.name = NULL;
                wq->wait_ctr--;
                wake_up_interruptible(&wq->queue);
@@ -175,51 +175,6 @@ static void autofs_notify_daemon(struct autofs_sb_info *sbi,
        fput(pipe);
 }
 
-static int autofs_getpath(struct autofs_sb_info *sbi,
-                         struct dentry *dentry, char *name)
-{
-       struct dentry *root = sbi->sb->s_root;
-       struct dentry *tmp;
-       char *buf;
-       char *p;
-       int len;
-       unsigned seq;
-
-rename_retry:
-       buf = name;
-       len = 0;
-
-       seq = read_seqbegin(&rename_lock);
-       rcu_read_lock();
-       spin_lock(&sbi->fs_lock);
-       for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
-               len += tmp->d_name.len + 1;
-
-       if (!len || --len > NAME_MAX) {
-               spin_unlock(&sbi->fs_lock);
-               rcu_read_unlock();
-               if (read_seqretry(&rename_lock, seq))
-                       goto rename_retry;
-               return 0;
-       }
-
-       *(buf + len) = '\0';
-       p = buf + len - dentry->d_name.len;
-       strncpy(p, dentry->d_name.name, dentry->d_name.len);
-
-       for (tmp = dentry->d_parent; tmp != root ; tmp = tmp->d_parent) {
-               *(--p) = '/';
-               p -= tmp->d_name.len;
-               strncpy(p, tmp->d_name.name, tmp->d_name.len);
-       }
-       spin_unlock(&sbi->fs_lock);
-       rcu_read_unlock();
-       if (read_seqretry(&rename_lock, seq))
-               goto rename_retry;
-
-       return len;
-}
-
 static struct autofs_wait_queue *
 autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr)
 {
@@ -352,6 +307,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
        struct qstr qstr;
        char *name;
        int status, ret, type;
+       unsigned int offset = 0;
        pid_t pid;
        pid_t tgid;
 
@@ -389,20 +345,23 @@ int autofs_wait(struct autofs_sb_info *sbi,
                return -ENOMEM;
 
        /* If this is a direct mount request create a dummy name */
-       if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
+       if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) {
+               qstr.name = name;
                qstr.len = sprintf(name, "%p", dentry);
-       else {
-               qstr.len = autofs_getpath(sbi, dentry, name);
-               if (!qstr.len) {
+       else {
+               char *p = dentry_path_raw(dentry, name, NAME_MAX);
+               if (IS_ERR(p)) {
                        kfree(name);
                        return -ENOENT;
                }
+               qstr.name = ++p; // skip the leading slash
+               qstr.len = strlen(p);
+               offset = p - name;
        }
-       qstr.name = name;
        qstr.hash = full_name_hash(dentry, name, qstr.len);
 
        if (mutex_lock_interruptible(&sbi->wq_mutex)) {
-               kfree(qstr.name);
+               kfree(name);
                return -EINTR;
        }
 
@@ -410,7 +369,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
        if (ret <= 0) {
                if (ret != -EINTR)
                        mutex_unlock(&sbi->wq_mutex);
-               kfree(qstr.name);
+               kfree(name);
                return ret;
        }
 
@@ -418,7 +377,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
                /* Create a new wait queue */
                wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL);
                if (!wq) {
-                       kfree(qstr.name);
+                       kfree(name);
                        mutex_unlock(&sbi->wq_mutex);
                        return -ENOMEM;
                }
@@ -430,6 +389,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
                sbi->queues = wq;
                init_waitqueue_head(&wq->queue);
                memcpy(&wq->name, &qstr, sizeof(struct qstr));
+               wq->offset = offset;
                wq->dev = autofs_get_dev(sbi);
                wq->ino = autofs_get_ino(sbi);
                wq->uid = current_uid();
@@ -469,7 +429,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
                         (unsigned long) wq->wait_queue_token, wq->name.len,
                         wq->name.name, notify);
                mutex_unlock(&sbi->wq_mutex);
-               kfree(qstr.name);
+               kfree(name);
        }
 
        /*
@@ -540,7 +500,7 @@ int autofs_wait_release(struct autofs_sb_info *sbi,
        }
 
        *wql = wq->next;        /* Unlink from chain */
-       kfree(wq->name.name);
+       kfree(wq->name.name - wq->offset);
        wq->name.name = NULL;   /* Do not wait on this queue */
        wq->status = status;
        wake_up(&wq->queue);
index b9c658e..a1072c6 100644 (file)
 #define        MAX_SHARED_LIBS                 (1)
 #endif
 
+#ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET
+#define DATA_START_OFFSET_WORDS                (0)
+#else
+#define DATA_START_OFFSET_WORDS                (MAX_SHARED_LIBS)
+#endif
+
 struct lib_info {
        struct {
                unsigned long start_code;               /* Start of text segment */
@@ -576,7 +582,8 @@ static int load_flat_file(struct linux_binprm *bprm,
                        goto err;
                }
 
-               len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+               len = data_len + extra +
+                       DATA_START_OFFSET_WORDS * sizeof(unsigned long);
                len = PAGE_ALIGN(len);
                realdatastart = vm_mmap(NULL, 0, len,
                        PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
@@ -591,7 +598,7 @@ static int load_flat_file(struct linux_binprm *bprm,
                        goto err;
                }
                datapos = ALIGN(realdatastart +
-                               MAX_SHARED_LIBS * sizeof(unsigned long),
+                               DATA_START_OFFSET_WORDS * sizeof(unsigned long),
                                FLAT_DATA_ALIGN);
 
                pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",
@@ -622,7 +629,8 @@ static int load_flat_file(struct linux_binprm *bprm,
                memp_size = len;
        } else {
 
-               len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32);
+               len = text_len + data_len + extra +
+                       DATA_START_OFFSET_WORDS * sizeof(u32);
                len = PAGE_ALIGN(len);
                textpos = vm_mmap(NULL, 0, len,
                        PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
@@ -638,7 +646,7 @@ static int load_flat_file(struct linux_binprm *bprm,
 
                realdatastart = textpos + ntohl(hdr->data_start);
                datapos = ALIGN(realdatastart +
-                               MAX_SHARED_LIBS * sizeof(u32),
+                               DATA_START_OFFSET_WORDS * sizeof(u32),
                                FLAT_DATA_ALIGN);
 
                reloc = (__be32 __user *)
@@ -714,7 +722,7 @@ static int load_flat_file(struct linux_binprm *bprm,
                        ret = result;
                        pr_err("Unable to read code+data+bss, errno %d\n", ret);
                        vm_munmap(textpos, text_len + data_len + extra +
-                               MAX_SHARED_LIBS * sizeof(u32));
+                                 DATA_START_OFFSET_WORDS * sizeof(u32));
                        goto err;
                }
        }
index a5244e0..b8abccd 100644 (file)
@@ -79,7 +79,7 @@ static void kill_bdev(struct block_device *bdev)
 {
        struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-       if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+       if (mapping_empty(mapping))
                return;
 
        invalidate_bh_lrus();
@@ -1677,6 +1677,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
        struct inode *bd_inode = bdev_file_inode(file);
        loff_t size = i_size_read(bd_inode);
        struct blk_plug plug;
+       size_t shorted = 0;
        ssize_t ret;
 
        if (bdev_read_only(I_BDEV(bd_inode)))
@@ -1694,12 +1695,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
        if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
                return -EOPNOTSUPP;
 
-       iov_iter_truncate(from, size - iocb->ki_pos);
+       size -= iocb->ki_pos;
+       if (iov_iter_count(from) > size) {
+               shorted = iov_iter_count(from) - size;
+               iov_iter_truncate(from, size);
+       }
 
        blk_start_plug(&plug);
        ret = __generic_file_write_iter(iocb, from);
        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
+       iov_iter_reexpand(from, iov_iter_count(from) + shorted);
        blk_finish_plug(&plug);
        return ret;
 }
@@ -1711,13 +1717,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
        struct inode *bd_inode = bdev_file_inode(file);
        loff_t size = i_size_read(bd_inode);
        loff_t pos = iocb->ki_pos;
+       size_t shorted = 0;
+       ssize_t ret;
 
        if (pos >= size)
                return 0;
 
        size -= pos;
-       iov_iter_truncate(to, size);
-       return generic_file_read_iter(iocb, to);
+       if (iov_iter_count(to) > size) {
+               shorted = iov_iter_count(to) - size;
+               iov_iter_truncate(to, size);
+       }
+
+       ret = generic_file_read_iter(iocb, to);
+       iov_iter_reexpand(to, iov_iter_count(to) + shorted);
+       return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_read_iter);
 
index 17f93fd..2bea01d 100644 (file)
@@ -591,16 +591,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                free_extent_map(em);
 
                if (page->index == end_index) {
-                       char *userpage;
                        size_t zero_offset = offset_in_page(isize);
 
                        if (zero_offset) {
                                int zeros;
                                zeros = PAGE_SIZE - zero_offset;
-                               userpage = kmap_atomic(page);
-                               memset(userpage + zero_offset, 0, zeros);
+                               memzero_page(page, zero_offset, zeros);
                                flush_dcache_page(page);
-                               kunmap_atomic(userpage);
                        }
                }
 
index f2d1bb2..074a78a 100644 (file)
@@ -3421,15 +3421,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
        }
 
        if (page->index == last_byte >> PAGE_SHIFT) {
-               char *userpage;
                size_t zero_offset = offset_in_page(last_byte);
 
                if (zero_offset) {
                        iosize = PAGE_SIZE - zero_offset;
-                       userpage = kmap_atomic(page);
-                       memset(userpage + zero_offset, 0, iosize);
+                       memzero_page(page, zero_offset, iosize);
                        flush_dcache_page(page);
-                       kunmap_atomic(userpage);
                }
        }
        begin_page_read(fs_info, page);
@@ -3438,14 +3435,11 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
                u64 disk_bytenr;
 
                if (cur >= last_byte) {
-                       char *userpage;
                        struct extent_state *cached = NULL;
 
                        iosize = PAGE_SIZE - pg_offset;
-                       userpage = kmap_atomic(page);
-                       memset(userpage + pg_offset, 0, iosize);
+                       memzero_page(page, pg_offset, iosize);
                        flush_dcache_page(page);
-                       kunmap_atomic(userpage);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
                                            &cached, GFP_NOFS);
                        unlock_extent_cached(tree, cur,
@@ -3528,13 +3522,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 
                /* we've found a hole, just zero and go on */
                if (block_start == EXTENT_MAP_HOLE) {
-                       char *userpage;
                        struct extent_state *cached = NULL;
 
-                       userpage = kmap_atomic(page);
-                       memset(userpage + pg_offset, 0, iosize);
+                       memzero_page(page, pg_offset, iosize);
                        flush_dcache_page(page);
-                       kunmap_atomic(userpage);
 
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
                                            &cached, GFP_NOFS);
@@ -3845,12 +3836,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        }
 
        if (page->index == end_index) {
-               char *userpage;
-
-               userpage = kmap_atomic(page);
-               memset(userpage + pg_offset, 0,
-                      PAGE_SIZE - pg_offset);
-               kunmap_atomic(userpage);
+               memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
                flush_dcache_page(page);
        }
 
index b21d491..4af3360 100644 (file)
@@ -646,17 +646,12 @@ again:
                if (!ret) {
                        unsigned long offset = offset_in_page(total_compressed);
                        struct page *page = pages[nr_pages - 1];
-                       char *kaddr;
 
                        /* zero the tail end of the last page, we might be
                         * sending it down to disk
                         */
-                       if (offset) {
-                               kaddr = kmap_atomic(page);
-                               memset(kaddr + offset, 0,
-                                      PAGE_SIZE - offset);
-                               kunmap_atomic(kaddr);
-                       }
+                       if (offset)
+                               memzero_page(page, offset, PAGE_SIZE - offset);
                        will_compress = 1;
                }
        }
@@ -4833,7 +4828,6 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
        struct extent_changeset *data_reserved = NULL;
-       char *kaddr;
        bool only_release_metadata = false;
        u32 blocksize = fs_info->sectorsize;
        pgoff_t index = from >> PAGE_SHIFT;
@@ -4925,15 +4919,13 @@ again:
        if (offset != blocksize) {
                if (!len)
                        len = blocksize - offset;
-               kaddr = kmap(page);
                if (front)
-                       memset(kaddr + (block_start - page_offset(page)),
-                               0, offset);
+                       memzero_page(page, (block_start - page_offset(page)),
+                                    offset);
                else
-                       memset(kaddr + (block_start - page_offset(page)) +  offset,
-                               0, len);
+                       memzero_page(page, (block_start - page_offset(page)) + offset,
+                                    len);
                flush_dcache_page(page);
-               kunmap(page);
        }
        ClearPageChecked(page);
        set_page_dirty(page);
@@ -6832,11 +6824,9 @@ static noinline int uncompress_inline(struct btrfs_path *path,
         * cover that region here.
         */
 
-       if (max_size + pg_offset < PAGE_SIZE) {
-               char *map = kmap(page);
-               memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
-               kunmap(page);
-       }
+       if (max_size + pg_offset < PAGE_SIZE)
+               memzero_page(page,  pg_offset + max_size,
+                            PAGE_SIZE - max_size - pg_offset);
        kfree(tmp);
        return ret;
 }
@@ -8506,7 +8496,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
        struct extent_changeset *data_reserved = NULL;
-       char *kaddr;
        unsigned long zero_start;
        loff_t size;
        vm_fault_t ret;
@@ -8620,10 +8609,8 @@ again:
                zero_start = PAGE_SIZE;
 
        if (zero_start != PAGE_SIZE) {
-               kaddr = kmap(page);
-               memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
+               memzero_page(page, zero_start, PAGE_SIZE - zero_start);
                flush_dcache_page(page);
-               kunmap(page);
        }
        ClearPageChecked(page);
        set_page_dirty(page);
index f4ec06b..3928ecc 100644 (file)
@@ -129,12 +129,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
         * So what's in the range [500, 4095] corresponds to zeroes.
         */
        if (datal < block_size) {
-               char *map;
-
-               map = kmap(page);
-               memset(map + datal, 0, block_size - datal);
+               memzero_page(page, datal, block_size - datal);
                flush_dcache_page(page);
-               kunmap(page);
        }
 
        SetPageUptodate(page);
index d524acf..c3fa7d3 100644 (file)
@@ -375,7 +375,6 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in,
        unsigned long bytes_left;
        unsigned long total_out = 0;
        unsigned long pg_offset = 0;
-       char *kaddr;
 
        destlen = min_t(unsigned long, destlen, PAGE_SIZE);
        bytes_left = destlen;
@@ -455,9 +454,7 @@ next:
         * end of the inline extent (destlen) to the end of the page
         */
        if (pg_offset < destlen) {
-               kaddr = kmap_atomic(dest_page);
-               memset(kaddr + pg_offset, 0, destlen - pg_offset);
-               kunmap_atomic(kaddr);
+               memzero_page(dest_page, pg_offset, destlen - pg_offset);
        }
        return ret;
 }
index 8e9626d..3e26b46 100644 (file)
@@ -631,7 +631,6 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
        size_t ret2;
        unsigned long total_out = 0;
        unsigned long pg_offset = 0;
-       char *kaddr;
 
        stream = ZSTD_initDStream(
                        ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
@@ -696,9 +695,7 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
        ret = 0;
 finish:
        if (pg_offset < destlen) {
-               kaddr = kmap_atomic(dest_page);
-               memset(kaddr + pg_offset, 0, destlen - pg_offset);
-               kunmap_atomic(kaddr);
+               memzero_page(dest_page, pg_offset, destlen - pg_offset);
        }
        return ret;
 }
index 0cb7ffd..ea48c01 100644 (file)
@@ -1020,11 +1020,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
        pgoff_t index;
        int sizebits;
 
-       sizebits = -1;
-       do {
-               sizebits++;
-       } while ((size << sizebits) < PAGE_SIZE);
-
+       sizebits = PAGE_SHIFT - __ffs(size);
        index = block >> sizebits;
 
        /*
@@ -1264,6 +1260,15 @@ static void bh_lru_install(struct buffer_head *bh)
        int i;
 
        check_irqs_on();
+       /*
+        * the refcount of buffer_head in bh_lru prevents dropping the
+        * attached page(i.e., try_to_free_buffers) so it could cause
+        * failing page migration.
+        * Skip putting upcoming bh into bh_lru until migration is done.
+        */
+       if (lru_cache_disabled())
+               return;
+
        bh_lru_lock();
 
        b = this_cpu_ptr(&bh_lrus);
@@ -1404,6 +1409,15 @@ __bread_gfp(struct block_device *bdev, sector_t block,
 }
 EXPORT_SYMBOL(__bread_gfp);
 
+static void __invalidate_bh_lrus(struct bh_lru *b)
+{
+       int i;
+
+       for (i = 0; i < BH_LRU_SIZE; i++) {
+               brelse(b->bhs[i]);
+               b->bhs[i] = NULL;
+       }
+}
 /*
  * invalidate_bh_lrus() is called rarely - but not only at unmount.
  * This doesn't race because it runs in each cpu either in irq
@@ -1412,16 +1426,12 @@ EXPORT_SYMBOL(__bread_gfp);
 static void invalidate_bh_lru(void *arg)
 {
        struct bh_lru *b = &get_cpu_var(bh_lrus);
-       int i;
 
-       for (i = 0; i < BH_LRU_SIZE; i++) {
-               brelse(b->bhs[i]);
-               b->bhs[i] = NULL;
-       }
+       __invalidate_bh_lrus(b);
        put_cpu_var(bh_lrus);
 }
 
-static bool has_bh_in_lru(int cpu, void *dummy)
+bool has_bh_in_lru(int cpu, void *dummy)
 {
        struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
        int i;
@@ -1440,6 +1450,16 @@ void invalidate_bh_lrus(void)
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
 
+void invalidate_bh_lrus_cpu(int cpu)
+{
+       struct bh_lru *b;
+
+       bh_lru_lock();
+       b = per_cpu_ptr(&bh_lrus, cpu);
+       __invalidate_bh_lrus(b);
+       bh_lru_unlock();
+}
+
 void set_bh_page(struct buffer_head *bh,
                struct page *page, unsigned long offset)
 {
index 471e401..94df854 100644 (file)
@@ -6,6 +6,7 @@ config CEPH_FS
        select LIBCRC32C
        select CRYPTO_AES
        select CRYPTO
+       select NETFS_SUPPORT
        default n
        help
          Choose Y or M here to include support for mounting the
index 26e6643..c1570fa 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/signal.h>
 #include <linux/iversion.h>
 #include <linux/ktime.h>
+#include <linux/netfs.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -61,6 +62,9 @@
        (CONGESTION_ON_THRESH(congestion_kb) -                          \
         (CONGESTION_ON_THRESH(congestion_kb) >> 2))
 
+static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
+                                       struct page *page, void **_fsdata);
+
 static inline struct ceph_snap_context *page_snap_context(struct page *page)
 {
        if (PagePrivate(page))
@@ -124,8 +128,7 @@ static int ceph_set_page_dirty(struct page *page)
         * PagePrivate so that we get invalidatepage callback.
         */
        BUG_ON(PagePrivate(page));
-       page->private = (unsigned long)snapc;
-       SetPagePrivate(page);
+       attach_page_private(page, snapc);
 
        ret = __set_page_dirty_nobuffers(page);
        WARN_ON(!PageLocked(page));
@@ -144,19 +147,19 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
 {
        struct inode *inode;
        struct ceph_inode_info *ci;
-       struct ceph_snap_context *snapc = page_snap_context(page);
+       struct ceph_snap_context *snapc;
+
+       wait_on_page_fscache(page);
 
        inode = page->mapping->host;
        ci = ceph_inode(inode);
 
-       if (offset != 0 || length != PAGE_SIZE) {
+       if (offset != 0 || length != thp_size(page)) {
                dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
                     inode, page, page->index, offset, length);
                return;
        }
 
-       ceph_invalidate_fscache_page(inode, page);
-
        WARN_ON(!PageLocked(page));
        if (!PagePrivate(page))
                return;
@@ -164,333 +167,222 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
        dout("%p invalidatepage %p idx %lu full dirty page\n",
             inode, page, page->index);
 
+       snapc = detach_page_private(page);
        ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
        ceph_put_snap_context(snapc);
-       page->private = 0;
-       ClearPagePrivate(page);
 }
 
-static int ceph_releasepage(struct page *page, gfp_t g)
+static int ceph_releasepage(struct page *page, gfp_t gfp)
 {
        dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host,
             page, page->index, PageDirty(page) ? "" : "not ");
 
-       /* Can we release the page from the cache? */
-       if (!ceph_release_fscache_page(page, g))
-               return 0;
-
+       if (PageFsCache(page)) {
+               if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS))
+                       return 0;
+               wait_on_page_fscache(page);
+       }
        return !PagePrivate(page);
 }
 
-/* read a single page, without unlocking it. */
-static int ceph_do_readpage(struct file *filp, struct page *page)
+static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
 {
-       struct inode *inode = file_inode(filp);
+       struct inode *inode = rreq->mapping->host;
        struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-       struct ceph_osd_client *osdc = &fsc->client->osdc;
-       struct ceph_osd_request *req;
-       struct ceph_vino vino = ceph_vino(inode);
-       int err = 0;
-       u64 off = page_offset(page);
-       u64 len = PAGE_SIZE;
-
-       if (off >= i_size_read(inode)) {
-               zero_user_segment(page, 0, PAGE_SIZE);
-               SetPageUptodate(page);
-               return 0;
-       }
-
-       if (ci->i_inline_version != CEPH_INLINE_NONE) {
-               /*
-                * Uptodate inline data should have been added
-                * into page cache while getting Fcr caps.
-                */
-               if (off == 0)
-                       return -EINVAL;
-               zero_user_segment(page, 0, PAGE_SIZE);
-               SetPageUptodate(page);
-               return 0;
-       }
-
-       err = ceph_readpage_from_fscache(inode, page);
-       if (err == 0)
-               return -EINPROGRESS;
-
-       dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
-            vino.ino, vino.snap, filp, off, len, page, page->index);
-       req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1,
-                                   CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL,
-                                   ci->i_truncate_seq, ci->i_truncate_size,
-                                   false);
-       if (IS_ERR(req))
-               return PTR_ERR(req);
+       struct ceph_file_layout *lo = &ci->i_layout;
+       u32 blockoff;
+       u64 blockno;
 
-       osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
+       /* Expand the start downward */
+       blockno = div_u64_rem(rreq->start, lo->stripe_unit, &blockoff);
+       rreq->start = blockno * lo->stripe_unit;
+       rreq->len += blockoff;
 
-       err = ceph_osdc_start_request(osdc, req, false);
-       if (!err)
-               err = ceph_osdc_wait_request(osdc, req);
-
-       ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
-                                req->r_end_latency, err);
-
-       ceph_osdc_put_request(req);
-       dout("readpage result %d\n", err);
-
-       if (err == -ENOENT)
-               err = 0;
-       if (err < 0) {
-               ceph_fscache_readpage_cancel(inode, page);
-               if (err == -EBLOCKLISTED)
-                       fsc->blocklisted = true;
-               goto out;
-       }
-       if (err < PAGE_SIZE)
-               /* zero fill remainder of page */
-               zero_user_segment(page, err, PAGE_SIZE);
-       else
-               flush_dcache_page(page);
-
-       SetPageUptodate(page);
-       ceph_readpage_to_fscache(inode, page);
-
-out:
-       return err < 0 ? err : 0;
+       /* Now, round up the length to the next block */
+       rreq->len = roundup(rreq->len, lo->stripe_unit);
 }
 
-static int ceph_readpage(struct file *filp, struct page *page)
+static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq)
 {
-       int r = ceph_do_readpage(filp, page);
-       if (r != -EINPROGRESS)
-               unlock_page(page);
-       else
-               r = 0;
-       return r;
+       struct inode *inode = subreq->rreq->mapping->host;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       u64 objno, objoff;
+       u32 xlen;
+
+       /* Truncate the extent at the end of the current block */
+       ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
+                                     &objno, &objoff, &xlen);
+       subreq->len = min(xlen, fsc->mount_options->rsize);
+       return true;
 }
 
-/*
- * Finish an async read(ahead) op.
- */
-static void finish_read(struct ceph_osd_request *req)
+static void finish_netfs_read(struct ceph_osd_request *req)
 {
-       struct inode *inode = req->r_inode;
-       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-       struct ceph_osd_data *osd_data;
-       int rc = req->r_result <= 0 ? req->r_result : 0;
-       int bytes = req->r_result >= 0 ? req->r_result : 0;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
+       struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
+       struct netfs_read_subrequest *subreq = req->r_priv;
        int num_pages;
-       int i;
+       int err = req->r_result;
 
-       dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
-       if (rc == -EBLOCKLISTED)
-               ceph_inode_to_client(inode)->blocklisted = true;
+       ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
+                                req->r_end_latency, err);
 
-       /* unlock all pages, zeroing any data we didn't read */
-       osd_data = osd_req_op_extent_osd_data(req, 0);
-       BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
-       num_pages = calc_pages_for((u64)osd_data->alignment,
-                                       (u64)osd_data->length);
-       for (i = 0; i < num_pages; i++) {
-               struct page *page = osd_data->pages[i];
-
-               if (rc < 0 && rc != -ENOENT) {
-                       ceph_fscache_readpage_cancel(inode, page);
-                       goto unlock;
-               }
-               if (bytes < (int)PAGE_SIZE) {
-                       /* zero (remainder of) page */
-                       int s = bytes < 0 ? 0 : bytes;
-                       zero_user_segment(page, s, PAGE_SIZE);
-               }
-               dout("finish_read %p uptodate %p idx %lu\n", inode, page,
-                    page->index);
-               flush_dcache_page(page);
-               SetPageUptodate(page);
-               ceph_readpage_to_fscache(inode, page);
-unlock:
-               unlock_page(page);
-               put_page(page);
-               bytes -= PAGE_SIZE;
-       }
+       dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result,
+            subreq->len, i_size_read(req->r_inode));
 
-       ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
-                                req->r_end_latency, rc);
+       /* no object means success but no data */
+       if (err == -ENOENT)
+               err = 0;
+       else if (err == -EBLOCKLISTED)
+               fsc->blocklisted = true;
+
+       if (err >= 0 && err < subreq->len)
+               __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+
+       netfs_subreq_terminated(subreq, err, true);
 
-       kfree(osd_data->pages);
+       num_pages = calc_pages_for(osd_data->alignment, osd_data->length);
+       ceph_put_page_vector(osd_data->pages, num_pages, false);
+       iput(req->r_inode);
 }
 
-/*
- * start an async read(ahead) operation.  return nr_pages we submitted
- * a read for on success, or negative error code.
- */
-static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
-                     struct list_head *page_list, int max)
+static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
 {
-       struct ceph_osd_client *osdc =
-               &ceph_inode_to_client(inode)->client->osdc;
+       struct netfs_read_request *rreq = subreq->rreq;
+       struct inode *inode = rreq->mapping->host;
        struct ceph_inode_info *ci = ceph_inode(inode);
-       struct page *page = lru_to_page(page_list);
-       struct ceph_vino vino;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_osd_request *req;
-       u64 off;
-       u64 len;
-       int i;
+       struct ceph_vino vino = ceph_vino(inode);
+       struct iov_iter iter;
        struct page **pages;
-       pgoff_t next_index;
-       int nr_pages = 0;
-       int got = 0;
-       int ret = 0;
-
-       if (!rw_ctx) {
-               /* caller of readpages does not hold buffer and read caps
-                * (fadvise, madvise and readahead cases) */
-               int want = CEPH_CAP_FILE_CACHE;
-               ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want,
-                                       true, &got);
-               if (ret < 0) {
-                       dout("start_read %p, error getting cap\n", inode);
-               } else if (!(got & want)) {
-                       dout("start_read %p, no cache cap\n", inode);
-                       ret = 0;
-               }
-               if (ret <= 0) {
-                       if (got)
-                               ceph_put_cap_refs(ci, got);
-                       while (!list_empty(page_list)) {
-                               page = lru_to_page(page_list);
-                               list_del(&page->lru);
-                               put_page(page);
-                       }
-                       return ret;
-               }
-       }
-
-       off = (u64) page_offset(page);
+       size_t page_off;
+       int err = 0;
+       u64 len = subreq->len;
 
-       /* count pages */
-       next_index = page->index;
-       list_for_each_entry_reverse(page, page_list, lru) {
-               if (page->index != next_index)
-                       break;
-               nr_pages++;
-               next_index++;
-               if (max && nr_pages == max)
-                       break;
-       }
-       len = nr_pages << PAGE_SHIFT;
-       dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
-            off, len);
-       vino = ceph_vino(inode);
-       req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
-                                   0, 1, CEPH_OSD_OP_READ,
-                                   CEPH_OSD_FLAG_READ, NULL,
-                                   ci->i_truncate_seq, ci->i_truncate_size,
-                                   false);
+       req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
+                       0, 1, CEPH_OSD_OP_READ,
+                       CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
+                       NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
        if (IS_ERR(req)) {
-               ret = PTR_ERR(req);
+               err = PTR_ERR(req);
+               req = NULL;
                goto out;
        }
 
-       /* build page vector */
-       nr_pages = calc_pages_for(0, len);
-       pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
-       if (!pages) {
-               ret = -ENOMEM;
-               goto out_put;
-       }
-       for (i = 0; i < nr_pages; ++i) {
-               page = list_entry(page_list->prev, struct page, lru);
-               BUG_ON(PageLocked(page));
-               list_del(&page->lru);
-
-               dout("start_read %p adding %p idx %lu\n", inode, page,
-                    page->index);
-               if (add_to_page_cache_lru(page, &inode->i_data, page->index,
-                                         GFP_KERNEL)) {
-                       ceph_fscache_uncache_page(inode, page);
-                       put_page(page);
-                       dout("start_read %p add_to_page_cache failed %p\n",
-                            inode, page);
-                       nr_pages = i;
-                       if (nr_pages > 0) {
-                               len = nr_pages << PAGE_SHIFT;
-                               osd_req_op_extent_update(req, 0, len);
-                               break;
-                       }
-                       goto out_pages;
-               }
-               pages[i] = page;
+       dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
+       iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+       err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off);
+       if (err < 0) {
+               dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
+               goto out;
        }
+
+       /* should always give us a page-aligned read */
+       WARN_ON_ONCE(page_off);
+       len = err;
+
        osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
-       req->r_callback = finish_read;
+       req->r_callback = finish_netfs_read;
+       req->r_priv = subreq;
        req->r_inode = inode;
+       ihold(inode);
 
-       dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
-       ret = ceph_osdc_start_request(osdc, req, false);
-       if (ret < 0)
-               goto out_pages;
+       err = ceph_osdc_start_request(req->r_osdc, req, false);
+       if (err)
+               iput(inode);
+out:
        ceph_osdc_put_request(req);
+       if (err)
+               netfs_subreq_terminated(subreq, err, false);
+       dout("%s: result %d\n", __func__, err);
+}
 
-       /* After adding locked pages to page cache, the inode holds cache cap.
-        * So we can drop our cap refs. */
-       if (got)
-               ceph_put_cap_refs(ci, got);
+static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file)
+{
+}
 
-       return nr_pages;
+static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
+{
+       struct inode *inode = mapping->host;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       int got = (uintptr_t)priv;
 
-out_pages:
-       for (i = 0; i < nr_pages; ++i) {
-               ceph_fscache_readpage_cancel(inode, pages[i]);
-               unlock_page(pages[i]);
-       }
-       ceph_put_page_vector(pages, nr_pages, false);
-out_put:
-       ceph_osdc_put_request(req);
-out:
        if (got)
                ceph_put_cap_refs(ci, got);
-       return ret;
 }
 
+const struct netfs_read_request_ops ceph_netfs_read_ops = {
+       .init_rreq              = ceph_init_rreq,
+       .is_cache_enabled       = ceph_is_cache_enabled,
+       .begin_cache_operation  = ceph_begin_cache_operation,
+       .issue_op               = ceph_netfs_issue_op,
+       .expand_readahead       = ceph_netfs_expand_readahead,
+       .clamp_length           = ceph_netfs_clamp_length,
+       .check_write_begin      = ceph_netfs_check_write_begin,
+       .cleanup                = ceph_readahead_cleanup,
+};
 
-/*
- * Read multiple pages.  Leave pages we don't read + unlock in page_list;
- * the caller (VM) cleans them up.
- */
-static int ceph_readpages(struct file *file, struct address_space *mapping,
-                         struct list_head *page_list, unsigned nr_pages)
+/* read a single page, without unlocking it. */
+static int ceph_readpage(struct file *file, struct page *page)
 {
        struct inode *inode = file_inode(file);
-       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-       struct ceph_file_info *fi = file->private_data;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_vino vino = ceph_vino(inode);
+       u64 off = page_offset(page);
+       u64 len = thp_size(page);
+
+       if (ci->i_inline_version != CEPH_INLINE_NONE) {
+               /*
+                * Uptodate inline data should have been added
+                * into page cache while getting Fcr caps.
+                */
+               if (off == 0) {
+                       unlock_page(page);
+                       return -EINVAL;
+               }
+               zero_user_segment(page, 0, thp_size(page));
+               SetPageUptodate(page);
+               unlock_page(page);
+               return 0;
+       }
+
+       dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
+            vino.ino, vino.snap, file, off, len, page, page->index);
+
+       return netfs_readpage(file, page, &ceph_netfs_read_ops, NULL);
+}
+
+static void ceph_readahead(struct readahead_control *ractl)
+{
+       struct inode *inode = file_inode(ractl->file);
+       struct ceph_file_info *fi = ractl->file->private_data;
        struct ceph_rw_context *rw_ctx;
-       int rc = 0;
-       int max = 0;
+       int got = 0;
+       int ret = 0;
 
        if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
-               return -EINVAL;
+               return;
 
-       rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
-                                        &nr_pages);
+       rw_ctx = ceph_find_rw_context(fi);
+       if (!rw_ctx) {
+               /*
+                * readahead callers do not necessarily hold Fcb caps
+                * (e.g. fadvise, madvise).
+                */
+               int want = CEPH_CAP_FILE_CACHE;
 
-       if (rc == 0)
-               goto out;
+               ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
+               if (ret < 0)
+                       dout("start_read %p, error getting cap\n", inode);
+               else if (!(got & want))
+                       dout("start_read %p, no cache cap\n", inode);
 
-       rw_ctx = ceph_find_rw_context(fi);
-       max = fsc->mount_options->rsize >> PAGE_SHIFT;
-       dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
-            inode, file, rw_ctx, nr_pages, max);
-       while (!list_empty(page_list)) {
-               rc = start_read(inode, rw_ctx, page_list, max);
-               if (rc < 0)
-                       goto out;
+               if (ret <= 0)
+                       return;
        }
-out:
-       ceph_fscache_readpages_cancel(inode, page_list);
-
-       dout("readpages %p file %p ret %d\n", inode, file, rc);
-       return rc;
+       netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got);
 }
 
 struct ceph_writeback_ctl
@@ -585,8 +477,8 @@ static u64 get_writepages_data_length(struct inode *inode,
                spin_unlock(&ci->i_ceph_lock);
                WARN_ON(!found);
        }
-       if (end > page_offset(page) + PAGE_SIZE)
-               end = page_offset(page) + PAGE_SIZE;
+       if (end > page_offset(page) + thp_size(page))
+               end = page_offset(page) + thp_size(page);
        return end > start ? end - start : 0;
 }
 
@@ -604,7 +496,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        struct ceph_snap_context *snapc, *oldest;
        loff_t page_off = page_offset(page);
        int err;
-       loff_t len = PAGE_SIZE;
+       loff_t len = thp_size(page);
        struct ceph_writeback_ctl ceph_wbc;
        struct ceph_osd_client *osdc = &fsc->client->osdc;
        struct ceph_osd_request *req;
@@ -632,7 +524,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        /* is this a partial page at end of file? */
        if (page_off >= ceph_wbc.i_size) {
                dout("%p page eof %llu\n", page, ceph_wbc.i_size);
-               page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
+               page->mapping->a_ops->invalidatepage(page, 0, thp_size(page));
                return 0;
        }
 
@@ -658,7 +550,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        }
 
        /* it may be a short write due to an object boundary */
-       WARN_ON_ONCE(len > PAGE_SIZE);
+       WARN_ON_ONCE(len > thp_size(page));
        osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
        dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
 
@@ -667,7 +559,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        if (!err)
                err = ceph_osdc_wait_request(osdc, req);
 
-       ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+       ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
                                  req->r_end_latency, err);
 
        ceph_osdc_put_request(req);
@@ -695,8 +587,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
                dout("writepage cleaned page %p\n", page);
                err = 0;  /* vfs expects us to return 0 */
        }
-       page->private = 0;
-       ClearPagePrivate(page);
+       oldest = detach_page_private(page);
+       WARN_ON_ONCE(oldest != snapc);
        end_page_writeback(page);
        ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
        ceph_put_snap_context(snapc);  /* page's reference */
@@ -755,7 +647,7 @@ static void writepages_finish(struct ceph_osd_request *req)
                ceph_clear_error_write(ci);
        }
 
-       ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+       ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
                                  req->r_end_latency, rc);
 
        /*
@@ -788,11 +680,9 @@ static void writepages_finish(struct ceph_osd_request *req)
                                clear_bdi_congested(inode_to_bdi(inode),
                                                    BLK_RW_ASYNC);
 
-                       ceph_put_snap_context(page_snap_context(page));
-                       page->private = 0;
-                       ClearPagePrivate(page);
-                       dout("unlocking %p\n", page);
+                       ceph_put_snap_context(detach_page_private(page));
                        end_page_writeback(page);
+                       dout("unlocking %p\n", page);
 
                        if (remove_page)
                                generic_error_remove_page(inode->i_mapping,
@@ -949,7 +839,7 @@ get_more_pages:
                                    page_offset(page) >= i_size_read(inode)) &&
                                    clear_page_dirty_for_io(page))
                                        mapping->a_ops->invalidatepage(page,
-                                                               0, PAGE_SIZE);
+                                                               0, thp_size(page));
                                unlock_page(page);
                                continue;
                        }
@@ -1038,7 +928,7 @@ get_more_pages:
                        pages[locked_pages++] = page;
                        pvec.pages[i] = NULL;
 
-                       len += PAGE_SIZE;
+                       len += thp_size(page);
                }
 
                /* did we get anything? */
@@ -1087,7 +977,7 @@ new_request:
                        BUG_ON(IS_ERR(req));
                }
                BUG_ON(len < page_offset(pages[locked_pages - 1]) +
-                            PAGE_SIZE - offset);
+                            thp_size(page) - offset);
 
                req->r_callback = writepages_finish;
                req->r_inode = inode;
@@ -1117,7 +1007,7 @@ new_request:
                        }
 
                        set_page_writeback(pages[i]);
-                       len += PAGE_SIZE;
+                       len += thp_size(page);
                }
 
                if (ceph_wbc.size_stable) {
@@ -1126,7 +1016,7 @@ new_request:
                        /* writepages_finish() clears writeback pages
                         * according to the data length, so make sure
                         * data length covers all locked pages */
-                       u64 min_len = len + 1 - PAGE_SIZE;
+                       u64 min_len = len + 1 - thp_size(page);
                        len = get_writepages_data_length(inode, pages[i - 1],
                                                         offset);
                        len = max(len, min_len);
@@ -1302,6 +1192,31 @@ ceph_find_incompatible(struct page *page)
        return NULL;
 }
 
+static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
+                                       struct page *page, void **_fsdata)
+{
+       struct inode *inode = file_inode(file);
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_snap_context *snapc;
+
+       snapc = ceph_find_incompatible(page);
+       if (snapc) {
+               int r;
+
+               unlock_page(page);
+               put_page(page);
+               if (IS_ERR(snapc))
+                       return PTR_ERR(snapc);
+
+               ceph_queue_writeback(inode);
+               r = wait_event_killable(ci->i_cap_wq,
+                                       context_is_writeable_or_written(inode, snapc));
+               ceph_put_snap_context(snapc);
+               return r == 0 ? -EAGAIN : r;
+       }
+       return 0;
+}
+
 /*
  * We are only allowed to write into/dirty the page if the page is
  * clean, or already dirty within the same snap context.
@@ -1312,75 +1227,47 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 {
        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_snap_context *snapc;
        struct page *page = NULL;
        pgoff_t index = pos >> PAGE_SHIFT;
-       int pos_in_page = pos & ~PAGE_MASK;
-       int r = 0;
+       int r;
 
-       dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len);
-
-       for (;;) {
+       /*
+        * Uninlining should have already been done and everything updated, EXCEPT
+        * for inline_version sent to the MDS.
+        */
+       if (ci->i_inline_version != CEPH_INLINE_NONE) {
                page = grab_cache_page_write_begin(mapping, index, flags);
-               if (!page) {
-                       r = -ENOMEM;
-                       break;
-               }
-
-               snapc = ceph_find_incompatible(page);
-               if (snapc) {
-                       if (IS_ERR(snapc)) {
-                               r = PTR_ERR(snapc);
-                               break;
-                       }
-                       unlock_page(page);
-                       put_page(page);
-                       page = NULL;
-                       ceph_queue_writeback(inode);
-                       r = wait_event_killable(ci->i_cap_wq,
-                                               context_is_writeable_or_written(inode, snapc));
-                       ceph_put_snap_context(snapc);
-                       if (r != 0)
-                               break;
-                       continue;
-               }
-
-               if (PageUptodate(page)) {
-                       dout(" page %p already uptodate\n", page);
-                       break;
-               }
+               if (!page)
+                       return -ENOMEM;
 
                /*
-                * In some cases we don't need to read at all:
-                * - full page write
-                * - write that lies completely beyond EOF
-                * - write that covers the the page from start to EOF or beyond it
+                * The inline_version on a new inode is set to 1. If that's the
+                * case, then the page is brand new and isn't yet Uptodate.
                 */
-               if ((pos_in_page == 0 && len == PAGE_SIZE) ||
-                   (pos >= i_size_read(inode)) ||
-                   (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) {
-                       zero_user_segments(page, 0, pos_in_page,
-                                          pos_in_page + len, PAGE_SIZE);
-                       break;
+               r = 0;
+               if (index == 0 && ci->i_inline_version != 1) {
+                       if (!PageUptodate(page)) {
+                               WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n",
+                                         ci->i_inline_version);
+                               r = -EINVAL;
+                       }
+                       goto out;
                }
-
-               /*
-                * We need to read it. If we get back -EINPROGRESS, then the page was
-                * handed off to fscache and it will be unlocked when the read completes.
-                * Refind the page in that case so we can reacquire the page lock. Otherwise
-                * we got a hard error or the read was completed synchronously.
-                */
-               r = ceph_do_readpage(file, page);
-               if (r != -EINPROGRESS)
-                       break;
+               zero_user_segment(page, 0, thp_size(page));
+               SetPageUptodate(page);
+               goto out;
        }
 
+       r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &page, NULL,
+                             &ceph_netfs_read_ops, NULL);
+out:
+       if (r == 0)
+               wait_on_page_fscache(page);
        if (r < 0) {
-               if (page) {
-                       unlock_page(page);
+               if (page)
                        put_page(page);
-               }
        } else {
+               WARN_ON_ONCE(!PageLocked(page));
                *pagep = page;
        }
        return r;
@@ -1438,7 +1325,7 @@ static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter)
 
 const struct address_space_operations ceph_aops = {
        .readpage = ceph_readpage,
-       .readpages = ceph_readpages,
+       .readahead = ceph_readahead,
        .writepage = ceph_writepage,
        .writepages = ceph_writepages_start,
        .write_begin = ceph_write_begin,
@@ -1470,7 +1357,6 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
        struct inode *inode = file_inode(vma->vm_file);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_file_info *fi = vma->vm_file->private_data;
-       struct page *pinned_page = NULL;
        loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
        int want, got, err;
        sigset_t oldset;
@@ -1478,21 +1364,20 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
 
        ceph_block_sigs(&oldset);
 
-       dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
-            inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
+       dout("filemap_fault %p %llx.%llx %llu trying to get caps\n",
+            inode, ceph_vinop(inode), off);
        if (fi->fmode & CEPH_FILE_MODE_LAZY)
                want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
        else
                want = CEPH_CAP_FILE_CACHE;
 
        got = 0;
-       err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1,
-                           &got, &pinned_page);
+       err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got);
        if (err < 0)
                goto out_restore;
 
-       dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
-            inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
+       dout("filemap_fault %p %llu got cap refs on %s\n",
+            inode, off, ceph_cap_string(got));
 
        if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
            ci->i_inline_version == CEPH_INLINE_NONE) {
@@ -1500,14 +1385,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
                ceph_add_rw_context(fi, &rw_ctx);
                ret = filemap_fault(vmf);
                ceph_del_rw_context(fi, &rw_ctx);
-               dout("filemap_fault %p %llu~%zd drop cap refs %s ret %x\n",
-                       inode, off, (size_t)PAGE_SIZE,
-                               ceph_cap_string(got), ret);
+               dout("filemap_fault %p %llu drop cap refs %s ret %x\n",
+                    inode, off, ceph_cap_string(got), ret);
        } else
                err = -EAGAIN;
 
-       if (pinned_page)
-               put_page(pinned_page);
        ceph_put_cap_refs(ci, got);
 
        if (err != -EAGAIN)
@@ -1542,8 +1424,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
                vmf->page = page;
                ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
 out_inline:
-               dout("filemap_fault %p %llu~%zd read inline data ret %x\n",
-                    inode, off, (size_t)PAGE_SIZE, ret);
+               dout("filemap_fault %p %llu read inline data ret %x\n",
+                    inode, off, ret);
        }
 out_restore:
        ceph_restore_sigs(&oldset);
@@ -1553,9 +1435,6 @@ out_restore:
        return ret;
 }
 
-/*
- * Reuse write_begin here for simplicity.
- */
 static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
 {
        struct vm_area_struct *vma = vmf->vma;
@@ -1591,10 +1470,10 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
                        goto out_free;
        }
 
-       if (off + PAGE_SIZE <= size)
-               len = PAGE_SIZE;
+       if (off + thp_size(page) <= size)
+               len = thp_size(page);
        else
-               len = size & ~PAGE_MASK;
+               len = offset_in_thp(page, size);
 
        dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
             inode, ceph_vinop(inode), off, len, size);
@@ -1604,8 +1483,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
                want = CEPH_CAP_FILE_BUFFER;
 
        got = 0;
-       err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len,
-                           &got, NULL);
+       err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got);
        if (err < 0)
                goto out_free;
 
@@ -1832,7 +1710,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
        if (!err)
                err = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
-       ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+       ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
                                  req->r_end_latency, err);
 
 out_put:
@@ -2057,6 +1935,10 @@ int ceph_pool_perm_check(struct inode *inode, int need)
        s64 pool;
        int ret, flags;
 
+       /* Only need to do this for regular files */
+       if (!S_ISREG(inode->i_mode))
+               return 0;
+
        if (ci->i_vino.snap != CEPH_NOSNAP) {
                /*
                 * Pool permission check needs to write to the first object.
index 2f5cb6b..9cfadbb 100644 (file)
@@ -173,7 +173,6 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
 
        ci->fscache = NULL;
 
-       fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode);
        fscache_relinquish_cookie(cookie, &ci->i_vino, false);
 }
 
@@ -194,7 +193,6 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
                dout("fscache_file_set_cookie %p %p disabling cache\n",
                     inode, filp);
                fscache_disable_cookie(ci->fscache, &ci->i_vino, false);
-               fscache_uncache_all_inode_pages(ci->fscache, inode);
        } else {
                fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode),
                                      ceph_fscache_can_enable, inode);
@@ -205,108 +203,6 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
        }
 }
 
-static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error)
-{
-       if (!error)
-               SetPageUptodate(page);
-
-       unlock_page(page);
-}
-
-static inline bool cache_valid(struct ceph_inode_info *ci)
-{
-       return ci->i_fscache_gen == ci->i_rdcache_gen;
-}
-
-
-/* Atempt to read from the fscache,
- *
- * This function is called from the readpage_nounlock context. DO NOT attempt to
- * unlock the page here (or in the callback).
- */
-int ceph_readpage_from_fscache(struct inode *inode, struct page *page)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       int ret;
-
-       if (!cache_valid(ci))
-               return -ENOBUFS;
-
-       ret = fscache_read_or_alloc_page(ci->fscache, page,
-                                        ceph_readpage_from_fscache_complete, NULL,
-                                        GFP_KERNEL);
-
-       switch (ret) {
-               case 0: /* Page found */
-                       dout("page read submitted\n");
-                       return 0;
-               case -ENOBUFS: /* Pages were not found, and can't be */
-               case -ENODATA: /* Pages were not found */
-                       dout("page/inode not in cache\n");
-                       return ret;
-               default:
-                       dout("%s: unknown error ret = %i\n", __func__, ret);
-                       return ret;
-       }
-}
-
-int ceph_readpages_from_fscache(struct inode *inode,
-                                 struct address_space *mapping,
-                                 struct list_head *pages,
-                                 unsigned *nr_pages)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       int ret;
-
-       if (!cache_valid(ci))
-               return -ENOBUFS;
-
-       ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
-                                         ceph_readpage_from_fscache_complete,
-                                         NULL, mapping_gfp_mask(mapping));
-
-       switch (ret) {
-               case 0: /* All pages found */
-                       dout("all-page read submitted\n");
-                       return 0;
-               case -ENOBUFS: /* Some pages were not found, and can't be */
-               case -ENODATA: /* some pages were not found */
-                       dout("page/inode not in cache\n");
-                       return ret;
-               default:
-                       dout("%s: unknown error ret = %i\n", __func__, ret);
-                       return ret;
-       }
-}
-
-void ceph_readpage_to_fscache(struct inode *inode, struct page *page)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       int ret;
-
-       if (!PageFsCache(page))
-               return;
-
-       if (!cache_valid(ci))
-               return;
-
-       ret = fscache_write_page(ci->fscache, page, i_size_read(inode),
-                                GFP_KERNEL);
-       if (ret)
-                fscache_uncache_page(ci->fscache, page);
-}
-
-void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-
-       if (!PageFsCache(page))
-               return;
-
-       fscache_wait_on_page_write(ci->fscache, page);
-       fscache_uncache_page(ci->fscache, page);
-}
-
 void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
 {
        if (fscache_cookie_valid(fsc->fscache)) {
@@ -329,24 +225,3 @@ void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
        }
        fsc->fscache = NULL;
 }
-
-/*
- * caller should hold CEPH_CAP_FILE_{RD,CACHE}
- */
-void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
-{
-       if (cache_valid(ci))
-               return;
-
-       /* resue i_truncate_mutex. There should be no pending
-        * truncate while the caller holds CEPH_CAP_FILE_RD */
-       mutex_lock(&ci->i_truncate_mutex);
-       if (!cache_valid(ci)) {
-               if (fscache_check_consistency(ci->fscache, &ci->i_vino))
-                       fscache_invalidate(ci->fscache);
-               spin_lock(&ci->i_ceph_lock);
-               ci->i_fscache_gen = ci->i_rdcache_gen;
-               spin_unlock(&ci->i_ceph_lock);
-       }
-       mutex_unlock(&ci->i_truncate_mutex);
-}
index 89dbdd1..1409d61 100644 (file)
@@ -9,6 +9,8 @@
 #ifndef _CEPH_CACHE_H
 #define _CEPH_CACHE_H
 
+#include <linux/netfs.h>
+
 #ifdef CONFIG_CEPH_FSCACHE
 
 extern struct fscache_netfs ceph_cache_netfs;
@@ -29,54 +31,37 @@ int ceph_readpages_from_fscache(struct inode *inode,
                                struct address_space *mapping,
                                struct list_head *pages,
                                unsigned *nr_pages);
-void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
-void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
 
 static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
 {
        ci->fscache = NULL;
-       ci->i_fscache_gen = 0;
 }
 
-static inline void ceph_fscache_invalidate(struct inode *inode)
+static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
 {
-       fscache_invalidate(ceph_inode(inode)->fscache);
+       return ci->fscache;
 }
 
-static inline void ceph_fscache_uncache_page(struct inode *inode,
-                                            struct page *page)
+static inline void ceph_fscache_invalidate(struct inode *inode)
 {
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       return fscache_uncache_page(ci->fscache, page);
+       fscache_invalidate(ceph_inode(inode)->fscache);
 }
 
-static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
+static inline bool ceph_is_cache_enabled(struct inode *inode)
 {
-       struct inode* inode = page->mapping->host;
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       return fscache_maybe_release_page(ci->fscache, page, gfp);
-}
+       struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(inode));
 
-static inline void ceph_fscache_readpage_cancel(struct inode *inode,
-                                               struct page *page)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
-               __fscache_uncache_page(ci->fscache, page);
+       if (!cookie)
+               return false;
+       return fscache_cookie_enabled(cookie);
 }
 
-static inline void ceph_fscache_readpages_cancel(struct inode *inode,
-                                                struct list_head *pages)
+static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq)
 {
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       return fscache_readpages_cancel(ci->fscache, pages);
-}
+       struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode));
 
-static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
-{
-       ci->i_fscache_gen = ci->i_rdcache_gen - 1;
+       return fscache_begin_read_operation(rreq, cookie);
 }
-
 #else
 
 static inline int ceph_fscache_register(void)
@@ -102,6 +87,11 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
 {
 }
 
+static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
+{
+       return NULL;
+}
+
 static inline void ceph_fscache_register_inode_cookie(struct inode *inode)
 {
 }
@@ -115,62 +105,19 @@ static inline void ceph_fscache_file_set_cookie(struct inode *inode,
 {
 }
 
-static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci)
-{
-}
-
-static inline void ceph_fscache_uncache_page(struct inode *inode,
-                                            struct page *pages)
-{
-}
-
-static inline int ceph_readpage_from_fscache(struct inode* inode,
-                                            struct page *page)
-{
-       return -ENOBUFS;
-}
-
-static inline int ceph_readpages_from_fscache(struct inode *inode,
-                                             struct address_space *mapping,
-                                             struct list_head *pages,
-                                             unsigned *nr_pages)
-{
-       return -ENOBUFS;
-}
-
-static inline void ceph_readpage_to_fscache(struct inode *inode,
-                                           struct page *page)
-{
-}
-
 static inline void ceph_fscache_invalidate(struct inode *inode)
 {
 }
 
-static inline void ceph_invalidate_fscache_page(struct inode *inode,
-                                               struct page *page)
+static inline bool ceph_is_cache_enabled(struct inode *inode)
 {
+       return false;
 }
 
-static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
-{
-       return 1;
-}
-
-static inline void ceph_fscache_readpage_cancel(struct inode *inode,
-                                               struct page *page)
-{
-}
-
-static inline void ceph_fscache_readpages_cancel(struct inode *inode,
-                                                struct list_head *pages)
-{
-}
-
-static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci)
+static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq)
 {
+       return -ENOBUFS;
 }
-
 #endif
 
-#endif
+#endif /* _CEPH_CACHE_H */
index 3c03fa3..a5e93b1 100644 (file)
@@ -1390,7 +1390,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
        arg->flush_tid = flush_tid;
        arg->oldest_flush_tid = oldest_flush_tid;
 
-       arg->size = inode->i_size;
+       arg->size = i_size_read(inode);
        ci->i_reported_size = arg->size;
        arg->max_size = ci->i_wanted_max_size;
        if (cap == ci->i_auth_cap) {
@@ -1867,6 +1867,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
        u32 invalidating_gen = ci->i_rdcache_gen;
 
        spin_unlock(&ci->i_ceph_lock);
+       ceph_fscache_invalidate(inode);
        invalidate_mapping_pages(&inode->i_data, 0, -1);
        spin_lock(&ci->i_ceph_lock);
 
@@ -1884,7 +1885,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
 
 bool __ceph_should_report_size(struct ceph_inode_info *ci)
 {
-       loff_t size = ci->vfs_inode.i_size;
+       loff_t size = i_size_read(&ci->vfs_inode);
        /* mds will adjust max size according to the reported size */
        if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
                return false;
@@ -2730,10 +2731,6 @@ again:
                                *got = need | want;
                        else
                                *got = need;
-                       if (S_ISREG(inode->i_mode) &&
-                           (need & CEPH_CAP_FILE_RD) &&
-                           !(*got & CEPH_CAP_FILE_CACHE))
-                               ceph_disable_fscache_readpage(ci);
                        ceph_take_cap_refs(ci, *got, true);
                        ret = 1;
                }
@@ -2858,8 +2855,7 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
  * due to a small max_size, make sure we check_max_size (and possibly
  * ask the mds) so we don't get hung up indefinitely.
  */
-int ceph_get_caps(struct file *filp, int need, int want,
-                 loff_t endoff, int *got, struct page **pinned_page)
+int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got)
 {
        struct ceph_file_info *fi = filp->private_data;
        struct inode *inode = file_inode(filp);
@@ -2957,11 +2953,11 @@ int ceph_get_caps(struct file *filp, int need, int want,
                        struct page *page =
                                find_get_page(inode->i_mapping, 0);
                        if (page) {
-                               if (PageUptodate(page)) {
-                                       *pinned_page = page;
-                                       break;
-                               }
+                               bool uptodate = PageUptodate(page);
+
                                put_page(page);
+                               if (uptodate)
+                                       break;
                        }
                        /*
                         * drop cap refs first because getattr while
@@ -2983,11 +2979,6 @@ int ceph_get_caps(struct file *filp, int need, int want,
                }
                break;
        }
-
-       if (S_ISREG(ci->vfs_inode.i_mode) &&
-           (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
-               ceph_fscache_revalidate_cookie(ci);
-
        *got = _got;
        return 0;
 }
@@ -3308,7 +3299,7 @@ static void handle_cap_grant(struct inode *inode,
        dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
             inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
        dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
-               inode->i_size);
+               i_size_read(inode));
 
 
        /*
index 66989c8..425f335 100644 (file)
@@ -162,34 +162,34 @@ static int metric_show(struct seq_file *s, void *p)
        seq_printf(s, "item          total       avg_lat(us)     min_lat(us)     max_lat(us)     stdev(us)\n");
        seq_printf(s, "-----------------------------------------------------------------------------------\n");
 
-       spin_lock(&m->read_latency_lock);
+       spin_lock(&m->read_metric_lock);
        total = m->total_reads;
        sum = m->read_latency_sum;
        avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
        min = m->read_latency_min;
        max = m->read_latency_max;
        sq = m->read_latency_sq_sum;
-       spin_unlock(&m->read_latency_lock);
+       spin_unlock(&m->read_metric_lock);
        CEPH_METRIC_SHOW("read", total, avg, min, max, sq);
 
-       spin_lock(&m->write_latency_lock);
+       spin_lock(&m->write_metric_lock);
        total = m->total_writes;
        sum = m->write_latency_sum;
        avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
        min = m->write_latency_min;
        max = m->write_latency_max;
        sq = m->write_latency_sq_sum;
-       spin_unlock(&m->write_latency_lock);
+       spin_unlock(&m->write_metric_lock);
        CEPH_METRIC_SHOW("write", total, avg, min, max, sq);
 
-       spin_lock(&m->metadata_latency_lock);
+       spin_lock(&m->metadata_metric_lock);
        total = m->total_metadatas;
        sum = m->metadata_latency_sum;
        avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
        min = m->metadata_latency_min;
        max = m->metadata_latency_max;
        sq = m->metadata_latency_sq_sum;
-       spin_unlock(&m->metadata_latency_lock);
+       spin_unlock(&m->metadata_metric_lock);
        CEPH_METRIC_SHOW("metadata", total, avg, min, max, sq);
 
        seq_printf(s, "\n");
index f7a790e..5624fae 100644 (file)
@@ -631,10 +631,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
        switch (whence) {
        case SEEK_CUR:
                offset += file->f_pos;
+               break;
        case SEEK_SET:
                break;
        case SEEK_END:
                retval = -EOPNOTSUPP;
+               goto out;
        default:
                goto out;
        }
@@ -665,8 +667,8 @@ out:
 /*
  * Handle lookups for the hidden .snap directory.
  */
-int ceph_handle_snapdir(struct ceph_mds_request *req,
-                       struct dentry *dentry, int err)
+struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
+                                  struct dentry *dentry, int err)
 {
        struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
        struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
@@ -674,18 +676,17 @@ int ceph_handle_snapdir(struct ceph_mds_request *req,
        /* .snap dir? */
        if (err == -ENOENT &&
            ceph_snap(parent) == CEPH_NOSNAP &&
-           strcmp(dentry->d_name.name,
-                  fsc->mount_options->snapdir_name) == 0) {
+           strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) {
+               struct dentry *res;
                struct inode *inode = ceph_get_snapdir(parent);
-               if (IS_ERR(inode))
-                       return PTR_ERR(inode);
-               dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n",
-                    dentry, dentry, inode);
-               BUG_ON(!d_unhashed(dentry));
-               d_add(dentry, inode);
-               err = 0;
+
+               res = d_splice_alias(inode, dentry);
+               dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n",
+                    dentry, dentry, inode, res);
+               if (res)
+                       dentry = res;
        }
-       return err;
+       return dentry;
 }
 
 /*
@@ -741,6 +742,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
        struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
        struct ceph_mds_request *req;
+       struct dentry *res;
        int op;
        int mask;
        int err;
@@ -791,7 +793,13 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
        req->r_parent = dir;
        set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        err = ceph_mdsc_do_request(mdsc, NULL, req);
-       err = ceph_handle_snapdir(req, dentry, err);
+       res = ceph_handle_snapdir(req, dentry, err);
+       if (IS_ERR(res)) {
+               err = PTR_ERR(res);
+       } else {
+               dentry = res;
+               err = 0;
+       }
        dentry = ceph_finish_lookup(req, dentry, err);
        ceph_mdsc_put_request(req);  /* will dput(dentry) */
        dout("lookup result=%p\n", dentry);
index f22156e..65540a4 100644 (file)
@@ -129,6 +129,10 @@ static struct inode *__lookup_inode(struct super_block *sb, u64 ino)
 
        vino.ino = ino;
        vino.snap = CEPH_NOSNAP;
+
+       if (ceph_vino_is_reserved(vino))
+               return ERR_PTR(-ESTALE);
+
        inode = ceph_find_inode(sb, vino);
        if (!inode) {
                struct ceph_mds_request *req;
@@ -178,8 +182,10 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
                return ERR_CAST(inode);
        /* We need LINK caps to reliably check i_nlink */
        err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false);
-       if (err)
+       if (err) {
+               iput(inode);
                return ERR_PTR(err);
+       }
        /* -ESTALE if inode as been unlinked and no file is open */
        if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) {
                iput(inode);
@@ -212,6 +218,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
                vino.ino = sfh->ino;
                vino.snap = sfh->snapid;
        }
+
+       if (ceph_vino_is_reserved(vino))
+               return ERR_PTR(-ESTALE);
+
        inode = ceph_find_inode(sb, vino);
        if (inode)
                return d_obtain_alias(inode);
index 209535d..77fc037 100644 (file)
@@ -739,9 +739,12 @@ retry:
        err = ceph_mdsc_do_request(mdsc,
                                   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
                                   req);
-       err = ceph_handle_snapdir(req, dentry, err);
-       if (err)
+       dentry = ceph_handle_snapdir(req, dentry, err);
+       if (IS_ERR(dentry)) {
+               err = PTR_ERR(dentry);
                goto out_req;
+       }
+       err = 0;
 
        if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
                err = ceph_handle_notrace_create(dir, dentry);
@@ -892,7 +895,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
                if (!ret)
                        ret = ceph_osdc_wait_request(osdc, req);
 
-               ceph_update_read_latency(&fsc->mdsc->metric,
+               ceph_update_read_metrics(&fsc->mdsc->metric,
                                         req->r_start_latency,
                                         req->r_end_latency,
                                         ret);
@@ -1034,16 +1037,6 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
        dout("ceph_aio_complete_req %p rc %d bytes %u\n",
             inode, rc, osd_data->bvec_pos.iter.bi_size);
 
-       /* r_start_latency == 0 means the request was not submitted */
-       if (req->r_start_latency) {
-               if (aio_req->write)
-                       ceph_update_write_latency(metric, req->r_start_latency,
-                                                 req->r_end_latency, rc);
-               else
-                       ceph_update_read_latency(metric, req->r_start_latency,
-                                                req->r_end_latency, rc);
-       }
-
        if (rc == -EOLDSNAPC) {
                struct ceph_aio_work *aio_work;
                BUG_ON(!aio_req->write);
@@ -1086,6 +1079,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
                }
        }
 
+       /* r_start_latency == 0 means the request was not submitted */
+       if (req->r_start_latency) {
+               if (aio_req->write)
+                       ceph_update_write_metrics(metric, req->r_start_latency,
+                                                 req->r_end_latency, rc);
+               else
+                       ceph_update_read_metrics(metric, req->r_start_latency,
+                                                req->r_end_latency, rc);
+       }
+
        put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
                  aio_req->should_dirty);
        ceph_osdc_put_request(req);
@@ -1290,10 +1293,10 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                        ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
                if (write)
-                       ceph_update_write_latency(metric, req->r_start_latency,
+                       ceph_update_write_metrics(metric, req->r_start_latency,
                                                  req->r_end_latency, ret);
                else
-                       ceph_update_read_latency(metric, req->r_start_latency,
+                       ceph_update_read_metrics(metric, req->r_start_latency,
                                                 req->r_end_latency, ret);
 
                size = i_size_read(inode);
@@ -1467,7 +1470,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
                if (!ret)
                        ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
-               ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
+               ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
                                          req->r_end_latency, ret);
 out:
                ceph_osdc_put_request(req);
@@ -1510,7 +1513,6 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
        size_t len = iov_iter_count(to);
        struct inode *inode = file_inode(filp);
        struct ceph_inode_info *ci = ceph_inode(inode);
-       struct page *pinned_page = NULL;
        bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
        ssize_t ret;
        int want, got = 0;
@@ -1529,8 +1531,7 @@ again:
                want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
        else
                want = CEPH_CAP_FILE_CACHE;
-       ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1,
-                           &got, &pinned_page);
+       ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
        if (ret < 0) {
                if (iocb->ki_flags & IOCB_DIRECT)
                        ceph_end_io_direct(inode);
@@ -1571,10 +1572,6 @@ again:
 
        dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
             inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
-       if (pinned_page) {
-               put_page(pinned_page);
-               pinned_page = NULL;
-       }
        ceph_put_cap_refs(ci, got);
 
        if (direct_lock)
@@ -1753,8 +1750,7 @@ retry_snap:
        else
                want = CEPH_CAP_FILE_BUFFER;
        got = 0;
-       err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count,
-                           &got, NULL);
+       err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
        if (err < 0)
                goto out;
 
@@ -2083,7 +2079,7 @@ static long ceph_fallocate(struct file *file, int mode,
        else
                want = CEPH_CAP_FILE_BUFFER;
 
-       ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
+       ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got);
        if (ret < 0)
                goto unlock;
 
@@ -2121,7 +2117,7 @@ static int get_rd_wr_caps(struct file *src_filp, int *src_got,
 
 retry_caps:
        ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
-                           dst_endoff, dst_got, NULL);
+                           dst_endoff, dst_got);
        if (ret < 0)
                return ret;
 
@@ -2143,7 +2139,7 @@ retry_caps:
                        return ret;
                }
                ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD,
-                                   CEPH_CAP_FILE_SHARED, -1, src_got, NULL);
+                                   CEPH_CAP_FILE_SHARED, -1, src_got);
                if (ret < 0)
                        return ret;
                /*... drop src_ci caps too, and retry */
index 689e3ff..e1c63ad 100644 (file)
@@ -56,6 +56,9 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
 {
        struct inode *inode;
 
+       if (ceph_vino_is_reserved(vino))
+               return ERR_PTR(-EREMOTEIO);
+
        inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
                             ceph_set_ino_cb, &vino);
        if (!inode)
@@ -99,14 +102,15 @@ struct inode *ceph_get_snapdir(struct inode *parent)
        inode->i_mtime = parent->i_mtime;
        inode->i_ctime = parent->i_ctime;
        inode->i_atime = parent->i_atime;
-       inode->i_op = &ceph_snapdir_iops;
-       inode->i_fop = &ceph_snapdir_fops;
-       ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
        ci->i_rbytes = 0;
        ci->i_btime = ceph_inode(parent)->i_btime;
 
-       if (inode->i_state & I_NEW)
+       if (inode->i_state & I_NEW) {
+               inode->i_op = &ceph_snapdir_iops;
+               inode->i_fop = &ceph_snapdir_fops;
+               ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
                unlock_new_inode(inode);
+       }
 
        return inode;
 }
@@ -628,10 +632,11 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int queue_trunc = 0;
+       loff_t isize = i_size_read(inode);
 
        if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
-           (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
-               dout("size %lld -> %llu\n", inode->i_size, size);
+           (truncate_seq == ci->i_truncate_seq && size > isize)) {
+               dout("size %lld -> %llu\n", isize, size);
                if (size > 0 && S_ISDIR(inode->i_mode)) {
                        pr_err("fill_file_size non-zero size for directory\n");
                        size = 0;
@@ -925,6 +930,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
                        ci->i_rfiles = le64_to_cpu(info->rfiles);
                        ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
                        ci->i_dir_pin = iinfo->dir_pin;
+                       ci->i_rsnaps = iinfo->rsnaps;
                        ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
                }
        }
@@ -1818,7 +1824,7 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size)
        bool ret;
 
        spin_lock(&ci->i_ceph_lock);
-       dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
+       dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
        i_size_write(inode, size);
        inode->i_blocks = calc_inode_blocks(size);
 
@@ -1894,6 +1900,7 @@ static void ceph_do_invalidate_pages(struct inode *inode)
        orig_gen = ci->i_rdcache_gen;
        spin_unlock(&ci->i_ceph_lock);
 
+       ceph_fscache_invalidate(inode);
        if (invalidate_inode_pages2(inode->i_mapping) < 0) {
                pr_err("invalidate_pages %p fails\n", inode);
        }
@@ -2124,20 +2131,19 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
                }
        }
        if (ia_valid & ATTR_SIZE) {
-               dout("setattr %p size %lld -> %lld\n", inode,
-                    inode->i_size, attr->ia_size);
-               if ((issued & CEPH_CAP_FILE_EXCL) &&
-                   attr->ia_size > inode->i_size) {
+               loff_t isize = i_size_read(inode);
+
+               dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
+               if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > isize) {
                        i_size_write(inode, attr->ia_size);
                        inode->i_blocks = calc_inode_blocks(attr->ia_size);
                        ci->i_reported_size = attr->ia_size;
                        dirtied |= CEPH_CAP_FILE_EXCL;
                        ia_valid |= ATTR_MTIME;
                } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
-                          attr->ia_size != inode->i_size) {
+                          attr->ia_size != isize) {
                        req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
-                       req->r_args.setattr.old_size =
-                               cpu_to_le64(inode->i_size);
+                       req->r_args.setattr.old_size = cpu_to_le64(isize);
                        mask |= CEPH_SETATTR_SIZE;
                        release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
                                   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
@@ -2247,7 +2253,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
                return err;
 
        if ((attr->ia_valid & ATTR_SIZE) &&
-           attr->ia_size > max(inode->i_size, fsc->max_file_size))
+           attr->ia_size > max(i_size_read(inode), fsc->max_file_size))
                return -EFBIG;
 
        if ((attr->ia_valid & ATTR_SIZE) &&
index 97602ea..c456509 100644 (file)
@@ -118,7 +118,7 @@ static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
 }
 
 /**
- * ceph_end_io_direct - declare the file is being used for direct i/o
+ * ceph_start_io_direct - declare the file is being used for direct i/o
  * @inode: file inode
  *
  * Declare that a direct I/O operation is about to start, and ensure
index d87bd85..e5af591 100644 (file)
@@ -176,6 +176,13 @@ static int parse_reply_info_in(void **p, void *end,
                        memset(&info->snap_btime, 0, sizeof(info->snap_btime));
                }
 
+               /* snapshot count, remains zero for v<=3 */
+               if (struct_v >= 4) {
+                       ceph_decode_64_safe(p, end, info->rsnaps, bad);
+               } else {
+                       info->rsnaps = 0;
+               }
+
                *p = end;
        } else {
                if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
@@ -214,7 +221,7 @@ static int parse_reply_info_in(void **p, void *end,
                }
 
                info->dir_pin = -ENODATA;
-               /* info->snap_btime remains zero */
+               /* info->snap_btime and info->rsnaps remain zero */
        }
        return 0;
 bad:
@@ -433,6 +440,13 @@ static int ceph_parse_deleg_inos(void **p, void *end,
 
                ceph_decode_64_safe(p, end, start, bad);
                ceph_decode_64_safe(p, end, len, bad);
+
+               /* Don't accept a delegation of system inodes */
+               if (start < CEPH_INO_SYSTEM_BASE) {
+                       pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
+                                       start, len);
+                       continue;
+               }
                while (len--) {
                        int err = xa_insert(&s->s_delegated_inos, ino = start++,
                                            DELEGATED_INO_AVAILABLE,
@@ -3306,7 +3320,7 @@ out_err:
        /* kick calling process */
        complete_request(mdsc, req);
 
-       ceph_update_metadata_latency(&mdsc->metric, req->r_start_latency,
+       ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency,
                                     req->r_end_latency, err);
 out:
        ceph_mdsc_put_request(req);
@@ -3780,7 +3794,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
                rec.v1.cap_id = cpu_to_le64(cap->cap_id);
                rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
                rec.v1.issued = cpu_to_le32(cap->issued);
-               rec.v1.size = cpu_to_le64(inode->i_size);
+               rec.v1.size = cpu_to_le64(i_size_read(inode));
                ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
                ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
                rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
index eaa7c54..15c11a0 100644 (file)
@@ -88,6 +88,7 @@ struct ceph_mds_reply_info_in {
        s32 dir_pin;
        struct ceph_timespec btime;
        struct ceph_timespec snap_btime;
+       u64 rsnaps;
        u64 change_attr;
 };
 
index 5ec94bd..28b6b42 100644 (file)
@@ -17,6 +17,9 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
        struct ceph_metric_write_latency *write;
        struct ceph_metric_metadata_latency *meta;
        struct ceph_metric_dlease *dlease;
+       struct ceph_opened_files *files;
+       struct ceph_pinned_icaps *icaps;
+       struct ceph_opened_inodes *inodes;
        struct ceph_client_metric *m = &mdsc->metric;
        u64 nr_caps = atomic64_read(&m->total_caps);
        struct ceph_msg *msg;
@@ -26,7 +29,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
        s32 len;
 
        len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
-             + sizeof(*meta) + sizeof(*dlease);
+             + sizeof(*meta) + sizeof(*dlease) + sizeof(*files)
+             + sizeof(*icaps) + sizeof(*inodes);
 
        msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
        if (!msg) {
@@ -95,6 +99,38 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
        dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries));
        items++;
 
+       sum = percpu_counter_sum(&m->total_inodes);
+
+       /* encode the opened files metric */
+       files = (struct ceph_opened_files *)(dlease + 1);
+       files->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES);
+       files->ver = 1;
+       files->compat = 1;
+       files->data_len = cpu_to_le32(sizeof(*files) - 10);
+       files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files));
+       files->total = cpu_to_le64(sum);
+       items++;
+
+       /* encode the pinned icaps metric */
+       icaps = (struct ceph_pinned_icaps *)(files + 1);
+       icaps->type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS);
+       icaps->ver = 1;
+       icaps->compat = 1;
+       icaps->data_len = cpu_to_le32(sizeof(*icaps) - 10);
+       icaps->pinned_icaps = cpu_to_le64(nr_caps);
+       icaps->total = cpu_to_le64(sum);
+       items++;
+
+       /* encode the opened inodes metric */
+       inodes = (struct ceph_opened_inodes *)(icaps + 1);
+       inodes->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES);
+       inodes->ver = 1;
+       inodes->compat = 1;
+       inodes->data_len = cpu_to_le32(sizeof(*inodes) - 10);
+       inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes));
+       inodes->total = cpu_to_le64(sum);
+       items++;
+
        put_unaligned_le32(items, &head->num);
        msg->front.iov_len = len;
        msg->hdr.version = cpu_to_le16(1);
@@ -183,21 +219,21 @@ int ceph_metric_init(struct ceph_client_metric *m)
        if (ret)
                goto err_i_caps_mis;
 
-       spin_lock_init(&m->read_latency_lock);
+       spin_lock_init(&m->read_metric_lock);
        m->read_latency_sq_sum = 0;
        m->read_latency_min = KTIME_MAX;
        m->read_latency_max = 0;
        m->total_reads = 0;
        m->read_latency_sum = 0;
 
-       spin_lock_init(&m->write_latency_lock);
+       spin_lock_init(&m->write_metric_lock);
        m->write_latency_sq_sum = 0;
        m->write_latency_min = KTIME_MAX;
        m->write_latency_max = 0;
        m->total_writes = 0;
        m->write_latency_sum = 0;
 
-       spin_lock_init(&m->metadata_latency_lock);
+       spin_lock_init(&m->metadata_metric_lock);
        m->metadata_latency_sq_sum = 0;
        m->metadata_latency_min = KTIME_MAX;
        m->metadata_latency_max = 0;
@@ -274,7 +310,7 @@ static inline void __update_latency(ktime_t *totalp, ktime_t *lsump,
        *sq_sump += sq;
 }
 
-void ceph_update_read_latency(struct ceph_client_metric *m,
+void ceph_update_read_metrics(struct ceph_client_metric *m,
                              ktime_t r_start, ktime_t r_end,
                              int rc)
 {
@@ -283,14 +319,14 @@ void ceph_update_read_latency(struct ceph_client_metric *m,
        if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT))
                return;
 
-       spin_lock(&m->read_latency_lock);
+       spin_lock(&m->read_metric_lock);
        __update_latency(&m->total_reads, &m->read_latency_sum,
                         &m->read_latency_min, &m->read_latency_max,
                         &m->read_latency_sq_sum, lat);
-       spin_unlock(&m->read_latency_lock);
+       spin_unlock(&m->read_metric_lock);
 }
 
-void ceph_update_write_latency(struct ceph_client_metric *m,
+void ceph_update_write_metrics(struct ceph_client_metric *m,
                               ktime_t r_start, ktime_t r_end,
                               int rc)
 {
@@ -299,14 +335,14 @@ void ceph_update_write_latency(struct ceph_client_metric *m,
        if (unlikely(rc && rc != -ETIMEDOUT))
                return;
 
-       spin_lock(&m->write_latency_lock);
+       spin_lock(&m->write_metric_lock);
        __update_latency(&m->total_writes, &m->write_latency_sum,
                         &m->write_latency_min, &m->write_latency_max,
                         &m->write_latency_sq_sum, lat);
-       spin_unlock(&m->write_latency_lock);
+       spin_unlock(&m->write_metric_lock);
 }
 
-void ceph_update_metadata_latency(struct ceph_client_metric *m,
+void ceph_update_metadata_metrics(struct ceph_client_metric *m,
                                  ktime_t r_start, ktime_t r_end,
                                  int rc)
 {
@@ -315,9 +351,9 @@ void ceph_update_metadata_latency(struct ceph_client_metric *m,
        if (unlikely(rc && rc != -ENOENT))
                return;
 
-       spin_lock(&m->metadata_latency_lock);
+       spin_lock(&m->metadata_metric_lock);
        __update_latency(&m->total_metadatas, &m->metadata_latency_sum,
                         &m->metadata_latency_min, &m->metadata_latency_max,
                         &m->metadata_latency_sq_sum, lat);
-       spin_unlock(&m->metadata_latency_lock);
+       spin_unlock(&m->metadata_metric_lock);
 }
index af6038f..e984eb2 100644 (file)
@@ -14,8 +14,11 @@ enum ceph_metric_type {
        CLIENT_METRIC_TYPE_WRITE_LATENCY,
        CLIENT_METRIC_TYPE_METADATA_LATENCY,
        CLIENT_METRIC_TYPE_DENTRY_LEASE,
+       CLIENT_METRIC_TYPE_OPENED_FILES,
+       CLIENT_METRIC_TYPE_PINNED_ICAPS,
+       CLIENT_METRIC_TYPE_OPENED_INODES,
 
-       CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
+       CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_OPENED_INODES,
 };
 
 /*
@@ -28,6 +31,9 @@ enum ceph_metric_type {
        CLIENT_METRIC_TYPE_WRITE_LATENCY,       \
        CLIENT_METRIC_TYPE_METADATA_LATENCY,    \
        CLIENT_METRIC_TYPE_DENTRY_LEASE,        \
+       CLIENT_METRIC_TYPE_OPENED_FILES,        \
+       CLIENT_METRIC_TYPE_PINNED_ICAPS,        \
+       CLIENT_METRIC_TYPE_OPENED_INODES,       \
                                                \
        CLIENT_METRIC_TYPE_MAX,                 \
 }
@@ -94,6 +100,42 @@ struct ceph_metric_dlease {
        __le64 total;
 } __packed;
 
+/* metric opened files header */
+struct ceph_opened_files {
+       __le32 type;     /* ceph metric type */
+
+       __u8  ver;
+       __u8  compat;
+
+       __le32 data_len; /* length of sizeof(opened_files + total) */
+       __le64 opened_files;
+       __le64 total;
+} __packed;
+
+/* metric pinned i_caps header */
+struct ceph_pinned_icaps {
+       __le32 type;     /* ceph metric type */
+
+       __u8  ver;
+       __u8  compat;
+
+       __le32 data_len; /* length of sizeof(pinned_icaps + total) */
+       __le64 pinned_icaps;
+       __le64 total;
+} __packed;
+
+/* metric opened inodes header */
+struct ceph_opened_inodes {
+       __le32 type;     /* ceph metric type */
+
+       __u8  ver;
+       __u8  compat;
+
+       __le32 data_len; /* length of sizeof(opened_inodes + total) */
+       __le64 opened_inodes;
+       __le64 total;
+} __packed;
+
 struct ceph_metric_head {
        __le32 num;     /* the number of metrics that will be sent */
 } __packed;
@@ -108,21 +150,21 @@ struct ceph_client_metric {
        struct percpu_counter i_caps_hit;
        struct percpu_counter i_caps_mis;
 
-       spinlock_t read_latency_lock;
+       spinlock_t read_metric_lock;
        u64 total_reads;
        ktime_t read_latency_sum;
        ktime_t read_latency_sq_sum;
        ktime_t read_latency_min;
        ktime_t read_latency_max;
 
-       spinlock_t write_latency_lock;
+       spinlock_t write_metric_lock;
        u64 total_writes;
        ktime_t write_latency_sum;
        ktime_t write_latency_sq_sum;
        ktime_t write_latency_min;
        ktime_t write_latency_max;
 
-       spinlock_t metadata_latency_lock;
+       spinlock_t metadata_metric_lock;
        u64 total_metadatas;
        ktime_t metadata_latency_sum;
        ktime_t metadata_latency_sq_sum;
@@ -162,13 +204,13 @@ static inline void ceph_update_cap_mis(struct ceph_client_metric *m)
        percpu_counter_inc(&m->i_caps_mis);
 }
 
-extern void ceph_update_read_latency(struct ceph_client_metric *m,
+extern void ceph_update_read_metrics(struct ceph_client_metric *m,
                                     ktime_t r_start, ktime_t r_end,
                                     int rc);
-extern void ceph_update_write_latency(struct ceph_client_metric *m,
+extern void ceph_update_write_metrics(struct ceph_client_metric *m,
                                      ktime_t r_start, ktime_t r_end,
                                      int rc);
-extern void ceph_update_metadata_latency(struct ceph_client_metric *m,
+extern void ceph_update_metadata_metrics(struct ceph_client_metric *m,
                                         ktime_t r_start, ktime_t r_end,
                                         int rc);
 #endif /* _FS_CEPH_MDS_METRIC_H */
index 0728b01..4ce1805 100644 (file)
@@ -605,7 +605,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
        struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 
        BUG_ON(capsnap->writing);
-       capsnap->size = inode->i_size;
+       capsnap->size = i_size_read(inode);
        capsnap->mtime = inode->i_mtime;
        capsnap->atime = inode->i_atime;
        capsnap->ctime = inode->i_ctime;
index c48bb30..db80d89 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/ceph/libceph.h>
 
 #ifdef CONFIG_CEPH_FSCACHE
+#define FSCACHE_USE_NEW_IO_API
 #include <linux/fscache.h>
 #endif
 
@@ -333,7 +334,7 @@ struct ceph_inode_info {
 
        /* for dirs */
        struct timespec64 i_rctime;
-       u64 i_rbytes, i_rfiles, i_rsubdirs;
+       u64 i_rbytes, i_rfiles, i_rsubdirs, i_rsnaps;
        u64 i_files, i_subdirs;
 
        /* quotas */
@@ -427,7 +428,6 @@ struct ceph_inode_info {
 
 #ifdef CONFIG_CEPH_FSCACHE
        struct fscache_cookie *fscache;
-       u32 i_fscache_gen;
 #endif
        errseq_t i_meta_err;
 
@@ -529,10 +529,34 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
                ci->i_vino.snap == pvino->snap;
 }
 
+/*
+ * The MDS reserves a set of inodes for its own usage. These should never
+ * be accessible by clients, and so the MDS has no reason to ever hand these
+ * out. The range is CEPH_MDS_INO_MDSDIR_OFFSET..CEPH_INO_SYSTEM_BASE.
+ *
+ * These come from src/mds/mdstypes.h in the ceph sources.
+ */
+#define CEPH_MAX_MDS           0x100
+#define CEPH_NUM_STRAY         10
+#define CEPH_MDS_INO_MDSDIR_OFFSET     (1 * CEPH_MAX_MDS)
+#define CEPH_INO_SYSTEM_BASE           ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY))
+
+static inline bool ceph_vino_is_reserved(const struct ceph_vino vino)
+{
+       if (vino.ino < CEPH_INO_SYSTEM_BASE &&
+           vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) {
+               WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino);
+               return true;
+       }
+       return false;
+}
 
 static inline struct inode *ceph_find_inode(struct super_block *sb,
                                            struct ceph_vino vino)
 {
+       if (ceph_vino_is_reserved(vino))
+               return NULL;
+
        /*
         * NB: The hashval will be run through the fs/inode.c hash function
         * anyway, so there is no need to squash the inode number down to
@@ -1156,7 +1180,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
                                      int mds, int drop, int unless);
 
 extern int ceph_get_caps(struct file *filp, int need, int want,
-                        loff_t endoff, int *got, struct page **pinned_page);
+                        loff_t endoff, int *got);
 extern int ceph_try_get_caps(struct inode *inode,
                             int need, int want, bool nonblock, int *got);
 
@@ -1193,7 +1217,7 @@ extern const struct dentry_operations ceph_dentry_ops;
 
 extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
-extern int ceph_handle_snapdir(struct ceph_mds_request *req,
+extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
                               struct dentry *dentry, int err);
 extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                                         struct dentry *dentry, int err);
index 02f59bc..1242db8 100644 (file)
@@ -233,6 +233,12 @@ static ssize_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
        return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs);
 }
 
+static ssize_t ceph_vxattrcb_dir_rsnaps(struct ceph_inode_info *ci, char *val,
+                                         size_t size)
+{
+       return ceph_fmt_xattr(val, size, "%lld", ci->i_rsnaps);
+}
+
 static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
                                        size_t size)
 {
@@ -384,6 +390,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
        XATTR_RSTAT_FIELD(dir, rentries),
        XATTR_RSTAT_FIELD(dir, rfiles),
        XATTR_RSTAT_FIELD(dir, rsubdirs),
+       XATTR_RSTAT_FIELD(dir, rsnaps),
        XATTR_RSTAT_FIELD(dir, rbytes),
        XATTR_RSTAT_FIELD(dir, rctime),
        {
index 2a5325a..9c45b3a 100644 (file)
@@ -55,6 +55,7 @@
 #define CIFS_MOUNT_MODE_FROM_SID 0x10000000 /* retrieve mode from special ACE */
 #define CIFS_MOUNT_RO_CACHE    0x20000000  /* assumes share will not change */
 #define CIFS_MOUNT_RW_CACHE    0x40000000  /* assumes only client accessing */
+#define CIFS_MOUNT_SHUTDOWN    0x80000000
 
 struct cifs_sb_info {
        struct rb_root tlink_tree;
index 153d5c8..4a97fe1 100644 (file)
@@ -57,6 +57,12 @@ struct smb_query_info {
        /* char buffer[]; */
 } __packed;
 
+/*
+ * Dumping the commonly used 16 byte (e.g. CCM and GCM128) keys still supported
+ * for backlevel compatibility, but is not sufficient for dumping the less
+ * frequently used GCM256 (32 byte) keys (see the newer "CIFS_DUMP_FULL_KEY"
+ * ioctl for dumping decryption info for GCM256 mounts)
+ */
 struct smb3_key_debug_info {
        __u64   Suid;
        __u16   cipher_type;
@@ -65,6 +71,18 @@ struct smb3_key_debug_info {
        __u8    smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
 } __packed;
 
+/*
+ * Dump full key (32 byte encrypt/decrypt keys instead of 16 bytes)
+ * is needed if GCM256 (stronger encryption) negotiated
+ */
+struct smb3_full_key_debug_info {
+       __u64   Suid;
+       __u16   cipher_type;
+       __u8    auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */
+       __u8    smb3encryptionkey[32]; /* SMB3_ENC_DEC_KEY_SIZE */
+       __u8    smb3decryptionkey[32]; /* SMB3_ENC_DEC_KEY_SIZE */
+} __packed;
+
 struct smb3_notify {
        __u32   completion_filter;
        bool    watch_tree;
@@ -78,3 +96,20 @@ struct smb3_notify {
 #define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info)
 #define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info)
 #define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify)
+#define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info)
+#define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32)
+
+/*
+ * Flags for going down operation
+ */
+#define CIFS_GOING_FLAGS_DEFAULT                0x0     /* going down */
+#define CIFS_GOING_FLAGS_LOGFLUSH               0x1     /* flush log but not data */
+#define CIFS_GOING_FLAGS_NOLOGFLUSH             0x2     /* don't flush log nor data */
+
+static inline bool cifs_forced_shutdown(struct cifs_sb_info *sbi)
+{
+       if (CIFS_MOUNT_SHUTDOWN & sbi->mnt_cifs_flags)
+               return true;
+       else
+               return false;
+}
index 5f2c139..d7ea9c5 100644 (file)
@@ -75,7 +75,7 @@ bool enable_oplocks = true;
 bool linuxExtEnabled = true;
 bool lookupCacheEnabled = true;
 bool disable_legacy_dialects; /* false by default */
-bool enable_gcm_256;  /* false by default, change when more servers support it */
+bool enable_gcm_256 = true;
 bool require_gcm_256; /* false by default */
 unsigned int global_secflags = CIFSSEC_DEF;
 /* unsigned int ntlmv2_support = 0; */
@@ -133,6 +133,7 @@ struct workqueue_struct     *cifsiod_wq;
 struct workqueue_struct        *decrypt_wq;
 struct workqueue_struct        *fileinfo_put_wq;
 struct workqueue_struct        *cifsoplockd_wq;
+struct workqueue_struct *deferredclose_wq;
 __u32 cifs_lock_secret;
 
 /*
@@ -390,6 +391,8 @@ cifs_alloc_inode(struct super_block *sb)
        /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */
        INIT_LIST_HEAD(&cifs_inode->openFileList);
        INIT_LIST_HEAD(&cifs_inode->llist);
+       INIT_LIST_HEAD(&cifs_inode->deferred_closes);
+       spin_lock_init(&cifs_inode->deferred_lock);
        return &cifs_inode->vfs_inode;
 }
 
@@ -860,13 +863,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
                goto out;
        }
 
-       /* cifs_setup_volume_info->smb3_parse_devname() redups UNC & prepath */
-       kfree(cifs_sb->ctx->UNC);
-       cifs_sb->ctx->UNC = NULL;
-       kfree(cifs_sb->ctx->prepath);
-       cifs_sb->ctx->prepath = NULL;
-
-       rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, old_ctx->UNC);
+       rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, NULL);
        if (rc) {
                root = ERR_PTR(rc);
                goto out;
@@ -1637,9 +1634,16 @@ init_cifs(void)
                goto out_destroy_fileinfo_put_wq;
        }
 
+       deferredclose_wq = alloc_workqueue("deferredclose",
+                                          WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+       if (!deferredclose_wq) {
+               rc = -ENOMEM;
+               goto out_destroy_cifsoplockd_wq;
+       }
+
        rc = cifs_fscache_register();
        if (rc)
-               goto out_destroy_cifsoplockd_wq;
+               goto out_destroy_deferredclose_wq;
 
        rc = cifs_init_inodecache();
        if (rc)
@@ -1707,6 +1711,8 @@ out_destroy_inodecache:
        cifs_destroy_inodecache();
 out_unreg_fscache:
        cifs_fscache_unregister();
+out_destroy_deferredclose_wq:
+       destroy_workqueue(deferredclose_wq);
 out_destroy_cifsoplockd_wq:
        destroy_workqueue(cifsoplockd_wq);
 out_destroy_fileinfo_put_wq:
@@ -1741,6 +1747,7 @@ exit_cifs(void)
        cifs_destroy_mids();
        cifs_destroy_inodecache();
        cifs_fscache_unregister();
+       destroy_workqueue(deferredclose_wq);
        destroy_workqueue(cifsoplockd_wq);
        destroy_workqueue(decrypt_wq);
        destroy_workqueue(fileinfo_put_wq);
index b23a0ee..d88b4b5 100644 (file)
@@ -1154,6 +1154,14 @@ struct cifs_pending_open {
        __u32 oplock;
 };
 
+struct cifs_deferred_close {
+       struct list_head dlist;
+       struct tcon_link *tlink;
+       __u16  netfid;
+       __u64  persistent_fid;
+       __u64  volatile_fid;
+};
+
 /*
  * This info hangs off the cifsFileInfo structure, pointed to by llist.
  * This is used to track byte stream locks on the file
@@ -1248,6 +1256,9 @@ struct cifsFileInfo {
        struct cifs_search_info srch_inf;
        struct work_struct oplock_break; /* work for oplock breaks */
        struct work_struct put; /* work for the final part of _put */
+       struct delayed_work deferred;
+       bool oplock_break_received; /* Flag to indicate oplock break */
+       bool deferred_scheduled;
 };
 
 struct cifs_io_parms {
@@ -1392,6 +1403,7 @@ struct cifsInodeInfo {
 #define CIFS_INO_DELETE_PENDING                  (3) /* delete pending on server */
 #define CIFS_INO_INVALID_MAPPING         (4) /* pagecache is invalid */
 #define CIFS_INO_LOCK                    (5) /* lock bit for synchronization */
+#define CIFS_INO_MODIFIED_ATTR            (6) /* Indicate change in mtime/ctime */
        unsigned long flags;
        spinlock_t writers_lock;
        unsigned int writers;           /* Number of writers on this inode */
@@ -1404,6 +1416,8 @@ struct cifsInodeInfo {
        struct fscache_cookie *fscache;
 #endif
        struct inode vfs_inode;
+       struct list_head deferred_closes; /* list of deferred closes */
+       spinlock_t deferred_lock; /* protection on deferred list */
 };
 
 static inline struct cifsInodeInfo *
@@ -1871,11 +1885,14 @@ extern bool disable_legacy_dialects;  /* forbid vers=1.0 and vers=2.0 mounts */
 
 void cifs_oplock_break(struct work_struct *work);
 void cifs_queue_oplock_break(struct cifsFileInfo *cfile);
+void smb2_deferred_work_close(struct work_struct *work);
 
+extern const struct slow_work_ops cifs_oplock_break_ops;
 extern struct workqueue_struct *cifsiod_wq;
 extern struct workqueue_struct *decrypt_wq;
 extern struct workqueue_struct *fileinfo_put_wq;
 extern struct workqueue_struct *cifsoplockd_wq;
+extern struct workqueue_struct *deferredclose_wq;
 extern __u32 cifs_lock_secret;
 
 extern mempool_t *cifs_mid_poolp;
index a79d500..d30cba4 100644 (file)
@@ -267,6 +267,19 @@ extern void cifs_add_pending_open_locked(struct cifs_fid *fid,
                                         struct tcon_link *tlink,
                                         struct cifs_pending_open *open);
 extern void cifs_del_pending_open(struct cifs_pending_open *open);
+
+extern bool cifs_is_deferred_close(struct cifsFileInfo *cfile,
+                               struct cifs_deferred_close **dclose);
+
+extern void cifs_add_deferred_close(struct cifsFileInfo *cfile,
+                               struct cifs_deferred_close *dclose);
+
+extern void cifs_del_deferred_close(struct cifsFileInfo *cfile);
+
+extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode);
+
+extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon);
+
 extern struct TCP_Server_Info *cifs_get_tcp_session(struct smb3_fs_context *ctx);
 extern void cifs_put_tcp_session(struct TCP_Server_Info *server,
                                 int from_reconnect);
index 121d8b4..495c395 100644 (file)
@@ -392,16 +392,6 @@ cifs_echo_request(struct work_struct *work)
        int rc;
        struct TCP_Server_Info *server = container_of(work,
                                        struct TCP_Server_Info, echo.work);
-       unsigned long echo_interval;
-
-       /*
-        * If we need to renegotiate, set echo interval to zero to
-        * immediately call echo service where we can renegotiate.
-        */
-       if (server->tcpStatus == CifsNeedNegotiate)
-               echo_interval = 0;
-       else
-               echo_interval = server->echo_interval;
 
        /*
         * We cannot send an echo if it is disabled.
@@ -412,7 +402,7 @@ cifs_echo_request(struct work_struct *work)
            server->tcpStatus == CifsExiting ||
            server->tcpStatus == CifsNew ||
            (server->ops->can_echo && !server->ops->can_echo(server)) ||
-           time_before(jiffies, server->lstrp + echo_interval - HZ))
+           time_before(jiffies, server->lstrp + server->echo_interval - HZ))
                goto requeue_echo;
 
        rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS;
@@ -476,6 +466,7 @@ server_unresponsive(struct TCP_Server_Info *server)
         */
        if ((server->tcpStatus == CifsGood ||
            server->tcpStatus == CifsNeedNegotiate) &&
+           (!server->ops->can_echo || server->ops->can_echo(server)) &&
            time_after(jiffies, server->lstrp + 3 * server->echo_interval)) {
                cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n",
                         (3 * server->echo_interval) / HZ);
@@ -3158,17 +3149,29 @@ out:
 int
 cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname)
 {
-       int rc = 0;
+       int rc;
 
-       smb3_parse_devname(devname, ctx);
+       if (devname) {
+               cifs_dbg(FYI, "%s: devname=%s\n", __func__, devname);
+               rc = smb3_parse_devname(devname, ctx);
+               if (rc) {
+                       cifs_dbg(VFS, "%s: failed to parse %s: %d\n", __func__, devname, rc);
+                       return rc;
+               }
+       }
 
        if (mntopts) {
                char *ip;
 
-               cifs_dbg(FYI, "%s: mntopts=%s\n", __func__, mntopts);
                rc = smb3_parse_opt(mntopts, "ip", &ip);
-               if (!rc && !cifs_convert_address((struct sockaddr *)&ctx->dstaddr, ip,
-                                                strlen(ip))) {
+               if (rc) {
+                       cifs_dbg(VFS, "%s: failed to parse ip options: %d\n", __func__, rc);
+                       return rc;
+               }
+
+               rc = cifs_convert_address((struct sockaddr *)&ctx->dstaddr, ip, strlen(ip));
+               kfree(ip);
+               if (!rc) {
                        cifs_dbg(VFS, "%s: failed to convert ip address\n", __func__);
                        return -EINVAL;
                }
@@ -3188,7 +3191,7 @@ cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const c
                return -EINVAL;
        }
 
-       return rc;
+       return 0;
 }
 
 static int
index c85aff8..6bcd3e8 100644 (file)
@@ -34,6 +34,7 @@
 #include "cifs_fs_sb.h"
 #include "cifs_unicode.h"
 #include "fs_context.h"
+#include "cifs_ioctl.h"
 
 static void
 renew_parental_timestamps(struct dentry *direntry)
@@ -430,6 +431,9 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
        __u32 oplock;
        struct cifsFileInfo *file_info;
 
+       if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb))))
+               return -EIO;
+
        /*
         * Posix open is only called (at lookup time) for file create now. For
         * opens (rather than creates), because we do not know if it is a file
@@ -546,6 +550,9 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode,
        cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
                 inode, direntry, direntry);
 
+       if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb))))
+               return -EIO;
+
        tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
        rc = PTR_ERR(tlink);
        if (IS_ERR(tlink))
@@ -583,6 +590,9 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode,
                return -EINVAL;
 
        cifs_sb = CIFS_SB(inode->i_sb);
+       if (unlikely(cifs_forced_shutdown(cifs_sb)))
+               return -EIO;
+
        tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
                return PTR_ERR(tlink);
index 639c595..6caad10 100644 (file)
@@ -45,6 +45,7 @@
 #include "fscache.h"
 #include "smbdirect.h"
 #include "fs_context.h"
+#include "cifs_ioctl.h"
 
 static inline int cifs_convert_flags(unsigned int flags)
 {
@@ -322,9 +323,12 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
        cfile->dentry = dget(dentry);
        cfile->f_flags = file->f_flags;
        cfile->invalidHandle = false;
+       cfile->oplock_break_received = false;
+       cfile->deferred_scheduled = false;
        cfile->tlink = cifs_get_tlink(tlink);
        INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
        INIT_WORK(&cfile->put, cifsFileInfo_put_work);
+       INIT_DELAYED_WORK(&cfile->deferred, smb2_deferred_work_close);
        mutex_init(&cfile->fh_mutex);
        spin_lock_init(&cfile->file_info_lock);
 
@@ -539,6 +543,11 @@ int cifs_open(struct inode *inode, struct file *file)
        xid = get_xid();
 
        cifs_sb = CIFS_SB(inode->i_sb);
+       if (unlikely(cifs_forced_shutdown(cifs_sb))) {
+               free_xid(xid);
+               return -EIO;
+       }
+
        tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink)) {
                free_xid(xid);
@@ -565,6 +574,23 @@ int cifs_open(struct inode *inode, struct file *file)
                        file->f_op = &cifs_file_direct_ops;
        }
 
+       spin_lock(&CIFS_I(inode)->deferred_lock);
+       /* Get the cached handle as SMB2 close is deferred */
+       rc = cifs_get_readable_path(tcon, full_path, &cfile);
+       if (rc == 0) {
+               if (file->f_flags == cfile->f_flags) {
+                       file->private_data = cfile;
+                       cifs_del_deferred_close(cfile);
+                       spin_unlock(&CIFS_I(inode)->deferred_lock);
+                       goto out;
+               } else {
+                       spin_unlock(&CIFS_I(inode)->deferred_lock);
+                       _cifsFileInfo_put(cfile, true, false);
+               }
+       } else {
+               spin_unlock(&CIFS_I(inode)->deferred_lock);
+       }
+
        if (server->oplocks)
                oplock = REQ_OPLOCK;
        else
@@ -846,11 +872,56 @@ reopen_error_exit:
        return rc;
 }
 
+void smb2_deferred_work_close(struct work_struct *work)
+{
+       struct cifsFileInfo *cfile = container_of(work,
+                       struct cifsFileInfo, deferred.work);
+
+       spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
+       if (!cfile->deferred_scheduled) {
+               spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
+               return;
+       }
+       cifs_del_deferred_close(cfile);
+       cfile->deferred_scheduled = false;
+       spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
+       _cifsFileInfo_put(cfile, true, false);
+}
+
 int cifs_close(struct inode *inode, struct file *file)
 {
+       struct cifsFileInfo *cfile;
+       struct cifsInodeInfo *cinode = CIFS_I(inode);
+       struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+       struct cifs_deferred_close *dclose;
+
        if (file->private_data != NULL) {
-               _cifsFileInfo_put(file->private_data, true, false);
+               cfile = file->private_data;
                file->private_data = NULL;
+               dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL);
+               if ((cinode->oplock == CIFS_CACHE_RHW_FLG) &&
+                   dclose) {
+                       if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags))
+                               inode->i_ctime = inode->i_mtime = current_time(inode);
+                       spin_lock(&cinode->deferred_lock);
+                       cifs_add_deferred_close(cfile, dclose);
+                       if (cfile->deferred_scheduled) {
+                               mod_delayed_work(deferredclose_wq,
+                                               &cfile->deferred, cifs_sb->ctx->acregmax);
+                       } else {
+                               /* Deferred close for files */
+                               queue_delayed_work(deferredclose_wq,
+                                               &cfile->deferred, cifs_sb->ctx->acregmax);
+                               cfile->deferred_scheduled = true;
+                               spin_unlock(&cinode->deferred_lock);
+                               return 0;
+                       }
+                       spin_unlock(&cinode->deferred_lock);
+                       _cifsFileInfo_put(cfile, true, false);
+               } else {
+                       _cifsFileInfo_put(cfile, true, false);
+                       kfree(dclose);
+               }
        }
 
        /* return code from the ->release op is always ignored */
@@ -1920,8 +1991,10 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
 
        if (total_written > 0) {
                spin_lock(&d_inode(dentry)->i_lock);
-               if (*offset > d_inode(dentry)->i_size)
+               if (*offset > d_inode(dentry)->i_size) {
                        i_size_write(d_inode(dentry), *offset);
+                       d_inode(dentry)->i_blocks = (512 - 1 + *offset) >> 9;
+               }
                spin_unlock(&d_inode(dentry)->i_lock);
        }
        mark_inode_dirty_sync(d_inode(dentry));
@@ -1947,7 +2020,8 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
                if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
                        continue;
                if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
-                       if (!open_file->invalidHandle) {
+                       if ((!open_file->invalidHandle) &&
+                               (!open_file->oplock_break_received)) {
                                /* found a good file */
                                /* lock it so it will not be closed on us */
                                cifsFileInfo_get(open_file);
@@ -2476,6 +2550,8 @@ retry:
        if (cfile)
                cifsFileInfo_put(cfile);
        free_xid(xid);
+       /* Indication to update ctime and mtime as close is deferred */
+       set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
        return rc;
 }
 
@@ -2577,13 +2653,17 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 
        if (rc > 0) {
                spin_lock(&inode->i_lock);
-               if (pos > inode->i_size)
+               if (pos > inode->i_size) {
                        i_size_write(inode, pos);
+                       inode->i_blocks = (512 - 1 + pos) >> 9;
+               }
                spin_unlock(&inode->i_lock);
        }
 
        unlock_page(page);
        put_page(page);
+       /* Indication to update ctime and mtime as close is deferred */
+       set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
 
        return rc;
 }
@@ -4744,6 +4824,8 @@ void cifs_oplock_break(struct work_struct *work)
        struct TCP_Server_Info *server = tcon->ses->server;
        int rc = 0;
        bool purge_cache = false;
+       bool is_deferred = false;
+       struct cifs_deferred_close *dclose;
 
        wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
                        TASK_UNINTERRUPTIBLE);
@@ -4790,6 +4872,18 @@ oplock_break_ack:
                                                             cinode);
                cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
        }
+       /*
+        * When oplock break is received and there are no active
+        * file handles but cached, then set the flag oplock_break_received.
+        * So, new open will not use cached handle.
+        */
+       spin_lock(&CIFS_I(inode)->deferred_lock);
+       is_deferred = cifs_is_deferred_close(cfile, &dclose);
+       if (is_deferred && cfile->deferred_scheduled) {
+               cfile->oplock_break_received = true;
+               mod_delayed_work(deferredclose_wq, &cfile->deferred, 0);
+       }
+       spin_unlock(&CIFS_I(inode)->deferred_lock);
        _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false);
        cifs_done_oplock_break(cinode);
 }
index 3e0d016..3bcf881 100644 (file)
@@ -476,6 +476,7 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx)
 
        /* move "pos" up to delimiter or NULL */
        pos += len;
+       kfree(ctx->UNC);
        ctx->UNC = kstrndup(devname, pos - devname, GFP_KERNEL);
        if (!ctx->UNC)
                return -ENOMEM;
@@ -486,6 +487,9 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx)
        if (*pos == '/' || *pos == '\\')
                pos++;
 
+       kfree(ctx->prepath);
+       ctx->prepath = NULL;
+
        /* If pos is NULL then no prepath */
        if (!*pos)
                return 0;
@@ -1642,6 +1646,7 @@ void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb)
                        cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n");
                }
        }
+       cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SHUTDOWN;
 
        return;
 }
index 002d864..1dfa579 100644 (file)
@@ -26,7 +26,6 @@
 #include <linux/sched/signal.h>
 #include <linux/wait_bit.h>
 #include <linux/fiemap.h>
-
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -38,7 +37,7 @@
 #include "cifs_unicode.h"
 #include "fscache.h"
 #include "fs_context.h"
-
+#include "cifs_ioctl.h"
 
 static void cifs_set_ops(struct inode *inode)
 {
@@ -1610,6 +1609,9 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 
        cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry);
 
+       if (unlikely(cifs_forced_shutdown(cifs_sb)))
+               return -EIO;
+
        tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
                return PTR_ERR(tlink);
@@ -1632,6 +1634,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
                goto unlink_out;
        }
 
+       cifs_close_all_deferred_files(tcon);
        if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
                                le64_to_cpu(tcon->fsUnixInfo.Capability))) {
                rc = CIFSPOSIXDelFile(xid, tcon, full_path,
@@ -1872,6 +1875,8 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode,
                 mode, inode);
 
        cifs_sb = CIFS_SB(inode->i_sb);
+       if (unlikely(cifs_forced_shutdown(cifs_sb)))
+               return -EIO;
        tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
                return PTR_ERR(tlink);
@@ -1954,6 +1959,11 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        }
 
        cifs_sb = CIFS_SB(inode->i_sb);
+       if (unlikely(cifs_forced_shutdown(cifs_sb))) {
+               rc = -EIO;
+               goto rmdir_exit;
+       }
+
        tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink)) {
                rc = PTR_ERR(tlink);
@@ -2088,6 +2098,9 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
                return -EINVAL;
 
        cifs_sb = CIFS_SB(source_dir->i_sb);
+       if (unlikely(cifs_forced_shutdown(cifs_sb)))
+               return -EIO;
+
        tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
                return PTR_ERR(tlink);
@@ -2109,6 +2122,7 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
                goto cifs_rename_exit;
        }
 
+       cifs_close_all_deferred_files(tcon);
        rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
                            to_name);
 
@@ -2404,6 +2418,9 @@ int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path,
        struct inode *inode = d_inode(dentry);
        int rc;
 
+       if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb))))
+               return -EIO;
+
        /*
         * We need to be sure that all dirty pages are written and the server
         * has actual ctime, mtime and file length.
@@ -2476,6 +2493,9 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
        struct cifsFileInfo *cfile;
        int rc;
 
+       if (unlikely(cifs_forced_shutdown(cifs_sb)))
+               return -EIO;
+
        /*
         * We need to be sure that all dirty pages are written as they
         * might fill holes on the server.
@@ -2962,6 +2982,9 @@ cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry,
        struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
        int rc, retries = 0;
 
+       if (unlikely(cifs_forced_shutdown(cifs_sb)))
+               return -EIO;
+
        do {
                if (pTcon->unix_ext)
                        rc = cifs_setattr_unix(direntry, attrs);
index 08d99fe..28ec8d7 100644 (file)
@@ -164,6 +164,100 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
        return rc;
 }
 
+static int cifs_shutdown(struct super_block *sb, unsigned long arg)
+{
+       struct cifs_sb_info *sbi = CIFS_SB(sb);
+       __u32 flags;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (get_user(flags, (__u32 __user *)arg))
+               return -EFAULT;
+
+       if (flags > CIFS_GOING_FLAGS_NOLOGFLUSH)
+               return -EINVAL;
+
+       if (cifs_forced_shutdown(sbi))
+               return 0;
+
+       cifs_dbg(VFS, "shut down requested (%d)", flags);
+/*     trace_cifs_shutdown(sb, flags);*/
+
+       /*
+        * see:
+        *   https://man7.org/linux/man-pages/man2/ioctl_xfs_goingdown.2.html
+        * for more information and description of original intent of the flags
+        */
+       switch (flags) {
+       /*
+        * We could add support later for default flag which requires:
+        *     "Flush all dirty data and metadata to disk"
+        * would need to call syncfs or equivalent to flush page cache for
+        * the mount and then issue fsync to server (if nostrictsync not set)
+        */
+       case CIFS_GOING_FLAGS_DEFAULT:
+               cifs_dbg(FYI, "shutdown with default flag not supported\n");
+               return -EINVAL;
+       /*
+        * FLAGS_LOGFLUSH is easy since it asks to write out metadata (not
+        * data) but metadata writes are not cached on the client, so can treat
+        * it similarly to NOLOGFLUSH
+        */
+       case CIFS_GOING_FLAGS_LOGFLUSH:
+       case CIFS_GOING_FLAGS_NOLOGFLUSH:
+               sbi->mnt_cifs_flags |= CIFS_MOUNT_SHUTDOWN;
+               return 0;
+       default:
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static int cifs_dump_full_key(struct cifs_tcon *tcon, unsigned long arg)
+{
+       struct smb3_full_key_debug_info pfull_key_inf;
+       __u64 suid;
+       struct list_head *tmp;
+       struct cifs_ses *ses;
+       bool found = false;
+
+       if (!smb3_encryption_required(tcon))
+               return -EOPNOTSUPP;
+
+       ses = tcon->ses; /* default to user id for current user */
+       if (get_user(suid, (__u64 __user *)arg))
+               suid = 0;
+       if (suid) {
+               /* search to see if there is a session with a matching SMB UID */
+               spin_lock(&cifs_tcp_ses_lock);
+               list_for_each(tmp, &tcon->ses->server->smb_ses_list) {
+                       ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+                       if (ses->Suid == suid) {
+                               found = true;
+                               break;
+                       }
+               }
+               spin_unlock(&cifs_tcp_ses_lock);
+               if (found == false)
+                       return -EINVAL;
+       } /* else uses default user's SMB UID (ie current user) */
+
+       pfull_key_inf.cipher_type = le16_to_cpu(ses->server->cipher_type);
+       pfull_key_inf.Suid = ses->Suid;
+       memcpy(pfull_key_inf.auth_key, ses->auth_key.response,
+              16 /* SMB2_NTLMV2_SESSKEY_SIZE */);
+       memcpy(pfull_key_inf.smb3decryptionkey, ses->smb3decryptionkey,
+              32 /* SMB3_ENC_DEC_KEY_SIZE */);
+       memcpy(pfull_key_inf.smb3encryptionkey,
+              ses->smb3encryptionkey, 32 /* SMB3_ENC_DEC_KEY_SIZE */);
+       if (copy_to_user((void __user *)arg, &pfull_key_inf,
+                        sizeof(struct smb3_full_key_debug_info)))
+               return -EFAULT;
+
+       return 0;
+}
+
 long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 {
        struct inode *inode = file_inode(filep);
@@ -304,6 +398,21 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                        else
                                rc = 0;
                        break;
+               /*
+                * Dump full key (32 bytes instead of 16 bytes) is
+                * needed if GCM256 (stronger encryption) negotiated
+                */
+               case CIFS_DUMP_FULL_KEY:
+                       if (pSMBFile == NULL)
+                               break;
+                       if (!capable(CAP_SYS_ADMIN)) {
+                               rc = -EACCES;
+                               break;
+                       }
+                       tcon = tlink_tcon(pSMBFile->tlink);
+                       rc = cifs_dump_full_key(tcon, arg);
+
+                       break;
                case CIFS_IOC_NOTIFY:
                        if (!S_ISDIR(inode->i_mode)) {
                                /* Notify can only be done on directories */
@@ -325,6 +434,9 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                                rc = -EOPNOTSUPP;
                        cifs_put_tlink(tlink);
                        break;
+               case CIFS_IOC_SHUTDOWN:
+                       rc = cifs_shutdown(inode->i_sb, arg);
+                       break;
                default:
                        cifs_dbg(FYI, "unsupported ioctl\n");
                        break;
index 616e1bc..970fcf2 100644 (file)
@@ -30,6 +30,7 @@
 #include "cifs_fs_sb.h"
 #include "cifs_unicode.h"
 #include "smb2proto.h"
+#include "cifs_ioctl.h"
 
 /*
  * M-F Symlink Functions - Begin
@@ -518,6 +519,9 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
        struct TCP_Server_Info *server;
        struct cifsInodeInfo *cifsInode;
 
+       if (unlikely(cifs_forced_shutdown(cifs_sb)))
+               return -EIO;
+
        tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
                return PTR_ERR(tlink);
@@ -679,9 +683,16 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode,
        struct tcon_link *tlink;
        struct cifs_tcon *pTcon;
        const char *full_path;
-       void *page = alloc_dentry_path();
+       void *page;
        struct inode *newinode = NULL;
 
+       if (unlikely(cifs_forced_shutdown(cifs_sb)))
+               return -EIO;
+
+       page = alloc_dentry_path();
+       if (!page)
+               return -ENOMEM;
+
        xid = get_xid();
 
        tlink = cifs_sb_tlink(cifs_sb);
index c15a90e..524dbdf 100644 (file)
@@ -672,6 +672,85 @@ cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink,
        spin_unlock(&tlink_tcon(open->tlink)->open_file_lock);
 }
 
+bool
+cifs_is_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close **pdclose)
+{
+       struct cifs_deferred_close *dclose;
+
+       list_for_each_entry(dclose, &CIFS_I(d_inode(cfile->dentry))->deferred_closes, dlist) {
+               if ((dclose->netfid == cfile->fid.netfid) &&
+                       (dclose->persistent_fid == cfile->fid.persistent_fid) &&
+                       (dclose->volatile_fid == cfile->fid.volatile_fid)) {
+                       *pdclose = dclose;
+                       return true;
+               }
+       }
+       return false;
+}
+
+void
+cifs_add_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close *dclose)
+{
+       bool is_deferred = false;
+       struct cifs_deferred_close *pdclose;
+
+       is_deferred = cifs_is_deferred_close(cfile, &pdclose);
+       if (is_deferred) {
+               kfree(dclose);
+               return;
+       }
+
+       dclose->tlink = cfile->tlink;
+       dclose->netfid = cfile->fid.netfid;
+       dclose->persistent_fid = cfile->fid.persistent_fid;
+       dclose->volatile_fid = cfile->fid.volatile_fid;
+       list_add_tail(&dclose->dlist, &CIFS_I(d_inode(cfile->dentry))->deferred_closes);
+}
+
+void
+cifs_del_deferred_close(struct cifsFileInfo *cfile)
+{
+       bool is_deferred = false;
+       struct cifs_deferred_close *dclose;
+
+       is_deferred = cifs_is_deferred_close(cfile, &dclose);
+       if (!is_deferred)
+               return;
+       list_del(&dclose->dlist);
+       kfree(dclose);
+}
+
+void
+cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode)
+{
+       struct cifsFileInfo *cfile = NULL;
+       struct cifs_deferred_close *dclose;
+
+       list_for_each_entry(cfile, &cifs_inode->openFileList, flist) {
+               spin_lock(&cifs_inode->deferred_lock);
+               if (cifs_is_deferred_close(cfile, &dclose))
+                       mod_delayed_work(deferredclose_wq, &cfile->deferred, 0);
+               spin_unlock(&cifs_inode->deferred_lock);
+       }
+}
+
+void
+cifs_close_all_deferred_files(struct cifs_tcon *tcon)
+{
+       struct cifsFileInfo *cfile;
+       struct cifsInodeInfo *cinode;
+       struct list_head *tmp;
+
+       spin_lock(&tcon->open_file_lock);
+       list_for_each(tmp, &tcon->openFileList) {
+               cfile = list_entry(tmp, struct cifsFileInfo, tlist);
+               cinode = CIFS_I(d_inode(cfile->dentry));
+               if (delayed_work_pending(&cfile->deferred))
+                       mod_delayed_work(deferredclose_wq, &cfile->deferred, 0);
+       }
+       spin_unlock(&tcon->open_file_lock);
+}
+
 /* parses DFS refferal V3 structure
  * caller is responsible for freeing target_nodes
  * returns:
index e351b94..aa3e8ca 100644 (file)
@@ -30,6 +30,7 @@
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
 #include "cifs_unicode.h"
+#include "cifs_ioctl.h"
 
 #define MAX_EA_VALUE_SIZE CIFSMaxBufSize
 #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" /* DACL only */
@@ -421,6 +422,9 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
        const char *full_path;
        void *page;
 
+       if (unlikely(cifs_forced_shutdown(cifs_sb)))
+               return -EIO;
+
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                return -EOPNOTSUPP;
 
index 9a3aed2..c039536 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset:8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * configfs_internal.h - Internal stuff for configfs
  *
  * Based on sysfs:
index b6098e0..ac5e0c0 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dir.c - Operations for configfs directories.
  *
  * Based on sysfs:
index da8351d..e26060d 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * file.c - operations for regular (text) files.
  *
  * Based on sysfs:
index 42c348b..eb5ec3e 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * inode.c - basic inode and dentry operations.
  *
  * Based on sysfs:
index 704a435..254170a 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * item.c - library routines for handling generic config items
  *
  * Based on kobject:
index 0c6e8cf..c2d8200 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * mount.c - operations for initializing and mounting configfs.
  *
  * Based on sysfs:
index 77c8543..0623c3e 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * symlink.c - operations for configfs symlinks.
  *
  * Based on sysfs:
index a69e2cd..270d621 100644 (file)
@@ -326,9 +326,9 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
 /*
  * Write full pathname from the root of the filesystem into the buffer.
  */
-static char *__dentry_path(struct dentry *d, char *buf, int buflen)
+static char *__dentry_path(const struct dentry *d, char *buf, int buflen)
 {
-       struct dentry *dentry;
+       const struct dentry *dentry;
        char *end, *retval;
        int len, seq = 0;
        int error = 0;
@@ -347,7 +347,7 @@ restart:
        *retval = '/';
        read_seqbegin_or_lock(&rename_lock, &seq);
        while (!IS_ROOT(dentry)) {
-               struct dentry *parent = dentry->d_parent;
+               const struct dentry *parent = dentry->d_parent;
 
                prefetch(parent);
                error = prepend_name(&end, &len, &dentry->d_name);
@@ -371,13 +371,13 @@ Elong:
        return ERR_PTR(-ENAMETOOLONG);
 }
 
-char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
+char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen)
 {
        return __dentry_path(dentry, buf, buflen);
 }
 EXPORT_SYMBOL(dentry_path_raw);
 
-char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+char *dentry_path(const struct dentry *dentry, char *buf, int buflen)
 {
        char *p = NULL;
        char *retval;
index b3d27fd..6921624 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -525,7 +525,7 @@ retry:
                dax_disassociate_entry(entry, mapping, false);
                xas_store(xas, NULL);   /* undo the PMD join */
                dax_wake_entry(xas, entry, true);
-               mapping->nrexceptional--;
+               mapping->nrpages -= PG_PMD_NR;
                entry = NULL;
                xas_set(xas, index);
        }
@@ -541,7 +541,7 @@ retry:
                dax_lock_entry(xas, entry);
                if (xas_error(xas))
                        goto out_unlock;
-               mapping->nrexceptional++;
+               mapping->nrpages += 1UL << order;
        }
 
 out_unlock:
@@ -661,7 +661,7 @@ static int __dax_invalidate_entry(struct address_space *mapping,
                goto out;
        dax_disassociate_entry(entry, mapping, trunc);
        xas_store(&xas, NULL);
-       mapping->nrexceptional--;
+       mapping->nrpages -= 1UL << dax_entry_order(entry);
        ret = 1;
 out:
        put_unlocked_entry(&xas, entry);
@@ -965,7 +965,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
        if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
                return -EIO;
 
-       if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
+       if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
                return 0;
 
        trace_dax_writeback_range(inode, xas.xa_index, end_index);
index 7d24ff7..cf871a8 100644 (file)
@@ -84,6 +84,8 @@ const struct qstr empty_name = QSTR_INIT("", 0);
 EXPORT_SYMBOL(empty_name);
 const struct qstr slash_name = QSTR_INIT("/", 1);
 EXPORT_SYMBOL(slash_name);
+const struct qstr dotdot_name = QSTR_INIT("..", 2);
+EXPORT_SYMBOL(dotdot_name);
 
 /*
  * This is the single most critical data structure when it comes
index 943e523..345f806 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 1997-2004 Erez Zadok
@@ -350,7 +350,7 @@ out:
        return rc;
 }
 
-/**
+/*
  * lower_offset_for_page
  *
  * Convert an eCryptfs page index into a lower byte offset
@@ -535,7 +535,7 @@ int ecryptfs_decrypt_page(struct page *page)
                rc = crypt_extent(crypt_stat, page, page,
                                  extent_offset, DECRYPT);
                if (rc) {
-                       printk(KERN_ERR "%s: Error encrypting extent; "
+                       printk(KERN_ERR "%s: Error decrypting extent; "
                               "rc = [%d]\n", __func__, rc);
                        goto out;
                }
@@ -627,9 +627,8 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
        }
 }
 
-/**
+/*
  * ecryptfs_compute_root_iv
- * @crypt_stats
  *
  * On error, sets the root IV to all 0's.
  */
@@ -1370,7 +1369,7 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
        return rc;
 }
 
-/**
+/*
  * ecryptfs_read_metadata
  *
  * Common entry point for reading file metadata. From here, we could
@@ -1448,7 +1447,7 @@ out:
        return rc;
 }
 
-/**
+/*
  * ecryptfs_encrypt_filename - encrypt filename
  *
  * CBC-encrypts the filename. We do not want to encrypt the same
@@ -1590,11 +1589,10 @@ out:
 
 struct kmem_cache *ecryptfs_key_tfm_cache;
 static struct list_head key_tfm_list;
-struct mutex key_tfm_list_mutex;
+DEFINE_MUTEX(key_tfm_list_mutex);
 
 int __init ecryptfs_init_crypto(void)
 {
-       mutex_init(&key_tfm_list_mutex);
        INIT_LIST_HEAD(&key_tfm_list);
        return 0;
 }
@@ -1877,10 +1875,11 @@ out:
 
 /**
  * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text
- * @crypt_stat: The crypt_stat struct associated with the file anem to encode
+ * @encoded_name: The encrypted name
+ * @encoded_name_size: Length of the encrypted name
+ * @mount_crypt_stat: The crypt_stat struct associated with the file name to encode
  * @name: The plaintext name
- * @length: The length of the plaintext
- * @encoded_name: The encypted name
+ * @name_size: The length of the plaintext name
  *
  * Encrypts and encodes a filename into something that constitutes a
  * valid filename for a filesystem, with printable characters.
@@ -1992,7 +1991,7 @@ static bool is_dot_dotdot(const char *name, size_t name_size)
  * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
  * @plaintext_name: The plaintext name
  * @plaintext_name_size: The plaintext name size
- * @ecryptfs_dir_dentry: eCryptfs directory dentry
+ * @sb: Ecryptfs's super_block
  * @name: The filename in cipher text
  * @name_size: The cipher text name size
  *
index 1f65e99..cf6d0e8 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  * Functions only useful for debugging.
  *
@@ -9,7 +9,7 @@
 
 #include "ecryptfs_kernel.h"
 
-/**
+/*
  * ecryptfs_dump_auth_tok - debug function to print auth toks
  *
  * This function will print the contents of an ecryptfs authentication
index 44606f0..acaa082 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 1997-2003 Erez Zadok
index e6ac78c..5f2b49e 100644 (file)
@@ -262,10 +262,7 @@ struct ecryptfs_inode_info {
  * vfsmount too. */
 struct ecryptfs_dentry_info {
        struct path lower_path;
-       union {
-               struct ecryptfs_crypt_stat *crypt_stat;
-               struct rcu_head rcu;
-       };
+       struct rcu_head rcu;
 };
 
 /**
@@ -496,12 +493,6 @@ ecryptfs_set_superblock_lower(struct super_block *sb,
        ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb;
 }
 
-static inline struct ecryptfs_dentry_info *
-ecryptfs_dentry_to_private(struct dentry *dentry)
-{
-       return (struct ecryptfs_dentry_info *)dentry->d_fsdata;
-}
-
 static inline void
 ecryptfs_set_dentry_private(struct dentry *dentry,
                            struct ecryptfs_dentry_info *dentry_info)
@@ -515,12 +506,6 @@ ecryptfs_dentry_to_lower(struct dentry *dentry)
        return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry;
 }
 
-static inline struct vfsmount *
-ecryptfs_dentry_to_lower_mnt(struct dentry *dentry)
-{
-       return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt;
-}
-
 static inline struct path *
 ecryptfs_dentry_to_lower_path(struct dentry *dentry)
 {
@@ -528,7 +513,7 @@ ecryptfs_dentry_to_lower_path(struct dentry *dentry)
 }
 
 #define ecryptfs_printk(type, fmt, arg...) \
-        __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
+        __ecryptfs_printk(type "%s: " fmt, __func__, ## arg)
 __printf(1, 2)
 void __ecryptfs_printk(const char *fmt, ...);
 
index 5fb45d8..18d5b91 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 1997-2004 Erez Zadok
@@ -19,7 +19,7 @@
 #include <linux/fs_stack.h>
 #include "ecryptfs_kernel.h"
 
-/**
+/*
  * ecryptfs_read_update_atime
  *
  * generic_file_read updates the atime of upper layer inode.  But, it
index 7169ea8..16d50df 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 1997-2004 Erez Zadok
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
-static struct dentry *lock_parent(struct dentry *dentry)
+static int lock_parent(struct dentry *dentry,
+                      struct dentry **lower_dentry,
+                      struct inode **lower_dir)
 {
-       struct dentry *dir;
+       struct dentry *lower_dir_dentry;
 
-       dir = dget_parent(dentry);
-       inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
-       return dir;
-}
+       lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+       *lower_dir = d_inode(lower_dir_dentry);
+       *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 
-static void unlock_dir(struct dentry *dir)
-{
-       inode_unlock(d_inode(dir));
-       dput(dir);
+       inode_lock_nested(*lower_dir, I_MUTEX_PARENT);
+       return (*lower_dentry)->d_parent == lower_dir_dentry ? 0 : -EINVAL;
 }
 
 static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
@@ -128,32 +127,29 @@ static int ecryptfs_interpose(struct dentry *lower_dentry,
 static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
                              struct inode *inode)
 {
-       struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-       struct dentry *lower_dir_dentry;
-       struct inode *lower_dir_inode;
+       struct dentry *lower_dentry;
+       struct inode *lower_dir;
        int rc;
 
-       lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
-       lower_dir_inode = d_inode(lower_dir_dentry);
-       inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT);
+       rc = lock_parent(dentry, &lower_dentry, &lower_dir);
        dget(lower_dentry);     // don't even try to make the lower negative
-       if (lower_dentry->d_parent != lower_dir_dentry)
-               rc = -EINVAL;
-       else if (d_unhashed(lower_dentry))
-               rc = -EINVAL;
-       else
-               rc = vfs_unlink(&init_user_ns, lower_dir_inode, lower_dentry,
-                               NULL);
+       if (!rc) {
+               if (d_unhashed(lower_dentry))
+                       rc = -EINVAL;
+               else
+                       rc = vfs_unlink(&init_user_ns, lower_dir, lower_dentry,
+                                       NULL);
+       }
        if (rc) {
                printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
                goto out_unlock;
        }
-       fsstack_copy_attr_times(dir, lower_dir_inode);
+       fsstack_copy_attr_times(dir, lower_dir);
        set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
        inode->i_ctime = dir->i_ctime;
 out_unlock:
        dput(lower_dentry);
-       inode_unlock(lower_dir_inode);
+       inode_unlock(lower_dir);
        if (!rc)
                d_drop(dentry);
        return rc;
@@ -177,13 +173,13 @@ ecryptfs_do_create(struct inode *directory_inode,
 {
        int rc;
        struct dentry *lower_dentry;
-       struct dentry *lower_dir_dentry;
+       struct inode *lower_dir;
        struct inode *inode;
 
-       lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
-       lower_dir_dentry = lock_parent(lower_dentry);
-       rc = vfs_create(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
-                       mode, true);
+       rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir);
+       if (!rc)
+               rc = vfs_create(&init_user_ns, lower_dir,
+                               lower_dentry, mode, true);
        if (rc) {
                printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
                       "rc = [%d]\n", __func__, rc);
@@ -193,18 +189,17 @@ ecryptfs_do_create(struct inode *directory_inode,
        inode = __ecryptfs_get_inode(d_inode(lower_dentry),
                                     directory_inode->i_sb);
        if (IS_ERR(inode)) {
-               vfs_unlink(&init_user_ns, d_inode(lower_dir_dentry),
-                          lower_dentry, NULL);
+               vfs_unlink(&init_user_ns, lower_dir, lower_dentry, NULL);
                goto out_lock;
        }
-       fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry));
-       fsstack_copy_inode_size(directory_inode, d_inode(lower_dir_dentry));
+       fsstack_copy_attr_times(directory_inode, lower_dir);
+       fsstack_copy_inode_size(directory_inode, lower_dir);
 out_lock:
-       unlock_dir(lower_dir_dentry);
+       inode_unlock(lower_dir);
        return inode;
 }
 
-/**
+/*
  * ecryptfs_initialize_file
  *
  * Cause the file to be changed from a basic empty file to an ecryptfs
@@ -247,10 +242,8 @@ out:
        return rc;
 }
 
-/**
+/*
  * ecryptfs_create
- * @dir: The inode of the directory in which to create the file.
- * @dentry: The eCryptfs dentry
  * @mode: The mode of the new file.
  *
  * Creates a new file.
@@ -318,7 +311,7 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
        return 0;
 }
 
-/**
+/*
  * ecryptfs_lookup_interpose - Dentry interposition for a lookup
  */
 static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
@@ -431,32 +424,28 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 {
        struct dentry *lower_old_dentry;
        struct dentry *lower_new_dentry;
-       struct dentry *lower_dir_dentry;
+       struct inode *lower_dir;
        u64 file_size_save;
        int rc;
 
        file_size_save = i_size_read(d_inode(old_dentry));
        lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
-       lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
-       dget(lower_old_dentry);
-       dget(lower_new_dentry);
-       lower_dir_dentry = lock_parent(lower_new_dentry);
-       rc = vfs_link(lower_old_dentry, &init_user_ns,
-                     d_inode(lower_dir_dentry), lower_new_dentry, NULL);
+       rc = lock_parent(new_dentry, &lower_new_dentry, &lower_dir);
+       if (!rc)
+               rc = vfs_link(lower_old_dentry, &init_user_ns, lower_dir,
+                             lower_new_dentry, NULL);
        if (rc || d_really_is_negative(lower_new_dentry))
                goto out_lock;
        rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
        if (rc)
                goto out_lock;
-       fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
-       fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+       fsstack_copy_attr_times(dir, lower_dir);
+       fsstack_copy_inode_size(dir, lower_dir);
        set_nlink(d_inode(old_dentry),
                  ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink);
        i_size_write(d_inode(new_dentry), file_size_save);
 out_lock:
-       unlock_dir(lower_dir_dentry);
-       dput(lower_new_dentry);
-       dput(lower_old_dentry);
+       inode_unlock(lower_dir);
        return rc;
 }
 
@@ -471,14 +460,14 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns,
 {
        int rc;
        struct dentry *lower_dentry;
-       struct dentry *lower_dir_dentry;
+       struct inode *lower_dir;
        char *encoded_symname;
        size_t encoded_symlen;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
 
-       lower_dentry = ecryptfs_dentry_to_lower(dentry);
-       dget(lower_dentry);
-       lower_dir_dentry = lock_parent(lower_dentry);
+       rc = lock_parent(dentry, &lower_dentry, &lower_dir);
+       if (rc)
+               goto out_lock;
        mount_crypt_stat = &ecryptfs_superblock_to_private(
                dir->i_sb)->mount_crypt_stat;
        rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
@@ -487,7 +476,7 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns,
                                                  strlen(symname));
        if (rc)
                goto out_lock;
-       rc = vfs_symlink(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
+       rc = vfs_symlink(&init_user_ns, lower_dir, lower_dentry,
                         encoded_symname);
        kfree(encoded_symname);
        if (rc || d_really_is_negative(lower_dentry))
@@ -495,11 +484,10 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns,
        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
        if (rc)
                goto out_lock;
-       fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
-       fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+       fsstack_copy_attr_times(dir, lower_dir);
+       fsstack_copy_inode_size(dir, lower_dir);
 out_lock:
-       unlock_dir(lower_dir_dentry);
-       dput(lower_dentry);
+       inode_unlock(lower_dir);
        if (d_really_is_negative(dentry))
                d_drop(dentry);
        return rc;
@@ -510,22 +498,22 @@ static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 {
        int rc;
        struct dentry *lower_dentry;
-       struct dentry *lower_dir_dentry;
+       struct inode *lower_dir;
 
-       lower_dentry = ecryptfs_dentry_to_lower(dentry);
-       lower_dir_dentry = lock_parent(lower_dentry);
-       rc = vfs_mkdir(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
-                      mode);
+       rc = lock_parent(dentry, &lower_dentry, &lower_dir);
+       if (!rc)
+               rc = vfs_mkdir(&init_user_ns, lower_dir,
+                              lower_dentry, mode);
        if (rc || d_really_is_negative(lower_dentry))
                goto out;
        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
        if (rc)
                goto out;
-       fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
-       fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
-       set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
+       fsstack_copy_attr_times(dir, lower_dir);
+       fsstack_copy_inode_size(dir, lower_dir);
+       set_nlink(dir, lower_dir->i_nlink);
 out:
-       unlock_dir(lower_dir_dentry);
+       inode_unlock(lower_dir);
        if (d_really_is_negative(dentry))
                d_drop(dentry);
        return rc;
@@ -534,29 +522,24 @@ out:
 static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct dentry *lower_dentry;
-       struct dentry *lower_dir_dentry;
-       struct inode *lower_dir_inode;
+       struct inode *lower_dir;
        int rc;
 
-       lower_dentry = ecryptfs_dentry_to_lower(dentry);
-       lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
-       lower_dir_inode = d_inode(lower_dir_dentry);
-
-       inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT);
+       rc = lock_parent(dentry, &lower_dentry, &lower_dir);
        dget(lower_dentry);     // don't even try to make the lower negative
-       if (lower_dentry->d_parent != lower_dir_dentry)
-               rc = -EINVAL;
-       else if (d_unhashed(lower_dentry))
-               rc = -EINVAL;
-       else
-               rc = vfs_rmdir(&init_user_ns, lower_dir_inode, lower_dentry);
+       if (!rc) {
+               if (d_unhashed(lower_dentry))
+                       rc = -EINVAL;
+               else
+                       rc = vfs_rmdir(&init_user_ns, lower_dir, lower_dentry);
+       }
        if (!rc) {
                clear_nlink(d_inode(dentry));
-               fsstack_copy_attr_times(dir, lower_dir_inode);
-               set_nlink(dir, lower_dir_inode->i_nlink);
+               fsstack_copy_attr_times(dir, lower_dir);
+               set_nlink(dir, lower_dir->i_nlink);
        }
        dput(lower_dentry);
-       inode_unlock(lower_dir_inode);
+       inode_unlock(lower_dir);
        if (!rc)
                d_drop(dentry);
        return rc;
@@ -568,21 +551,21 @@ ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
 {
        int rc;
        struct dentry *lower_dentry;
-       struct dentry *lower_dir_dentry;
+       struct inode *lower_dir;
 
-       lower_dentry = ecryptfs_dentry_to_lower(dentry);
-       lower_dir_dentry = lock_parent(lower_dentry);
-       rc = vfs_mknod(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
-                      mode, dev);
+       rc = lock_parent(dentry, &lower_dentry, &lower_dir);
+       if (!rc)
+               rc = vfs_mknod(&init_user_ns, lower_dir,
+                              lower_dentry, mode, dev);
        if (rc || d_really_is_negative(lower_dentry))
                goto out;
        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
        if (rc)
                goto out;
-       fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
-       fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+       fsstack_copy_attr_times(dir, lower_dir);
+       fsstack_copy_inode_size(dir, lower_dir);
 out:
-       unlock_dir(lower_dir_dentry);
+       inode_unlock(lower_dir);
        if (d_really_is_negative(dentry))
                d_drop(dentry);
        return rc;
@@ -888,6 +871,7 @@ ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
 
 /**
  * ecryptfs_setattr
+ * @mnt_userns: user namespace of the target mount
  * @dentry: dentry handle to the inode to modify
  * @ia: Structure with flags of what to change and values
  *
index f6a17d2..3fe4196 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  * In-kernel key management code.  Includes functions to parse and
  * write authentication token-related packets with the underlying
@@ -21,7 +21,7 @@
 #include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 
-/**
+/*
  * request_key returned an error instead of a valid key address;
  * determine the type of error, make appropriate log entries, and
  * return an error code.
@@ -536,8 +536,9 @@ out:
 
 /**
  * ecryptfs_find_auth_tok_for_sig
+ * @auth_tok_key: key containing the authentication token
  * @auth_tok: Set to the matching auth_tok; NULL if not found
- * @crypt_stat: inode crypt_stat crypto context
+ * @mount_crypt_stat: inode crypt_stat crypto context
  * @sig: Sig of auth_tok to find
  *
  * For now, this function simply looks at the registered auth_tok's
@@ -576,7 +577,7 @@ ecryptfs_find_auth_tok_for_sig(
        return rc;
 }
 
-/**
+/*
  * write_tag_70_packet can gobble a lot of stack space. We stuff most
  * of the function's parameters in a kmalloc'd struct to help reduce
  * eCryptfs' overall stack usage.
@@ -604,7 +605,7 @@ struct ecryptfs_write_tag_70_packet_silly_stack {
        struct shash_desc *hash_desc;
 };
 
-/**
+/*
  * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK
  * @filename: NULL-terminated filename string
  *
@@ -873,7 +874,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
 };
 
 /**
- * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
+ * ecryptfs_parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
  * @filename: This function kmalloc's the memory for the filename
  * @filename_size: This function sets this to the amount of memory
  *                 kmalloc'd for the filename
@@ -1172,7 +1173,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
        rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher, cipher_code);
        if (rc) {
                ecryptfs_printk(KERN_ERR, "Cipher code [%d] is invalid\n",
-                               cipher_code)
+                               cipher_code);
                goto out;
        }
        crypt_stat->flags |= ECRYPTFS_KEY_VALID;
index a7c903c..ae4cb4e 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 2008 International Business Machines Corp.
@@ -108,6 +108,7 @@ void ecryptfs_destroy_kthread(void)
  * @lower_file: Result of dentry_open by root on lower dentry
  * @lower_dentry: Lower dentry for file to open
  * @lower_mnt: Lower vfsmount for file to open
+ * @cred: credential to use for this call
  *
  * This function gets a r/w file opened against the lower dentry.
  *
index cdf40a5..d66bbd2 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 1997-2003 Erez Zadok
@@ -24,7 +24,7 @@
 #include <linux/magic.h>
 #include "ecryptfs_kernel.h"
 
-/**
+/*
  * Module parameter that defines the ecryptfs_verbosity level.
  */
 int ecryptfs_verbosity = 0;
@@ -34,7 +34,7 @@ MODULE_PARM_DESC(ecryptfs_verbosity,
                 "Initial verbosity level (0 or 1; defaults to "
                 "0, which is Quiet)");
 
-/**
+/*
  * Module parameter that defines the number of message buffer elements
  */
 unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS;
@@ -43,7 +43,7 @@ module_param(ecryptfs_message_buf_len, uint, 0);
 MODULE_PARM_DESC(ecryptfs_message_buf_len,
                 "Number of message buffer elements");
 
-/**
+/*
  * Module parameter that defines the maximum guaranteed amount of time to wait
  * for a response from ecryptfsd.  The actual sleep time will be, more than
  * likely, a small amount greater than this specified value, but only less if
@@ -57,7 +57,7 @@ MODULE_PARM_DESC(ecryptfs_message_wait_timeout,
                 "sleep while waiting for a message response from "
                 "userspace");
 
-/**
+/*
  * Module parameter that is an estimate of the maximum number of users
  * that will be concurrently using eCryptfs. Set this to the right
  * value to balance performance and memory use.
@@ -80,7 +80,7 @@ void __ecryptfs_printk(const char *fmt, ...)
        va_end(args);
 }
 
-/**
+/*
  * ecryptfs_init_lower_file
  * @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with
  *                   the lower dentry and the lower mount set
@@ -221,7 +221,7 @@ static void ecryptfs_init_mount_crypt_stat(
 
 /**
  * ecryptfs_parse_options
- * @sb: The ecryptfs super block
+ * @sbi: The ecryptfs super block
  * @options: The options passed to the kernel
  * @check_ruid: set to 1 if device uid should be checked against the ruid
  *
@@ -466,10 +466,10 @@ out:
 struct kmem_cache *ecryptfs_sb_info_cache;
 static struct file_system_type ecryptfs_fs_type;
 
-/**
- * ecryptfs_get_sb
- * @fs_type
- * @flags
+/*
+ * ecryptfs_mount
+ * @fs_type: The filesystem type that the superblock should belong to
+ * @flags: The flags associated with the mount
  * @dev_name: The path to mount over
  * @raw_data: The options passed into the kernel
  */
@@ -492,6 +492,12 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
                goto out;
        }
 
+       if (!dev_name) {
+               rc = -EINVAL;
+               err = "Device name cannot be null";
+               goto out;
+       }
+
        rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid);
        if (rc) {
                err = "Error parsing options";
@@ -635,7 +641,7 @@ static struct file_system_type ecryptfs_fs_type = {
 };
 MODULE_ALIAS_FS("ecryptfs");
 
-/**
+/*
  * inode_info_init_once
  *
  * Initializes the ecryptfs_inode_info_cache when it is created
index c0dfd96..6318f35 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 2004-2008 International Business Machines Corp.
 
 static LIST_HEAD(ecryptfs_msg_ctx_free_list);
 static LIST_HEAD(ecryptfs_msg_ctx_alloc_list);
-static struct mutex ecryptfs_msg_ctx_lists_mux;
+static DEFINE_MUTEX(ecryptfs_msg_ctx_lists_mux);
 
 static struct hlist_head *ecryptfs_daemon_hash;
-struct mutex ecryptfs_daemon_hash_mux;
+DEFINE_MUTEX(ecryptfs_daemon_hash_mux);
 static int ecryptfs_hash_bits;
 #define ecryptfs_current_euid_hash(uid) \
        hash_long((unsigned long)from_kuid(&init_user_ns, current_euid()), ecryptfs_hash_bits)
@@ -147,7 +147,7 @@ out:
        return rc;
 }
 
-/**
+/*
  * ecryptfs_exorcise_daemon - Destroy the daemon struct
  *
  * Must be called ceremoniously while in possession of
@@ -181,7 +181,8 @@ out:
 }
 
 /**
- * ecryptfs_process_reponse
+ * ecryptfs_process_response
+ * @daemon: eCryptfs daemon object
  * @msg: The ecryptfs message received; the caller should sanity check
  *       msg->data_len and free the memory
  * @seq: The sequence number of the message; must match the sequence
@@ -250,6 +251,7 @@ out:
  * ecryptfs_send_message_locked
  * @data: The data to send
  * @data_len: The length of data
+ * @msg_type: Type of message
  * @msg_ctx: The message context allocated for the send
  *
  * Must be called with ecryptfs_daemon_hash_mux held.
@@ -359,7 +361,6 @@ int __init ecryptfs_init_messaging(void)
                       "too large, defaulting to [%d] users\n", __func__,
                       ecryptfs_number_of_users);
        }
-       mutex_init(&ecryptfs_daemon_hash_mux);
        mutex_lock(&ecryptfs_daemon_hash_mux);
        ecryptfs_hash_bits = 1;
        while (ecryptfs_number_of_users >> ecryptfs_hash_bits)
@@ -383,7 +384,6 @@ int __init ecryptfs_init_messaging(void)
                rc = -ENOMEM;
                goto out;
        }
-       mutex_init(&ecryptfs_msg_ctx_lists_mux);
        mutex_lock(&ecryptfs_msg_ctx_lists_mux);
        ecryptfs_msg_counter = 0;
        for (i = 0; i < ecryptfs_message_buf_len; i++) {
index 742ece2..4e62c3c 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 2008 International Business Machines Corp.
@@ -312,6 +312,7 @@ out_unlock_daemon:
 
 /**
  * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon
+ * @daemon: eCryptfs daemon object
  * @data: Bytes comprising struct ecryptfs_message
  * @data_size: sizeof(struct ecryptfs_message) + data len
  * @seq: Sequence number for miscdev response packet
index 2f333a4..392e721 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  * This is where eCryptfs coordinates the symmetric encryption and
  * decryption of the file data as it passes between the lower
@@ -22,7 +22,7 @@
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
-/**
+/*
  * ecryptfs_get_locked_page
  *
  * Get one page from cache or lower f/s, return error otherwise.
@@ -41,6 +41,7 @@ struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
 /**
  * ecryptfs_writepage
  * @page: Page that is locked before this call is made
+ * @wbc: Write-back control structure
  *
  * Returns zero on success; non-zero otherwise
  *
@@ -78,7 +79,7 @@ static void strip_xattr_flag(char *page_virt,
        }
 }
 
-/**
+/*
  *   Header Extent:
  *     Octets 0-7:        Unencrypted file size (big-endian)
  *     Octets 8-15:       eCryptfs special marker
@@ -229,7 +230,7 @@ out:
        return rc;
 }
 
-/**
+/*
  * Called with lower inode mutex held.
  */
 static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
@@ -368,7 +369,7 @@ out:
        return rc;
 }
 
-/**
+/*
  * ecryptfs_write_inode_size_to_header
  *
  * Writes the lower file size to the first 8 bytes of the header.
index 0438997..60bdcad 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 2007 International Business Machines Corp.
@@ -230,6 +230,8 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
  * ecryptfs_read_lower_page_segment
  * @page_for_ecryptfs: The page into which data for eCryptfs will be
  *                     written
+ * @page_index: Page index in @page_for_ecryptfs from which to start
+ *             writing
  * @offset_in_page: Offset in @page_for_ecryptfs from which to start
  *                  writing
  * @size: The number of bytes to write into @page_for_ecryptfs
index 6b1853f..39116af 100644 (file)
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
  * eCryptfs: Linux filesystem encryption layer
  *
  * Copyright (C) 1997-2003 Erez Zadok
@@ -81,7 +81,7 @@ static void ecryptfs_destroy_inode(struct inode *inode)
 
 /**
  * ecryptfs_statfs
- * @sb: The ecryptfs super block
+ * @dentry: The ecryptfs dentry
  * @buf: The struct kstatfs to fill in with stats
  *
  * Get the filesystem statistics. Currently, we let this pass right through
@@ -108,7 +108,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 /**
  * ecryptfs_evict_inode
- * @inode - The ecryptfs inode
+ * @inode: The ecryptfs inode
  *
  * Called by iput() when the inode reference count reached zero
  * and the inode is not hashed anywhere.  Used to clear anything
@@ -123,7 +123,7 @@ static void ecryptfs_evict_inode(struct inode *inode)
        iput(ecryptfs_inode_to_lower(inode));
 }
 
-/**
+/*
  * ecryptfs_show_options
  *
  * Prints the mount options for a given superblock.
index 73138ea..1e596e1 100644 (file)
@@ -657,6 +657,12 @@ static void ep_done_scan(struct eventpoll *ep,
         */
        list_splice(txlist, &ep->rdllist);
        __pm_relax(ep->ws);
+
+       if (!list_empty(&ep->rdllist)) {
+               if (waitqueue_active(&ep->wq))
+                       wake_up(&ep->wq);
+       }
+
        write_unlock_irq(&ep->lock);
 }
 
index c6b8bba..1f69b81 100644 (file)
@@ -81,11 +81,10 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, uns
 
 struct dentry *ext2_get_parent(struct dentry *child)
 {
-       struct qstr dotdot = QSTR_INIT("..", 2);
        ino_t ino;
        int res;
 
-       res = ext2_inode_by_name(d_inode(child), &dotdot, &ino);
+       res = ext2_inode_by_name(d_inode(child), &dotdot_name, &ino);
        if (res)
                return ERR_PTR(res);
 
index e8100a9..afb9d05 100644 (file)
@@ -1814,11 +1814,10 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
 struct dentry *ext4_get_parent(struct dentry *child)
 {
        __u32 ino;
-       static const struct qstr dotdot = QSTR_INIT("..", 2);
        struct ext4_dir_entry_2 * de;
        struct buffer_head *bh;
 
-       bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL);
+       bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL);
        if (IS_ERR(bh))
                return ERR_CAST(bh);
        if (!bh)
index 62e638a..7669de7 100644 (file)
@@ -7,6 +7,13 @@ config F2FS_FS
        select CRYPTO_CRC32
        select F2FS_FS_XATTR if FS_ENCRYPTION
        select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
+       select LZ4_COMPRESS if F2FS_FS_LZ4
+       select LZ4_DECOMPRESS if F2FS_FS_LZ4
+       select LZ4HC_COMPRESS if F2FS_FS_LZ4HC
+       select LZO_COMPRESS if F2FS_FS_LZO
+       select LZO_DECOMPRESS if F2FS_FS_LZO
+       select ZSTD_COMPRESS if F2FS_FS_ZSTD
+       select ZSTD_DECOMPRESS if F2FS_FS_ZSTD
        help
          F2FS is based on Log-structured File System (LFS), which supports
          versatile "flash-friendly" features. The design has been focused on
@@ -94,8 +101,6 @@ config F2FS_FS_COMPRESSION
 config F2FS_FS_LZO
        bool "LZO compression support"
        depends on F2FS_FS_COMPRESSION
-       select LZO_COMPRESS
-       select LZO_DECOMPRESS
        default y
        help
          Support LZO compress algorithm, if unsure, say Y.
@@ -103,8 +108,6 @@ config F2FS_FS_LZO
 config F2FS_FS_LZ4
        bool "LZ4 compression support"
        depends on F2FS_FS_COMPRESSION
-       select LZ4_COMPRESS
-       select LZ4_DECOMPRESS
        default y
        help
          Support LZ4 compress algorithm, if unsure, say Y.
@@ -113,7 +116,6 @@ config F2FS_FS_LZ4HC
        bool "LZ4HC compression support"
        depends on F2FS_FS_COMPRESSION
        depends on F2FS_FS_LZ4
-       select LZ4HC_COMPRESS
        default y
        help
          Support LZ4HC compress algorithm, LZ4HC has compatible on-disk
@@ -122,8 +124,6 @@ config F2FS_FS_LZ4HC
 config F2FS_FS_ZSTD
        bool "ZSTD compression support"
        depends on F2FS_FS_COMPRESSION
-       select ZSTD_COMPRESS
-       select ZSTD_DECOMPRESS
        default y
        help
          Support ZSTD compress algorithm, if unsure, say Y.
@@ -132,8 +132,6 @@ config F2FS_FS_LZORLE
        bool "LZO-RLE compression support"
        depends on F2FS_FS_COMPRESSION
        depends on F2FS_FS_LZO
-       select LZO_COMPRESS
-       select LZO_DECOMPRESS
        default y
        help
          Support LZO-RLE compress algorithm, if unsure, say Y.
index 965037a..239ad94 100644 (file)
@@ -29,6 +29,7 @@ static inline size_t f2fs_acl_size(int count)
 static inline int f2fs_acl_count(size_t size)
 {
        ssize_t s;
+
        size -= sizeof(struct f2fs_acl_header);
        s = size - 4 * sizeof(struct f2fs_acl_entry_short);
        if (s < 0) {
index be5415a..f795049 100644 (file)
@@ -719,6 +719,7 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi)
                orphan_blk = (struct f2fs_orphan_block *)page_address(page);
                for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
                        nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
+
                        err = recover_orphan_inode(sbi, ino);
                        if (err) {
                                f2fs_put_page(page, 1);
@@ -1456,7 +1457,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                        orphan_blocks);
 
        if (__remain_node_summaries(cpc->reason))
-               ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
+               ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
                                cp_payload_blks + data_sum_blocks +
                                orphan_blocks + NR_CURSEG_NODE_TYPE);
        else
@@ -1818,7 +1819,11 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
        llist_add(&req.llnode, &cprc->issue_list);
        atomic_inc(&cprc->queued_ckpt);
 
-       /* update issue_list before we wake up issue_checkpoint thread */
+       /*
+        * update issue_list before we wake up issue_checkpoint thread,
+        * this smp_mb() pairs with another barrier in ___wait_event(),
+        * see more details in comments of waitqueue_active().
+        */
        smp_mb();
 
        if (waitqueue_active(&cprc->ckpt_wait_queue))
index 77fa342..53b1378 100644 (file)
@@ -76,12 +76,6 @@ bool f2fs_is_compressed_page(struct page *page)
                return false;
        if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page))
                return false;
-       /*
-        * page->private may be set with pid.
-        * pid_max is enough to check if it is traced.
-        */
-       if (IS_IO_TRACED_PAGE(page))
-               return false;
 
        f2fs_bug_on(F2FS_M_SB(page->mapping),
                *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC);
@@ -896,7 +890,6 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index)
 
 static bool __cluster_may_compress(struct compress_ctx *cc)
 {
-       struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
        loff_t i_size = i_size_read(cc->inode);
        unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE);
        int i;
@@ -904,12 +897,7 @@ static bool __cluster_may_compress(struct compress_ctx *cc)
        for (i = 0; i < cc->cluster_size; i++) {
                struct page *page = cc->rpages[i];
 
-               f2fs_bug_on(sbi, !page);
-
-               if (unlikely(f2fs_cp_error(sbi)))
-                       return false;
-               if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
-                       return false;
+               f2fs_bug_on(F2FS_I_SB(cc->inode), !page);
 
                /* beyond EOF */
                if (page->index >= nr_pages)
@@ -1353,6 +1341,7 @@ unlock_continue:
        if (fio.compr_blocks)
                f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false);
        f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true);
+       add_compr_block_stat(inode, cc->nr_cpages);
 
        set_inode_flag(cc->inode, FI_APPEND_WRITE);
        if (cc->cluster_idx == 0)
diff --git a/fs/f2fs/compress.h b/fs/f2fs/compress.h
deleted file mode 100644 (file)
index e69de29..0000000
index 4e5257c..96f1a35 100644 (file)
@@ -1086,6 +1086,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
 
        for (; count > 0; dn->ofs_in_node++) {
                block_t blkaddr = f2fs_data_blkaddr(dn);
+
                if (blkaddr == NULL_ADDR) {
                        dn->data_blkaddr = NEW_ADDR;
                        __set_data_blkaddr(dn);
@@ -1722,7 +1723,7 @@ static int get_data_block_dio_write(struct inode *inode, sector_t iblock,
        return __get_data_block(inode, iblock, bh_result, create,
                                F2FS_GET_BLOCK_DIO, NULL,
                                f2fs_rw_hint_to_seg_type(inode->i_write_hint),
-                               IS_SWAPFILE(inode) ? false : true);
+                               true);
 }
 
 static int get_data_block_dio(struct inode *inode, sector_t iblock,
@@ -1837,6 +1838,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        int ret = 0;
        bool compr_cluster = false;
        unsigned int cluster_size = F2FS_I(inode)->i_cluster_size;
+       loff_t maxbytes;
 
        if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
                ret = f2fs_precache_extents(inode);
@@ -1850,6 +1852,15 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 
        inode_lock(inode);
 
+       maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS;
+       if (start > maxbytes) {
+               ret = -EFBIG;
+               goto out;
+       }
+
+       if (len > maxbytes || (maxbytes - len) < start)
+               len = maxbytes - start;
+
        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
                ret = f2fs_xattr_fiemap(inode, fieinfo);
                goto out;
@@ -3755,6 +3766,7 @@ int f2fs_migrate_page(struct address_space *mapping,
 
        if (atomic_written) {
                struct inmem_pages *cur;
+
                list_for_each_entry(cur, &fi->inmem_pages, list)
                        if (cur->page == page) {
                                cur->page = newpage;
@@ -3780,11 +3792,64 @@ int f2fs_migrate_page(struct address_space *mapping,
 #endif
 
 #ifdef CONFIG_SWAP
+static int f2fs_is_file_aligned(struct inode *inode)
+{
+       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+       block_t main_blkaddr = SM_I(sbi)->main_blkaddr;
+       block_t cur_lblock;
+       block_t last_lblock;
+       block_t pblock;
+       unsigned long nr_pblocks;
+       unsigned int blocks_per_sec = BLKS_PER_SEC(sbi);
+       int ret = 0;
+
+       cur_lblock = 0;
+       last_lblock = bytes_to_blks(inode, i_size_read(inode));
+
+       while (cur_lblock < last_lblock) {
+               struct f2fs_map_blocks map;
+
+               memset(&map, 0, sizeof(map));
+               map.m_lblk = cur_lblock;
+               map.m_len = last_lblock - cur_lblock;
+               map.m_next_pgofs = NULL;
+               map.m_next_extent = NULL;
+               map.m_seg_type = NO_CHECK_TYPE;
+               map.m_may_create = false;
+
+               ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
+               if (ret)
+                       goto out;
+
+               /* hole */
+               if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+                       f2fs_err(sbi, "Swapfile has holes\n");
+                       ret = -ENOENT;
+                       goto out;
+               }
+
+               pblock = map.m_pblk;
+               nr_pblocks = map.m_len;
+
+               if ((pblock - main_blkaddr) & (blocks_per_sec - 1) ||
+                       nr_pblocks & (blocks_per_sec - 1)) {
+                       f2fs_err(sbi, "Swapfile does not align to section");
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               cur_lblock += nr_pblocks;
+       }
+out:
+       return ret;
+}
+
 static int check_swap_activate_fast(struct swap_info_struct *sis,
                                struct file *swap_file, sector_t *span)
 {
        struct address_space *mapping = swap_file->f_mapping;
        struct inode *inode = mapping->host;
+       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        sector_t cur_lblock;
        sector_t last_lblock;
        sector_t pblock;
@@ -3792,8 +3857,8 @@ static int check_swap_activate_fast(struct swap_info_struct *sis,
        sector_t highest_pblock = 0;
        int nr_extents = 0;
        unsigned long nr_pblocks;
-       u64 len;
-       int ret;
+       unsigned int blocks_per_sec = BLKS_PER_SEC(sbi);
+       int ret = 0;
 
        /*
         * Map all the blocks into the extent list.  This code doesn't try
@@ -3801,31 +3866,41 @@ static int check_swap_activate_fast(struct swap_info_struct *sis,
         */
        cur_lblock = 0;
        last_lblock = bytes_to_blks(inode, i_size_read(inode));
-       len = i_size_read(inode);
 
-       while (cur_lblock <= last_lblock && cur_lblock < sis->max) {
+       while (cur_lblock < last_lblock && cur_lblock < sis->max) {
                struct f2fs_map_blocks map;
-               pgoff_t next_pgofs;
 
                cond_resched();
 
                memset(&map, 0, sizeof(map));
                map.m_lblk = cur_lblock;
-               map.m_len = bytes_to_blks(inode, len) - cur_lblock;
-               map.m_next_pgofs = &next_pgofs;
+               map.m_len = last_lblock - cur_lblock;
+               map.m_next_pgofs = NULL;
+               map.m_next_extent = NULL;
                map.m_seg_type = NO_CHECK_TYPE;
+               map.m_may_create = false;
 
                ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
                if (ret)
-                       goto err_out;
+                       goto out;
 
                /* hole */
-               if (!(map.m_flags & F2FS_MAP_FLAGS))
-                       goto err_out;
+               if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+                       f2fs_err(sbi, "Swapfile has holes\n");
+                       ret = -ENOENT;
+                       goto out;
+               }
 
                pblock = map.m_pblk;
                nr_pblocks = map.m_len;
 
+               if ((pblock - SM_I(sbi)->main_blkaddr) & (blocks_per_sec - 1) ||
+                               nr_pblocks & (blocks_per_sec - 1)) {
+                       f2fs_err(sbi, "Swapfile does not align to section");
+                       ret = -EINVAL;
+                       goto out;
+               }
+
                if (cur_lblock + nr_pblocks >= sis->max)
                        nr_pblocks = sis->max - cur_lblock;
 
@@ -3854,9 +3929,6 @@ static int check_swap_activate_fast(struct swap_info_struct *sis,
        sis->highest_bit = cur_lblock - 1;
 out:
        return ret;
-err_out:
-       pr_err("swapon: swapfile has holes\n");
-       return -EINVAL;
 }
 
 /* Copied from generic_swapfile_activate() to check any holes */
@@ -3865,6 +3937,7 @@ static int check_swap_activate(struct swap_info_struct *sis,
 {
        struct address_space *mapping = swap_file->f_mapping;
        struct inode *inode = mapping->host;
+       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        unsigned blocks_per_page;
        unsigned long page_no;
        sector_t probe_block;
@@ -3872,11 +3945,15 @@ static int check_swap_activate(struct swap_info_struct *sis,
        sector_t lowest_block = -1;
        sector_t highest_block = 0;
        int nr_extents = 0;
-       int ret;
+       int ret = 0;
 
        if (PAGE_SIZE == F2FS_BLKSIZE)
                return check_swap_activate_fast(sis, swap_file, span);
 
+       ret = f2fs_is_file_aligned(inode);
+       if (ret)
+               goto out;
+
        blocks_per_page = bytes_to_blks(inode, PAGE_SIZE);
 
        /*
@@ -3891,13 +3968,14 @@ static int check_swap_activate(struct swap_info_struct *sis,
                unsigned block_in_page;
                sector_t first_block;
                sector_t block = 0;
-               int      err = 0;
 
                cond_resched();
 
                block = probe_block;
-               err = bmap(inode, &block);
-               if (err || !block)
+               ret = bmap(inode, &block);
+               if (ret)
+                       goto out;
+               if (!block)
                        goto bad_bmap;
                first_block = block;
 
@@ -3913,9 +3991,10 @@ static int check_swap_activate(struct swap_info_struct *sis,
                                        block_in_page++) {
 
                        block = probe_block + block_in_page;
-                       err = bmap(inode, &block);
-
-                       if (err || !block)
+                       ret = bmap(inode, &block);
+                       if (ret)
+                               goto out;
+                       if (!block)
                                goto bad_bmap;
 
                        if (block != first_block + block_in_page) {
@@ -3955,8 +4034,8 @@ reprobe:
 out:
        return ret;
 bad_bmap:
-       pr_err("swapon: swapfile has holes\n");
-       return -EINVAL;
+       f2fs_err(sbi, "Swapfile has holes\n");
+       return -ENOENT;
 }
 
 static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
index 91855d5..c03949a 100644 (file)
@@ -173,6 +173,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
        si->util_invalid = 50 - si->util_free - si->util_valid;
        for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
                struct curseg_info *curseg = CURSEG_I(sbi, i);
+
                si->curseg[i] = curseg->segno;
                si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno);
                si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]);
@@ -300,10 +301,12 @@ get_cache:
        si->page_mem = 0;
        if (sbi->node_inode) {
                unsigned npages = NODE_MAPPING(sbi)->nrpages;
+
                si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
        }
        if (sbi->meta_inode) {
                unsigned npages = META_MAPPING(sbi)->nrpages;
+
                si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
        }
 }
index e6270a8..dc7ce79 100644 (file)
@@ -449,9 +449,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 
 struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
 {
-       struct qstr dotdot = QSTR_INIT("..", 2);
-
-       return f2fs_find_entry(dir, &dotdot, p);
+       return f2fs_find_entry(dir, &dotdot_name, p);
 }
 
 ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
@@ -473,6 +471,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
                struct page *page, struct inode *inode)
 {
        enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
+
        lock_page(page);
        f2fs_wait_on_page_writeback(page, type, true, true);
        de->ino = cpu_to_le32(inode->i_ino);
index 11a20dc..0448788 100644 (file)
@@ -97,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
 #define F2FS_MOUNT_NORECOVERY          0x04000000
 #define F2FS_MOUNT_ATGC                        0x08000000
 #define F2FS_MOUNT_MERGE_CHECKPOINT    0x10000000
+#define        F2FS_MOUNT_GC_MERGE             0x20000000
 
 #define F2FS_OPTION(sbi)       ((sbi)->mount_opt)
 #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -637,21 +638,26 @@ enum {
 #define FADVISE_MODIFIABLE_BITS        (FADVISE_COLD_BIT | FADVISE_HOT_BIT)
 
 #define file_is_cold(inode)    is_file(inode, FADVISE_COLD_BIT)
-#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
 #define file_set_cold(inode)   set_file(inode, FADVISE_COLD_BIT)
-#define file_lost_pino(inode)  set_file(inode, FADVISE_LOST_PINO_BIT)
 #define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT)
+
+#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_lost_pino(inode)  set_file(inode, FADVISE_LOST_PINO_BIT)
 #define file_got_pino(inode)   clear_file(inode, FADVISE_LOST_PINO_BIT)
+
 #define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT)
 #define file_set_encrypt(inode)        set_file(inode, FADVISE_ENCRYPT_BIT)
-#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT)
+
 #define file_enc_name(inode)   is_file(inode, FADVISE_ENC_NAME_BIT)
 #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
+
 #define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT)
 #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
+
 #define file_is_hot(inode)     is_file(inode, FADVISE_HOT_BIT)
 #define file_set_hot(inode)    set_file(inode, FADVISE_HOT_BIT)
 #define file_clear_hot(inode)  clear_file(inode, FADVISE_HOT_BIT)
+
 #define file_is_verity(inode)  is_file(inode, FADVISE_VERITY_BIT)
 #define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT)
 
@@ -860,7 +866,7 @@ struct f2fs_nm_info {
        /* NAT cache management */
        struct radix_tree_root nat_root;/* root of the nat entry cache */
        struct radix_tree_root nat_set_root;/* root of the nat set cache */
-       struct rw_semaphore nat_tree_lock;      /* protect nat_tree_lock */
+       struct rw_semaphore nat_tree_lock;      /* protect nat entry tree */
        struct list_head nat_entries;   /* cached nat entry list (clean) */
        spinlock_t nat_list_lock;       /* protect clean nat entry list */
        unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
@@ -1297,14 +1303,6 @@ enum {
 #define IS_DUMMY_WRITTEN_PAGE(page)                    \
                (page_private(page) == DUMMY_WRITTEN_PAGE)
 
-#ifdef CONFIG_F2FS_IO_TRACE
-#define IS_IO_TRACED_PAGE(page)                        \
-               (page_private(page) > 0 &&              \
-                page_private(page) < (unsigned long)PID_MAX_LIMIT)
-#else
-#define IS_IO_TRACED_PAGE(page) (0)
-#endif
-
 /* For compression */
 enum compress_algorithm_type {
        COMPRESS_LZO,
@@ -1623,6 +1621,11 @@ struct f2fs_sb_info {
 #ifdef CONFIG_F2FS_FS_COMPRESSION
        struct kmem_cache *page_array_slab;     /* page array entry */
        unsigned int page_array_slab_size;      /* default page array slab size */
+
+       /* For runtime compression statistics */
+       u64 compr_written_block;
+       u64 compr_saved_block;
+       u32 compr_new_inode;
 #endif
 };
 
@@ -2215,6 +2218,7 @@ static inline block_t __cp_payload(struct f2fs_sb_info *sbi)
 static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
 {
        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+       void *tmp_ptr = &ckpt->sit_nat_version_bitmap;
        int offset;
 
        if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) {
@@ -2224,7 +2228,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
                 * if large_nat_bitmap feature is enabled, leave checksum
                 * protection for all nat/sit bitmaps.
                 */
-               return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32);
+               return tmp_ptr + offset + sizeof(__le32);
        }
 
        if (__cp_payload(sbi) > 0) {
@@ -2235,7 +2239,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
        } else {
                offset = (flag == NAT_BITMAP) ?
                        le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0;
-               return &ckpt->sit_nat_version_bitmap + offset;
+               return tmp_ptr + offset;
        }
 }
 
@@ -3302,7 +3306,6 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname);
 /*
  * node.c
  */
-struct dnode_of_data;
 struct node_info;
 
 int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid);
@@ -3379,6 +3382,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi);
 int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable);
 void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
 int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
+bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno);
 void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi);
 void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi);
 void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi);
@@ -3386,7 +3390,7 @@ void f2fs_get_new_segment(struct f2fs_sb_info *sbi,
                        unsigned int *newseg, bool new_sec, int dir);
 void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
                                        unsigned int start, unsigned int end);
-void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type);
+void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force);
 void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
 int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
 bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
@@ -3550,7 +3554,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi);
 int f2fs_start_gc_thread(struct f2fs_sb_info *sbi);
 void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi);
 block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode);
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background,
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force,
                        unsigned int segno);
 void f2fs_build_gc_manager(struct f2fs_sb_info *sbi);
 int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count);
@@ -3958,6 +3962,18 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi);
 void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi);
 int __init f2fs_init_compress_cache(void);
 void f2fs_destroy_compress_cache(void);
+#define inc_compr_inode_stat(inode)                                    \
+       do {                                                            \
+               struct f2fs_sb_info *sbi = F2FS_I_SB(inode);            \
+               sbi->compr_new_inode++;                                 \
+       } while (0)
+#define add_compr_block_stat(inode, blocks)                            \
+       do {                                                            \
+               struct f2fs_sb_info *sbi = F2FS_I_SB(inode);            \
+               int diff = F2FS_I(inode)->i_cluster_size - blocks;      \
+               sbi->compr_written_block += blocks;                     \
+               sbi->compr_saved_block += diff;                         \
+       } while (0)
 #else
 static inline bool f2fs_is_compressed_page(struct page *page) { return false; }
 static inline bool f2fs_is_compress_backend_ready(struct inode *inode)
@@ -3986,6 +4002,7 @@ static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return
 static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { }
 static inline int __init f2fs_init_compress_cache(void) { return 0; }
 static inline void f2fs_destroy_compress_cache(void) { }
+#define inc_compr_inode_stat(inode)            do { } while (0)
 #endif
 
 static inline void set_compress_context(struct inode *inode)
@@ -4009,6 +4026,7 @@ static inline void set_compress_context(struct inode *inode)
        F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
        set_inode_flag(inode, FI_COMPRESSED_FILE);
        stat_inc_compr_inode(inode);
+       inc_compr_inode_stat(inode);
        f2fs_mark_inode_dirty_sync(inode, true);
 }
 
@@ -4179,8 +4197,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
                if (F2FS_IO_ALIGNED(sbi))
                        return true;
        }
-       if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED) &&
-                                       !IS_SWAPFILE(inode))
+       if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
                return true;
 
        return false;
index 8a56acb..44a4650 100644 (file)
@@ -1622,9 +1622,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
        struct f2fs_map_blocks map = { .m_next_pgofs = NULL,
                        .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE,
                        .m_may_create = true };
-       pgoff_t pg_end;
+       pgoff_t pg_start, pg_end;
        loff_t new_size = i_size_read(inode);
        loff_t off_end;
+       block_t expanded = 0;
        int err;
 
        err = inode_newsize_ok(inode, (len + offset));
@@ -1637,11 +1638,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 
        f2fs_balance_fs(sbi, true);
 
+       pg_start = ((unsigned long long)offset) >> PAGE_SHIFT;
        pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT;
        off_end = (offset + len) & (PAGE_SIZE - 1);
 
-       map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT;
-       map.m_len = pg_end - map.m_lblk;
+       map.m_lblk = pg_start;
+       map.m_len = pg_end - pg_start;
        if (off_end)
                map.m_len++;
 
@@ -1649,19 +1651,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
                return 0;
 
        if (f2fs_is_pinned_file(inode)) {
-               block_t len = (map.m_len >> sbi->log_blocks_per_seg) <<
-                                       sbi->log_blocks_per_seg;
-               block_t done = 0;
+               block_t sec_blks = BLKS_PER_SEC(sbi);
+               block_t sec_len = roundup(map.m_len, sec_blks);
 
-               if (map.m_len % sbi->blocks_per_seg)
-                       len += sbi->blocks_per_seg;
-
-               map.m_len = sbi->blocks_per_seg;
+               map.m_len = sec_blks;
 next_alloc:
                if (has_not_enough_free_secs(sbi, 0,
                        GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
                        down_write(&sbi->gc_lock);
-                       err = f2fs_gc(sbi, true, false, NULL_SEGNO);
+                       err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
                        if (err && err != -ENODATA && err != -EAGAIN)
                                goto out_err;
                }
@@ -1669,7 +1667,7 @@ next_alloc:
                down_write(&sbi->pin_sem);
 
                f2fs_lock_op(sbi);
-               f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA_PINNED);
+               f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
                f2fs_unlock_op(sbi);
 
                map.m_seg_type = CURSEG_COLD_DATA_PINNED;
@@ -1677,24 +1675,25 @@ next_alloc:
 
                up_write(&sbi->pin_sem);
 
-               done += map.m_len;
-               len -= map.m_len;
+               expanded += map.m_len;
+               sec_len -= map.m_len;
                map.m_lblk += map.m_len;
-               if (!err && len)
+               if (!err && sec_len)
                        goto next_alloc;
 
-               map.m_len = done;
+               map.m_len = expanded;
        } else {
                err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+               expanded = map.m_len;
        }
 out_err:
        if (err) {
                pgoff_t last_off;
 
-               if (!map.m_len)
+               if (!expanded)
                        return err;
 
-               last_off = map.m_lblk + map.m_len - 1;
+               last_off = pg_start + expanded - 1;
 
                /* update new size to the failed position */
                new_size = (last_off == pg_end) ? offset + len :
@@ -2434,7 +2433,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
                down_write(&sbi->gc_lock);
        }
 
-       ret = f2fs_gc(sbi, sync, true, NULL_SEGNO);
+       ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO);
 out:
        mnt_drop_write_file(filp);
        return ret;
@@ -2470,7 +2469,8 @@ do_more:
                down_write(&sbi->gc_lock);
        }
 
-       ret = f2fs_gc(sbi, range->sync, true, GET_SEGNO(sbi, range->start));
+       ret = f2fs_gc(sbi, range->sync, true, false,
+                               GET_SEGNO(sbi, range->start));
        if (ret) {
                if (ret == -EBUSY)
                        ret = -EAGAIN;
@@ -2527,7 +2527,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 {
        struct inode *inode = file_inode(filp);
        struct f2fs_map_blocks map = { .m_next_extent = NULL,
-                                       .m_seg_type = NO_CHECK_TYPE ,
+                                       .m_seg_type = NO_CHECK_TYPE,
                                        .m_may_create = false };
        struct extent_info ei = {0, 0, 0};
        pgoff_t pg_start, pg_end, next_pgofs;
@@ -2923,7 +2923,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
                sm->last_victim[GC_CB] = end_segno + 1;
                sm->last_victim[GC_GREEDY] = end_segno + 1;
                sm->last_victim[ALLOC_NEXT] = end_segno + 1;
-               ret = f2fs_gc(sbi, true, true, start_segno);
+               ret = f2fs_gc(sbi, true, true, true, start_segno);
                if (ret == -EAGAIN)
                        ret = 0;
                else if (ret < 0)
@@ -4311,8 +4311,13 @@ write:
                clear_inode_flag(inode, FI_NO_PREALLOC);
 
                /* if we couldn't write data, we should deallocate blocks. */
-               if (preallocated && i_size_read(inode) < target_size)
+               if (preallocated && i_size_read(inode) < target_size) {
+                       down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+                       down_write(&F2FS_I(inode)->i_mmap_sem);
                        f2fs_truncate(inode);
+                       up_write(&F2FS_I(inode)->i_mmap_sem);
+                       up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+               }
 
                if (ret > 0)
                        f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
index 39330ad..8d1f17a 100644 (file)
@@ -31,19 +31,24 @@ static int gc_thread_func(void *data)
        struct f2fs_sb_info *sbi = data;
        struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
        wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
+       wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq;
        unsigned int wait_ms;
 
        wait_ms = gc_th->min_sleep_time;
 
        set_freezable();
        do {
-               bool sync_mode;
+               bool sync_mode, foreground = false;
 
                wait_event_interruptible_timeout(*wq,
                                kthread_should_stop() || freezing(current) ||
+                               waitqueue_active(fggc_wq) ||
                                gc_th->gc_wake,
                                msecs_to_jiffies(wait_ms));
 
+               if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq))
+                       foreground = true;
+
                /* give it a try one time */
                if (gc_th->gc_wake)
                        gc_th->gc_wake = 0;
@@ -90,7 +95,10 @@ static int gc_thread_func(void *data)
                        goto do_gc;
                }
 
-               if (!down_write_trylock(&sbi->gc_lock)) {
+               if (foreground) {
+                       down_write(&sbi->gc_lock);
+                       goto do_gc;
+               } else if (!down_write_trylock(&sbi->gc_lock)) {
                        stat_other_skip_bggc_count(sbi);
                        goto next;
                }
@@ -107,14 +115,22 @@ static int gc_thread_func(void *data)
                else
                        increase_sleep_time(gc_th, &wait_ms);
 do_gc:
-               stat_inc_bggc_count(sbi->stat_info);
+               if (!foreground)
+                       stat_inc_bggc_count(sbi->stat_info);
 
                sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC;
 
+               /* foreground GC was been triggered via f2fs_balance_fs() */
+               if (foreground)
+                       sync_mode = false;
+
                /* if return value is not zero, no victim was selected */
-               if (f2fs_gc(sbi, sync_mode, true, NULL_SEGNO))
+               if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO))
                        wait_ms = gc_th->no_gc_sleep_time;
 
+               if (foreground)
+                       wake_up_all(&gc_th->fggc_wq);
+
                trace_f2fs_background_gc(sbi->sb, wait_ms,
                                prefree_segments(sbi), free_segments(sbi));
 
@@ -144,10 +160,11 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
        gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
        gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
 
-       gc_th->gc_wake= 0;
+       gc_th->gc_wake = 0;
 
        sbi->gc_thread = gc_th;
        init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
+       init_waitqueue_head(&sbi->gc_thread->fggc_wq);
        sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
                        "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
        if (IS_ERR(gc_th->f2fs_gc_task)) {
@@ -162,9 +179,11 @@ out:
 void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
 {
        struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
+
        if (!gc_th)
                return;
        kthread_stop(gc_th->f2fs_gc_task);
+       wake_up_all(&gc_th->fggc_wq);
        kfree(gc_th);
        sbi->gc_thread = NULL;
 }
@@ -392,10 +411,6 @@ static void add_victim_entry(struct f2fs_sb_info *sbi,
                if (p->gc_mode == GC_AT &&
                        get_valid_blocks(sbi, segno, true) == 0)
                        return;
-
-               if (p->alloc_mode == AT_SSR &&
-                       get_seg_entry(sbi, segno)->ckpt_valid_blocks == 0)
-                       return;
        }
 
        for (i = 0; i < sbi->segs_per_sec; i++)
@@ -728,11 +743,27 @@ retry:
 
                if (sec_usage_check(sbi, secno))
                        goto next;
+
                /* Don't touch checkpointed data */
-               if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
-                                       get_ckpt_valid_blocks(sbi, segno) &&
-                                       p.alloc_mode == LFS))
-                       goto next;
+               if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+                       if (p.alloc_mode == LFS) {
+                               /*
+                                * LFS is set to find source section during GC.
+                                * The victim should have no checkpointed data.
+                                */
+                               if (get_ckpt_valid_blocks(sbi, segno, true))
+                                       goto next;
+                       } else {
+                               /*
+                                * SSR | AT_SSR are set to find target segment
+                                * for writes which can be full by checkpointed
+                                * and newly written blocks.
+                                */
+                               if (!f2fs_segment_has_free_slot(sbi, segno))
+                                       goto next;
+                       }
+               }
+
                if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
                        goto next;
 
@@ -828,6 +859,7 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
 static void put_gc_inode(struct gc_inode_list *gc_list)
 {
        struct inode_entry *ie, *next_ie;
+
        list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) {
                radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
                iput(ie->inode);
@@ -952,9 +984,11 @@ block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
                bidx = node_ofs - 1;
        } else if (node_ofs <= indirect_blks) {
                int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
+
                bidx = node_ofs - 2 - dec;
        } else {
                int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
+
                bidx = node_ofs - 5 - dec;
        }
        return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode);
@@ -1120,7 +1154,8 @@ static int move_data_block(struct inode *inode, block_t bidx,
        block_t newaddr;
        int err = 0;
        bool lfs_mode = f2fs_lfs_mode(fio.sbi);
-       int type = fio.sbi->am.atgc_enabled ?
+       int type = fio.sbi->am.atgc_enabled && (gc_type == BG_GC) &&
+                               (fio.sbi->gc_mode != GC_URGENT_HIGH) ?
                                CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA;
 
        /* do not read out */
@@ -1354,7 +1389,8 @@ out:
  * the victim data block is ignored.
  */
 static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
-               struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
+               struct gc_inode_list *gc_list, unsigned int segno, int gc_type,
+               bool force_migrate)
 {
        struct super_block *sb = sbi->sb;
        struct f2fs_summary *entry;
@@ -1383,8 +1419,8 @@ next_step:
                 * race condition along with SSR block allocation.
                 */
                if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) ||
-                               get_valid_blocks(sbi, segno, true) ==
-                                                       BLKS_PER_SEC(sbi))
+                       (!force_migrate && get_valid_blocks(sbi, segno, true) ==
+                                                       BLKS_PER_SEC(sbi)))
                        return submitted;
 
                if (check_valid_map(sbi, segno, off) == 0)
@@ -1519,7 +1555,8 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
 
 static int do_garbage_collect(struct f2fs_sb_info *sbi,
                                unsigned int start_segno,
-                               struct gc_inode_list *gc_list, int gc_type)
+                               struct gc_inode_list *gc_list, int gc_type,
+                               bool force_migrate)
 {
        struct page *sum_page;
        struct f2fs_summary_block *sum;
@@ -1606,7 +1643,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
                                                                gc_type);
                else
                        submitted += gc_data_segment(sbi, sum->entries, gc_list,
-                                                       segno, gc_type);
+                                                       segno, gc_type,
+                                                       force_migrate);
 
                stat_inc_seg_count(sbi, type, gc_type);
                migrated++;
@@ -1634,7 +1672,7 @@ skip:
 }
 
 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
-                       bool background, unsigned int segno)
+                       bool background, bool force, unsigned int segno)
 {
        int gc_type = sync ? FG_GC : BG_GC;
        int sec_freed = 0, seg_freed = 0, total_freed = 0;
@@ -1696,7 +1734,7 @@ gc_more:
        if (ret)
                goto stop;
 
-       seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
+       seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force);
        if (gc_type == FG_GC &&
                seg_freed == f2fs_usable_segs_in_sec(sbi, segno))
                sec_freed++;
@@ -1835,7 +1873,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi,
                        .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
                };
 
-               do_garbage_collect(sbi, segno, &gc_list, FG_GC);
+               do_garbage_collect(sbi, segno, &gc_list, FG_GC, true);
                put_gc_inode(&gc_list);
 
                if (!gc_only && get_valid_blocks(sbi, segno, true)) {
@@ -1974,7 +2012,20 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
 
        /* stop CP to protect MAIN_SEC in free_segment_range */
        f2fs_lock_op(sbi);
+
+       spin_lock(&sbi->stat_lock);
+       if (shrunk_blocks + valid_user_blocks(sbi) +
+               sbi->current_reserved_blocks + sbi->unusable_block_count +
+               F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count)
+               err = -ENOSPC;
+       spin_unlock(&sbi->stat_lock);
+
+       if (err)
+               goto out_unlock;
+
        err = free_segment_range(sbi, secs, true);
+
+out_unlock:
        f2fs_unlock_op(sbi);
        up_write(&sbi->gc_lock);
        if (err)
index 0c8dae1..3fe145e 100644 (file)
@@ -42,6 +42,12 @@ struct f2fs_gc_kthread {
 
        /* for changing gc mode */
        unsigned int gc_wake;
+
+       /* for GC_MERGE mount option */
+       wait_queue_head_t fggc_wq;              /*
+                                                * caller of f2fs_balance_fs()
+                                                * will wait on this wait queue.
+                                                */
 };
 
 struct gc_inode_list {
index 993caef..92652ca 100644 (file)
@@ -219,7 +219,8 @@ out:
 
        f2fs_put_page(page, 1);
 
-       f2fs_balance_fs(sbi, dn.node_changed);
+       if (!err)
+               f2fs_balance_fs(sbi, dn.node_changed);
 
        return err;
 }
index 349d9cb..b401f08 100644 (file)
@@ -666,6 +666,7 @@ retry:
        node_page = f2fs_get_node_page(sbi, inode->i_ino);
        if (IS_ERR(node_page)) {
                int err = PTR_ERR(node_page);
+
                if (err == -ENOMEM) {
                        cond_resched();
                        goto retry;
@@ -698,7 +699,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 
        /*
         * We need to balance fs here to prevent from producing dirty node pages
-        * during the urgent cleaning time when runing out of free sections.
+        * during the urgent cleaning time when running out of free sections.
         */
        f2fs_update_inode_page(inode);
        if (wbc && wbc->nr_to_write)
index 14bf4f6..a9cd9cf 100644 (file)
@@ -416,9 +416,9 @@ out:
 
 struct dentry *f2fs_get_parent(struct dentry *child)
 {
-       struct qstr dotdot = QSTR_INIT("..", 2);
        struct page *page;
-       unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page);
+       unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot_name, &page);
+
        if (!ino) {
                if (IS_ERR(page))
                        return ERR_CAST(page);
@@ -628,6 +628,7 @@ static const char *f2fs_get_link(struct dentry *dentry,
                                 struct delayed_call *done)
 {
        const char *link = page_get_link(dentry, inode, done);
+
        if (!IS_ERR(link) && !*link) {
                /* this is broken symlink case */
                do_delayed_call(done);
@@ -766,6 +767,7 @@ out_fail:
 static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = d_inode(dentry);
+
        if (f2fs_empty_dir(inode))
                return f2fs_unlink(dir, dentry);
        return -ENOTEMPTY;
index 4b0e2e3..e67ce5f 100644 (file)
@@ -43,11 +43,15 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
 bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
+       struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
        struct sysinfo val;
        unsigned long avail_ram;
        unsigned long mem_size = 0;
        bool res = false;
 
+       if (!nm_i)
+               return true;
+
        si_meminfo(&val);
 
        /* only uses low memory */
@@ -89,6 +93,10 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
                /* it allows 20% / total_ram for inmemory pages */
                mem_size = get_pages(sbi, F2FS_INMEM_PAGES);
                res = mem_size < (val.totalram / 5);
+       } else if (type == DISCARD_CACHE) {
+               mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
+                               sizeof(struct discard_cmd)) >> PAGE_SHIFT;
+               res = mem_size < (avail_ram * nm_i->ram_thresh / 100);
        } else {
                if (!sbi->sb->s_bdi->wb.dirty_exceeded)
                        return true;
@@ -462,6 +470,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
        /* increment version no as node is removed */
        if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
                unsigned char version = nat_get_version(e);
+
                nat_set_version(e, inc_node_version(version));
        }
 
@@ -1383,7 +1392,7 @@ repeat:
                goto out_err;
        }
 page_hit:
-       if(unlikely(nid != nid_of_node(page))) {
+       if (unlikely(nid != nid_of_node(page))) {
                f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
                          nid, nid_of_node(page), ino_of_node(page),
                          ofs_of_node(page), cpver_of_node(page),
@@ -1775,7 +1784,7 @@ continue_unlock:
 out:
        if (nwritten)
                f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE);
-       return ret ? -EIO: 0;
+       return ret ? -EIO : 0;
 }
 
 static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data)
@@ -2117,8 +2126,8 @@ static int __insert_free_nid(struct f2fs_sb_info *sbi,
                                struct free_nid *i)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
-
        int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
+
        if (err)
                return err;
 
@@ -2785,6 +2794,9 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
                struct f2fs_nat_entry raw_ne;
                nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
 
+               if (f2fs_check_nid_range(sbi, nid))
+                       continue;
+
                raw_ne = nat_in_journal(journal, i);
 
                ne = __lookup_nat_cache(nm_i, nid);
@@ -2980,6 +2992,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        while ((found = __gang_lookup_nat_set(nm_i,
                                        set_idx, SETVEC_SIZE, setvec))) {
                unsigned idx;
+
                set_idx = setvec[found - 1]->set + 1;
                for (idx = 0; idx < found; idx++)
                        __adjust_nat_entry_set(setvec[idx], &sets,
index f84541b..7a45c0f 100644 (file)
@@ -147,6 +147,7 @@ enum mem_type {
        INO_ENTRIES,    /* indicates inode entries */
        EXTENT_CACHE,   /* indicates extent cache */
        INMEM_PAGES,    /* indicates inmemory pages */
+       DISCARD_CACHE,  /* indicates memory of cached discard cmds */
        BASE_CHECK,     /* check kernel status */
 };
 
index da75d5d..422146c 100644 (file)
@@ -458,6 +458,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
        /* Get the previous summary */
        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
                struct curseg_info *curseg = CURSEG_I(sbi, i);
+
                if (curseg->segno == segno) {
                        sum = curseg->sum_blk->entries[blkoff];
                        goto got_it;
@@ -875,5 +876,5 @@ out:
 #endif
        sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
 
-       return ret ? ret: err;
+       return ret ? ret : err;
 }
index c286656..c605415 100644 (file)
@@ -186,7 +186,10 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
 {
        struct inmem_pages *new;
 
-       f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE);
+       if (PagePrivate(page))
+               set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
+       else
+               f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE);
 
        new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
 
@@ -324,23 +327,27 @@ void f2fs_drop_inmem_pages(struct inode *inode)
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct f2fs_inode_info *fi = F2FS_I(inode);
 
-       while (!list_empty(&fi->inmem_pages)) {
+       do {
                mutex_lock(&fi->inmem_lock);
+               if (list_empty(&fi->inmem_pages)) {
+                       fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
+
+                       spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+                       if (!list_empty(&fi->inmem_ilist))
+                               list_del_init(&fi->inmem_ilist);
+                       if (f2fs_is_atomic_file(inode)) {
+                               clear_inode_flag(inode, FI_ATOMIC_FILE);
+                               sbi->atomic_files--;
+                       }
+                       spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+
+                       mutex_unlock(&fi->inmem_lock);
+                       break;
+               }
                __revoke_inmem_pages(inode, &fi->inmem_pages,
                                                true, false, true);
                mutex_unlock(&fi->inmem_lock);
-       }
-
-       fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
-
-       spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
-       if (!list_empty(&fi->inmem_ilist))
-               list_del_init(&fi->inmem_ilist);
-       if (f2fs_is_atomic_file(inode)) {
-               clear_inode_flag(inode, FI_ATOMIC_FILE);
-               sbi->atomic_files--;
-       }
-       spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+       } while (1);
 }
 
 void f2fs_drop_inmem_page(struct inode *inode, struct page *page)
@@ -503,8 +510,19 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
         * dir/node pages without enough free segments.
         */
        if (has_not_enough_free_secs(sbi, 0, 0)) {
-               down_write(&sbi->gc_lock);
-               f2fs_gc(sbi, false, false, NULL_SEGNO);
+               if (test_opt(sbi, GC_MERGE) && sbi->gc_thread &&
+                                       sbi->gc_thread->f2fs_gc_task) {
+                       DEFINE_WAIT(wait);
+
+                       prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait,
+                                               TASK_UNINTERRUPTIBLE);
+                       wake_up(&sbi->gc_thread->gc_wait_queue_head);
+                       io_schedule();
+                       finish_wait(&sbi->gc_thread->fggc_wq, &wait);
+               } else {
+                       down_write(&sbi->gc_lock);
+                       f2fs_gc(sbi, false, false, false, NULL_SEGNO);
+               }
        }
 }
 
@@ -653,7 +671,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
 
        llist_add(&cmd.llnode, &fcc->issue_list);
 
-       /* update issue_list before we wake up issue_flush thread */
+       /*
+        * update issue_list before we wake up issue_flush thread, this
+        * smp_mb() pairs with another barrier in ___wait_event(), see
+        * more details in comments of waitqueue_active().
+        */
        smp_mb();
 
        if (waitqueue_active(&fcc->flush_wait_queue))
@@ -861,7 +883,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
        mutex_lock(&dirty_i->seglist_lock);
 
        valid_blocks = get_valid_blocks(sbi, segno, false);
-       ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno);
+       ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false);
 
        if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) ||
                ckpt_valid_blocks == usable_blocks)) {
@@ -946,7 +968,7 @@ static unsigned int get_free_segment(struct f2fs_sb_info *sbi)
        for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
                if (get_valid_blocks(sbi, segno, false))
                        continue;
-               if (get_ckpt_valid_blocks(sbi, segno))
+               if (get_ckpt_valid_blocks(sbi, segno, false))
                        continue;
                mutex_unlock(&dirty_i->seglist_lock);
                return segno;
@@ -1095,6 +1117,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
                                struct discard_policy *dpolicy,
                                int discard_type, unsigned int granularity)
 {
+       struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+
        /* common policy */
        dpolicy->type = discard_type;
        dpolicy->sync = true;
@@ -1114,7 +1138,9 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
                dpolicy->ordered = true;
                if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) {
                        dpolicy->granularity = 1;
-                       dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+                       if (atomic_read(&dcc->discard_cmd_cnt))
+                               dpolicy->max_interval =
+                                       DEF_MIN_DISCARD_ISSUE_TIME;
                }
        } else if (discard_type == DPOLICY_FORCE) {
                dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
@@ -1730,8 +1756,15 @@ static int issue_discard_thread(void *data)
        set_freezable();
 
        do {
-               __init_discard_policy(sbi, &dpolicy, DPOLICY_BG,
-                                       dcc->discard_granularity);
+               if (sbi->gc_mode == GC_URGENT_HIGH ||
+                       !f2fs_available_free_memory(sbi, DISCARD_CACHE))
+                       __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1);
+               else
+                       __init_discard_policy(sbi, &dpolicy, DPOLICY_BG,
+                                               dcc->discard_granularity);
+
+               if (!atomic_read(&dcc->discard_cmd_cnt))
+                      wait_ms = dpolicy.max_interval;
 
                wait_event_interruptible_timeout(*q,
                                kthread_should_stop() || freezing(current) ||
@@ -1755,9 +1788,8 @@ static int issue_discard_thread(void *data)
                        wait_ms = dpolicy.max_interval;
                        continue;
                }
-
-               if (sbi->gc_mode == GC_URGENT_HIGH)
-                       __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1);
+               if (!atomic_read(&dcc->discard_cmd_cnt))
+                       continue;
 
                sb_start_intwrite(sbi->sb);
 
@@ -1765,7 +1797,7 @@ static int issue_discard_thread(void *data)
                if (issued > 0) {
                        __wait_all_discard_cmd(sbi, &dpolicy);
                        wait_ms = dpolicy.min_interval;
-               } else if (issued == -1){
+               } else if (issued == -1) {
                        wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME);
                        if (!wait_ms)
                                wait_ms = dpolicy.mid_interval;
@@ -2142,6 +2174,7 @@ static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
                                        unsigned int segno, int modified)
 {
        struct seg_entry *se = get_seg_entry(sbi, segno);
+
        se->type = type;
        if (modified)
                __mark_sit_entry_dirty(sbi, segno);
@@ -2333,6 +2366,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
 {
        struct curseg_info *curseg = CURSEG_I(sbi, type);
        void *addr = curseg->sum_blk;
+
        addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
        memcpy(addr, sum, sizeof(struct f2fs_summary));
 }
@@ -2604,22 +2638,20 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
        curseg->alloc_type = LFS;
 }
 
-static void __next_free_blkoff(struct f2fs_sb_info *sbi,
-                       struct curseg_info *seg, block_t start)
+static int __next_free_blkoff(struct f2fs_sb_info *sbi,
+                                       int segno, block_t start)
 {
-       struct seg_entry *se = get_seg_entry(sbi, seg->segno);
+       struct seg_entry *se = get_seg_entry(sbi, segno);
        int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
        unsigned long *target_map = SIT_I(sbi)->tmp_map;
        unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
        unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
-       int i, pos;
+       int i;
 
        for (i = 0; i < entries; i++)
                target_map[i] = ckpt_map[i] | cur_map[i];
 
-       pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
-
-       seg->next_blkoff = pos;
+       return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
 }
 
 /*
@@ -2631,11 +2663,18 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
                                struct curseg_info *seg)
 {
        if (seg->alloc_type == SSR)
-               __next_free_blkoff(sbi, seg, seg->next_blkoff + 1);
+               seg->next_blkoff =
+                       __next_free_blkoff(sbi, seg->segno,
+                                               seg->next_blkoff + 1);
        else
                seg->next_blkoff++;
 }
 
+bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
+{
+       return __next_free_blkoff(sbi, segno, 0) < sbi->blocks_per_seg;
+}
+
 /*
  * This function always allocates a used segment(from dirty seglist) by SSR
  * manner, so it should recover the existing segment information of valid blocks
@@ -2661,7 +2700,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush)
 
        reset_curseg(sbi, type, 1);
        curseg->alloc_type = SSR;
-       __next_free_blkoff(sbi, curseg, 0);
+       curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0);
 
        sum_page = f2fs_get_sum_page(sbi, new_segno);
        if (IS_ERR(sum_page)) {
@@ -2893,7 +2932,8 @@ unlock:
        up_read(&SM_I(sbi)->curseg_lock);
 }
 
-static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type)
+static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
+                                               bool new_sec, bool force)
 {
        struct curseg_info *curseg = CURSEG_I(sbi, type);
        unsigned int old_segno;
@@ -2901,32 +2941,43 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type)
        if (!curseg->inited)
                goto alloc;
 
-       if (!curseg->next_blkoff &&
-               !get_valid_blocks(sbi, curseg->segno, false) &&
-               !get_ckpt_valid_blocks(sbi, curseg->segno))
-               return;
+       if (force || curseg->next_blkoff ||
+               get_valid_blocks(sbi, curseg->segno, new_sec))
+               goto alloc;
 
+       if (!get_ckpt_valid_blocks(sbi, curseg->segno, new_sec))
+               return;
 alloc:
        old_segno = curseg->segno;
        SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
        locate_dirty_segment(sbi, old_segno);
 }
 
-void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type)
+static void __allocate_new_section(struct f2fs_sb_info *sbi,
+                                               int type, bool force)
+{
+       __allocate_new_segment(sbi, type, true, force);
+}
+
+void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
 {
+       down_read(&SM_I(sbi)->curseg_lock);
        down_write(&SIT_I(sbi)->sentry_lock);
-       __allocate_new_segment(sbi, type);
+       __allocate_new_section(sbi, type, force);
        up_write(&SIT_I(sbi)->sentry_lock);
+       up_read(&SM_I(sbi)->curseg_lock);
 }
 
 void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
 {
        int i;
 
+       down_read(&SM_I(sbi)->curseg_lock);
        down_write(&SIT_I(sbi)->sentry_lock);
        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
-               __allocate_new_segment(sbi, i);
+               __allocate_new_segment(sbi, i, false, false);
        up_write(&SIT_I(sbi)->sentry_lock);
+       up_read(&SM_I(sbi)->curseg_lock);
 }
 
 static const struct segment_allocation default_salloc_ops = {
@@ -3239,7 +3290,9 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
                struct inode *inode = fio->page->mapping->host;
 
                if (is_cold_data(fio->page)) {
-                       if (fio->sbi->am.atgc_enabled)
+                       if (fio->sbi->am.atgc_enabled &&
+                               (fio->io_type == FS_DATA_IO) &&
+                               (fio->sbi->gc_mode != GC_URGENT_HIGH))
                                return CURSEG_ALL_DATA_ATGC;
                        else
                                return CURSEG_COLD_DATA;
@@ -3365,12 +3418,12 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
                f2fs_inode_chksum_set(sbi, page);
        }
 
-       if (F2FS_IO_ALIGNED(sbi))
-               fio->retry = false;
-
        if (fio) {
                struct f2fs_bio_info *io;
 
+               if (F2FS_IO_ALIGNED(sbi))
+                       fio->retry = false;
+
                INIT_LIST_HEAD(&fio->list);
                fio->in_list = true;
                io = sbi->write_io[fio->type] + fio->temp;
@@ -3499,7 +3552,13 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
                set_sbi_flag(sbi, SBI_NEED_FSCK);
                f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.",
                          __func__, segno);
-               return -EFSCORRUPTED;
+               err = -EFSCORRUPTED;
+               goto drop_bio;
+       }
+
+       if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || f2fs_cp_error(sbi)) {
+               err = -EIO;
+               goto drop_bio;
        }
 
        stat_inc_inplace_blocks(fio->sbi);
@@ -3513,6 +3572,15 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
                f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
        }
 
+       return err;
+drop_bio:
+       if (fio->bio) {
+               struct bio *bio = *(fio->bio);
+
+               bio->bi_status = BLK_STS_IOERR;
+               bio_endio(bio);
+               fio->bio = NULL;
+       }
        return err;
 }
 
@@ -3539,6 +3607,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
        struct seg_entry *se;
        int type;
        unsigned short old_blkoff;
+       unsigned char old_alloc_type;
 
        segno = GET_SEGNO(sbi, new_blkaddr);
        se = get_seg_entry(sbi, segno);
@@ -3572,6 +3641,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 
        old_cursegno = curseg->segno;
        old_blkoff = curseg->next_blkoff;
+       old_alloc_type = curseg->alloc_type;
 
        /* change the current segment */
        if (segno != curseg->segno) {
@@ -3606,6 +3676,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
                        change_curseg(sbi, type, true);
                }
                curseg->next_blkoff = old_blkoff;
+               curseg->alloc_type = old_alloc_type;
        }
 
        up_write(&sit_i->sentry_lock);
@@ -3717,6 +3788,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
 
                for (j = 0; j < blk_off; j++) {
                        struct f2fs_summary *s;
+
                        s = (struct f2fs_summary *)(kaddr + offset);
                        seg_i->sum_blk->entries[j] = *s;
                        offset += SUMMARY_SIZE;
@@ -3779,6 +3851,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
                if (__exist_node_summaries(sbi)) {
                        struct f2fs_summary *ns = &sum->entries[0];
                        int i;
+
                        for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
                                ns->version = 0;
                                ns->ofs_in_node = 0;
@@ -3880,6 +3953,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
        /* Step 3: write summary entries */
        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
                unsigned short blkoff;
+
                seg_i = CURSEG_I(sbi, i);
                if (sbi->ckpt->alloc_type[i] == SSR)
                        blkoff = sbi->blocks_per_seg;
@@ -3916,6 +3990,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi,
                                        block_t blkaddr, int type)
 {
        int i, end;
+
        if (IS_DATASEG(type))
                end = type + NR_CURSEG_DATA_TYPE;
        else
@@ -4499,6 +4574,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
        /* set use the current segments */
        for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) {
                struct curseg_info *curseg_t = CURSEG_I(sbi, type);
+
                __set_test_and_inuse(sbi, curseg_t->segno);
        }
 }
@@ -4731,7 +4807,8 @@ static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi,
 }
 
 static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx,
-                             void *data) {
+                             void *data)
+{
        memcpy(data, zone, sizeof(struct blk_zone));
        return 0;
 }
@@ -4783,7 +4860,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
 
        f2fs_notice(sbi, "Assign new section to curseg[%d]: "
                    "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff);
-       allocate_segment_by_default(sbi, type, true);
+
+       f2fs_allocate_new_section(sbi, type, true);
 
        /* check consistency of the zone curseg pointed to */
        if (check_zone_write_pointer(sbi, zbd, &zone))
@@ -4847,8 +4925,10 @@ struct check_zone_write_pointer_args {
 };
 
 static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx,
-                                     void *data) {
+                                     void *data)
+{
        struct check_zone_write_pointer_args *args;
+
        args = (struct check_zone_write_pointer_args *)data;
 
        return check_zone_write_pointer(args->sbi, args->fdev, zone);
@@ -5127,6 +5207,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
 static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
        kvfree(dirty_i->victim_secmap);
 }
 
@@ -5171,6 +5252,7 @@ static void destroy_curseg(struct f2fs_sb_info *sbi)
 static void destroy_free_segmap(struct f2fs_sb_info *sbi)
 {
        struct free_segmap_info *free_i = SM_I(sbi)->free_info;
+
        if (!free_i)
                return;
        SM_I(sbi)->free_info = NULL;
index e9a7a63..050230c 100644 (file)
@@ -172,12 +172,10 @@ enum {
 /*
  * BG_GC means the background cleaning job.
  * FG_GC means the on-demand cleaning job.
- * FORCE_FG_GC means on-demand cleaning job in background.
  */
 enum {
        BG_GC = 0,
        FG_GC,
-       FORCE_FG_GC,
 };
 
 /* for a function parameter to select a victim segment */
@@ -361,8 +359,20 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
 }
 
 static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi,
-                               unsigned int segno)
+                               unsigned int segno, bool use_section)
 {
+       if (use_section && __is_large_section(sbi)) {
+               unsigned int start_segno = START_SEGNO(segno);
+               unsigned int blocks = 0;
+               int i;
+
+               for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) {
+                       struct seg_entry *se = get_seg_entry(sbi, start_segno);
+
+                       blocks += se->ckpt_valid_blocks;
+               }
+               return blocks;
+       }
        return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
 }
 
index 82592b1..7d325bf 100644 (file)
@@ -151,6 +151,8 @@ enum {
        Opt_compress_chksum,
        Opt_compress_mode,
        Opt_atgc,
+       Opt_gc_merge,
+       Opt_nogc_merge,
        Opt_err,
 };
 
@@ -223,6 +225,8 @@ static match_table_t f2fs_tokens = {
        {Opt_compress_chksum, "compress_chksum"},
        {Opt_compress_mode, "compress_mode=%s"},
        {Opt_atgc, "atgc"},
+       {Opt_gc_merge, "gc_merge"},
+       {Opt_nogc_merge, "nogc_merge"},
        {Opt_err, NULL},
 };
 
@@ -555,6 +559,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 
        while ((p = strsep(&options, ",")) != NULL) {
                int token;
+
                if (!*p)
                        continue;
                /*
@@ -1073,6 +1078,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
                case Opt_atgc:
                        set_opt(sbi, ATGC);
                        break;
+               case Opt_gc_merge:
+                       set_opt(sbi, GC_MERGE);
+                       break;
+               case Opt_nogc_merge:
+                       clear_opt(sbi, GC_MERGE);
+                       break;
                default:
                        f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
                                 p);
@@ -1616,6 +1627,7 @@ static inline void f2fs_show_quota_options(struct seq_file *seq,
 #endif
 }
 
+#ifdef CONFIG_F2FS_FS_COMPRESSION
 static inline void f2fs_show_compress_options(struct seq_file *seq,
                                                        struct super_block *sb)
 {
@@ -1661,6 +1673,7 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
        else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER)
                seq_printf(seq, ",compress_mode=%s", "user");
 }
+#endif
 
 static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 {
@@ -1673,6 +1686,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
        else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF)
                seq_printf(seq, ",background_gc=%s", "off");
 
+       if (test_opt(sbi, GC_MERGE))
+               seq_puts(seq, ",gc_merge");
+
        if (test_opt(sbi, DISABLE_ROLL_FORWARD))
                seq_puts(seq, ",disable_roll_forward");
        if (test_opt(sbi, NORECOVERY))
@@ -1824,6 +1840,7 @@ static void default_options(struct f2fs_sb_info *sbi)
        set_opt(sbi, EXTENT_CACHE);
        set_opt(sbi, NOHEAP);
        clear_opt(sbi, DISABLE_CHECKPOINT);
+       set_opt(sbi, MERGE_CHECKPOINT);
        F2FS_OPTION(sbi).unusable_cap = 0;
        sbi->sb->s_flags |= SB_LAZYTIME;
        set_opt(sbi, FLUSH_MERGE);
@@ -1865,7 +1882,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
 
        while (!f2fs_time_over(sbi, DISABLE_TIME)) {
                down_write(&sbi->gc_lock);
-               err = f2fs_gc(sbi, true, false, NULL_SEGNO);
+               err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
                if (err == -ENODATA) {
                        err = 0;
                        break;
@@ -1876,7 +1893,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
 
        ret = sync_filesystem(sbi->sb);
        if (ret || err) {
-               err = ret ? ret: err;
+               err = ret ? ret : err;
                goto restore_flag;
        }
 
@@ -1925,8 +1942,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
        struct f2fs_mount_info org_mount_opt;
        unsigned long old_sb_flags;
        int err;
-       bool need_restart_gc = false;
-       bool need_stop_gc = false;
+       bool need_restart_gc = false, need_stop_gc = false;
+       bool need_restart_ckpt = false, need_stop_ckpt = false;
+       bool need_restart_flush = false, need_stop_flush = false;
        bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
        bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT);
        bool no_io_align = !F2FS_IO_ALIGNED(sbi);
@@ -2035,7 +2053,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
         * option. Also sync the filesystem.
         */
        if ((*flags & SB_RDONLY) ||
-                       F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) {
+                       (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF &&
+                       !test_opt(sbi, GC_MERGE))) {
                if (sbi->gc_thread) {
                        f2fs_stop_gc_thread(sbi);
                        need_restart_gc = true;
@@ -2057,18 +2076,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
                clear_sbi_flag(sbi, SBI_IS_CLOSE);
        }
 
-       if (checkpoint_changed) {
-               if (test_opt(sbi, DISABLE_CHECKPOINT)) {
-                       err = f2fs_disable_checkpoint(sbi);
-                       if (err)
-                               goto restore_gc;
-               } else {
-                       f2fs_enable_checkpoint(sbi);
-               }
-       }
-
-       if (!test_opt(sbi, DISABLE_CHECKPOINT) &&
-                       test_opt(sbi, MERGE_CHECKPOINT)) {
+       if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) ||
+                       !test_opt(sbi, MERGE_CHECKPOINT)) {
+               f2fs_stop_ckpt_thread(sbi);
+               need_restart_ckpt = true;
+       } else {
                err = f2fs_start_ckpt_thread(sbi);
                if (err) {
                        f2fs_err(sbi,
@@ -2076,8 +2088,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
                            err);
                        goto restore_gc;
                }
-       } else {
-               f2fs_stop_ckpt_thread(sbi);
+               need_stop_ckpt = true;
        }
 
        /*
@@ -2087,11 +2098,24 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
        if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
                clear_opt(sbi, FLUSH_MERGE);
                f2fs_destroy_flush_cmd_control(sbi, false);
+               need_restart_flush = true;
        } else {
                err = f2fs_create_flush_cmd_control(sbi);
                if (err)
-                       goto restore_gc;
+                       goto restore_ckpt;
+               need_stop_flush = true;
        }
+
+       if (checkpoint_changed) {
+               if (test_opt(sbi, DISABLE_CHECKPOINT)) {
+                       err = f2fs_disable_checkpoint(sbi);
+                       if (err)
+                               goto restore_flush;
+               } else {
+                       f2fs_enable_checkpoint(sbi);
+               }
+       }
+
 skip:
 #ifdef CONFIG_QUOTA
        /* Release old quota file names */
@@ -2106,6 +2130,21 @@ skip:
        adjust_unusable_cap_perc(sbi);
        *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
        return 0;
+restore_flush:
+       if (need_restart_flush) {
+               if (f2fs_create_flush_cmd_control(sbi))
+                       f2fs_warn(sbi, "background flush thread has stopped");
+       } else if (need_stop_flush) {
+               clear_opt(sbi, FLUSH_MERGE);
+               f2fs_destroy_flush_cmd_control(sbi, false);
+       }
+restore_ckpt:
+       if (need_restart_ckpt) {
+               if (f2fs_start_ckpt_thread(sbi))
+                       f2fs_warn(sbi, "background ckpt thread has stopped");
+       } else if (need_stop_ckpt) {
+               f2fs_stop_ckpt_thread(sbi);
+       }
 restore_gc:
        if (need_restart_gc) {
                if (f2fs_start_gc_thread(sbi))
@@ -3719,7 +3758,7 @@ try_onemore:
        sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS;
 
        for (i = 0; i < NR_PAGE_TYPE; i++) {
-               int n = (i == META) ? 1: NR_TEMP_TYPE;
+               int n = (i == META) ? 1 : NR_TEMP_TYPE;
                int j;
 
                sbi->write_io[i] =
@@ -3833,7 +3872,7 @@ try_onemore:
 
        /* setup checkpoint request control and start checkpoint issue thread */
        f2fs_init_ckpt_req_control(sbi);
-       if (!test_opt(sbi, DISABLE_CHECKPOINT) &&
+       if (!f2fs_readonly(sb) && !test_opt(sbi, DISABLE_CHECKPOINT) &&
                        test_opt(sbi, MERGE_CHECKPOINT)) {
                err = f2fs_start_ckpt_thread(sbi);
                if (err) {
@@ -3929,10 +3968,18 @@ try_onemore:
                 * previous checkpoint was not done by clean system shutdown.
                 */
                if (f2fs_hw_is_readonly(sbi)) {
-                       if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG))
-                               f2fs_err(sbi, "Need to recover fsync data, but write access unavailable");
-                       else
-                               f2fs_info(sbi, "write access unavailable, skipping recovery");
+                       if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+                               err = f2fs_recover_fsync_data(sbi, true);
+                               if (err > 0) {
+                                       err = -EROFS;
+                                       f2fs_err(sbi, "Need to recover fsync data, but "
+                                               "write access unavailable, please try "
+                                               "mount w/ disable_roll_forward or norecovery");
+                               }
+                               if (err < 0)
+                                       goto free_meta;
+                       }
+                       f2fs_info(sbi, "write access unavailable, skipping recovery");
                        goto reset_checkpoint;
                }
 
@@ -3989,7 +4036,8 @@ reset_checkpoint:
         * If filesystem is not mounted as read-only then
         * do start the gc_thread.
         */
-       if (F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF && !f2fs_readonly(sb)) {
+       if ((F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF ||
+               test_opt(sbi, GC_MERGE)) && !f2fs_readonly(sb)) {
                /* After POR, we can run background GC thread.*/
                err = f2fs_start_gc_thread(sbi);
                if (err)
index e38a7f6..39b522e 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/seq_file.h>
 #include <linux/unicode.h>
 #include <linux/ioprio.h>
+#include <linux/sysfs.h>
 
 #include "f2fs.h"
 #include "segment.h"
@@ -91,6 +92,13 @@ static ssize_t free_segments_show(struct f2fs_attr *a,
                        (unsigned long long)(free_segments(sbi)));
 }
 
+static ssize_t ovp_segments_show(struct f2fs_attr *a,
+               struct f2fs_sb_info *sbi, char *buf)
+{
+       return sprintf(buf, "%llu\n",
+                       (unsigned long long)(overprovision_segments(sbi)));
+}
+
 static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
                struct f2fs_sb_info *sbi, char *buf)
 {
@@ -282,6 +290,17 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
                return len;
        }
 
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+       if (!strcmp(a->attr.name, "compr_written_block"))
+               return sysfs_emit(buf, "%llu\n", sbi->compr_written_block);
+
+       if (!strcmp(a->attr.name, "compr_saved_block"))
+               return sysfs_emit(buf, "%llu\n", sbi->compr_saved_block);
+
+       if (!strcmp(a->attr.name, "compr_new_inode"))
+               return sysfs_emit(buf, "%u\n", sbi->compr_new_inode);
+#endif
+
        ui = (unsigned int *)(ptr + a->offset);
 
        return sprintf(buf, "%u\n", *ui);
@@ -458,6 +477,24 @@ out:
                return count;
        }
 
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+       if (!strcmp(a->attr.name, "compr_written_block") ||
+               !strcmp(a->attr.name, "compr_saved_block")) {
+               if (t != 0)
+                       return -EINVAL;
+               sbi->compr_written_block = 0;
+               sbi->compr_saved_block = 0;
+               return count;
+       }
+
+       if (!strcmp(a->attr.name, "compr_new_inode")) {
+               if (t != 0)
+                       return -EINVAL;
+               sbi->compr_new_inode = 0;
+               return count;
+       }
+#endif
+
        *ui = (unsigned int)t;
 
        return count;
@@ -629,6 +666,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag);
 F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio);
 F2FS_GENERAL_RO_ATTR(dirty_segments);
 F2FS_GENERAL_RO_ATTR(free_segments);
+F2FS_GENERAL_RO_ATTR(ovp_segments);
 F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
 F2FS_GENERAL_RO_ATTR(features);
 F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
@@ -668,6 +706,9 @@ F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
 F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD);
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode);
 #endif
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
@@ -715,6 +756,7 @@ static struct attribute *f2fs_attrs[] = {
        ATTR_LIST(ckpt_thread_ioprio),
        ATTR_LIST(dirty_segments),
        ATTR_LIST(free_segments),
+       ATTR_LIST(ovp_segments),
        ATTR_LIST(unusable),
        ATTR_LIST(lifetime_write_kbytes),
        ATTR_LIST(features),
@@ -730,6 +772,11 @@ static struct attribute *f2fs_attrs[] = {
        ATTR_LIST(moved_blocks_foreground),
        ATTR_LIST(moved_blocks_background),
        ATTR_LIST(avg_vblocks),
+#endif
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+       ATTR_LIST(compr_written_block),
+       ATTR_LIST(compr_saved_block),
+       ATTR_LIST(compr_new_inode),
 #endif
        NULL,
 };
index a7beff2..03549b5 100644 (file)
@@ -152,40 +152,73 @@ static int f2fs_end_enable_verity(struct file *filp, const void *desc,
                                  size_t desc_size, u64 merkle_tree_size)
 {
        struct inode *inode = file_inode(filp);
+       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size;
        struct fsverity_descriptor_location dloc = {
                .version = cpu_to_le32(F2FS_VERIFY_VER),
                .size = cpu_to_le32(desc_size),
                .pos = cpu_to_le64(desc_pos),
        };
-       int err = 0;
+       int err = 0, err2 = 0;
 
-       if (desc != NULL) {
-               /* Succeeded; write the verity descriptor. */
-               err = pagecache_write(inode, desc, desc_size, desc_pos);
+       /*
+        * If an error already occurred (which fs/verity/ signals by passing
+        * desc == NULL), then only clean-up is needed.
+        */
+       if (desc == NULL)
+               goto cleanup;
 
-               /* Write all pages before clearing FI_VERITY_IN_PROGRESS. */
-               if (!err)
-                       err = filemap_write_and_wait(inode->i_mapping);
-       }
+       /* Append the verity descriptor. */
+       err = pagecache_write(inode, desc, desc_size, desc_pos);
+       if (err)
+               goto cleanup;
+
+       /*
+        * Write all pages (both data and verity metadata).  Note that this must
+        * happen before clearing FI_VERITY_IN_PROGRESS; otherwise pages beyond
+        * i_size won't be written properly.  For crash consistency, this also
+        * must happen before the verity inode flag gets persisted.
+        */
+       err = filemap_write_and_wait(inode->i_mapping);
+       if (err)
+               goto cleanup;
+
+       /* Set the verity xattr. */
+       err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY,
+                           F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc),
+                           NULL, XATTR_CREATE);
+       if (err)
+               goto cleanup;
 
-       /* If we failed, truncate anything we wrote past i_size. */
-       if (desc == NULL || err)
-               f2fs_truncate(inode);
+       /* Finally, set the verity inode flag. */
+       file_set_verity(inode);
+       f2fs_set_inode_flags(inode);
+       f2fs_mark_inode_dirty_sync(inode, true);
 
        clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
+       return 0;
 
-       if (desc != NULL && !err) {
-               err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY,
-                                   F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc),
-                                   NULL, XATTR_CREATE);
-               if (!err) {
-                       file_set_verity(inode);
-                       f2fs_set_inode_flags(inode);
-                       f2fs_mark_inode_dirty_sync(inode, true);
-               }
+cleanup:
+       /*
+        * Verity failed to be enabled, so clean up by truncating any verity
+        * metadata that was written beyond i_size (both from cache and from
+        * disk) and clearing FI_VERITY_IN_PROGRESS.
+        *
+        * Taking i_gc_rwsem[WRITE] is needed to stop f2fs garbage collection
+        * from re-instantiating cached pages we are truncating (since unlike
+        * normal file accesses, garbage collection isn't limited by i_size).
+        */
+       down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+       truncate_inode_pages(inode->i_mapping, inode->i_size);
+       err2 = f2fs_truncate(inode);
+       if (err2) {
+               f2fs_err(sbi, "Truncating verity metadata failed (errno=%d)",
+                        err2);
+               set_sbi_flag(sbi, SBI_NEED_FSCK);
        }
-       return err;
+       up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+       clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
+       return err ?: err2;
 }
 
 static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
index 490f843..c8f34de 100644 (file)
@@ -488,6 +488,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
                f2fs_wait_on_page_writeback(xpage, NODE, true, true);
        } else {
                struct dnode_of_data dn;
+
                set_new_dnode(&dn, inode, NULL, NULL, new_nid);
                xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET);
                if (IS_ERR(xpage)) {
index f7e3304..860e884 100644 (file)
@@ -771,7 +771,7 @@ int fat_trim_fs(struct inode *inode, struct fstrim_range *range)
        /*
         * FAT data is organized as clusters, trim at the granulary of cluster.
         *
-        * fstrim_range is in byte, convert vaules to cluster index.
+        * fstrim_range is in byte, convert values to cluster index.
         * Treat sectors before data region as all used, not to trim them.
         */
        ent_start = max_t(u64, range->start>>sbi->cluster_bits, FAT_START_ENT);
index f633348..86dc995 100644 (file)
--- a/fs/file.c
+++ b/fs/file.c
@@ -1081,8 +1081,6 @@ out_unlock:
 
 /**
  * __receive_fd() - Install received file into file descriptor table
- *
- * @fd: fd to install into (if negative, a new fd will be allocated)
  * @file: struct file that was received from another process
  * @ufd: __user pointer to write new fd number to
  * @o_flags: the O_* flags to apply to the new fd entry
@@ -1096,7 +1094,7 @@ out_unlock:
  *
  * Returns newly install fd or -ve on error.
  */
-int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags)
+int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
 {
        int new_fd;
        int error;
@@ -1105,32 +1103,33 @@ int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flag
        if (error)
                return error;
 
-       if (fd < 0) {
-               new_fd = get_unused_fd_flags(o_flags);
-               if (new_fd < 0)
-                       return new_fd;
-       } else {
-               new_fd = fd;
-       }
+       new_fd = get_unused_fd_flags(o_flags);
+       if (new_fd < 0)
+               return new_fd;
 
        if (ufd) {
                error = put_user(new_fd, ufd);
                if (error) {
-                       if (fd < 0)
-                               put_unused_fd(new_fd);
+                       put_unused_fd(new_fd);
                        return error;
                }
        }
 
-       if (fd < 0) {
-               fd_install(new_fd, get_file(file));
-       } else {
-               error = replace_fd(new_fd, file, o_flags);
-               if (error)
-                       return error;
-       }
+       fd_install(new_fd, get_file(file));
+       __receive_sock(file);
+       return new_fd;
+}
 
-       /* Bump the sock usage counts, if any. */
+int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
+{
+       int error;
+
+       error = security_file_receive(file);
+       if (error)
+               return error;
+       error = replace_fd(new_fd, file, o_flags);
+       if (error)
+               return error;
        __receive_sock(file);
        return new_fd;
 }
index 6c99520..393e36b 100644 (file)
@@ -873,14 +873,13 @@ static struct dentry *fuse_get_parent(struct dentry *child)
        struct inode *inode;
        struct dentry *parent;
        struct fuse_entry_out outarg;
-       const struct qstr name = QSTR_INIT("..", 2);
        int err;
 
        if (!fc->export_support)
                return ERR_PTR(-ESTALE);
 
        err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
-                              &name, &outarg, &inode);
+                              &dotdot_name, &outarg, &inode);
        if (err) {
                if (err == -ENOENT)
                        return ERR_PTR(-ESTALE);
index 84c3810..ea7fc5c 100644 (file)
@@ -273,8 +273,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
        if (mapping) {
                truncate_inode_pages_final(mapping);
                if (!gfs2_withdrawn(sdp))
-                       GLOCK_BUG_ON(gl, mapping->nrpages ||
-                                    mapping->nrexceptional);
+                       GLOCK_BUG_ON(gl, !mapping_empty(mapping));
        }
        trace_gfs2_glock_put(gl);
        sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
index 7b5e984..7d0c3db 100644 (file)
@@ -316,7 +316,7 @@ retry:
        if (mode & FMODE_WRITE)
                r = w = 1;
 
-       name = dentry_name(d_real(file->f_path.dentry, file->f_inode));
+       name = dentry_name(file_dentry(file));
        if (name == NULL)
                return -ENOMEM;
 
index 302f451..d92c4af 100644 (file)
@@ -356,7 +356,8 @@ struct hpfs_dirent {
   u8 no_of_acls;                       /* number of ACL's (low 3 bits) */
   u8 ix;                               /* code page index (of filename), see
                                           struct code_page_data */
-  u8 namelen, name[1];                 /* file name */
+  u8 namelen;                          /* file name length */
+  u8 name[];                           /* file name */
   /* dnode_secno down;   btree down pointer, if present,
                          follows name on next word boundary, or maybe it
                          precedes next dirent, which is on a word boundary. */
index 701c82c..a2a4233 100644 (file)
@@ -463,14 +463,11 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
        struct address_space *mapping = &inode->i_data;
        const pgoff_t start = lstart >> huge_page_shift(h);
        const pgoff_t end = lend >> huge_page_shift(h);
-       struct vm_area_struct pseudo_vma;
        struct pagevec pvec;
        pgoff_t next, index;
        int i, freed = 0;
        bool truncate_op = (lend == LLONG_MAX);
 
-       vma_init(&pseudo_vma, current->mm);
-       pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
        pagevec_init(&pvec);
        next = start;
        while (next < end) {
@@ -482,10 +479,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 
                for (i = 0; i < pagevec_count(&pvec); ++i) {
                        struct page *page = pvec.pages[i];
-                       u32 hash;
+                       u32 hash = 0;
 
                        index = page->index;
-                       hash = hugetlb_fault_mutex_hash(mapping, index);
                        if (!truncate_op) {
                                /*
                                 * Only need to hold the fault mutex in the
@@ -493,6 +489,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                                 * page faults.  Races are not possible in the
                                 * case of truncation.
                                 */
+                               hash = hugetlb_fault_mutex_hash(mapping, index);
                                mutex_lock(&hugetlb_fault_mutex_table[hash]);
                        }
 
@@ -1435,7 +1432,7 @@ static int get_hstate_idx(int page_size_log)
 
        if (!h)
                return -1;
-       return h - hstates;
+       return hstate_index(h);
 }
 
 /*
index 9e192be..c93500d 100644 (file)
@@ -529,7 +529,14 @@ void clear_inode(struct inode *inode)
         */
        xa_lock_irq(&inode->i_data.i_pages);
        BUG_ON(inode->i_data.nrpages);
-       BUG_ON(inode->i_data.nrexceptional);
+       /*
+        * Almost always, mapping_empty(&inode->i_data) here; but there are
+        * two known and long-standing ways in which nodes may get left behind
+        * (when deep radix-tree node allocation failed partway; or when THP
+        * collapse_file() failed). Until those two known cases are cleaned up,
+        * or a cleanup function is called here, do not BUG_ON(!mapping_empty),
+        * nor even WARN_ON(!mapping_empty).
+        */
        xa_unlock_irq(&inode->i_data.i_pages);
        BUG_ON(!list_empty(&inode->i_data.private_list));
        BUG_ON(!(inode->i_state & I_FREEING));
index 360f813..f46acbb 100644 (file)
@@ -251,7 +251,7 @@ struct io_rsrc_data {
 struct io_buffer {
        struct list_head list;
        __u64 addr;
-       __s32 len;
+       __u32 len;
        __u16 bid;
 };
 
@@ -456,6 +456,7 @@ struct io_ring_ctx {
        spinlock_t                      rsrc_ref_lock;
        struct io_rsrc_node             *rsrc_node;
        struct io_rsrc_node             *rsrc_backup_node;
+       struct io_mapped_ubuf           *dummy_ubuf;
 
        struct io_restriction           restrictions;
 
@@ -702,7 +703,8 @@ enum {
        REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
        REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 
-       REQ_F_FAIL_LINK_BIT,
+       /* first byte is taken by user flags, shift it to not overlap */
+       REQ_F_FAIL_LINK_BIT     = 8,
        REQ_F_INFLIGHT_BIT,
        REQ_F_CUR_POS_BIT,
        REQ_F_NOWAIT_BIT,
@@ -1157,6 +1159,12 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
                goto err;
        __hash_init(ctx->cancel_hash, 1U << hash_bits);
 
+       ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
+       if (!ctx->dummy_ubuf)
+               goto err;
+       /* set invalid range, so io_import_fixed() fails meeting it */
+       ctx->dummy_ubuf->ubuf = -1UL;
+
        if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
                            PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
                goto err;
@@ -1184,6 +1192,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
        INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
        return ctx;
 err:
+       kfree(ctx->dummy_ubuf);
        kfree(ctx->cancel_hash);
        kfree(ctx);
        return NULL;
@@ -3977,7 +3986,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
                        break;
 
                buf->addr = addr;
-               buf->len = pbuf->len;
+               buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
                buf->bid = bid;
                addr += pbuf->len;
                bid++;
@@ -6503,14 +6512,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
        req->work.creds = NULL;
 
        /* enforce forwards compatibility on users */
-       if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
-               req->flags = 0;
+       if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
                return -EINVAL;
-       }
-
        if (unlikely(req->opcode >= IORING_OP_LAST))
                return -EINVAL;
-
        if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
                return -EACCES;
 
@@ -7539,6 +7544,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
                        io_ring_submit_lock(ctx, lock_ring);
                        spin_lock_irqsave(&ctx->completion_lock, flags);
                        io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
+                       ctx->cq_extra++;
                        io_commit_cqring(ctx);
                        spin_unlock_irqrestore(&ctx->completion_lock, flags);
                        io_cqring_ev_posted(ctx);
@@ -8111,11 +8117,13 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
        struct io_mapped_ubuf *imu = *slot;
        unsigned int i;
 
-       for (i = 0; i < imu->nr_bvecs; i++)
-               unpin_user_page(imu->bvec[i].bv_page);
-       if (imu->acct_pages)
-               io_unaccount_mem(ctx, imu->acct_pages);
-       kvfree(imu);
+       if (imu != ctx->dummy_ubuf) {
+               for (i = 0; i < imu->nr_bvecs; i++)
+                       unpin_user_page(imu->bvec[i].bv_page);
+               if (imu->acct_pages)
+                       io_unaccount_mem(ctx, imu->acct_pages);
+               kvfree(imu);
+       }
        *slot = NULL;
 }
 
@@ -8132,7 +8140,7 @@ static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
        for (i = 0; i < ctx->nr_user_bufs; i++)
                io_buffer_unmap(ctx, &ctx->user_bufs[i]);
        kfree(ctx->user_bufs);
-       kfree(ctx->buf_data);
+       io_rsrc_data_free(ctx->buf_data);
        ctx->user_bufs = NULL;
        ctx->buf_data = NULL;
        ctx->nr_user_bufs = 0;
@@ -8255,6 +8263,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
        size_t size;
        int ret, pret, nr_pages, i;
 
+       if (!iov->iov_base) {
+               *pimu = ctx->dummy_ubuf;
+               return 0;
+       }
+
        ubuf = (unsigned long) iov->iov_base;
        end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        start = ubuf >> PAGE_SHIFT;
@@ -8352,7 +8365,9 @@ static int io_buffer_validate(struct iovec *iov)
         * constraints here, we'll -EINVAL later when IO is
         * submitted if they are wrong.
         */
-       if (!iov->iov_base || !iov->iov_len)
+       if (!iov->iov_base)
+               return iov->iov_len ? -EFAULT : 0;
+       if (!iov->iov_len)
                return -EFAULT;
 
        /* arbitrary limit, but we need something */
@@ -8385,7 +8400,7 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
                return -ENOMEM;
        ret = io_buffers_map_alloc(ctx, nr_args);
        if (ret) {
-               kfree(data);
+               io_rsrc_data_free(data);
                return ret;
        }
 
@@ -8402,6 +8417,10 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
                ret = io_buffer_validate(&iov);
                if (ret)
                        break;
+               if (!iov.iov_base && tag) {
+                       ret = -EINVAL;
+                       break;
+               }
 
                ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
                                             &last_hpage);
@@ -8451,12 +8470,16 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
                err = io_buffer_validate(&iov);
                if (err)
                        break;
+               if (!iov.iov_base && tag) {
+                       err = -EINVAL;
+                       break;
+               }
                err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
                if (err)
                        break;
 
                i = array_index_nospec(offset, ctx->nr_user_bufs);
-               if (ctx->user_bufs[i]) {
+               if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
                        err = io_queue_rsrc_removal(ctx->buf_data, offset,
                                                    ctx->rsrc_node, ctx->user_bufs[i]);
                        if (unlikely(err)) {
@@ -8604,6 +8627,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
        if (ctx->hash_map)
                io_wq_put_hash(ctx->hash_map);
        kfree(ctx->cancel_hash);
+       kfree(ctx->dummy_ubuf);
        kfree(ctx);
 }
 
@@ -9607,7 +9631,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
        if (ret)
                goto err;
        /* always set a rsrc node */
-       io_rsrc_node_switch_start(ctx);
+       ret = io_rsrc_node_switch_start(ctx);
+       if (ret)
+               goto err;
        io_rsrc_node_switch(ctx, NULL);
 
        memset(&p->sq_off, 0, sizeof(p->sq_off));
@@ -10136,6 +10162,13 @@ static int __init io_uring_init(void)
        BUILD_BUG_SQE_ELEM(42, __u16,  personality);
        BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
 
+       BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
+                    sizeof(struct io_uring_rsrc_update));
+       BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
+                    sizeof(struct io_uring_rsrc_update2));
+       /* should fit into one byte */
+       BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
+
        BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
        BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
        req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
index 0129e6b..f2cd203 100644 (file)
@@ -1134,9 +1134,7 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
 }
 
 void
-iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
-               void (*merge_private)(struct iomap_ioend *ioend,
-                               struct iomap_ioend *next))
+iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
 {
        struct iomap_ioend *next;
 
@@ -1148,8 +1146,6 @@ iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
                        break;
                list_move_tail(&next->io_list, &ioend->io_list);
                ioend->io_size += next->io_size;
-               if (next->io_private && merge_private)
-                       merge_private(ioend, next);
        }
 }
 EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
@@ -1236,7 +1232,6 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
        ioend->io_inode = inode;
        ioend->io_size = 0;
        ioend->io_offset = offset;
-       ioend->io_private = NULL;
        ioend->io_bio = bio;
        return ioend;
 }
index 94ef92f..4880146 100644 (file)
@@ -767,6 +767,7 @@ repeat:
                        rs.cont_extent = isonum_733(rr->u.CE.extent);
                        rs.cont_offset = isonum_733(rr->u.CE.offset);
                        rs.cont_size = isonum_733(rr->u.CE.size);
+                       break;
                default:
                        break;
                }
index f8fb89b..4fc8cd6 100644 (file)
@@ -57,6 +57,7 @@ const struct file_operations jffs2_file_operations =
        .mmap =         generic_file_readonly_mmap,
        .fsync =        jffs2_fsync,
        .splice_read =  generic_file_splice_read,
+       .splice_write = iter_file_splice_write,
 };
 
 /* jffs2_file_inode_operations */
index db72a9d..b676056 100644 (file)
@@ -1079,7 +1079,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
        memcpy(&fd->name, rd->name, checkedlen);
        fd->name[checkedlen] = 0;
 
-       crc = crc32(0, fd->name, rd->nsize);
+       crc = crc32(0, fd->name, checkedlen);
        if (crc != je32_to_cpu(rd->name_crc)) {
                pr_notice("%s(): Name CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
                          __func__, ofs, je32_to_cpu(rd->name_crc), crc);
index e4131cb..36d9a12 100644 (file)
@@ -194,18 +194,18 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 
 #define jffs2_sum_active() (0)
 #define jffs2_sum_init(a) (0)
-#define jffs2_sum_exit(a)
+#define jffs2_sum_exit(a) do { } while (0)
 #define jffs2_sum_disable_collecting(a)
 #define jffs2_sum_is_disabled(a) (0)
-#define jffs2_sum_reset_collected(a)
+#define jffs2_sum_reset_collected(a) do { } while (0)
 #define jffs2_sum_add_kvec(a,b,c,d) (0)
-#define jffs2_sum_move_collected(a,b)
+#define jffs2_sum_move_collected(a,b) do { } while (0)
 #define jffs2_sum_write_sumnode(a) (0)
-#define jffs2_sum_add_padding_mem(a,b)
-#define jffs2_sum_add_inode_mem(a,b,c)
-#define jffs2_sum_add_dirent_mem(a,b,c)
-#define jffs2_sum_add_xattr_mem(a,b,c)
-#define jffs2_sum_add_xref_mem(a,b,c)
+#define jffs2_sum_add_padding_mem(a,b) do { } while (0)
+#define jffs2_sum_add_inode_mem(a,b,c) do { } while (0)
+#define jffs2_sum_add_dirent_mem(a,b,c) do { } while (0)
+#define jffs2_sum_add_xattr_mem(a,b,c) do { } while (0)
+#define jffs2_sum_add_xref_mem(a,b,c) do { } while (0)
 #define jffs2_sum_scan_sumnode(a,b,c,d,e) (0)
 
 #endif /* CONFIG_JFFS2_SUMMARY */
index 5c42363..74b2a1d 100644 (file)
@@ -1808,6 +1808,9 @@ check_conflicting_open(struct file *filp, const long arg, int flags)
 
        if (flags & FL_LAYOUT)
                return 0;
+       if (flags & FL_DELEG)
+               /* We leave these checks to the caller */
+               return 0;
 
        if (arg == F_RDLCK)
                return inode_is_open_for_write(inode) ? -EAGAIN : 0;
index f7786e0..ed9d580 100644 (file)
@@ -137,12 +137,12 @@ static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp,
                list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
                        if (!pnfs_layout_is_valid(lo))
                                continue;
-                       if (stateid != NULL &&
-                           !nfs4_stateid_match_other(stateid, &lo->plh_stateid))
+                       if (!nfs4_stateid_match_other(stateid, &lo->plh_stateid))
                                continue;
-                       if (!nfs_sb_active(server->super))
-                               continue;
-                       inode = igrab(lo->plh_inode);
+                       if (nfs_sb_active(server->super))
+                               inode = igrab(lo->plh_inode);
+                       else
+                               inode = ERR_PTR(-EAGAIN);
                        rcu_read_unlock();
                        if (inode)
                                return inode;
@@ -176,9 +176,10 @@ static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp,
                                continue;
                        if (nfsi->layout != lo)
                                continue;
-                       if (!nfs_sb_active(server->super))
-                               continue;
-                       inode = igrab(lo->plh_inode);
+                       if (nfs_sb_active(server->super))
+                               inode = igrab(lo->plh_inode);
+                       else
+                               inode = ERR_PTR(-EAGAIN);
                        rcu_read_unlock();
                        if (inode)
                                return inode;
index ff5c4d0..cfeaadf 100644 (file)
@@ -476,7 +476,6 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
                        to->to_maxval = to->to_initval;
                to->to_exponential = 0;
                break;
-#ifndef CONFIG_NFS_DISABLE_UDP_SUPPORT
        case XPRT_TRANSPORT_UDP:
                if (retrans == NFS_UNSPEC_RETRANS)
                        to->to_retries = NFS_DEF_UDP_RETRANS;
@@ -487,7 +486,6 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
                to->to_maxval = NFS_MAX_UDP_TIMEOUT;
                to->to_exponential = 1;
                break;
-#endif
        default:
                BUG();
        }
@@ -698,9 +696,18 @@ static int nfs_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = ctx->flags;
        server->options = ctx->options;
-       server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
-               NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
-               NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
+       server->caps |= NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS;
+
+       switch (clp->rpc_ops->version) {
+       case 2:
+               server->fattr_valid = NFS_ATTR_FATTR_V2;
+               break;
+       case 3:
+               server->fattr_valid = NFS_ATTR_FATTR_V3;
+               break;
+       default:
+               server->fattr_valid = NFS_ATTR_FATTR_V4;
+       }
 
        if (ctx->rsize)
                server->rsize = nfs_block_size(ctx->rsize, NULL);
@@ -794,6 +801,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
        server->maxfilesize = fsinfo->maxfilesize;
 
        server->time_delta = fsinfo->time_delta;
+       server->change_attr_type = fsinfo->change_attr_type;
 
        server->clone_blksize = fsinfo->clone_blksize;
        /* We're airborne Set socket buffersize */
@@ -935,6 +943,8 @@ struct nfs_server *nfs_alloc_server(void)
                return NULL;
        }
 
+       server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+
        ida_init(&server->openowner_id);
        ida_init(&server->lockowner_id);
        pnfs_init_server(server);
index 04bf806..e6ec6f0 100644 (file)
@@ -114,7 +114,7 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
        return ret;
 }
 /**
- * nfs_have_delegation - check if inode has a delegation, mark it
+ * nfs4_have_delegation - check if inode has a delegation, mark it
  * NFS_DELEGATION_REFERENCED if there is one.
  * @inode: inode to check
  * @flags: delegation types to check for
@@ -481,6 +481,22 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
        if (freeme == NULL)
                goto out;
 add_new:
+       /*
+        * If we didn't revalidate the change attribute before setting
+        * the delegation, then pre-emptively ask for a full attribute
+        * cache revalidation.
+        */
+       spin_lock(&inode->i_lock);
+       if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_CHANGE)
+               nfs_set_cache_invalid(inode,
+                       NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME |
+                       NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE |
+                       NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK |
+                       NFS_INO_INVALID_OTHER | NFS_INO_INVALID_DATA |
+                       NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL |
+                       NFS_INO_INVALID_XATTR);
+       spin_unlock(&inode->i_lock);
+
        list_add_tail_rcu(&delegation->super_list, &server->delegations);
        rcu_assign_pointer(nfsi->delegation, delegation);
        delegation = NULL;
@@ -488,11 +504,6 @@ add_new:
        atomic_long_inc(&nfs_active_delegations);
 
        trace_nfs4_set_delegation(inode, type);
-
-       spin_lock(&inode->i_lock);
-       if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME))
-               NFS_I(inode)->cache_validity |= NFS_INO_REVAL_FORCED;
-       spin_unlock(&inode->i_lock);
 out:
        spin_unlock(&clp->cl_lock);
        if (delegation != NULL)
@@ -674,7 +685,7 @@ void nfs_inode_evict_delegation(struct inode *inode)
 }
 
 /**
- * nfs_inode_return_delegation - synchronously return a delegation
+ * nfs4_inode_return_delegation - synchronously return a delegation
  * @inode: inode to process
  *
  * This routine will always flush any dirty data to disk on the
@@ -697,7 +708,7 @@ int nfs4_inode_return_delegation(struct inode *inode)
 }
 
 /**
- * nfs_inode_return_delegation_on_close - asynchronously return a delegation
+ * nfs4_inode_return_delegation_on_close - asynchronously return a delegation
  * @inode: inode to process
  *
  * This routine is called on file close in order to determine if the
@@ -811,7 +822,7 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
 }
 
 /**
- * nfs_super_return_all_delegations - return delegations for one superblock
+ * nfs_server_return_all_delegations - return delegations for one superblock
  * @server: pointer to nfs_server to process
  *
  */
index 9b00a0b..c19b4fd 100644 (file)
@@ -84,8 +84,7 @@ int nfs4_inode_make_writeable(struct inode *inode);
 
 static inline int nfs_have_delegated_attributes(struct inode *inode)
 {
-       return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) &&
-               !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
+       return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ);
 }
 
 #endif
index fc4f490..1a6d286 100644 (file)
@@ -866,6 +866,8 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
                        break;
                }
 
+               verf_arg = verf_res;
+
                status = nfs_readdir_page_filler(desc, entry, pages, pglen,
                                                 arrays, narrays);
        } while (!status && nfs_readdir_page_needs_filling(page));
@@ -927,7 +929,12 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
                        }
                        return res;
                }
-               memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf));
+               /*
+                * Set the cookie verifier if the page cache was empty
+                */
+               if (desc->page_index == 0)
+                       memcpy(nfsi->cookieverf, verf,
+                              sizeof(nfsi->cookieverf));
        }
        res = nfs_readdir_search_array(desc);
        if (res == 0) {
@@ -974,10 +981,10 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 /*
  * Once we've found the start of the dirent within a page: fill 'er up...
  */
-static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
+static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
+                          const __be32 *verf)
 {
        struct file     *file = desc->file;
-       struct nfs_inode *nfsi = NFS_I(file_inode(file));
        struct nfs_cache_array *array;
        unsigned int i = 0;
 
@@ -991,7 +998,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
                        desc->eof = true;
                        break;
                }
-               memcpy(desc->verf, nfsi->cookieverf, sizeof(desc->verf));
+               memcpy(desc->verf, verf, sizeof(desc->verf));
                if (i < (array->size-1))
                        desc->dir_cookie = array->array[i+1].cookie;
                else
@@ -1048,7 +1055,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 
        for (i = 0; !desc->eof && i < sz && arrays[i]; i++) {
                desc->page = arrays[i];
-               nfs_do_filldir(desc);
+               nfs_do_filldir(desc, verf);
        }
        desc->page = NULL;
 
@@ -1069,6 +1076,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 {
        struct dentry   *dentry = file_dentry(file);
        struct inode    *inode = d_inode(dentry);
+       struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_open_dir_context *dir_ctx = file->private_data;
        struct nfs_readdir_descriptor *desc;
        int res;
@@ -1122,7 +1130,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
                        break;
                }
                if (res == -ETOOSMALL && desc->plus) {
-                       clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+                       clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
                        nfs_zap_caches(inode);
                        desc->page_index = 0;
                        desc->plus = false;
@@ -1132,7 +1140,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
                if (res < 0)
                        break;
 
-               nfs_do_filldir(desc);
+               nfs_do_filldir(desc, nfsi->cookieverf);
                nfs_readdir_page_unlock_and_put_cached(desc);
        } while (!desc->eof);
 
@@ -1703,7 +1711,7 @@ static void nfs_drop_nlink(struct inode *inode)
        NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter();
        nfs_set_cache_invalid(
                inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME |
-                              NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED);
+                              NFS_INO_INVALID_NLINK);
        spin_unlock(&inode->i_lock);
 }
 
@@ -2940,7 +2948,7 @@ static int nfs_execute_ok(struct inode *inode, int mask)
 
        if (S_ISDIR(inode->i_mode))
                return 0;
-       if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_OTHER)) {
+       if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_MODE)) {
                if (mask & MAY_NOT_BLOCK)
                        return -ECHILD;
                ret = __nfs_revalidate_inode(server, inode);
@@ -2998,16 +3006,10 @@ out_notsup:
        if (mask & MAY_NOT_BLOCK)
                return -ECHILD;
 
-       res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+       res = nfs_revalidate_inode(inode, NFS_INO_INVALID_MODE |
+                                                 NFS_INO_INVALID_OTHER);
        if (res == 0)
                res = generic_permission(&init_user_ns, inode, mask);
        goto out;
 }
 EXPORT_SYMBOL_GPL(nfs_permission);
-
-/*
- * Local variables:
- *  version-control: t
- *  kept-new-versions: 5
- * End:
- */
index f2b34cf..37a1a88 100644 (file)
@@ -169,19 +169,8 @@ out:
 
 static u64 nfs_fetch_iversion(struct inode *inode)
 {
-       struct nfs_server *server = NFS_SERVER(inode);
-
-       /* Is this the right call?: */
-       nfs_revalidate_inode(server, inode);
-       /*
-        * Also, note we're ignoring any returned error.  That seems to be
-        * the practice for cache consistency information elsewhere in
-        * the server, but I'm not sure why.
-        */
-       if (server->nfs_client->rpc_ops->version >= 4)
-               return inode_peek_iversion_raw(inode);
-       else
-               return time_to_chattr(&inode->i_ctime);
+       nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
+       return inode_peek_iversion_raw(inode);
 }
 
 const struct export_operations nfs_export_ops = {
index 16ad505..1fef107 100644 (file)
@@ -105,7 +105,7 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
 
        if (filp->f_flags & O_DIRECT)
                goto force_reval;
-       if (nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE))
+       if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_SIZE))
                goto force_reval;
        return 0;
 force_reval:
index 872112b..d383de0 100644 (file)
@@ -106,7 +106,7 @@ static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
        if (unlikely(!p))
                return -ENOBUFS;
        fh->size = be32_to_cpup(p++);
-       if (fh->size > sizeof(struct nfs_fh)) {
+       if (fh->size > NFS_MAXFHSIZE) {
                printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
                       fh->size);
                return -EOVERFLOW;
index a06d213..d95c9a3 100644 (file)
@@ -283,20 +283,40 @@ static int nfs_verify_server_address(struct sockaddr *addr)
        return 0;
 }
 
+#ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT
+static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx)
+{
+       return true;
+}
+#else
+static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx)
+{
+       if (ctx->version == 4)
+               return true;
+       return false;
+}
+#endif
+
 /*
  * Sanity check the NFS transport protocol.
- *
  */
-static void nfs_validate_transport_protocol(struct nfs_fs_context *ctx)
+static int nfs_validate_transport_protocol(struct fs_context *fc,
+                                          struct nfs_fs_context *ctx)
 {
        switch (ctx->nfs_server.protocol) {
        case XPRT_TRANSPORT_UDP:
+               if (nfs_server_transport_udp_invalid(ctx))
+                       goto out_invalid_transport_udp;
+               break;
        case XPRT_TRANSPORT_TCP:
        case XPRT_TRANSPORT_RDMA:
                break;
        default:
                ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP;
        }
+       return 0;
+out_invalid_transport_udp:
+       return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
 }
 
 /*
@@ -305,8 +325,6 @@ static void nfs_validate_transport_protocol(struct nfs_fs_context *ctx)
  */
 static void nfs_set_mount_transport_protocol(struct nfs_fs_context *ctx)
 {
-       nfs_validate_transport_protocol(ctx);
-
        if (ctx->mount_server.protocol == XPRT_TRANSPORT_UDP ||
            ctx->mount_server.protocol == XPRT_TRANSPORT_TCP)
                        return;
@@ -932,6 +950,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
        struct nfs_fh *mntfh = ctx->mntfh;
        struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
        int extra_flags = NFS_MOUNT_LEGACY_INTERFACE;
+       int ret;
 
        if (data == NULL)
                goto out_no_data;
@@ -976,6 +995,15 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
                        memset(mntfh->data + mntfh->size, 0,
                               sizeof(mntfh->data) - mntfh->size);
 
+               /*
+                * for proto == XPRT_TRANSPORT_UDP, which is what uses
+                * to_exponential, implying shift: limit the shift value
+                * to BITS_PER_LONG (majortimeo is unsigned long)
+                */
+               if (!(data->flags & NFS_MOUNT_TCP)) /* this will be UDP */
+                       if (data->retrans >= 64) /* shift value is too large */
+                               goto out_invalid_data;
+
                /*
                 * Translate to nfs_fs_context, which nfs_fill_super
                 * can deal with.
@@ -1048,6 +1076,10 @@ static int nfs23_parse_monolithic(struct fs_context *fc,
                goto generic;
        }
 
+       ret = nfs_validate_transport_protocol(fc, ctx);
+       if (ret)
+               return ret;
+
        ctx->skip_reconfig_option_check = true;
        return 0;
 
@@ -1076,6 +1108,9 @@ out_no_address:
 
 out_invalid_fh:
        return nfs_invalf(fc, "NFS: invalid root filehandle");
+
+out_invalid_data:
+       return nfs_invalf(fc, "NFS: invalid binary mount data");
 }
 
 #if IS_ENABLED(CONFIG_NFS_V4)
@@ -1146,6 +1181,7 @@ static int nfs4_parse_monolithic(struct fs_context *fc,
 {
        struct nfs_fs_context *ctx = nfs_fc2context(fc);
        struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
+       int ret;
        char *c;
 
        if (!data) {
@@ -1218,9 +1254,9 @@ static int nfs4_parse_monolithic(struct fs_context *fc,
        ctx->acdirmin   = data->acdirmin;
        ctx->acdirmax   = data->acdirmax;
        ctx->nfs_server.protocol = data->proto;
-       nfs_validate_transport_protocol(ctx);
-       if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
-               goto out_invalid_transport_udp;
+       ret = nfs_validate_transport_protocol(fc, ctx);
+       if (ret)
+               return ret;
 done:
        ctx->skip_reconfig_option_check = true;
        return 0;
@@ -1231,9 +1267,6 @@ out_inval_auth:
 
 out_no_address:
        return nfs_invalf(fc, "NFS4: mount program didn't pass remote address");
-
-out_invalid_transport_udp:
-       return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
 }
 #endif
 
@@ -1298,6 +1331,10 @@ static int nfs_fs_context_validate(struct fs_context *fc)
        if (!nfs_verify_server_address(sap))
                goto out_no_address;
 
+       ret = nfs_validate_transport_protocol(fc, ctx);
+       if (ret)
+               return ret;
+
        if (ctx->version == 4) {
                if (IS_ENABLED(CONFIG_NFS_V4)) {
                        if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA)
@@ -1306,9 +1343,6 @@ static int nfs_fs_context_validate(struct fs_context *fc)
                                port = NFS_PORT;
                        max_namelen = NFS4_MAXNAMLEN;
                        max_pathlen = NFS4_MAXPATHLEN;
-                       nfs_validate_transport_protocol(ctx);
-                       if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
-                               goto out_invalid_transport_udp;
                        ctx->flags &= ~(NFS_MOUNT_NONLM | NFS_MOUNT_NOACL |
                                        NFS_MOUNT_VER3 | NFS_MOUNT_LOCAL_FLOCK |
                                        NFS_MOUNT_LOCAL_FCNTL);
@@ -1317,10 +1351,6 @@ static int nfs_fs_context_validate(struct fs_context *fc)
                }
        } else {
                nfs_set_mount_transport_protocol(ctx);
-#ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT
-              if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
-                      goto out_invalid_transport_udp;
-#endif
                if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA)
                        port = NFS_RDMA_PORT;
        }
@@ -1354,8 +1384,6 @@ out_no_device_name:
 out_v4_not_compiled:
        nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel");
        return -EPROTONOSUPPORT;
-out_invalid_transport_udp:
-       return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
 out_no_address:
        return nfs_invalf(fc, "NFS: mount program didn't pass remote address");
 out_mountproto_mismatch:
index 5a8854d..529c409 100644 (file)
@@ -164,34 +164,19 @@ static int nfs_attribute_timeout(struct inode *inode)
        return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
 
-static bool nfs_check_cache_invalid_delegated(struct inode *inode, unsigned long flags)
+static bool nfs_check_cache_flags_invalid(struct inode *inode,
+                                         unsigned long flags)
 {
        unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
 
-       /* Special case for the pagecache or access cache */
-       if (flags == NFS_INO_REVAL_PAGECACHE &&
-           !(cache_validity & NFS_INO_REVAL_FORCED))
-               return false;
        return (cache_validity & flags) != 0;
 }
 
-static bool nfs_check_cache_invalid_not_delegated(struct inode *inode, unsigned long flags)
-{
-       unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
-
-       if ((cache_validity & flags) != 0)
-               return true;
-       if (nfs_attribute_timeout(inode))
-               return true;
-       return false;
-}
-
 bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags)
 {
-       if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
-               return nfs_check_cache_invalid_delegated(inode, flags);
-
-       return nfs_check_cache_invalid_not_delegated(inode, flags);
+       if (nfs_check_cache_flags_invalid(inode, flags))
+               return true;
+       return nfs_attribute_cache_expired(inode);
 }
 EXPORT_SYMBOL_GPL(nfs_check_cache_invalid);
 
@@ -214,20 +199,21 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
 
        if (have_delegation) {
                if (!(flags & NFS_INO_REVAL_FORCED))
-                       flags &= ~NFS_INO_INVALID_OTHER;
-               flags &= ~(NFS_INO_INVALID_CHANGE
-                               | NFS_INO_INVALID_SIZE
-                               | NFS_INO_REVAL_PAGECACHE
-                               | NFS_INO_INVALID_XATTR);
-       }
+                       flags &= ~(NFS_INO_INVALID_MODE |
+                                  NFS_INO_INVALID_OTHER |
+                                  NFS_INO_INVALID_XATTR);
+               flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE);
+       } else if (flags & NFS_INO_REVAL_PAGECACHE)
+               flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE;
 
        if (!nfs_has_xattr_cache(nfsi))
                flags &= ~NFS_INO_INVALID_XATTR;
+       if (flags & NFS_INO_INVALID_DATA)
+               nfs_fscache_invalidate(inode);
        if (inode->i_mapping->nrpages == 0)
                flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER);
+       flags &= ~(NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED);
        nfsi->cache_validity |= flags;
-       if (flags & NFS_INO_INVALID_DATA)
-               nfs_fscache_invalidate(inode);
 }
 EXPORT_SYMBOL_GPL(nfs_set_cache_invalid);
 
@@ -452,6 +438,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
                .fattr  = fattr
        };
        struct inode *inode = ERR_PTR(-ENOENT);
+       u64 fattr_supported = NFS_SB(sb)->fattr_valid;
        unsigned long hash;
 
        nfs_attr_check_mountpoint(sb, fattr);
@@ -484,8 +471,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
                inode->i_mode = fattr->mode;
                nfsi->cache_validity = 0;
                if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
-                               && nfs_server_capable(inode, NFS_CAP_MODE))
-                       nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+                               && (fattr_supported & NFS_ATTR_FATTR_MODE))
+                       nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
                /* Why so? Because we want revalidate for devices/FIFOs, and
                 * that's precisely what we have in nfs_file_inode_operations.
                 */
@@ -530,15 +517,15 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
                nfsi->attr_gencount = fattr->gencount;
                if (fattr->valid & NFS_ATTR_FATTR_ATIME)
                        inode->i_atime = fattr->atime;
-               else if (nfs_server_capable(inode, NFS_CAP_ATIME))
+               else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
                if (fattr->valid & NFS_ATTR_FATTR_MTIME)
                        inode->i_mtime = fattr->mtime;
-               else if (nfs_server_capable(inode, NFS_CAP_MTIME))
+               else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
                if (fattr->valid & NFS_ATTR_FATTR_CTIME)
                        inode->i_ctime = fattr->ctime;
-               else if (nfs_server_capable(inode, NFS_CAP_CTIME))
+               else if (fattr_supported & NFS_ATTR_FATTR_CTIME)
                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME);
                if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
                        inode_set_iversion_raw(inode, fattr->change_attr);
@@ -550,29 +537,31 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE);
                if (fattr->valid & NFS_ATTR_FATTR_NLINK)
                        set_nlink(inode, fattr->nlink);
-               else if (nfs_server_capable(inode, NFS_CAP_NLINK))
-                       nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+               else if (fattr_supported & NFS_ATTR_FATTR_NLINK)
+                       nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK);
                if (fattr->valid & NFS_ATTR_FATTR_OWNER)
                        inode->i_uid = fattr->uid;
-               else if (nfs_server_capable(inode, NFS_CAP_OWNER))
+               else if (fattr_supported & NFS_ATTR_FATTR_OWNER)
                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
                if (fattr->valid & NFS_ATTR_FATTR_GROUP)
                        inode->i_gid = fattr->gid;
-               else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
+               else if (fattr_supported & NFS_ATTR_FATTR_GROUP)
                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
                if (nfs_server_capable(inode, NFS_CAP_XATTR))
                        nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
                if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
                        inode->i_blocks = fattr->du.nfs2.blocks;
+               else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED &&
+                        fattr->size != 0)
+                       nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS);
                if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
                        /*
                         * report the blocks in 512byte units
                         */
                        inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-               }
-
-               if (nfsi->cache_validity != 0)
-                       nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
+               } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED &&
+                          fattr->size != 0)
+                       nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS);
 
                nfs_setsecurity(inode, fattr, label);
 
@@ -634,8 +623,7 @@ nfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
        }
 
        /* Optimization: if the end result is no change, don't RPC */
-       attr->ia_valid &= NFS_VALID_ATTRS;
-       if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
+       if (((attr->ia_valid & NFS_VALID_ATTRS) & ~(ATTR_FILE|ATTR_OPEN)) == 0)
                return 0;
 
        trace_nfs_setattr_enter(inode);
@@ -710,12 +698,20 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
        spin_lock(&inode->i_lock);
        NFS_I(inode)->attr_gencount = fattr->gencount;
        if ((attr->ia_valid & ATTR_SIZE) != 0) {
-               nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
+               nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME |
+                                                    NFS_INO_INVALID_BLOCKS);
                nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
                nfs_vmtruncate(inode, attr->ia_size);
        }
        if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
                NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_CTIME;
+               if ((attr->ia_valid & ATTR_KILL_SUID) != 0 &&
+                   inode->i_mode & S_ISUID)
+                       inode->i_mode &= ~S_ISUID;
+               if ((attr->ia_valid & ATTR_KILL_SGID) != 0 &&
+                   (inode->i_mode & (S_ISGID | S_IXGRP)) ==
+                    (S_ISGID | S_IXGRP))
+                       inode->i_mode &= ~S_ISGID;
                if ((attr->ia_valid & ATTR_MODE) != 0) {
                        int mode = attr->ia_mode & S_IALLUGO;
                        mode |= inode->i_mode & ~S_IALLUGO;
@@ -793,14 +789,28 @@ static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
        dput(parent);
 }
 
-static bool nfs_need_revalidate_inode(struct inode *inode)
+static u32 nfs_get_valid_attrmask(struct inode *inode)
 {
-       if (NFS_I(inode)->cache_validity &
-                       (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
-               return true;
-       if (nfs_attribute_cache_expired(inode))
-               return true;
-       return false;
+       unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+       u32 reply_mask = STATX_INO | STATX_TYPE;
+
+       if (!(cache_validity & NFS_INO_INVALID_ATIME))
+               reply_mask |= STATX_ATIME;
+       if (!(cache_validity & NFS_INO_INVALID_CTIME))
+               reply_mask |= STATX_CTIME;
+       if (!(cache_validity & NFS_INO_INVALID_MTIME))
+               reply_mask |= STATX_MTIME;
+       if (!(cache_validity & NFS_INO_INVALID_SIZE))
+               reply_mask |= STATX_SIZE;
+       if (!(cache_validity & NFS_INO_INVALID_NLINK))
+               reply_mask |= STATX_NLINK;
+       if (!(cache_validity & NFS_INO_INVALID_MODE))
+               reply_mask |= STATX_MODE;
+       if (!(cache_validity & NFS_INO_INVALID_OTHER))
+               reply_mask |= STATX_UID | STATX_GID;
+       if (!(cache_validity & NFS_INO_INVALID_BLOCKS))
+               reply_mask |= STATX_BLOCKS;
+       return reply_mask;
 }
 
 int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
@@ -815,9 +825,13 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
 
        trace_nfs_getattr_enter(inode);
 
+       request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID |
+                       STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME |
+                       STATX_INO | STATX_SIZE | STATX_BLOCKS;
+
        if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) {
                nfs_readdirplus_parent_cache_hit(path->dentry);
-               goto out_no_update;
+               goto out_no_revalidate;
        }
 
        /* Flush out writes to the server in order to update c/mtime.  */
@@ -850,14 +864,24 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
        /* Check whether the cached attributes are stale */
        do_update |= force_sync || nfs_attribute_cache_expired(inode);
        cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
-       do_update |= cache_validity &
-               (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL);
+       do_update |= cache_validity & NFS_INO_INVALID_CHANGE;
        if (request_mask & STATX_ATIME)
                do_update |= cache_validity & NFS_INO_INVALID_ATIME;
-       if (request_mask & (STATX_CTIME|STATX_MTIME))
-               do_update |= cache_validity & NFS_INO_REVAL_PAGECACHE;
+       if (request_mask & STATX_CTIME)
+               do_update |= cache_validity & NFS_INO_INVALID_CTIME;
+       if (request_mask & STATX_MTIME)
+               do_update |= cache_validity & NFS_INO_INVALID_MTIME;
+       if (request_mask & STATX_SIZE)
+               do_update |= cache_validity & NFS_INO_INVALID_SIZE;
+       if (request_mask & STATX_NLINK)
+               do_update |= cache_validity & NFS_INO_INVALID_NLINK;
+       if (request_mask & STATX_MODE)
+               do_update |= cache_validity & NFS_INO_INVALID_MODE;
+       if (request_mask & (STATX_UID | STATX_GID))
+               do_update |= cache_validity & NFS_INO_INVALID_OTHER;
        if (request_mask & STATX_BLOCKS)
                do_update |= cache_validity & NFS_INO_INVALID_BLOCKS;
+
        if (do_update) {
                /* Update the attribute cache */
                if (!(server->flags & NFS_MOUNT_NOAC))
@@ -871,8 +895,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
                nfs_readdirplus_parent_cache_hit(path->dentry);
 out_no_revalidate:
        /* Only return attributes that were revalidated. */
-       stat->result_mask &= request_mask;
-out_no_update:
+       stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask;
+
        generic_fillattr(&init_user_ns, inode, stat);
        stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
        if (S_ISDIR(inode->i_mode))
@@ -963,7 +987,6 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 {
        struct nfs_inode *nfsi;
        struct inode *inode;
-       struct nfs_server *server;
 
        if (!(ctx->mode & FMODE_WRITE))
                return;
@@ -979,10 +1002,10 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
                return;
        if (!list_empty(&nfsi->open_files))
                return;
-       server = NFS_SERVER(inode);
-       if (server->flags & NFS_MOUNT_NOCTO)
+       if (NFS_SERVER(inode)->flags & NFS_MOUNT_NOCTO)
                return;
-       nfs_revalidate_inode(server, inode);
+       nfs_revalidate_inode(inode,
+                            NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE);
 }
 EXPORT_SYMBOL_GPL(nfs_close_context);
 
@@ -1237,16 +1260,16 @@ int nfs_attribute_cache_expired(struct inode *inode)
 
 /**
  * nfs_revalidate_inode - Revalidate the inode attributes
- * @server: pointer to nfs_server struct
  * @inode: pointer to inode struct
+ * @flags: cache flags to check
  *
  * Updates inode attribute information by retrieving the data from the server.
  */
-int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
+int nfs_revalidate_inode(struct inode *inode, unsigned long flags)
 {
-       if (!nfs_need_revalidate_inode(inode))
+       if (!nfs_check_cache_invalid(inode, flags))
                return NFS_STALE(inode) ? -ESTALE : 0;
-       return __nfs_revalidate_inode(server, inode);
+       return __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 }
 EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
 
@@ -1332,7 +1355,7 @@ out:
 
 bool nfs_mapping_need_revalidate_inode(struct inode *inode)
 {
-       return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) ||
+       return nfs_check_cache_invalid(inode, NFS_INO_INVALID_CHANGE) ||
                NFS_STALE(inode);
 }
 
@@ -1468,8 +1491,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
        if (!nfs_file_has_buffered_writers(nfsi)) {
                /* Verify a few of the more important attributes */
                if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr))
-                       invalid |= NFS_INO_INVALID_CHANGE
-                               | NFS_INO_REVAL_PAGECACHE;
+                       invalid |= NFS_INO_INVALID_CHANGE;
 
                ts = inode->i_mtime;
                if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime))
@@ -1483,28 +1505,21 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
                        cur_size = i_size_read(inode);
                        new_isize = nfs_size_to_loff_t(fattr->size);
                        if (cur_size != new_isize)
-                               invalid |= NFS_INO_INVALID_SIZE
-                                       | NFS_INO_REVAL_PAGECACHE;
+                               invalid |= NFS_INO_INVALID_SIZE;
                }
        }
 
        /* Have any file permissions changed? */
        if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
-               invalid |= NFS_INO_INVALID_ACCESS
-                       | NFS_INO_INVALID_ACL
-                       | NFS_INO_INVALID_OTHER;
+               invalid |= NFS_INO_INVALID_MODE;
        if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid))
-               invalid |= NFS_INO_INVALID_ACCESS
-                       | NFS_INO_INVALID_ACL
-                       | NFS_INO_INVALID_OTHER;
+               invalid |= NFS_INO_INVALID_OTHER;
        if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid))
-               invalid |= NFS_INO_INVALID_ACCESS
-                       | NFS_INO_INVALID_ACL
-                       | NFS_INO_INVALID_OTHER;
+               invalid |= NFS_INO_INVALID_OTHER;
 
        /* Has the link count changed? */
        if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
-               invalid |= NFS_INO_INVALID_OTHER;
+               invalid |= NFS_INO_INVALID_NLINK;
 
        ts = inode->i_atime;
        if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime))
@@ -1642,41 +1657,142 @@ EXPORT_SYMBOL_GPL(_nfs_display_fhandle);
 #endif
 
 /**
- * nfs_inode_attrs_need_update - check if the inode attributes need updating
+ * nfs_inode_attrs_cmp_generic - compare attributes
+ * @fattr: attributes
  * @inode: pointer to inode
+ *
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
+ * Note also the check for wraparound of 'attr_gencount'
+ *
+ * The function returns '1' if it thinks the attributes in @fattr are
+ * more recent than the ones cached in @inode. Otherwise it returns
+ * the value '0'.
+ */
+static int nfs_inode_attrs_cmp_generic(const struct nfs_fattr *fattr,
+                                      const struct inode *inode)
+{
+       unsigned long attr_gencount = NFS_I(inode)->attr_gencount;
+
+       return (long)(fattr->gencount - attr_gencount) > 0 ||
+              (long)(attr_gencount - nfs_read_attr_generation_counter()) > 0;
+}
+
+/**
+ * nfs_inode_attrs_cmp_monotonic - compare attributes
  * @fattr: attributes
+ * @inode: pointer to inode
  *
  * Attempt to divine whether or not an RPC call reply carrying stale
  * attributes got scheduled after another call carrying updated ones.
  *
- * To do so, the function first assumes that a more recent ctime means
- * that the attributes in fattr are newer, however it also attempt to
- * catch the case where ctime either didn't change, or went backwards
- * (if someone reset the clock on the server) by looking at whether
- * or not this RPC call was started after the inode was last updated.
- * Note also the check for wraparound of 'attr_gencount'
+ * We assume that the server observes monotonic semantics for
+ * the change attribute, so a larger value means that the attributes in
+ * @fattr are more recent, in which case the function returns the
+ * value '1'.
+ * A return value of '0' indicates no measurable change
+ * A return value of '-1' means that the attributes in @inode are
+ * more recent.
+ */
+static int nfs_inode_attrs_cmp_monotonic(const struct nfs_fattr *fattr,
+                                        const struct inode *inode)
+{
+       s64 diff = fattr->change_attr - inode_peek_iversion_raw(inode);
+       if (diff > 0)
+               return 1;
+       return diff == 0 ? 0 : -1;
+}
+
+/**
+ * nfs_inode_attrs_cmp_strict_monotonic - compare attributes
+ * @fattr: attributes
+ * @inode: pointer to inode
  *
- * The function returns 'true' if it thinks the attributes in 'fattr' are
- * more recent than the ones cached in the inode.
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
  *
+ * We assume that the server observes strictly monotonic semantics for
+ * the change attribute, so a larger value means that the attributes in
+ * @fattr are more recent, in which case the function returns the
+ * value '1'.
+ * A return value of '-1' means that the attributes in @inode are
+ * more recent or unchanged.
  */
-static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+static int nfs_inode_attrs_cmp_strict_monotonic(const struct nfs_fattr *fattr,
+                                               const struct inode *inode)
 {
-       const struct nfs_inode *nfsi = NFS_I(inode);
+       return  nfs_inode_attrs_cmp_monotonic(fattr, inode) > 0 ? 1 : -1;
+}
 
-       return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
-               ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
+/**
+ * nfs_inode_attrs_cmp - compare attributes
+ * @fattr: attributes
+ * @inode: pointer to inode
+ *
+ * This function returns '1' if it thinks the attributes in @fattr are
+ * more recent than the ones cached in @inode. It returns '-1' if
+ * the attributes in @inode are more recent than the ones in @fattr,
+ * and it returns 0 if not sure.
+ */
+static int nfs_inode_attrs_cmp(const struct nfs_fattr *fattr,
+                              const struct inode *inode)
+{
+       if (nfs_inode_attrs_cmp_generic(fattr, inode) > 0)
+               return 1;
+       switch (NFS_SERVER(inode)->change_attr_type) {
+       case NFS4_CHANGE_TYPE_IS_UNDEFINED:
+               break;
+       case NFS4_CHANGE_TYPE_IS_TIME_METADATA:
+               if (!(fattr->valid & NFS_ATTR_FATTR_CHANGE))
+                       break;
+               return nfs_inode_attrs_cmp_monotonic(fattr, inode);
+       default:
+               if (!(fattr->valid & NFS_ATTR_FATTR_CHANGE))
+                       break;
+               return nfs_inode_attrs_cmp_strict_monotonic(fattr, inode);
+       }
+       return 0;
+}
+
+/**
+ * nfs_inode_finish_partial_attr_update - complete a previous inode update
+ * @fattr: attributes
+ * @inode: pointer to inode
+ *
+ * Returns '1' if the last attribute update left the inode cached
+ * attributes in a partially unrevalidated state, and @fattr
+ * matches the change attribute of that partial update.
+ * Otherwise returns '0'.
+ */
+static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr,
+                                               const struct inode *inode)
+{
+       const unsigned long check_valid =
+               NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME |
+               NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE |
+               NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_OTHER |
+               NFS_INO_INVALID_NLINK;
+       unsigned long cache_validity = NFS_I(inode)->cache_validity;
+
+       if (!(cache_validity & NFS_INO_INVALID_CHANGE) &&
+           (cache_validity & check_valid) != 0 &&
+           (fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+           nfs_inode_attrs_cmp_monotonic(fattr, inode) == 0)
+               return 1;
+       return 0;
 }
 
-static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+static int nfs_refresh_inode_locked(struct inode *inode,
+                                   struct nfs_fattr *fattr)
 {
-       int ret;
+       int attr_cmp = nfs_inode_attrs_cmp(fattr, inode);
+       int ret = 0;
 
        trace_nfs_refresh_inode_enter(inode);
 
-       if (nfs_inode_attrs_need_update(inode, fattr))
+       if (attr_cmp > 0 || nfs_inode_finish_partial_attr_update(fattr, inode))
                ret = nfs_update_inode(inode, fattr);
-       else
+       else if (attr_cmp == 0)
                ret = nfs_check_inode_attributes(inode, fattr);
 
        trace_nfs_refresh_inode_exit(inode, ret);
@@ -1761,11 +1877,13 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
  */
 int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr)
 {
+       int attr_cmp = nfs_inode_attrs_cmp(fattr, inode);
        int status;
 
        /* Don't do a WCC update if these attributes are already stale */
-       if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
-                       !nfs_inode_attrs_need_update(inode, fattr)) {
+       if (attr_cmp < 0)
+               return 0;
+       if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !attr_cmp) {
                fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
                                | NFS_ATTR_FATTR_PRESIZE
                                | NFS_ATTR_FATTR_PREMTIME
@@ -1839,9 +1957,10 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc);
  */
 static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
-       struct nfs_server *server;
+       struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
        loff_t cur_isize, new_isize;
+       u64 fattr_supported = server->fattr_valid;
        unsigned long invalid = 0;
        unsigned long now = jiffies;
        unsigned long save_cache_validity;
@@ -1885,7 +2004,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                goto out_err;
        }
 
-       server = NFS_SERVER(inode);
        /* Update the fsid? */
        if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
                        !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
@@ -1904,14 +2022,17 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
                        | NFS_INO_INVALID_ATIME
                        | NFS_INO_REVAL_FORCED
-                       | NFS_INO_REVAL_PAGECACHE
                        | NFS_INO_INVALID_BLOCKS);
 
        /* Do atomic weak cache consistency updates */
        nfs_wcc_update_inode(inode, fattr);
 
        if (pnfs_layoutcommit_outstanding(inode)) {
-               nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR;
+               nfsi->cache_validity |=
+                       save_cache_validity &
+                       (NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME |
+                        NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE |
+                        NFS_INO_INVALID_BLOCKS);
                cache_revalidated = false;
        }
 
@@ -1928,6 +2049,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                                save_cache_validity |= NFS_INO_INVALID_CTIME
                                        | NFS_INO_INVALID_MTIME
                                        | NFS_INO_INVALID_SIZE
+                                       | NFS_INO_INVALID_BLOCKS
+                                       | NFS_INO_INVALID_NLINK
+                                       | NFS_INO_INVALID_MODE
                                        | NFS_INO_INVALID_OTHER;
                                if (S_ISDIR(inode->i_mode))
                                        nfs_force_lookup_revalidate(inode);
@@ -1940,28 +2064,24 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        attr_changed = true;
                }
        } else {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_CHANGE
-                               | NFS_INO_REVAL_PAGECACHE
-                               | NFS_INO_REVAL_FORCED);
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_CHANGE;
                cache_revalidated = false;
        }
 
        if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
                inode->i_mtime = fattr->mtime;
-       } else if (server->caps & NFS_CAP_MTIME) {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_MTIME
-                               | NFS_INO_REVAL_FORCED);
+       } else if (fattr_supported & NFS_ATTR_FATTR_MTIME) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_MTIME;
                cache_revalidated = false;
        }
 
        if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
                inode->i_ctime = fattr->ctime;
-       } else if (server->caps & NFS_CAP_CTIME) {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_CTIME
-                               | NFS_INO_REVAL_FORCED);
+       } else if (fattr_supported & NFS_ATTR_FATTR_CTIME) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_CTIME;
                cache_revalidated = false;
        }
 
@@ -1985,21 +2105,23 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                                        (long long)cur_isize,
                                        (long long)new_isize);
                }
+               if (new_isize == 0 &&
+                   !(fattr->valid & (NFS_ATTR_FATTR_SPACE_USED |
+                                     NFS_ATTR_FATTR_BLOCKS_USED))) {
+                       fattr->du.nfs3.used = 0;
+                       fattr->valid |= NFS_ATTR_FATTR_SPACE_USED;
+               }
        } else {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_SIZE
-                               | NFS_INO_REVAL_PAGECACHE
-                               | NFS_INO_REVAL_FORCED);
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_SIZE;
                cache_revalidated = false;
        }
 
-
        if (fattr->valid & NFS_ATTR_FATTR_ATIME)
                inode->i_atime = fattr->atime;
-       else if (server->caps & NFS_CAP_ATIME) {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_ATIME
-                               | NFS_INO_REVAL_FORCED);
+       else if (fattr_supported & NFS_ATTR_FATTR_ATIME) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_ATIME;
                cache_revalidated = false;
        }
 
@@ -2012,10 +2134,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                                | NFS_INO_INVALID_ACL;
                        attr_changed = true;
                }
-       } else if (server->caps & NFS_CAP_MODE) {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_OTHER
-                               | NFS_INO_REVAL_FORCED);
+       } else if (fattr_supported & NFS_ATTR_FATTR_MODE) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_MODE;
                cache_revalidated = false;
        }
 
@@ -2026,10 +2147,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        inode->i_uid = fattr->uid;
                        attr_changed = true;
                }
-       } else if (server->caps & NFS_CAP_OWNER) {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_OTHER
-                               | NFS_INO_REVAL_FORCED);
+       } else if (fattr_supported & NFS_ATTR_FATTR_OWNER) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_OTHER;
                cache_revalidated = false;
        }
 
@@ -2040,10 +2160,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        inode->i_gid = fattr->gid;
                        attr_changed = true;
                }
-       } else if (server->caps & NFS_CAP_OWNER_GROUP) {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_OTHER
-                               | NFS_INO_REVAL_FORCED);
+       } else if (fattr_supported & NFS_ATTR_FATTR_GROUP) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_OTHER;
                cache_revalidated = false;
        }
 
@@ -2054,10 +2173,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        set_nlink(inode, fattr->nlink);
                        attr_changed = true;
                }
-       } else if (server->caps & NFS_CAP_NLINK) {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_OTHER
-                               | NFS_INO_REVAL_FORCED);
+       } else if (fattr_supported & NFS_ATTR_FATTR_NLINK) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_NLINK;
                cache_revalidated = false;
        }
 
@@ -2066,18 +2184,22 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                 * report the blocks in 512byte units
                 */
                inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-       } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+       } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_BLOCKS;
+               cache_revalidated = false;
+       }
+
+       if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) {
                inode->i_blocks = fattr->du.nfs2.blocks;
-       else {
-               nfsi->cache_validity |= save_cache_validity &
-                               (NFS_INO_INVALID_BLOCKS
-                               | NFS_INO_REVAL_FORCED);
+       } else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED) {
+               nfsi->cache_validity |=
+                       save_cache_validity & NFS_INO_INVALID_BLOCKS;
                cache_revalidated = false;
        }
 
        /* Update attrtimeo value if we're out of the unstable period */
        if (attr_changed) {
-               invalid &= ~NFS_INO_INVALID_ATTR;
                nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
                nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
                nfsi->attrtimeo_timestamp = now;
@@ -2094,7 +2216,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        nfsi->attrtimeo_timestamp = now;
                }
                /* Set the barrier to be more recent than this fattr */
-               if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0)
+               if ((long)(fattr->gencount - nfsi->attr_gencount) > 0)
                        nfsi->attr_gencount = fattr->gencount;
        }
 
index 7395d09..a36af04 100644 (file)
@@ -181,7 +181,7 @@ struct nfs_mount_request {
        struct net              *net;
 };
 
-extern int nfs_mount(struct nfs_mount_request *info);
+extern int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans);
 extern void nfs_umount(const struct nfs_mount_request *info);
 
 /* client.c */
index 5088fda..b5551ed 100644 (file)
@@ -104,7 +104,7 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
 }
 
 /**
- * nfs_end_io_direct - declare the file is being used for direct i/o
+ * nfs_start_io_direct - declare the file is being used for direct i/o
  * @inode: file inode
  *
  * Declare that a direct I/O operation is about to start, and ensure
index dda5c3e..c5e3b6b 100644 (file)
@@ -136,14 +136,16 @@ struct mnt_fhstatus {
 /**
  * nfs_mount - Obtain an NFS file handle for the given host and path
  * @info: pointer to mount request arguments
+ * @timeo: deciseconds the mount waits for a response before it retries
+ * @retrans: number of times the mount retries a request
  *
- * Uses default timeout parameters specified by underlying transport. On
- * successful return, the auth_flavs list and auth_flav_len will be populated
- * with the list from the server or a faked-up list if the server didn't
- * provide one.
+ * Uses timeout parameters specified by caller. On successful return, the
+ * auth_flavs list and auth_flav_len will be populated with the list from the
+ * server or a faked-up list if the server didn't provide one.
  */
-int nfs_mount(struct nfs_mount_request *info)
+int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans)
 {
+       struct rpc_timeout mnt_timeout;
        struct mountres result = {
                .fh             = info->fh,
                .auth_count     = info->auth_flav_len,
@@ -158,6 +160,7 @@ int nfs_mount(struct nfs_mount_request *info)
                .protocol       = info->protocol,
                .address        = info->sap,
                .addrsize       = info->salen,
+               .timeout        = &mnt_timeout,
                .servername     = info->hostname,
                .program        = &mnt_program,
                .version        = info->version,
@@ -177,6 +180,7 @@ int nfs_mount(struct nfs_mount_request *info)
        if (info->noresvport)
                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
 
+       nfs_init_timeout_values(&mnt_timeout, info->protocol, timeo, retrans);
        mnt_clnt = rpc_create(&args);
        if (IS_ERR(mnt_clnt))
                goto out_clnt_err;
index bb386a6..9ec560a 100644 (file)
@@ -65,7 +65,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
        if (!nfs_server_capable(inode, NFS_CAP_ACLS))
                return ERR_PTR(-EOPNOTSUPP);
 
-       status = nfs_revalidate_inode(server, inode);
+       status = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
        if (status < 0)
                return ERR_PTR(status);
 
index ed1c837..e6eca1d 100644 (file)
@@ -433,7 +433,7 @@ static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
        if (unlikely(!p))
                return -EIO;
        length = be32_to_cpup(p++);
-       if (unlikely(length > NFS3_FHSIZE))
+       if (unlikely(length > NFS3_FHSIZE || length == 0))
                goto out_toobig;
        p = xdr_inline_decode(xdr, length);
        if (unlikely(!p))
@@ -442,7 +442,7 @@ static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
        memcpy(fh->data, p, length);
        return 0;
 out_toobig:
-       dprintk("NFS: file handle size (%u) too big\n", length);
+       trace_nfs_xdr_bad_filehandle(xdr, NFSERR_BADHANDLE);
        return -E2BIG;
 }
 
@@ -2227,6 +2227,7 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr,
 
        /* ignore properties */
        result->lease_time = 0;
+       result->change_attr_type = NFS4_CHANGE_TYPE_IS_TIME_METADATA;
        return 0;
 }
 
index 094024b..a243495 100644 (file)
@@ -46,11 +46,12 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
 {
        struct inode *inode = file_inode(filep);
        struct nfs_server *server = NFS_SERVER(inode);
+       u32 bitmask[3];
        struct nfs42_falloc_args args = {
                .falloc_fh      = NFS_FH(inode),
                .falloc_offset  = offset,
                .falloc_length  = len,
-               .falloc_bitmask = nfs4_fattr_bitmap,
+               .falloc_bitmask = bitmask,
        };
        struct nfs42_falloc_res res = {
                .falloc_server  = server,
@@ -68,6 +69,10 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
                return status;
        }
 
+       memcpy(bitmask, server->cache_consistency_bitmask, sizeof(bitmask));
+       if (server->attr_bitmask[1] & FATTR4_WORD1_SPACE_USED)
+               bitmask[1] |= FATTR4_WORD1_SPACE_USED;
+
        res.falloc_fattr = nfs_alloc_fattr();
        if (!res.falloc_fattr)
                return -ENOMEM;
@@ -75,7 +80,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
        status = nfs4_call_sync(server->client, server, msg,
                                &args.seq_args, &res.seq_res, 0);
        if (status == 0)
-               status = nfs_post_op_update_inode(inode, res.falloc_fattr);
+               status = nfs_post_op_update_inode_force_wcc(inode,
+                                                           res.falloc_fattr);
 
        kfree(res.falloc_fattr);
        return status;
@@ -84,7 +90,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
 static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
                                loff_t offset, loff_t len)
 {
-       struct nfs_server *server = NFS_SERVER(file_inode(filep));
+       struct inode *inode = file_inode(filep);
+       struct nfs_server *server = NFS_SERVER(inode);
        struct nfs4_exception exception = { };
        struct nfs_lock_context *lock;
        int err;
@@ -93,9 +100,13 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
        if (IS_ERR(lock))
                return PTR_ERR(lock);
 
-       exception.inode = file_inode(filep);
+       exception.inode = inode;
        exception.state = lock->open_context->state;
 
+       err = nfs_sync_inode(inode);
+       if (err)
+               goto out;
+
        do {
                err = _nfs42_proc_fallocate(msg, filep, lock, offset, len);
                if (err == -ENOTSUPP) {
@@ -104,7 +115,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
                }
                err = nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
-
+out:
        nfs_put_lock_context(lock);
        return err;
 }
@@ -142,16 +153,13 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
                return -EOPNOTSUPP;
 
        inode_lock(inode);
-       err = nfs_sync_inode(inode);
-       if (err)
-               goto out_unlock;
 
        err = nfs42_proc_fallocate(&msg, filep, offset, len);
        if (err == 0)
                truncate_pagecache_range(inode, offset, (offset + len) -1);
        if (err == -EOPNOTSUPP)
                NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
-out_unlock:
+
        inode_unlock(inode);
        return err;
 }
@@ -261,6 +269,33 @@ out:
        return status;
 }
 
+/**
+ * nfs42_copy_dest_done - perform inode cache updates after clone/copy offload
+ * @inode: pointer to destination inode
+ * @pos: destination offset
+ * @len: copy length
+ *
+ * Punch a hole in the inode page cache, so that the NFS client will
+ * know to retrieve new data.
+ * Update the file size if necessary, and then mark the inode as having
+ * invalid cached values for change attribute, ctime, mtime and space used.
+ */
+static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len)
+{
+       loff_t newsize = pos + len;
+       loff_t end = newsize - 1;
+
+       truncate_pagecache_range(inode, pos, end);
+       spin_lock(&inode->i_lock);
+       if (newsize > i_size_read(inode))
+               i_size_write(inode, newsize);
+       nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
+                                            NFS_INO_INVALID_CTIME |
+                                            NFS_INO_INVALID_MTIME |
+                                            NFS_INO_INVALID_BLOCKS);
+       spin_unlock(&inode->i_lock);
+}
+
 static ssize_t _nfs42_proc_copy(struct file *src,
                                struct nfs_lock_context *src_lock,
                                struct file *dst,
@@ -354,19 +389,8 @@ static ssize_t _nfs42_proc_copy(struct file *src,
                        goto out;
        }
 
-       truncate_pagecache_range(dst_inode, pos_dst,
-                                pos_dst + res->write_res.count);
-       spin_lock(&dst_inode->i_lock);
-       nfs_set_cache_invalid(
-               dst_inode, NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED |
-                                  NFS_INO_INVALID_SIZE | NFS_INO_INVALID_ATTR |
-                                  NFS_INO_INVALID_DATA);
-       spin_unlock(&dst_inode->i_lock);
-       spin_lock(&src_inode->i_lock);
-       nfs_set_cache_invalid(src_inode, NFS_INO_REVAL_PAGECACHE |
-                                                NFS_INO_REVAL_FORCED |
-                                                NFS_INO_INVALID_ATIME);
-       spin_unlock(&src_inode->i_lock);
+       nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count);
+       nfs_invalidate_atime(src_inode);
        status = res->write_res.count;
 out:
        if (args->sync)
@@ -659,7 +683,10 @@ static loff_t _nfs42_proc_llseek(struct file *filep,
        if (status)
                return status;
 
-       return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
+       if (whence == SEEK_DATA && res.sr_eof)
+               return -NFS4ERR_NXIO;
+       else
+               return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
 }
 
 loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
@@ -1044,8 +1071,10 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
 
        status = nfs4_call_sync(server->client, server, msg,
                                &args.seq_args, &res.seq_res, 0);
-       if (status == 0)
+       if (status == 0) {
+               nfs42_copy_dest_done(dst_inode, dst_offset, count);
                status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
+       }
 
        kfree(res.dst_fattr);
        return status;
index 6c2ce79..1c4d2a0 100644 (file)
@@ -168,7 +168,7 @@ nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry)
  *        make it easier to copy the value after an RPC, even if
  *        the value will not be passed up to application (e.g.
  *        for a 'query' getxattr with NULL buffer).
- * @len:   Length of the value. Can be 0 for zero-length attribues.
+ * @len:   Length of the value. Can be 0 for zero-length attributes.
  *         @value and @pages will be NULL if @len is 0.
  */
 static struct nfs4_xattr_entry *
index 441a2fa..57b3821 100644 (file)
@@ -420,9 +420,7 @@ static const struct nfs4_ssc_client_ops nfs4_ssc_clnt_ops_tbl = {
  */
 void nfs42_ssc_register_ops(void)
 {
-#ifdef CONFIG_NFSD_V4
        nfs42_ssc_register(&nfs4_ssc_clnt_ops_tbl);
-#endif
 }
 
 /**
@@ -433,9 +431,7 @@ void nfs42_ssc_register_ops(void)
  */
 void nfs42_ssc_unregister_ops(void)
 {
-#ifdef CONFIG_NFSD_V4
        nfs42_ssc_unregister(&nfs4_ssc_clnt_ops_tbl);
-#endif
 }
 #endif /* CONFIG_NFS_V4_2 */
 
index c65c4b4..87d04f2 100644 (file)
@@ -108,9 +108,10 @@ static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
 static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
                const struct cred *, bool);
 #endif
-static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode,
-               struct nfs_server *server,
-               struct nfs4_label *label);
+static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ],
+                            const __u32 *src, struct inode *inode,
+                            struct nfs_server *server,
+                            struct nfs4_label *label);
 
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
 static inline struct nfs4_label *
@@ -263,6 +264,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
                        | FATTR4_WORD1_FS_LAYOUT_TYPES,
                        FATTR4_WORD2_LAYOUT_BLKSIZE
                        | FATTR4_WORD2_CLONE_BLKSIZE
+                       | FATTR4_WORD2_CHANGE_ATTR_TYPE
                        | FATTR4_WORD2_XATTR_SUPPORT
 };
 
@@ -283,7 +285,7 @@ const u32 nfs4_fs_locations_bitmap[3] = {
 };
 
 static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src,
-               struct inode *inode)
+                                   struct inode *inode, unsigned long flags)
 {
        unsigned long cache_validity;
 
@@ -291,22 +293,20 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src,
        if (!inode || !nfs4_have_delegation(inode, FMODE_READ))
                return;
 
-       cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
-       if (!(cache_validity & NFS_INO_REVAL_FORCED))
-               cache_validity &= ~(NFS_INO_INVALID_CHANGE
-                               | NFS_INO_INVALID_SIZE);
+       cache_validity = READ_ONCE(NFS_I(inode)->cache_validity) | flags;
 
+       /* Remove the attributes over which we have full control */
+       dst[1] &= ~FATTR4_WORD1_RAWDEV;
        if (!(cache_validity & NFS_INO_INVALID_SIZE))
                dst[0] &= ~FATTR4_WORD0_SIZE;
 
        if (!(cache_validity & NFS_INO_INVALID_CHANGE))
                dst[0] &= ~FATTR4_WORD0_CHANGE;
-}
 
-static void nfs4_bitmap_copy_adjust_setattr(__u32 *dst,
-               const __u32 *src, struct inode *inode)
-{
-       nfs4_bitmap_copy_adjust(dst, src, inode);
+       if (!(cache_validity & NFS_INO_INVALID_MODE))
+               dst[1] &= ~FATTR4_WORD1_MODE;
+       if (!(cache_validity & NFS_INO_INVALID_OTHER))
+               dst[1] &= ~(FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP);
 }
 
 static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
@@ -1169,14 +1169,26 @@ int nfs4_call_sync(struct rpc_clnt *clnt,
 static void
 nfs4_inc_nlink_locked(struct inode *inode)
 {
-       nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+       nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
+                                            NFS_INO_INVALID_CTIME |
+                                            NFS_INO_INVALID_NLINK);
        inc_nlink(inode);
 }
 
+static void
+nfs4_inc_nlink(struct inode *inode)
+{
+       spin_lock(&inode->i_lock);
+       nfs4_inc_nlink_locked(inode);
+       spin_unlock(&inode->i_lock);
+}
+
 static void
 nfs4_dec_nlink_locked(struct inode *inode)
 {
-       nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+       nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
+                                            NFS_INO_INVALID_CTIME |
+                                            NFS_INO_INVALID_NLINK);
        drop_nlink(inode);
 }
 
@@ -1186,11 +1198,23 @@ nfs4_update_changeattr_locked(struct inode *inode,
                unsigned long timestamp, unsigned long cache_validity)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
+       u64 change_attr = inode_peek_iversion_raw(inode);
 
        cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME;
 
-       if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(inode)) {
-               nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
+       switch (NFS_SERVER(inode)->change_attr_type) {
+       case NFS4_CHANGE_TYPE_IS_UNDEFINED:
+               break;
+       case NFS4_CHANGE_TYPE_IS_TIME_METADATA:
+               if ((s64)(change_attr - cinfo->after) > 0)
+                       goto out;
+               break;
+       default:
+               if ((s64)(change_attr - cinfo->after) >= 0)
+                       goto out;
+       }
+
+       if (cinfo->atomic && cinfo->before == change_attr) {
                nfsi->attrtimeo_timestamp = jiffies;
        } else {
                if (S_ISDIR(inode->i_mode)) {
@@ -1202,7 +1226,7 @@ nfs4_update_changeattr_locked(struct inode *inode,
                                cache_validity |= NFS_INO_REVAL_PAGECACHE;
                }
 
-               if (cinfo->before != inode_peek_iversion_raw(inode))
+               if (cinfo->before != change_attr)
                        cache_validity |= NFS_INO_INVALID_ACCESS |
                                          NFS_INO_INVALID_ACL |
                                          NFS_INO_INVALID_XATTR;
@@ -1210,8 +1234,9 @@ nfs4_update_changeattr_locked(struct inode *inode,
        inode_set_iversion_raw(inode, cinfo->after);
        nfsi->read_cache_jiffies = timestamp;
        nfsi->attr_gencount = nfs_inc_attr_generation_counter();
-       nfs_set_cache_invalid(inode, cache_validity);
        nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE;
+out:
+       nfs_set_cache_invalid(inode, cache_validity);
 }
 
 void
@@ -3344,12 +3369,17 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
                .inode = inode,
                .stateid = &arg.stateid,
        };
+       unsigned long adjust_flags = NFS_INO_INVALID_CHANGE;
        int err;
 
+       if (sattr->ia_valid & (ATTR_MODE | ATTR_KILL_SUID | ATTR_KILL_SGID))
+               adjust_flags |= NFS_INO_INVALID_MODE;
+       if (sattr->ia_valid & (ATTR_UID | ATTR_GID))
+               adjust_flags |= NFS_INO_INVALID_OTHER;
+
        do {
-               nfs4_bitmap_copy_adjust_setattr(bitmask,
-                               nfs4_bitmask(server, olabel),
-                               inode);
+               nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, olabel),
+                                       inode, adjust_flags);
 
                err = _nfs4_do_setattr(inode, &arg, &res, cred, ctx);
                switch (err) {
@@ -3591,6 +3621,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        struct nfs4_closedata *calldata = data;
        struct nfs4_state *state = calldata->state;
        struct inode *inode = calldata->inode;
+       struct nfs_server *server = NFS_SERVER(inode);
        struct pnfs_layout_hdr *lo;
        bool is_rdonly, is_wronly, is_rdwr;
        int call_close = 0;
@@ -3647,8 +3678,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) {
                /* Close-to-open cache consistency revalidation */
                if (!nfs4_have_delegation(inode, FMODE_READ)) {
-                       calldata->arg.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
-                       nfs4_bitmask_adjust(calldata->arg.bitmask, inode, NFS_SERVER(inode), NULL);
+                       nfs4_bitmask_set(calldata->arg.bitmask_store,
+                                        server->cache_consistency_bitmask,
+                                        inode, server, NULL);
+                       calldata->arg.bitmask = calldata->arg.bitmask_store;
                } else
                        calldata->arg.bitmask = NULL;
        }
@@ -3835,12 +3868,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
                        res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK;
                }
                memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
-               server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
-                               NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
-                               NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
-                               NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
-                               NFS_CAP_CTIME|NFS_CAP_MTIME|
-                               NFS_CAP_SECURITY_LABEL);
+               server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS |
+                                 NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL);
+               server->fattr_valid = NFS_ATTR_FATTR_V4;
                if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
                                res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
                        server->caps |= NFS_CAP_ACLS;
@@ -3848,25 +3878,29 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
                        server->caps |= NFS_CAP_HARDLINKS;
                if (res.has_symlinks != 0)
                        server->caps |= NFS_CAP_SYMLINKS;
-               if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID)
-                       server->caps |= NFS_CAP_FILEID;
-               if (res.attr_bitmask[1] & FATTR4_WORD1_MODE)
-                       server->caps |= NFS_CAP_MODE;
-               if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)
-                       server->caps |= NFS_CAP_NLINK;
-               if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER)
-                       server->caps |= NFS_CAP_OWNER;
-               if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)
-                       server->caps |= NFS_CAP_OWNER_GROUP;
-               if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)
-                       server->caps |= NFS_CAP_ATIME;
-               if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)
-                       server->caps |= NFS_CAP_CTIME;
-               if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
-                       server->caps |= NFS_CAP_MTIME;
+               if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID;
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_MODE;
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_NLINK;
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_OWNER))
+                       server->fattr_valid &= ~(NFS_ATTR_FATTR_OWNER |
+                               NFS_ATTR_FATTR_OWNER_NAME);
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP))
+                       server->fattr_valid &= ~(NFS_ATTR_FATTR_GROUP |
+                               NFS_ATTR_FATTR_GROUP_NAME);
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_SPACE_USED))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_SPACE_USED;
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_ATIME;
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_CTIME;
+               if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME;
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
-               if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
-                       server->caps |= NFS_CAP_SECURITY_LABEL;
+               if (!(res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL))
+                       server->fattr_valid &= ~NFS_ATTR_FATTR_V4_SECURITY_LABEL;
 #endif
                memcpy(server->attr_bitmask_nl, res.attr_bitmask,
                                sizeof(server->attr_bitmask));
@@ -4154,8 +4188,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
        if (inode && (server->flags & NFS_MOUNT_SOFTREVAL))
                task_flags |= RPC_TASK_TIMEOUT;
 
-       nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode);
-
+       nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode, 0);
        nfs_fattr_init(fattr);
        nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
        return nfs4_do_call_sync(server->client, server, &msg,
@@ -4582,11 +4615,11 @@ _nfs4_proc_remove(struct inode *dir, const struct qstr *name, u32 ftype)
        status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
        if (status == 0) {
                spin_lock(&dir->i_lock);
-               nfs4_update_changeattr_locked(dir, &res.cinfo, timestamp,
-                                             NFS_INO_INVALID_DATA);
                /* Removing a directory decrements nlink in the parent */
                if (ftype == NF4DIR && dir->i_nlink > 2)
                        nfs4_dec_nlink_locked(dir);
+               nfs4_update_changeattr_locked(dir, &res.cinfo, timestamp,
+                                             NFS_INO_INVALID_DATA);
                spin_unlock(&dir->i_lock);
        }
        return status;
@@ -4715,11 +4748,11 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
                        /* Note: If we moved a directory, nlink will change */
                        nfs4_update_changeattr(old_dir, &res->old_cinfo,
                                        res->old_fattr->time_start,
-                                       NFS_INO_INVALID_OTHER |
+                                       NFS_INO_INVALID_NLINK |
                                            NFS_INO_INVALID_DATA);
                        nfs4_update_changeattr(new_dir, &res->new_cinfo,
                                        res->new_fattr->time_start,
-                                       NFS_INO_INVALID_OTHER |
+                                       NFS_INO_INVALID_NLINK |
                                            NFS_INO_INVALID_DATA);
                } else
                        nfs4_update_changeattr(old_dir, &res->old_cinfo,
@@ -4761,12 +4794,13 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
        }
 
        nfs4_inode_make_writeable(inode);
-       nfs4_bitmap_copy_adjust_setattr(bitmask, nfs4_bitmask(server, res.label), inode);
-
+       nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, res.label), inode,
+                               NFS_INO_INVALID_CHANGE);
        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
        if (!status) {
                nfs4_update_changeattr(dir, &res.cinfo, res.fattr->time_start,
                                       NFS_INO_INVALID_DATA);
+               nfs4_inc_nlink(inode);
                status = nfs_post_op_update_inode(inode, res.fattr);
                if (!status)
                        nfs_setsecurity(inode, res.fattr, res.label);
@@ -4844,12 +4878,12 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
                                    &data->arg.seq_args, &data->res.seq_res, 1);
        if (status == 0) {
                spin_lock(&dir->i_lock);
-               nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo,
-                               data->res.fattr->time_start,
-                               NFS_INO_INVALID_DATA);
                /* Creating a directory bumps nlink in the parent */
                if (data->arg.ftype == NF4DIR)
                        nfs4_inc_nlink_locked(dir);
+               nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo,
+                                             data->res.fattr->time_start,
+                                             NFS_INO_INVALID_DATA);
                spin_unlock(&dir->i_lock);
                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
        }
@@ -5416,37 +5450,39 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
        return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
 }
 
-static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode,
-                               struct nfs_server *server,
-                               struct nfs4_label *label)
+static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src,
+                            struct inode *inode, struct nfs_server *server,
+                            struct nfs4_label *label)
 {
-
        unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+       unsigned int i;
 
-       if ((cache_validity & NFS_INO_INVALID_DATA) ||
-               (cache_validity & NFS_INO_REVAL_PAGECACHE) ||
-               (cache_validity & NFS_INO_REVAL_FORCED) ||
-               (cache_validity & NFS_INO_INVALID_OTHER))
-               nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode);
+       memcpy(bitmask, src, sizeof(*bitmask) * NFS4_BITMASK_SZ);
 
+       if (cache_validity & NFS_INO_INVALID_CHANGE)
+               bitmask[0] |= FATTR4_WORD0_CHANGE;
        if (cache_validity & NFS_INO_INVALID_ATIME)
                bitmask[1] |= FATTR4_WORD1_TIME_ACCESS;
+       if (cache_validity & NFS_INO_INVALID_MODE)
+               bitmask[1] |= FATTR4_WORD1_MODE;
        if (cache_validity & NFS_INO_INVALID_OTHER)
-               bitmask[1] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER |
-                               FATTR4_WORD1_OWNER_GROUP |
-                               FATTR4_WORD1_NUMLINKS;
+               bitmask[1] |= FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP;
+       if (cache_validity & NFS_INO_INVALID_NLINK)
+               bitmask[1] |= FATTR4_WORD1_NUMLINKS;
        if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL)
                bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL;
-       if (cache_validity & NFS_INO_INVALID_CHANGE)
-               bitmask[0] |= FATTR4_WORD0_CHANGE;
        if (cache_validity & NFS_INO_INVALID_CTIME)
                bitmask[1] |= FATTR4_WORD1_TIME_METADATA;
        if (cache_validity & NFS_INO_INVALID_MTIME)
                bitmask[1] |= FATTR4_WORD1_TIME_MODIFY;
-       if (cache_validity & NFS_INO_INVALID_SIZE)
-               bitmask[0] |= FATTR4_WORD0_SIZE;
        if (cache_validity & NFS_INO_INVALID_BLOCKS)
                bitmask[1] |= FATTR4_WORD1_SPACE_USED;
+
+       if (cache_validity & NFS_INO_INVALID_SIZE)
+               bitmask[0] |= FATTR4_WORD0_SIZE;
+
+       for (i = 0; i < NFS4_BITMASK_SZ; i++)
+               bitmask[i] &= server->attr_bitmask[i];
 }
 
 static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
@@ -5459,8 +5495,10 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
                hdr->args.bitmask = NULL;
                hdr->res.fattr = NULL;
        } else {
-               hdr->args.bitmask = server->cache_consistency_bitmask;
-               nfs4_bitmask_adjust(hdr->args.bitmask, hdr->inode, server, NULL);
+               nfs4_bitmask_set(hdr->args.bitmask_store,
+                                server->cache_consistency_bitmask,
+                                hdr->inode, server, NULL);
+               hdr->args.bitmask = hdr->args.bitmask_store;
        }
 
        if (!hdr->pgio_done_cb)
@@ -5858,7 +5896,7 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
 
        if (!nfs4_server_supports_acls(server))
                return -EOPNOTSUPP;
-       ret = nfs_revalidate_inode(server, inode);
+       ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
        if (ret < 0)
                return ret;
        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
@@ -6502,8 +6540,10 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
 
        data->args.fhandle = &data->fh;
        data->args.stateid = &data->stateid;
-       data->args.bitmask = server->cache_consistency_bitmask;
-       nfs4_bitmask_adjust(data->args.bitmask, inode, server, NULL);
+       nfs4_bitmask_set(data->args.bitmask_store,
+                        server->cache_consistency_bitmask, inode, server,
+                        NULL);
+       data->args.bitmask = data->args.bitmask_store;
        nfs_copy_fh(&data->fh, NFS_FH(inode));
        nfs4_stateid_copy(&data->stateid, stateid);
        data->res.fattr = &data->fattr;
@@ -7250,22 +7290,22 @@ nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd,
 
 #ifdef CONFIG_NFS_V4_1
 struct nfs4_lock_waiter {
-       struct task_struct      *task;
        struct inode            *inode;
-       struct nfs_lowner       *owner;
+       struct nfs_lowner       owner;
+       wait_queue_entry_t      wait;
 };
 
 static int
 nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key)
 {
-       int ret;
-       struct nfs4_lock_waiter *waiter = wait->private;
+       struct nfs4_lock_waiter *waiter =
+               container_of(wait, struct nfs4_lock_waiter, wait);
 
        /* NULL key means to wake up everyone */
        if (key) {
                struct cb_notify_lock_args      *cbnl = key;
                struct nfs_lowner               *lowner = &cbnl->cbnl_owner,
-                                               *wowner = waiter->owner;
+                                               *wowner = &waiter->owner;
 
                /* Only wake if the callback was for the same owner. */
                if (lowner->id != wowner->id || lowner->s_dev != wowner->s_dev)
@@ -7276,53 +7316,45 @@ nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, vo
                        return 0;
        }
 
-       /* override "private" so we can use default_wake_function */
-       wait->private = waiter->task;
-       ret = woken_wake_function(wait, mode, flags, key);
-       if (ret)
-               list_del_init(&wait->entry);
-       wait->private = waiter;
-       return ret;
+       return woken_wake_function(wait, mode, flags, key);
 }
 
 static int
 nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-       int status = -ERESTARTSYS;
        struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
        struct nfs_server *server = NFS_SERVER(state->inode);
        struct nfs_client *clp = server->nfs_client;
        wait_queue_head_t *q = &clp->cl_lock_waitq;
-       struct nfs_lowner owner = { .clientid = clp->cl_clientid,
-                                   .id = lsp->ls_seqid.owner_id,
-                                   .s_dev = server->s_dev };
-       struct nfs4_lock_waiter waiter = { .task  = current,
-                                          .inode = state->inode,
-                                          .owner = &owner};
-       wait_queue_entry_t wait;
+       struct nfs4_lock_waiter waiter = {
+               .inode = state->inode,
+               .owner = { .clientid = clp->cl_clientid,
+                          .id = lsp->ls_seqid.owner_id,
+                          .s_dev = server->s_dev },
+       };
+       int status;
 
        /* Don't bother with waitqueue if we don't expect a callback */
        if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
                return nfs4_retry_setlk_simple(state, cmd, request);
 
-       init_wait(&wait);
-       wait.private = &waiter;
-       wait.func = nfs4_wake_lock_waiter;
+       init_wait(&waiter.wait);
+       waiter.wait.func = nfs4_wake_lock_waiter;
+       add_wait_queue(q, &waiter.wait);
 
-       while(!signalled()) {
-               add_wait_queue(q, &wait);
+       do {
                status = nfs4_proc_setlk(state, cmd, request);
-               if ((status != -EAGAIN) || IS_SETLK(cmd)) {
-                       finish_wait(q, &wait);
+               if (status != -EAGAIN || IS_SETLK(cmd))
                        break;
-               }
 
                status = -ERESTARTSYS;
                freezer_do_not_count();
-               wait_woken(&wait, TASK_INTERRUPTIBLE, NFS4_LOCK_MAXTIMEOUT);
+               wait_woken(&waiter.wait, TASK_INTERRUPTIBLE,
+                          NFS4_LOCK_MAXTIMEOUT);
                freezer_count();
-               finish_wait(q, &wait);
-       }
+       } while (!signalled());
+
+       remove_wait_queue(q, &waiter.wait);
 
        return status;
 }
@@ -7615,7 +7647,7 @@ static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler,
                        return -EACCES;
        }
 
-       ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+       ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
        if (ret)
                return ret;
 
@@ -7646,7 +7678,7 @@ nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len)
                        return 0;
        }
 
-       ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+       ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
        if (ret)
                return ret;
 
@@ -10427,9 +10459,3 @@ const struct xattr_handler *nfs4_xattr_handlers[] = {
 #endif
        NULL
 };
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
index ff876dd..db3811a 100644 (file)
@@ -149,9 +149,3 @@ void nfs4_set_lease_period(struct nfs_client *clp,
        /* Cap maximum reconnect timeout at 1/2 lease period */
        rpc_set_connect_timeout(clp->cl_rpcclient, lease, lease >> 1);
 }
-
-/*
- * Local variables:
- *   c-basic-offset: 8
- * End:
- */
index 3a51351..f22818a 100644 (file)
@@ -645,7 +645,7 @@ void nfs4_purge_state_owners(struct nfs_server *server, struct list_head *head)
 }
 
 /**
- * nfs4_purge_state_owners - Release all cached state owners
+ * nfs4_free_state_owners - Release all cached state owners
  * @head: resulting list of state owners
  *
  * Frees a list of state owners that was generated by
@@ -2695,9 +2695,3 @@ static int nfs4_run_state_manager(void *ptr)
        module_put_and_exit(0);
        return 0;
 }
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
index 48d761e..2ef75ca 100644 (file)
@@ -666,7 +666,42 @@ TRACE_EVENT(nfs4_state_mgr_failed,
                )
 )
 
-TRACE_EVENT(nfs4_xdr_status,
+TRACE_EVENT(nfs4_xdr_bad_operation,
+               TP_PROTO(
+                       const struct xdr_stream *xdr,
+                       u32 op,
+                       u32 expected
+               ),
+
+               TP_ARGS(xdr, op, expected),
+
+               TP_STRUCT__entry(
+                       __field(unsigned int, task_id)
+                       __field(unsigned int, client_id)
+                       __field(u32, xid)
+                       __field(u32, op)
+                       __field(u32, expected)
+               ),
+
+               TP_fast_assign(
+                       const struct rpc_rqst *rqstp = xdr->rqst;
+                       const struct rpc_task *task = rqstp->rq_task;
+
+                       __entry->task_id = task->tk_pid;
+                       __entry->client_id = task->tk_client->cl_clid;
+                       __entry->xid = be32_to_cpu(rqstp->rq_xid);
+                       __entry->op = op;
+                       __entry->expected = expected;
+               ),
+
+               TP_printk(
+                       "task:%u@%d xid=0x%08x operation=%u, expected=%u",
+                       __entry->task_id, __entry->client_id, __entry->xid,
+                       __entry->op, __entry->expected
+               )
+);
+
+DECLARE_EVENT_CLASS(nfs4_xdr_event,
                TP_PROTO(
                        const struct xdr_stream *xdr,
                        u32 op,
@@ -701,6 +736,16 @@ TRACE_EVENT(nfs4_xdr_status,
                        __entry->op
                )
 );
+#define DEFINE_NFS4_XDR_EVENT(name) \
+       DEFINE_EVENT(nfs4_xdr_event, name, \
+                       TP_PROTO( \
+                               const struct xdr_stream *xdr, \
+                               u32 op, \
+                               u32 error \
+                       ), \
+                       TP_ARGS(xdr, op, error))
+DEFINE_NFS4_XDR_EVENT(nfs4_xdr_status);
+DEFINE_NFS4_XDR_EVENT(nfs4_xdr_bad_filehandle);
 
 DECLARE_EVENT_CLASS(nfs4_cb_error_class,
                TP_PROTO(
index ac6b79e..a8cff19 100644 (file)
@@ -144,7 +144,17 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
  * layout types will be returned.
  */
 #define decode_fsinfo_maxsz    (op_decode_hdr_maxsz + \
-                                nfs4_fattr_bitmap_maxsz + 4 + 8 + 5)
+                                nfs4_fattr_bitmap_maxsz + 1 + \
+                                1 /* lease time */ + \
+                                2 /* max filesize */ + \
+                                2 /* max read */ + \
+                                2 /* max write */ + \
+                                nfstime4_maxsz /* time delta */ + \
+                                5 /* fs layout types */ + \
+                                1 /* layout blksize */ + \
+                                1 /* clone blksize */ + \
+                                1 /* change attr type */ + \
+                                1 /* xattr support */)
 #define encode_renew_maxsz     (op_encode_hdr_maxsz + 3)
 #define decode_renew_maxsz     (op_decode_hdr_maxsz)
 #define encode_setclientid_maxsz \
@@ -3200,9 +3210,7 @@ out_status:
        *nfs_retval = nfs4_stat_to_errno(nfserr);
        return true;
 out_bad_operation:
-       dprintk("nfs: Server returned operation"
-               " %d but we issued a request for %d\n",
-                       opnum, expected);
+       trace_nfs4_xdr_bad_operation(xdr, opnum, expected);
        *nfs_retval = -EREMOTEIO;
        return false;
 out_overflow:
@@ -3487,8 +3495,11 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru
                if (unlikely(!p))
                        return -EIO;
                len = be32_to_cpup(p);
-               if (len > NFS4_FHSIZE)
-                       return -EIO;
+               if (len > NFS4_FHSIZE || len == 0) {
+                       trace_nfs4_xdr_bad_filehandle(xdr, OP_READDIR,
+                                                     NFS4ERR_BADHANDLE);
+                       return -EREMOTEIO;
+               }
                p = xdr_inline_decode(xdr, len);
                if (unlikely(!p))
                        return -EIO;
@@ -4837,6 +4848,32 @@ static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
        return 0;
 }
 
+static int decode_attr_change_attr_type(struct xdr_stream *xdr,
+                                       uint32_t *bitmap,
+                                       enum nfs4_change_attr_type *res)
+{
+       u32 tmp = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+
+       dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+       if (bitmap[2] & FATTR4_WORD2_CHANGE_ATTR_TYPE) {
+               if (xdr_stream_decode_u32(xdr, &tmp))
+                       return -EIO;
+               bitmap[2] &= ~FATTR4_WORD2_CHANGE_ATTR_TYPE;
+       }
+
+       switch(tmp) {
+       case NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR:
+       case NFS4_CHANGE_TYPE_IS_VERSION_COUNTER:
+       case NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS:
+       case NFS4_CHANGE_TYPE_IS_TIME_METADATA:
+               *res = tmp;
+               break;
+       default:
+               *res = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+       }
+       return 0;
+}
+
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
        unsigned int savep;
@@ -4885,6 +4922,11 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
        if (status)
                goto xdr_error;
 
+       status = decode_attr_change_attr_type(xdr, bitmap,
+                                             &fsinfo->change_attr_type);
+       if (status)
+               goto xdr_error;
+
        status = decode_attr_xattrsupport(xdr, bitmap,
                                          &fsinfo->xattr_support);
        if (status)
@@ -4913,8 +4955,10 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
        if (unlikely(!p))
                return -EIO;
        len = be32_to_cpup(p);
-       if (len > NFS4_FHSIZE)
-               return -EIO;
+       if (len > NFS4_FHSIZE || len == 0) {
+               trace_nfs4_xdr_bad_filehandle(xdr, OP_GETFH, NFS4ERR_BADHANDLE);
+               return -EREMOTEIO;
+       }
        fh->size = len;
        p = xdr_inline_decode(xdr, len);
        if (unlikely(!p))
@@ -7629,9 +7673,3 @@ const struct rpc_version nfs_version4 = {
        .procs                  = nfs4_procedures,
        .counts                 = nfs_version4_counts,
 };
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
index a90b363..5d1bfcc 100644 (file)
@@ -12,3 +12,4 @@
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_status);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_bad_filehandle);
index 5a59dcd..eb1ef34 100644 (file)
@@ -45,6 +45,11 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_CTIME);
 TRACE_DEFINE_ENUM(NFS_INO_INVALID_MTIME);
 TRACE_DEFINE_ENUM(NFS_INO_INVALID_SIZE);
 TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER);
+TRACE_DEFINE_ENUM(NFS_INO_DATA_INVAL_DEFER);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_BLOCKS);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_XATTR);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_NLINK);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_MODE);
 
 #define nfs_show_cache_validity(v) \
        __print_flags(v, "|", \
@@ -60,7 +65,11 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER);
                        { NFS_INO_INVALID_MTIME, "INVALID_MTIME" }, \
                        { NFS_INO_INVALID_SIZE, "INVALID_SIZE" }, \
                        { NFS_INO_INVALID_OTHER, "INVALID_OTHER" }, \
-                       { NFS_INO_INVALID_XATTR, "INVALID_XATTR" })
+                       { NFS_INO_DATA_INVAL_DEFER, "DATA_INVAL_DEFER" }, \
+                       { NFS_INO_INVALID_BLOCKS, "INVALID_BLOCKS" }, \
+                       { NFS_INO_INVALID_XATTR, "INVALID_XATTR" }, \
+                       { NFS_INO_INVALID_NLINK, "INVALID_NLINK" }, \
+                       { NFS_INO_INVALID_MODE, "INVALID_MODE" })
 
 TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS);
 TRACE_DEFINE_ENUM(NFS_INO_STALE);
@@ -1392,7 +1401,7 @@ TRACE_DEFINE_ENUM(NFSERR_JUKEBOX);
                        { NFSERR_BADTYPE, "BADTYPE" }, \
                        { NFSERR_JUKEBOX, "JUKEBOX" })
 
-TRACE_EVENT(nfs_xdr_status,
+DECLARE_EVENT_CLASS(nfs_xdr_event,
                TP_PROTO(
                        const struct xdr_stream *xdr,
                        int error
@@ -1434,6 +1443,15 @@ TRACE_EVENT(nfs_xdr_status,
                        nfs_show_status(__entry->error)
                )
 );
+#define DEFINE_NFS_XDR_EVENT(name) \
+       DEFINE_EVENT(nfs_xdr_event, name, \
+                       TP_PROTO( \
+                               const struct xdr_stream *xdr, \
+                               int error \
+                       ), \
+                       TP_ARGS(xdr, error))
+DEFINE_NFS_XDR_EVENT(nfs_xdr_status);
+DEFINE_NFS_XDR_EVENT(nfs_xdr_bad_filehandle);
 
 #endif /* _TRACE_NFS_H */
 
index 78c9c4b..6c20b28 100644 (file)
@@ -577,7 +577,7 @@ static void nfs_clear_request(struct nfs_page *req)
 }
 
 /**
- * nfs_release_request - Release the count on an NFS read/write request
+ * nfs_free_request - Release the count on an NFS read/write request
  * @req: request to release
  *
  * Note: Should never be called with the spinlock held!
@@ -1152,7 +1152,7 @@ nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
 }
 
 /**
- * nfs_pageio_add_request - Attempt to coalesce a request into a page list.
+ * __nfs_pageio_add_request - Attempt to coalesce a request into a page list.
  * @desc: destination io descriptor
  * @req: request
  *
index 102b66e..03e0b34 100644 (file)
@@ -1344,7 +1344,7 @@ _pnfs_return_layout(struct inode *ino)
        }
        valid_layout = pnfs_layout_is_valid(lo);
        pnfs_clear_layoutcommit(ino, &tmp_list);
-       pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
+       pnfs_mark_matching_lsegs_return(lo, &tmp_list, NULL, 0);
 
        if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
                struct pnfs_layout_range range = {
@@ -2410,9 +2410,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
                        .iomode = IOMODE_ANY,
                        .length = NFS4_MAX_UINT64,
                };
-               pnfs_set_plh_return_info(lo, IOMODE_ANY, 0);
-               pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
-                                               &range, 0);
+               pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0);
                goto out_forget;
        } else {
                /* We have a completely new layout */
@@ -2468,6 +2466,9 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
 
        assert_spin_locked(&lo->plh_inode->i_lock);
 
+       if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+               tmp_list = &lo->plh_return_segs;
+
        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
                if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
                        dprintk("%s: marking lseg %p iomode %d "
@@ -2475,6 +2476,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                                lseg, lseg->pls_range.iomode,
                                lseg->pls_range.offset,
                                lseg->pls_range.length);
+                       if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+                               tmp_list = &lo->plh_return_segs;
                        if (mark_lseg_invalid(lseg, tmp_list))
                                continue;
                        remaining++;
index 73ab7c5..ea19dbf 100644 (file)
@@ -91,6 +91,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
        info->dtpref = fsinfo.tsize;
        info->maxfilesize = 0x7FFFFFFF;
        info->lease_time = 0;
+       info->change_attr_type = NFS4_CHANGE_TYPE_IS_TIME_METADATA;
        return 0;
 }
 
index 4aaa1f5..19a212f 100644 (file)
@@ -116,16 +116,12 @@ static void unregister_nfs4_fs(void)
 #ifdef CONFIG_NFS_V4_2
 static void nfs_ssc_register_ops(void)
 {
-#ifdef CONFIG_NFSD_V4
        nfs_ssc_register(&nfs_ssc_clnt_ops_tbl);
-#endif
 }
 
 static void nfs_ssc_unregister_ops(void)
 {
-#ifdef CONFIG_NFSD_V4
        nfs_ssc_unregister(&nfs_ssc_clnt_ops_tbl);
-#endif
 }
 #endif /* CONFIG_NFS_V4_2 */
 
@@ -867,7 +863,7 @@ static int nfs_request_mount(struct fs_context *fc,
         * Now ask the mount server to map our export path
         * to a file handle.
         */
-       status = nfs_mount(&request);
+       status = nfs_mount(&request, ctx->timeo, ctx->retrans);
        if (status != 0) {
                dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
                                request.hostname, status);
index f05a903..3bf8217 100644 (file)
@@ -764,9 +764,6 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
         * with invalidate/truncate.
         */
        spin_lock(&mapping->private_lock);
-       if (!nfs_have_writebacks(inode) &&
-           NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
-               inode_inc_iversion_raw(inode);
        if (likely(!PageSwapCache(req->wb_page))) {
                set_bit(PG_MAPPED, &req->wb_flags);
                SetPagePrivate(req->wb_page);
@@ -1293,7 +1290,7 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode,
        if (nfs_have_delegated_attributes(inode))
                goto out;
        if (nfsi->cache_validity &
-           (NFS_INO_REVAL_PAGECACHE | NFS_INO_INVALID_SIZE))
+           (NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE))
                return false;
        smp_rmb();
        if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags) && pagelen != 0)
@@ -1604,7 +1601,7 @@ static int nfs_writeback_done(struct rpc_task *task,
        /* Deal with the suid/sgid bit corner case */
        if (nfs_should_remove_suid(inode)) {
                spin_lock(&inode->i_lock);
-               nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+               nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
                spin_unlock(&inode->i_lock);
        }
        return 0;
index 5fa38ad..f229172 100644 (file)
@@ -138,7 +138,7 @@ config NFSD_FLEXFILELAYOUT
 
 config NFSD_V4_2_INTER_SSC
        bool "NFSv4.2 inter server to server COPY"
-       depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2
+       depends on NFSD_V4 && NFS_V4_2
        help
          This option enables support for NFSv4.2 inter server to
          server copy where the destination server calls the NFSv4.2
index daf43b9..f4ce93d 100644 (file)
@@ -3317,9 +3317,3 @@ const struct svc_version nfsd_version4 = {
        .vs_rpcb_optnl          = true,
        .vs_need_cong_ctrl      = true,
 };
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
index 7698172..b517a87 100644 (file)
@@ -354,6 +354,124 @@ static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = {
        .release        = nfsd4_cb_notify_lock_release,
 };
 
+/*
+ * We store the NONE, READ, WRITE, and BOTH bits separately in the
+ * st_{access,deny}_bmap field of the stateid, in order to track not
+ * only what share bits are currently in force, but also what
+ * combinations of share bits previous opens have used.  This allows us
+ * to enforce the recommendation of rfc 3530 14.2.19 that the server
+ * return an error if the client attempt to downgrade to a combination
+ * of share bits not explicable by closing some of its previous opens.
+ *
+ * XXX: This enforcement is actually incomplete, since we don't keep
+ * track of access/deny bit combinations; so, e.g., we allow:
+ *
+ *     OPEN allow read, deny write
+ *     OPEN allow both, deny none
+ *     DOWNGRADE allow read, deny none
+ *
+ * which we should reject.
+ */
+static unsigned int
+bmap_to_share_mode(unsigned long bmap)
+{
+       int i;
+       unsigned int access = 0;
+
+       for (i = 1; i < 4; i++) {
+               if (test_bit(i, &bmap))
+                       access |= i;
+       }
+       return access;
+}
+
+/* set share access for a given stateid */
+static inline void
+set_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+       unsigned char mask = 1 << access;
+
+       WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
+       stp->st_access_bmap |= mask;
+}
+
+/* clear share access for a given stateid */
+static inline void
+clear_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+       unsigned char mask = 1 << access;
+
+       WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
+       stp->st_access_bmap &= ~mask;
+}
+
+/* test whether a given stateid has access */
+static inline bool
+test_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+       unsigned char mask = 1 << access;
+
+       return (bool)(stp->st_access_bmap & mask);
+}
+
+/* set share deny for a given stateid */
+static inline void
+set_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+       unsigned char mask = 1 << deny;
+
+       WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
+       stp->st_deny_bmap |= mask;
+}
+
+/* clear share deny for a given stateid */
+static inline void
+clear_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+       unsigned char mask = 1 << deny;
+
+       WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
+       stp->st_deny_bmap &= ~mask;
+}
+
+/* test whether a given stateid is denying specific access */
+static inline bool
+test_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+       unsigned char mask = 1 << deny;
+
+       return (bool)(stp->st_deny_bmap & mask);
+}
+
+static int nfs4_access_to_omode(u32 access)
+{
+       switch (access & NFS4_SHARE_ACCESS_BOTH) {
+       case NFS4_SHARE_ACCESS_READ:
+               return O_RDONLY;
+       case NFS4_SHARE_ACCESS_WRITE:
+               return O_WRONLY;
+       case NFS4_SHARE_ACCESS_BOTH:
+               return O_RDWR;
+       }
+       WARN_ON_ONCE(1);
+       return O_RDONLY;
+}
+
+static inline int
+access_permit_read(struct nfs4_ol_stateid *stp)
+{
+       return test_access(NFS4_SHARE_ACCESS_READ, stp) ||
+               test_access(NFS4_SHARE_ACCESS_BOTH, stp) ||
+               test_access(NFS4_SHARE_ACCESS_WRITE, stp);
+}
+
+static inline int
+access_permit_write(struct nfs4_ol_stateid *stp)
+{
+       return test_access(NFS4_SHARE_ACCESS_WRITE, stp) ||
+               test_access(NFS4_SHARE_ACCESS_BOTH, stp);
+}
+
 static inline struct nfs4_stateowner *
 nfs4_get_stateowner(struct nfs4_stateowner *sop)
 {
@@ -543,14 +661,12 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
 #define FILE_HASH_BITS                   8
 #define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
 
-static unsigned int nfsd_fh_hashval(struct knfsd_fh *fh)
+static unsigned int file_hashval(struct svc_fh *fh)
 {
-       return jhash2(fh->fh_base.fh_pad, XDR_QUADLEN(fh->fh_size), 0);
-}
+       struct inode *inode = d_inode(fh->fh_dentry);
 
-static unsigned int file_hashval(struct knfsd_fh *fh)
-{
-       return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
+       /* XXX: why not (here & in file cache) use inode? */
+       return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
 }
 
 static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
@@ -1152,108 +1268,6 @@ static unsigned int clientstr_hashval(struct xdr_netobj name)
        return opaque_hashval(name.data, 8) & CLIENT_HASH_MASK;
 }
 
-/*
- * We store the NONE, READ, WRITE, and BOTH bits separately in the
- * st_{access,deny}_bmap field of the stateid, in order to track not
- * only what share bits are currently in force, but also what
- * combinations of share bits previous opens have used.  This allows us
- * to enforce the recommendation of rfc 3530 14.2.19 that the server
- * return an error if the client attempt to downgrade to a combination
- * of share bits not explicable by closing some of its previous opens.
- *
- * XXX: This enforcement is actually incomplete, since we don't keep
- * track of access/deny bit combinations; so, e.g., we allow:
- *
- *     OPEN allow read, deny write
- *     OPEN allow both, deny none
- *     DOWNGRADE allow read, deny none
- *
- * which we should reject.
- */
-static unsigned int
-bmap_to_share_mode(unsigned long bmap) {
-       int i;
-       unsigned int access = 0;
-
-       for (i = 1; i < 4; i++) {
-               if (test_bit(i, &bmap))
-                       access |= i;
-       }
-       return access;
-}
-
-/* set share access for a given stateid */
-static inline void
-set_access(u32 access, struct nfs4_ol_stateid *stp)
-{
-       unsigned char mask = 1 << access;
-
-       WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
-       stp->st_access_bmap |= mask;
-}
-
-/* clear share access for a given stateid */
-static inline void
-clear_access(u32 access, struct nfs4_ol_stateid *stp)
-{
-       unsigned char mask = 1 << access;
-
-       WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
-       stp->st_access_bmap &= ~mask;
-}
-
-/* test whether a given stateid has access */
-static inline bool
-test_access(u32 access, struct nfs4_ol_stateid *stp)
-{
-       unsigned char mask = 1 << access;
-
-       return (bool)(stp->st_access_bmap & mask);
-}
-
-/* set share deny for a given stateid */
-static inline void
-set_deny(u32 deny, struct nfs4_ol_stateid *stp)
-{
-       unsigned char mask = 1 << deny;
-
-       WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
-       stp->st_deny_bmap |= mask;
-}
-
-/* clear share deny for a given stateid */
-static inline void
-clear_deny(u32 deny, struct nfs4_ol_stateid *stp)
-{
-       unsigned char mask = 1 << deny;
-
-       WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
-       stp->st_deny_bmap &= ~mask;
-}
-
-/* test whether a given stateid is denying specific access */
-static inline bool
-test_deny(u32 deny, struct nfs4_ol_stateid *stp)
-{
-       unsigned char mask = 1 << deny;
-
-       return (bool)(stp->st_deny_bmap & mask);
-}
-
-static int nfs4_access_to_omode(u32 access)
-{
-       switch (access & NFS4_SHARE_ACCESS_BOTH) {
-       case NFS4_SHARE_ACCESS_READ:
-               return O_RDONLY;
-       case NFS4_SHARE_ACCESS_WRITE:
-               return O_WRONLY;
-       case NFS4_SHARE_ACCESS_BOTH:
-               return O_RDWR;
-       }
-       WARN_ON_ONCE(1);
-       return O_RDONLY;
-}
-
 /*
  * A stateid that had a deny mode associated with it is being released
  * or downgraded. Recalculate the deny mode on the file.
@@ -3125,6 +3139,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out_nolock;
                }
                new->cl_mach_cred = true;
+               break;
        case SP4_NONE:
                break;
        default:                                /* checked by xdr code */
@@ -4072,7 +4087,7 @@ static struct nfs4_file *nfsd4_alloc_file(void)
 }
 
 /* OPEN Share state helper functions */
-static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
+static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
                                struct nfs4_file *fp)
 {
        lockdep_assert_held(&state_lock);
@@ -4082,12 +4097,14 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
        INIT_LIST_HEAD(&fp->fi_stateids);
        INIT_LIST_HEAD(&fp->fi_delegations);
        INIT_LIST_HEAD(&fp->fi_clnt_odstate);
-       fh_copy_shallow(&fp->fi_fhandle, fh);
+       fh_copy_shallow(&fp->fi_fhandle, &fh->fh_handle);
        fp->fi_deleg_file = NULL;
        fp->fi_had_conflict = false;
        fp->fi_share_deny = 0;
        memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
        memset(fp->fi_access, 0, sizeof(fp->fi_access));
+       fp->fi_aliased = false;
+       fp->fi_inode = d_inode(fh->fh_dentry);
 #ifdef CONFIG_NFSD_PNFS
        INIT_LIST_HEAD(&fp->fi_lo_states);
        atomic_set(&fp->fi_lo_recalls, 0);
@@ -4426,13 +4443,13 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
 
 /* search file_hashtbl[] for file */
 static struct nfs4_file *
-find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
+find_file_locked(struct svc_fh *fh, unsigned int hashval)
 {
        struct nfs4_file *fp;
 
        hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
                                lockdep_is_held(&state_lock)) {
-               if (fh_match(&fp->fi_fhandle, fh)) {
+               if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
                        if (refcount_inc_not_zero(&fp->fi_ref))
                                return fp;
                }
@@ -4440,8 +4457,32 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
        return NULL;
 }
 
-struct nfs4_file *
-find_file(struct knfsd_fh *fh)
+static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh,
+                                    unsigned int hashval)
+{
+       struct nfs4_file *fp;
+       struct nfs4_file *ret = NULL;
+       bool alias_found = false;
+
+       spin_lock(&state_lock);
+       hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
+                                lockdep_is_held(&state_lock)) {
+               if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
+                       if (refcount_inc_not_zero(&fp->fi_ref))
+                               ret = fp;
+               } else if (d_inode(fh->fh_dentry) == fp->fi_inode)
+                       fp->fi_aliased = alias_found = true;
+       }
+       if (likely(ret == NULL)) {
+               nfsd4_init_file(fh, hashval, new);
+               new->fi_aliased = alias_found;
+               ret = new;
+       }
+       spin_unlock(&state_lock);
+       return ret;
+}
+
+static struct nfs4_file * find_file(struct svc_fh *fh)
 {
        struct nfs4_file *fp;
        unsigned int hashval = file_hashval(fh);
@@ -4453,7 +4494,7 @@ find_file(struct knfsd_fh *fh)
 }
 
 static struct nfs4_file *
-find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh)
+find_or_add_file(struct nfs4_file *new, struct svc_fh *fh)
 {
        struct nfs4_file *fp;
        unsigned int hashval = file_hashval(fh);
@@ -4464,15 +4505,7 @@ find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh)
        if (fp)
                return fp;
 
-       spin_lock(&state_lock);
-       fp = find_file_locked(fh, hashval);
-       if (likely(fp == NULL)) {
-               nfsd4_init_file(fh, hashval, new);
-               fp = new;
-       }
-       spin_unlock(&state_lock);
-
-       return fp;
+       return insert_file(new, fh, hashval);
 }
 
 /*
@@ -4485,7 +4518,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
        struct nfs4_file *fp;
        __be32 ret = nfs_ok;
 
-       fp = find_file(&current_fh->fh_handle);
+       fp = find_file(current_fh);
        if (!fp)
                return ret;
        /* Check for conflicting share reservations */
@@ -4880,6 +4913,11 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
        if (nf)
                nfsd_file_put(nf);
 
+       status = nfserrno(nfsd_open_break_lease(cur_fh->fh_dentry->d_inode,
+                                                               access));
+       if (status)
+               goto out_put_access;
+
        status = nfsd4_truncate(rqstp, cur_fh, open);
        if (status)
                goto out_put_access;
@@ -4951,6 +4989,65 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
        return fl;
 }
 
+static int nfsd4_check_conflicting_opens(struct nfs4_client *clp,
+                                        struct nfs4_file *fp)
+{
+       struct nfs4_ol_stateid *st;
+       struct file *f = fp->fi_deleg_file->nf_file;
+       struct inode *ino = locks_inode(f);
+       int writes;
+
+       writes = atomic_read(&ino->i_writecount);
+       if (!writes)
+               return 0;
+       /*
+        * There could be multiple filehandles (hence multiple
+        * nfs4_files) referencing this file, but that's not too
+        * common; let's just give up in that case rather than
+        * trying to go look up all the clients using that other
+        * nfs4_file as well:
+        */
+       if (fp->fi_aliased)
+               return -EAGAIN;
+       /*
+        * If there's a close in progress, make sure that we see it
+        * clear any fi_fds[] entries before we see it decrement
+        * i_writecount:
+        */
+       smp_mb__after_atomic();
+
+       if (fp->fi_fds[O_WRONLY])
+               writes--;
+       if (fp->fi_fds[O_RDWR])
+               writes--;
+       if (writes > 0)
+               return -EAGAIN; /* There may be non-NFSv4 writers */
+       /*
+        * It's possible there are non-NFSv4 write opens in progress,
+        * but if they haven't incremented i_writecount yet then they
+        * also haven't called break lease yet; so, they'll break this
+        * lease soon enough.  So, all that's left to check for is NFSv4
+        * opens:
+        */
+       spin_lock(&fp->fi_lock);
+       list_for_each_entry(st, &fp->fi_stateids, st_perfile) {
+               if (st->st_openstp == NULL /* it's an open */ &&
+                   access_permit_write(st) &&
+                   st->st_stid.sc_client != clp) {
+                       spin_unlock(&fp->fi_lock);
+                       return -EAGAIN;
+               }
+       }
+       spin_unlock(&fp->fi_lock);
+       /*
+        * There's a small chance that we could be racing with another
+        * NFSv4 open.  However, any open that hasn't added itself to
+        * the fi_stateids list also hasn't called break_lease yet; so,
+        * they'll break this lease soon enough.
+        */
+       return 0;
+}
+
 static struct nfs4_delegation *
 nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
                    struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate)
@@ -4970,9 +5067,12 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 
        nf = find_readable_file(fp);
        if (!nf) {
-               /* We should always have a readable file here */
-               WARN_ON_ONCE(1);
-               return ERR_PTR(-EBADF);
+               /*
+                * We probably could attempt another open and get a read
+                * delegation, but for now, don't bother until the
+                * client actually sends us one.
+                */
+               return ERR_PTR(-EAGAIN);
        }
        spin_lock(&state_lock);
        spin_lock(&fp->fi_lock);
@@ -5007,6 +5107,9 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
                locks_free_lock(fl);
        if (status)
                goto out_clnt_odstate;
+       status = nfsd4_check_conflicting_opens(clp, fp);
+       if (status)
+               goto out_unlock;
 
        spin_lock(&state_lock);
        spin_lock(&fp->fi_lock);
@@ -5088,17 +5191,6 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
                                goto out_no_deleg;
                        if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
                                goto out_no_deleg;
-                       /*
-                        * Also, if the file was opened for write or
-                        * create, there's a good chance the client's
-                        * about to write to it, resulting in an
-                        * immediate recall (since we don't support
-                        * write delegations):
-                        */
-                       if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-                               goto out_no_deleg;
-                       if (open->op_create == NFS4_OPEN_CREATE)
-                               goto out_no_deleg;
                        break;
                default:
                        goto out_no_deleg;
@@ -5161,7 +5253,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
         * and check for delegations in the process of being recalled.
         * If not found, create the nfs4_file struct
         */
-       fp = find_or_add_file(open->op_file, &current_fh->fh_handle);
+       fp = find_or_add_file(open->op_file, current_fh);
        if (fp != open->op_file) {
                status = nfs4_check_deleg(cl, open, &dp);
                if (status)
@@ -5502,21 +5594,6 @@ static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
        return nfs_ok;
 }
 
-static inline int
-access_permit_read(struct nfs4_ol_stateid *stp)
-{
-       return test_access(NFS4_SHARE_ACCESS_READ, stp) ||
-               test_access(NFS4_SHARE_ACCESS_BOTH, stp) ||
-               test_access(NFS4_SHARE_ACCESS_WRITE, stp);
-}
-
-static inline int
-access_permit_write(struct nfs4_ol_stateid *stp)
-{
-       return test_access(NFS4_SHARE_ACCESS_WRITE, stp) ||
-               test_access(NFS4_SHARE_ACCESS_BOTH, stp);
-}
-
 static
 __be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
 {
@@ -6288,15 +6365,6 @@ out:
        return status;
 }
 
-static inline u64
-end_offset(u64 start, u64 len)
-{
-       u64 end;
-
-       end = start + len;
-       return end >= start ? end: NFS4_MAX_UINT64;
-}
-
 /* last octet in a range */
 static inline u64
 last_byte_offset(u64 start, u64 len)
@@ -6865,11 +6933,20 @@ out:
 static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
 {
        struct nfsd_file *nf;
-       __be32 err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
-       if (!err) {
-               err = nfserrno(vfs_test_lock(nf->nf_file, lock));
-               nfsd_file_put(nf);
-       }
+       __be32 err;
+
+       err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
+       if (err)
+               return err;
+       fh_lock(fhp); /* to block new leases till after test_lock: */
+       err = nfserrno(nfsd_open_break_lease(fhp->fh_dentry->d_inode,
+                                                       NFSD_MAY_READ));
+       if (err)
+               goto out;
+       err = nfserrno(vfs_test_lock(nf->nf_file, lock));
+out:
+       fh_unlock(fhp);
+       nfsd_file_put(nf);
        return err;
 }
 
index e0f06d3..7abeccb 100644 (file)
@@ -5448,9 +5448,3 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p)
        nfsd4_sequence_done(resp);
        return 1;
 }
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
index 853bf50..c2c3d90 100644 (file)
@@ -1166,6 +1166,7 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode)
                inode->i_fop = &simple_dir_operations;
                inode->i_op = &simple_dir_inode_operations;
                inc_nlink(inode);
+               break;
        default:
                break;
        }
index 82ba034..dd5d699 100644 (file)
@@ -308,7 +308,7 @@ static int nfsd_init_socks(struct net *net, const struct cred *cred)
 
 static int nfsd_users = 0;
 
-static int nfsd_startup_generic(int nrservs)
+static int nfsd_startup_generic(void)
 {
        int ret;
 
@@ -374,7 +374,7 @@ void nfsd_reset_boot_verifier(struct nfsd_net *nn)
        write_sequnlock(&nn->boot_lock);
 }
 
-static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred)
+static int nfsd_startup_net(struct net *net, const struct cred *cred)
 {
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        int ret;
@@ -382,7 +382,7 @@ static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cre
        if (nn->nfsd_net_up)
                return 0;
 
-       ret = nfsd_startup_generic(nrservs);
+       ret = nfsd_startup_generic();
        if (ret)
                return ret;
        ret = nfsd_init_socks(net, cred);
@@ -790,7 +790,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
 
        nfsd_up_before = nn->nfsd_net_up;
 
-       error = nfsd_startup_net(nrservs, net, cred);
+       error = nfsd_startup_net(net, cred);
        if (error)
                goto out_destroy;
        error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
index 54cab65..e73bdbb 100644 (file)
@@ -516,6 +516,8 @@ struct nfs4_clnt_odstate {
  */
 struct nfs4_file {
        refcount_t              fi_ref;
+       struct inode *          fi_inode;
+       bool                    fi_aliased;
        spinlock_t              fi_lock;
        struct hlist_node       fi_hash;        /* hash on fi_fhandle */
        struct list_head        fi_stateids;
@@ -669,7 +671,6 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name
                                struct xdr_netobj princhash, struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
 
-struct nfs4_file *find_file(struct knfsd_fh *fh);
 void put_nfs4_file(struct nfs4_file *fi);
 extern void nfs4_put_copy(struct nfsd4_copy *copy);
 extern struct nfsd4_copy *
index fe540a3..a7c4252 100644 (file)
@@ -866,9 +866,3 @@ struct nfsd4_operation {
 
 
 #endif
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
index 025fb08..ce14477 100644 (file)
@@ -293,7 +293,7 @@ void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
  * nilfs_cpfile_delete_checkpoints - delete checkpoints
  * @cpfile: inode of checkpoint file
  * @start: start checkpoint number
- * @end: end checkpoint numer
+ * @end: end checkpoint number
  *
  * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in
  * the period from @start to @end, excluding @end itself. The checkpoints
index 3fcb935..640ac8f 100644 (file)
@@ -1043,7 +1043,7 @@ out:
  * @inode: inode object
  * @argp: pointer on argument from userspace
  *
- * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It
+ * Description: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It
  * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which
  * performs the actual trim operation.
  *
@@ -1085,7 +1085,7 @@ static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
  * @inode: inode object
  * @argp: pointer on argument from userspace
  *
- * Decription: nilfs_ioctl_set_alloc_range() function defines lower limit
+ * Description: nilfs_ioctl_set_alloc_range() function defines lower limit
  * of segments in bytes and upper limit of segments in bytes.
  * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility.
  *
index 189bd10..91eebeb 100644 (file)
@@ -440,10 +440,9 @@ static struct dentry *nilfs_get_parent(struct dentry *child)
 {
        unsigned long ino;
        struct inode *inode;
-       struct qstr dotdot = QSTR_INIT("..", 2);
        struct nilfs_root *root;
 
-       ino = nilfs_inode_by_name(d_inode(child), &dotdot);
+       ino = nilfs_inode_by_name(d_inode(child), &dotdot_name);
        if (!ino)
                return ERR_PTR(-ENOENT);
 
index cd4da95..686c8ee 100644 (file)
@@ -2214,7 +2214,7 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
  * nilfs_construct_segment - construct a logical segment
  * @sb: super block
  *
- * Return Value: On success, 0 is retured. On errors, one of the following
+ * Return Value: On success, 0 is returned. On errors, one of the following
  * negative error code is returned.
  *
  * %-EROFS - Read only filesystem.
@@ -2251,7 +2251,7 @@ int nilfs_construct_segment(struct super_block *sb)
  * @start: start byte offset
  * @end: end byte offset (inclusive)
  *
- * Return Value: On success, 0 is retured. On errors, one of the following
+ * Return Value: On success, 0 is returned. On errors, one of the following
  * negative error code is returned.
  *
  * %-EROFS - Read only filesystem.
index 221a1cc..8b7b01a 100644 (file)
@@ -195,7 +195,7 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
 /**
  * load_nilfs - load and recover the nilfs
  * @nilfs: the_nilfs structure to be released
- * @sb: super block isntance used to recover past segment
+ * @sb: super block instance used to recover past segment
  *
  * load_nilfs() searches and load the latest super root,
  * attaches the last segment, and does recovery if needed.
index 5259bad..5c72a7e 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * acl.c
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
index 4e86450..f59d8d0 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * acl.h
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
index 7871078..e032f2e 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * alloc.c
  *
  * Extent allocs and frees
index 7f973dd..4af7aba 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * alloc.h
  *
  * Function prototypes
index ad20403..1294925 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  */
 
index 70ed438..3a52011 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
  */
 
index dabfef9..863a531 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * blockcheck.c
  *
  * Checksum and ECC codes for the OCFS2 userspace library.
index 8f17d2c..d0578e9 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * blockcheck.h
  *
  * Checksum and ECC codes for the OCFS2 userspace library.
index f0b104e..e775877 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * io.c
  *
  * Buffer cache handling
index 1c5e533..2d51649 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_buffer_head.h
  *
  * Buffer cache handling functions defined
index 12a7590..e829c25 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
  */
 
index beed31e..1d4100a 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * heartbeat.h
  *
  * Function prototypes
index 1d696c9..810d328 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
  */
 
index 446e452..b73fc42 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
 
index 667a5c5..7524994 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * netdebug.c
  *
  * debug functionality for o2net
index 7a7640c..bb82e6b 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
  */
 
index 3e00066..3490e77 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * nodemanager.h
  *
  * Function prototypes
index 760d850..6088c9f 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_heartbeat.h
  *
  * On-disk structures for ocfs2_heartbeat
index 21ad307..c9a0b77 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_nodemanager.h
  *
  * Header describing the interface between userspace and the kernel
index cea739b..189c111 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- *
- * vim: noexpandtab sw=8 ts=8 sts=0:
+/*
  *
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
index 6d45ce8..d64bf44 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
 
index d6067c3..022f716 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sys.c
  *
  * OCFS2 cluster sysfs interface
index ce38051..70aaba6 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sys.h
  *
  * Function prototypes for o2cb sysfs interface
index 3bd8119..f660c0d 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- *
- * vim: noexpandtab sw=8 ts=8 sts=0:
+/*
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
  *
index 736338f..a75b551 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * tcp.h
  *
  * Function prototypes
index e6a2b9d..601c99b 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
 
index 42a61ee..04fc834 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dcache.c
  *
  * dentry cache handling code
index 3686a52..7f246c5 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dcache.h
  *
  * Function prototypes
index bdfba9d..bd8d534 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dir.c
  *
  * Creates, reads, walks and deletes directory-nodes
index e3e7d5d..4b9f5a1 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dir.h
  *
  * Function prototypes
index 6456c0f..bae60ca 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmapi.h
  *
  * externally exported dlm interfaces
index 70a1076..c681ba9 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmast.c
  *
  * AST and BAST functionality for local and remote nodes
index 58d57e2..fd20227 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmcommon.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
index 6051edc..450d46e 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmconvert.c
  *
  * underlying calls for lock conversion
index 12d9c28..1f37171 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmconvert.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
index 4b8b41d..d442cf5 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdebug.c
  *
  * debug functionality for the dlm
index f8fd868..e08f735 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdebug.h
  *
  * Copyright (C) 2008 Oracle.  All rights reserved.
index 357cfc7..9f90fc9 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdomain.c
  *
  * defines domain join / leave apis
index 7c21664..815abe3 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdomain.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
index 83f0760..041fd17 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmlock.c
  *
  * underlying calls for lock creation
index f105746..4960a6d 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmmod.c
  *
  * standalone DLM module
index afc5173..0e7aad1 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmrecovery.c
  *
  * recovery stuff
index 5ccc4ff..c350bd4 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmthread.c
  *
  * standalone DLM module
index dcb17ca..61103b2 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmunlock.c
  *
  * underlying calls for unlocking locks
index b2870f1..fa0a14f 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmfs.c
  *
  * Code which implements the kernel side of a minimal userspace
index 339f098..29f183a 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * userdlm.c
  *
  * Code which implements the kernel side of a minimal userspace
index 0558ae7..47ba18e 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * userdlm.h
  *
  * Userspace dlm defines
index 0fbe8bf..48fd369 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmglue.c
  *
  * Code which implements an OCFS2 specific interface to our DLM.
index b8fbed2..e5da580 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmglue.h
  *
  * description here
index 69ed278..eaa8c80 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * export.c
  *
  * Functions to facilitate NFS exporting
index d485da0..6363574 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * export.h
  *
  * Function prototypes
index 7b93e9c..70a768b 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * extent_map.c
  *
  * Block/Cluster mapping functions
index e5464f6..bc4ed59 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * extent_map.h
  *
  * In-memory file extent mappings for OCFS2.
index db8a626..f17c3d3 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * file.c
  *
  * File open, close, extend, truncate
index 8536cec..71db8f3 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * file.h
  *
  * Function prototypes
index 50f11bf..90b8d30 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * filecheck.c
  *
  * Code which implements online file check.
index 4d00677..d3bcb8b 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * filecheck.h
  *
  * Online file check.
index 60c5f99..9099d8f 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * heartbeat.c
  *
  * Register ourselves with the heartbaet service, keep our node maps
index 5fedb2d..f1f8b18 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * heartbeat.h
  *
  * Function prototypes
index 7c9dfd5..bc8f32f 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * inode.c
  *
  * vfs' aops, fops, dops and iops
index 51a4f71..82b28fd 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * inode.h
  *
  * Function prototypes
index db52e84..4e589ce 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * journal.c
  *
  * Defines functions of journalling api
index bfe611e..d158acb 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * journal.h
  *
  * Defines journalling api and structures.
index fc8252a..5f6bacb 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * localalloc.c
  *
  * Node local data allocation
index e8a5cea..08f925b 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * localalloc.h
  *
  * Function prototypes
index 7edc4e5..fab7c6a 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * locks.c
  *
  * Userspace file locking support
index 389fe1f..b52de39 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * locks.h
  *
  * Function prototypes for Userspace file locking support
index 25cabbf..1834f26 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * mmap.c
  *
  * Code to deal with the mess that is clustered mmap.
index 758d966..192cad0 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * move_extents.c
  *
  * Copyright (C) 2011 Oracle.  All rights reserved.
index 28cac43..987f9e5 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * move_extents.h
  *
  * Copyright (C) 2011 Oracle.  All rights reserved.
index 05ced86..2c46ff6 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * namei.c
  *
  * Create and rename file, directory, symlinks
index cc091ed..9cc891e 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * namei.h
  *
  * Function prototypes
index 01ae48c..6dbcf3d 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs1_fs_compat.h
  *
  * OCFS1 volume header definitions.  OCFS2 creates valid but unmountable
index 7993d52..bb62cc2 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2.h
  *
  * Defines macros and structures used in OCFS2
index 19137c6..638d875 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_fs.h
  *
  * On-disk structures for OCFS2.
index 273616b..9680797 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_ioctl.h
  *
  * Defines OCFS2 ioctls.
index b4be849..8ac357c 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_lockid.h
  *
  * Defines OCFS2 lockid bits.
index 5c9c105..31a5e16 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_lockingver.h
  *
  * Defines OCFS2 Locking version values.
index c19a463..7f6355c 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * refcounttree.c
  *
  * Copyright (C) 2009 Oracle.  All rights reserved.
index 0b90144..8197a94 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * refcounttree.h
  *
  * Copyright (C) 2009 Oracle.  All rights reserved.
index bf3842e..769e466 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * reservations.c
  *
  * Allocation reservations implementation
index 6ac8812..677c506 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * reservations.h
  *
  * Allocation reservations function prototypes and structures.
index 24eb52f..d65d43c 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * resize.c
  *
  * volume resize.
index 0af0c02..4990637 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * resize.h
  *
  * Function prototypes
index 4da0e4b..0b0ae3e 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * slot_map.c
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
index 93b53e7..a436445 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * slotmap.h
  *
  * description here
index f700120..88f75f7 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stack_o2cb.c
  *
  * Code which interfaces ocfs2 with the o2cb stack.
index 7397064..85a4762 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stack_user.c
  *
  * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
index 8d33ebc..d50e8b8 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stackglue.c
  *
  * Code which implements an OCFS2 specific interface to underlying
index e9d26cb..3636847 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stackglue.h
  *
  * Glue to the underlying cluster stack.
index 8c8cf7f..8521942 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * suballoc.c
  *
  * metadata alloc and free
index 50b3625..5805a03 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * suballoc.h
  *
  * Defines sub allocator api
index 079f882..c86bd4e 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * super.c
  *
  * load/unload driver, mount/dismount volumes
index 76facaf..8312651 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * super.h
  *
  * Function prototypes
index 94cfacc..f755a49 100644 (file)
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  *  linux/cluster/ssi/cfs/symlink.c
  *
  *     This program is free software; you can redistribute it and/or
index 167094d..ffcf021 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * symlink.h
  *
  * Function prototypes
index bb701c4..53a945d 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sysfile.c
  *
  * Initialize, read, write, etc. system files.
index a83dd96..2b38c75 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sysfile.h
  *
  * Function prototypes
index 580852b..0985492 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * uptodate.c
  *
  * Tracking the up-to-date-ness of a local buffer_head with respect to
index 77a30ca..85d9413 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * uptodate.h
  *
  * Cluster uptodate tracking
index 36ae47a..dd784eb 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * xattr.c
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
index 9c80382..00308b5 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * xattr.h
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
index ccef8c9..86810e5 100644 (file)
@@ -248,21 +248,7 @@ populate_shared_memory:
                 *       or it can pointers to struct page's
                 */
 
-               /*
-                * When reading, readahead_size will only be zero when
-                * we're doing O_DIRECT, otherwise we got here from
-                * orangefs_readpage.
-                *
-                * If we got here from orangefs_readpage we want to
-                * copy either a page or the whole file into the io
-                * vector, whichever is smaller.
-                */
-               if (readahead_size)
-                       copy_amount =
-                               min(new_op->downcall.resp.io.amt_complete,
-                                       (__s64)PAGE_SIZE);
-               else
-                       copy_amount = new_op->downcall.resp.io.amt_complete;
+               copy_amount = new_op->downcall.resp.io.amt_complete;
 
                ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index,
                        copy_amount);
@@ -283,19 +269,11 @@ populate_shared_memory:
 
 out:
        if (buffer_index >= 0) {
-               if ((readahead_size) && (type == ORANGEFS_IO_READ)) {
-                       /* readpage */
-                       *index_return = buffer_index;
-                       gossip_debug(GOSSIP_FILE_DEBUG,
-                               "%s: hold on to buffer_index :%d:\n",
-                               __func__, buffer_index);
-               } else {
-                       /* O_DIRECT */
-                       orangefs_bufmap_put(buffer_index);
-                       gossip_debug(GOSSIP_FILE_DEBUG,
-                               "%s(%pU): PUT buffer_index %d\n",
-                               __func__, handle, buffer_index);
-               }
+               orangefs_bufmap_put(buffer_index);
+               gossip_debug(GOSSIP_FILE_DEBUG,
+                       "%s(%pU): PUT buffer_index %d\n",
+                       __func__, handle, buffer_index);
+               buffer_index = -1;
        }
        op_release(new_op);
        return ret;
index 85b3dd2..6bf35a0 100644 (file)
@@ -245,6 +245,50 @@ static int orangefs_writepages(struct address_space *mapping,
 
 static int orangefs_launder_page(struct page *);
 
+static void orangefs_readahead(struct readahead_control *rac)
+{
+       loff_t offset;
+       struct iov_iter iter;
+       struct file *file = rac->file;
+       struct inode *inode = file->f_mapping->host;
+       struct xarray *i_pages;
+       struct page *page;
+       loff_t new_start = readahead_pos(rac);
+       int ret;
+       size_t new_len = 0;
+
+       loff_t bytes_remaining = inode->i_size - readahead_pos(rac);
+       loff_t pages_remaining = bytes_remaining / PAGE_SIZE;
+
+       if (pages_remaining >= 1024)
+               new_len = 4194304;
+       else if (pages_remaining > readahead_count(rac))
+               new_len = bytes_remaining;
+
+       if (new_len)
+               readahead_expand(rac, new_start, new_len);
+
+       offset = readahead_pos(rac);
+       i_pages = &file->f_mapping->i_pages;
+
+       iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac));
+
+       /* read in the pages. */
+       if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode,
+                       &offset, &iter, readahead_length(rac),
+                       inode->i_size, NULL, NULL, file)) < 0)
+               gossip_debug(GOSSIP_FILE_DEBUG,
+                       "%s: wait_for_direct_io failed. \n", __func__);
+       else
+               ret = 0;
+
+       /* clean up. */
+       while ((page = readahead_page(rac))) {
+               page_endio(page, false, ret);
+               put_page(page);
+       }
+}
+
 static int orangefs_readpage(struct file *file, struct page *page)
 {
        struct inode *inode = page->mapping->host;
@@ -252,44 +296,24 @@ static int orangefs_readpage(struct file *file, struct page *page)
        struct bio_vec bv;
        ssize_t ret;
        loff_t off; /* offset into this page */
-       pgoff_t index; /* which page */
-       struct page *next_page;
-       char *kaddr;
-       loff_t read_size;
-       int buffer_index = -1; /* orangefs shared memory slot */
-       int slot_index;   /* index into slot */
-       int remaining;
-
-       /*
-        * Get up to this many bytes from Orangefs at a time and try
-        * to fill them into the page cache at once. Tests with dd made
-        * this seem like a reasonable static number, if there was
-        * interest perhaps this number could be made setable through
-        * sysfs...
-        */
-       read_size = 524288;
 
        if (PageDirty(page))
                orangefs_launder_page(page);
 
        off = page_offset(page);
-       index = off >> PAGE_SHIFT;
        bv.bv_page = page;
        bv.bv_len = PAGE_SIZE;
        bv.bv_offset = 0;
        iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE);
 
        ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
-           read_size, inode->i_size, NULL, &buffer_index, file);
-       remaining = ret;
+           PAGE_SIZE, inode->i_size, NULL, NULL, file);
        /* this will only zero remaining unread portions of the page data */
        iov_iter_zero(~0U, &iter);
        /* takes care of potential aliasing */
        flush_dcache_page(page);
        if (ret < 0) {
                SetPageError(page);
-               unlock_page(page);
-               goto out;
        } else {
                SetPageUptodate(page);
                if (PageError(page))
@@ -298,60 +322,7 @@ static int orangefs_readpage(struct file *file, struct page *page)
        }
        /* unlock the page after the ->readpage() routine completes */
        unlock_page(page);
-
-       if (remaining > PAGE_SIZE) {
-               slot_index = 0;
-               while ((remaining - PAGE_SIZE) >= PAGE_SIZE) {
-                       remaining -= PAGE_SIZE;
-                       /*
-                        * It is an optimization to try and fill more than one
-                        * page... by now we've already gotten the single
-                        * page we were after, if stuff doesn't seem to
-                        * be going our way at this point just return
-                        * and hope for the best.
-                        *
-                        * If we look for pages and they're already there is
-                        * one reason to give up, and if they're not there
-                        * and we can't create them is another reason.
-                        */
-
-                       index++;
-                       slot_index++;
-                       next_page = find_get_page(inode->i_mapping, index);
-                       if (next_page) {
-                               gossip_debug(GOSSIP_FILE_DEBUG,
-                                       "%s: found next page, quitting\n",
-                                       __func__);
-                               put_page(next_page);
-                               goto out;
-                       }
-                       next_page = find_or_create_page(inode->i_mapping,
-                                                       index,
-                                                       GFP_KERNEL);
-                       /*
-                        * I've never hit this, leave it as a printk for
-                        * now so it will be obvious.
-                        */
-                       if (!next_page) {
-                               printk("%s: can't create next page, quitting\n",
-                                       __func__);
-                               goto out;
-                       }
-                       kaddr = kmap_atomic(next_page);
-                       orangefs_bufmap_page_fill(kaddr,
-                                               buffer_index,
-                                               slot_index);
-                       kunmap_atomic(kaddr);
-                       SetPageUptodate(next_page);
-                       unlock_page(next_page);
-                       put_page(next_page);
-               }
-       }
-
-out:
-       if (buffer_index != -1)
-               orangefs_bufmap_put(buffer_index);
-       return ret;
+        return ret;
 }
 
 static int orangefs_write_begin(struct file *file,
@@ -660,6 +631,7 @@ out:
 /** ORANGEFS2 implementation of address space operations */
 static const struct address_space_operations orangefs_address_operations = {
        .writepage = orangefs_writepage,
+       .readahead = orangefs_readahead,
        .readpage = orangefs_readpage,
        .writepages = orangefs_writepages,
        .set_page_dirty = __set_page_dirty_nobuffers,
index 74a3d63..cd72978 100644 (file)
@@ -31,7 +31,7 @@ static ulong module_parm_debug_mask;
 __u64 orangefs_gossip_debug_mask;
 int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
 int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
-int orangefs_cache_timeout_msecs = 50;
+int orangefs_cache_timeout_msecs = 500;
 int orangefs_dcache_timeout_msecs = 50;
 int orangefs_getattr_timeout_msecs = 50;
 
index bc86aa8..5b78739 100644 (file)
@@ -166,15 +166,8 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
        const char              *cp = name, *next;
        struct proc_dir_entry   *de;
 
-       de = *ret;
-       if (!de)
-               de = &proc_root;
-
-       while (1) {
-               next = strchr(cp, '/');
-               if (!next)
-                       break;
-
+       de = *ret ?: &proc_root;
+       while ((next = strchr(cp, '/')) != NULL) {
                de = pde_subdir_find(de, cp, next - cp);
                if (!de) {
                        WARN(1, "name '%s'\n", name);
@@ -756,7 +749,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
        while (1) {
                next = pde_subdir_first(de);
                if (next) {
-                       if (unlikely(pde_is_permanent(root))) {
+                       if (unlikely(pde_is_permanent(next))) {
                                write_unlock(&proc_subdir_lock);
                                WARN(1, "removing permanent /proc entry '%s/%s'",
                                        next->parent->name, next->name);
index bde6b6f..599eb72 100644 (file)
@@ -273,25 +273,15 @@ void proc_entry_rundown(struct proc_dir_entry *de)
        spin_unlock(&de->pde_unload_lock);
 }
 
-static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence)
-{
-       typeof_member(struct proc_ops, proc_lseek) lseek;
-
-       lseek = pde->proc_ops->proc_lseek;
-       if (!lseek)
-               lseek = default_llseek;
-       return lseek(file, offset, whence);
-}
-
 static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
 {
        struct proc_dir_entry *pde = PDE(file_inode(file));
        loff_t rv = -EINVAL;
 
        if (pde_is_permanent(pde)) {
-               return pde_lseek(pde, file, offset, whence);
+               return pde->proc_ops->proc_lseek(file, offset, whence);
        } else if (use_pde(pde)) {
-               rv = pde_lseek(pde, file, offset, whence);
+               rv = pde->proc_ops->proc_lseek(file, offset, whence);
                unuse_pde(pde);
        }
        return rv;
@@ -493,7 +483,6 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
 
 static int proc_reg_open(struct inode *inode, struct file *file)
 {
-       struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
        struct proc_dir_entry *pde = PDE(inode);
        int rv = 0;
        typeof_member(struct proc_ops, proc_open) open;
@@ -507,9 +496,6 @@ static int proc_reg_open(struct inode *inode, struct file *file)
                return rv;
        }
 
-       if (fs_info->pidonly == PROC_PIDONLY_ON)
-               return -ENOENT;
-
        /*
         * Ensure that
         * 1) PDE's ->release hook will be called no matter what
index 66c7dd1..dea0f5e 100644 (file)
@@ -1563,7 +1563,7 @@ err_register_leaves:
 }
 
 /**
- * register_sysctl_table_path - register a sysctl table hierarchy
+ * register_sysctl_paths - register a sysctl table hierarchy
  * @path: The path to the directory the sysctl table is in.
  * @table: the top-level table structure
  *
index e862cab..fc97845 100644 (file)
@@ -661,6 +661,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
                [ilog2(VM_PKEY_BIT4)]   = "",
 #endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+               [ilog2(VM_UFFD_MINOR)]  = "ui",
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
        };
        size_t i;
 
index 155b828..4a7cb16 100644 (file)
@@ -488,13 +488,3 @@ int reiserfs_proc_info_global_done(void)
  * (available at http://www.namesys.com/legalese.html)
  *
  */
-
-/*
- * Make Linus happy.
- * Local variables:
- * c-indentation-style: "K&R"
- * mode-name: "LC"
- * c-basic-offset: 8
- * tab-width: 8
- * End:
- */
index 8c1baca..11b7e72 100644 (file)
@@ -454,6 +454,7 @@ void generic_shutdown_super(struct super_block *sb)
                evict_inodes(sb);
                /* only nonzero refcount inodes can have marks */
                fsnotify_sb_delete(sb);
+               security_sb_delete(sb);
 
                if (sb->s_dio_done_wq) {
                        destroy_workqueue(sb->s_dio_done_wq);
index 4b83cbd..1261e8b 100644 (file)
@@ -477,7 +477,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
  *
  * The instances directory is special as it allows for mkdir and rmdir to
  * to be done by userspace. When a mkdir or rmdir is performed, the inode
- * locks are released and the methhods passed in (@mkdir and @rmdir) are
+ * locks are released and the methods passed in (@mkdir and @rmdir) are
  * called without locks and with the name of the directory being created
  * within the instances directory.
  *
index 4d17e53..382a54c 100644 (file)
@@ -223,7 +223,8 @@ static bool inode_still_linked(struct ubifs_info *c, struct replay_entry *rino)
         */
        list_for_each_entry_reverse(r, &c->replay_list, list) {
                ubifs_assert(c, r->sqnum >= rino->sqnum);
-               if (key_inum(c, &r->key) == key_inum(c, &rino->key))
+               if (key_inum(c, &r->key) == key_inum(c, &rino->key) &&
+                   key_type(c, &r->key) == UBIFS_INO_KEY)
                        return r->deletion == 0;
 
        }
index c160f71..e7693b9 100644 (file)
@@ -53,6 +53,9 @@
 
 static int get_default_compressor(struct ubifs_info *c)
 {
+       if (ubifs_compr_present(c, UBIFS_COMPR_ZSTD))
+               return UBIFS_COMPR_ZSTD;
+
        if (ubifs_compr_present(c, UBIFS_COMPR_LZO))
                return UBIFS_COMPR_LZO;
 
index ddb2ca6..7b572e1 100644 (file)
@@ -1552,8 +1552,8 @@ static int mount_ubifs(struct ubifs_info *c)
        ubifs_msg(c, "LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes",
                  c->leb_size, c->leb_size >> 10, c->min_io_size,
                  c->max_write_size);
-       ubifs_msg(c, "FS size: %lld bytes (%lld MiB, %d LEBs), journal size %lld bytes (%lld MiB, %d LEBs)",
-                 x, x >> 20, c->main_lebs,
+       ubifs_msg(c, "FS size: %lld bytes (%lld MiB, %d LEBs), max %d LEBs, journal size %lld bytes (%lld MiB, %d LEBs)",
+                 x, x >> 20, c->main_lebs, c->max_leb_cnt,
                  y, y >> 20, c->log_lebs + c->max_bud_cnt);
        ubifs_msg(c, "reserved for root: %llu bytes (%llu KiB)",
                  c->report_rp_size, c->report_rp_size >> 10);
@@ -2232,6 +2232,8 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
                goto out_umount;
        }
 
+       import_uuid(&sb->s_uuid, c->uuid);
+
        mutex_unlock(&c->umount_mutex);
        return 0;
 
index f146b30..3ae9f1e 100644 (file)
@@ -1215,11 +1215,10 @@ static struct dentry *udf_get_parent(struct dentry *child)
 {
        struct kernel_lb_addr tloc;
        struct inode *inode = NULL;
-       struct qstr dotdot = QSTR_INIT("..", 2);
        struct fileIdentDesc cfi;
        struct udf_fileident_bh fibh;
 
-       if (!udf_find_entry(d_inode(child), &dotdot, &fibh, &cfi))
+       if (!udf_find_entry(d_inode(child), &dotdot_name, &fibh, &cfi))
                return ERR_PTR(-EACCES);
 
        if (fibh.sbh != fibh.ebh)
index 983558b..74028b5 100644 (file)
@@ -128,10 +128,9 @@ static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid,
 
 static struct dentry *ufs_get_parent(struct dentry *child)
 {
-       struct qstr dot_dot = QSTR_INIT("..", 2);
        ino_t ino;
 
-       ino = ufs_inode_by_name(d_inode(child), &dot_dot);
+       ino = ufs_inode_by_name(d_inode(child), &dotdot_name);
        if (!ino)
                return ERR_PTR(-ENOENT);
        return d_obtain_alias(ufs_iget(child->d_sb, ino));
index 0be8cdd..14f9228 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
@@ -196,24 +197,21 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
        msg_init(&msg);
        msg.event = UFFD_EVENT_PAGEFAULT;
        msg.arg.pagefault.address = address;
+       /*
+        * These flags indicate why the userfault occurred:
+        * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
+        * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
+        * - Neither of these flags being set indicates a MISSING fault.
+        *
+        * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
+        * fault. Otherwise, it was a read fault.
+        */
        if (flags & FAULT_FLAG_WRITE)
-               /*
-                * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
-                * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
-                * was not set in a UFFD_EVENT_PAGEFAULT, it means it
-                * was a read fault, otherwise if set it means it's
-                * a write fault.
-                */
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
        if (reason & VM_UFFD_WP)
-               /*
-                * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
-                * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
-                * not set in a UFFD_EVENT_PAGEFAULT, it means it was
-                * a missing fault, otherwise if set it means it's a
-                * write protect fault.
-                */
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+       if (reason & VM_UFFD_MINOR)
+               msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
        if (features & UFFD_FEATURE_THREAD_ID)
                msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
        return msg;
@@ -400,8 +398,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 
        BUG_ON(ctx->mm != mm);
 
-       VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
-       VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+       /* Any unrecognized flag is a bug. */
+       VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
+       /* 0 or > 1 flags set is a bug; we expect exactly 1. */
+       VM_BUG_ON(!reason || (reason & (reason - 1)));
 
        if (ctx->features & UFFD_FEATURE_SIGBUS)
                goto out;
@@ -611,7 +611,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
                for (vma = mm->mmap; vma; vma = vma->vm_next)
                        if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
                                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-                               vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+                               vma->vm_flags &= ~__VM_UFFD_FLAGS;
                        }
                mmap_write_unlock(mm);
 
@@ -643,7 +643,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
        octx = vma->vm_userfaultfd_ctx.ctx;
        if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-               vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+               vma->vm_flags &= ~__VM_UFFD_FLAGS;
                return 0;
        }
 
@@ -725,7 +725,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
        } else {
                /* Drop uffd context if remap feature not enabled */
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-               vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+               vma->vm_flags &= ~__VM_UFFD_FLAGS;
        }
 }
 
@@ -866,12 +866,12 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                cond_resched();
                BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
-                      !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+                      !!(vma->vm_flags & __VM_UFFD_FLAGS));
                if (vma->vm_userfaultfd_ctx.ctx != ctx) {
                        prev = vma;
                        continue;
                }
-               new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+               new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
                prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
                                 new_flags, vma->anon_vma,
                                 vma->vm_file, vma->vm_pgoff,
@@ -1261,9 +1261,19 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
                                     unsigned long vm_flags)
 {
        /* FIXME: add WP support to hugetlbfs and shmem */
-       return vma_is_anonymous(vma) ||
-               ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
-                !(vm_flags & VM_UFFD_WP));
+       if (vm_flags & VM_UFFD_WP) {
+               if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma))
+                       return false;
+       }
+
+       if (vm_flags & VM_UFFD_MINOR) {
+               /* FIXME: Add minor fault interception for shmem. */
+               if (!is_vm_hugetlb_page(vma))
+                       return false;
+       }
+
+       return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
+              vma_is_shmem(vma);
 }
 
 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
@@ -1289,14 +1299,19 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
        ret = -EINVAL;
        if (!uffdio_register.mode)
                goto out;
-       if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
-                                    UFFDIO_REGISTER_MODE_WP))
+       if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
                goto out;
        vm_flags = 0;
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
                vm_flags |= VM_UFFD_MISSING;
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
                vm_flags |= VM_UFFD_WP;
+       if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+               goto out;
+#endif
+               vm_flags |= VM_UFFD_MINOR;
+       }
 
        ret = validate_range(mm, &uffdio_register.range.start,
                             uffdio_register.range.len);
@@ -1340,7 +1355,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                cond_resched();
 
                BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
-                      !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+                      !!(cur->vm_flags & __VM_UFFD_FLAGS));
 
                /* check not compatible vmas */
                ret = -EINVAL;
@@ -1420,8 +1435,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                        start = vma->vm_start;
                vma_end = min(end, vma->vm_end);
 
-               new_flags = (vma->vm_flags &
-                            ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags;
+               new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
                prev = vma_merge(mm, prev, start, vma_end, new_flags,
                                 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
                                 vma_policy(vma),
@@ -1449,6 +1463,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                vma->vm_flags = new_flags;
                vma->vm_userfaultfd_ctx.ctx = ctx;
 
+               if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
+                       hugetlb_unshare_all_pmds(vma);
+
        skip:
                prev = vma;
                start = vma->vm_end;
@@ -1470,6 +1487,10 @@ out_unlock:
                if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
                        ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
 
+               /* CONTINUE ioctl is only supported for MINOR ranges. */
+               if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
+                       ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
+
                /*
                 * Now that we scanned all vmas we can already tell
                 * userland which ioctls methods are guaranteed to
@@ -1540,7 +1561,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                cond_resched();
 
                BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
-                      !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+                      !!(cur->vm_flags & __VM_UFFD_FLAGS));
 
                /*
                 * Check not compatible vmas, not strictly required
@@ -1591,7 +1612,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                        wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
                }
 
-               new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+               new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
                prev = vma_merge(mm, prev, start, vma_end, new_flags,
                                 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
                                 vma_policy(vma),
@@ -1823,6 +1844,66 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
        return ret;
 }
 
+static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
+{
+       __s64 ret;
+       struct uffdio_continue uffdio_continue;
+       struct uffdio_continue __user *user_uffdio_continue;
+       struct userfaultfd_wake_range range;
+
+       user_uffdio_continue = (struct uffdio_continue __user *)arg;
+
+       ret = -EAGAIN;
+       if (READ_ONCE(ctx->mmap_changing))
+               goto out;
+
+       ret = -EFAULT;
+       if (copy_from_user(&uffdio_continue, user_uffdio_continue,
+                          /* don't copy the output fields */
+                          sizeof(uffdio_continue) - (sizeof(__s64))))
+               goto out;
+
+       ret = validate_range(ctx->mm, &uffdio_continue.range.start,
+                            uffdio_continue.range.len);
+       if (ret)
+               goto out;
+
+       ret = -EINVAL;
+       /* double check for wraparound just in case. */
+       if (uffdio_continue.range.start + uffdio_continue.range.len <=
+           uffdio_continue.range.start) {
+               goto out;
+       }
+       if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
+               goto out;
+
+       if (mmget_not_zero(ctx->mm)) {
+               ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
+                                    uffdio_continue.range.len,
+                                    &ctx->mmap_changing);
+               mmput(ctx->mm);
+       } else {
+               return -ESRCH;
+       }
+
+       if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
+               return -EFAULT;
+       if (ret < 0)
+               goto out;
+
+       /* len == 0 would wake all */
+       BUG_ON(!ret);
+       range.len = ret;
+       if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
+               range.start = uffdio_continue.range.start;
+               wake_userfault(ctx, &range);
+       }
+       ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
+
+out:
+       return ret;
+}
+
 static inline unsigned int uffd_ctx_features(__u64 user_features)
 {
        /*
@@ -1859,6 +1940,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
                goto err_out;
        /* report all available features and ioctls to userland */
        uffdio_api.features = UFFD_API_FEATURES;
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+       uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
+#endif
        uffdio_api.ioctls = UFFD_API_IOCTLS;
        ret = -EFAULT;
        if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
@@ -1907,6 +1991,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
        case UFFDIO_WRITEPROTECT:
                ret = userfaultfd_writeprotect(ctx, arg);
                break;
+       case UFFDIO_CONTINUE:
+               ret = userfaultfd_continue(ctx, arg);
+               break;
        }
        return ret;
 }
index 6c5f8d1..e32a183 100644 (file)
@@ -253,7 +253,8 @@ xfs_ag_resv_init(
        xfs_agnumber_t                  agno = pag->pag_agno;
        xfs_extlen_t                    ask;
        xfs_extlen_t                    used;
-       int                             error = 0;
+       int                             error = 0, error2;
+       bool                            has_resv = false;
 
        /* Create the metadata reservation. */
        if (pag->pag_meta_resv.ar_asked == 0) {
@@ -291,6 +292,8 @@ xfs_ag_resv_init(
                        if (error)
                                goto out;
                }
+               if (ask)
+                       has_resv = true;
        }
 
        /* Create the RMAPBT metadata reservation */
@@ -304,19 +307,28 @@ xfs_ag_resv_init(
                error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
                if (error)
                        goto out;
+               if (ask)
+                       has_resv = true;
        }
 
-#ifdef DEBUG
-       /* need to read in the AGF for the ASSERT below to work */
-       error = xfs_alloc_pagf_init(pag->pag_mount, tp, pag->pag_agno, 0);
-       if (error)
-               return error;
-
-       ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
-              xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <=
-              pag->pagf_freeblks + pag->pagf_flcount);
-#endif
 out:
+       /*
+        * Initialize the pagf if we have at least one active reservation on the
+        * AG. This may have occurred already via reservation calculation, but
+        * fall back to an explicit init to ensure the in-core allocbt usage
+        * counters are initialized as soon as possible. This is important
+        * because filesystems with large perag reservations are susceptible to
+        * free space reservation problems that the allocbt counter is used to
+        * address.
+        */
+       if (has_resv) {
+               error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0);
+               if (error2)
+                       return error2;
+               ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
+                      xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <=
+                      pag->pagf_freeblks + pag->pagf_flcount);
+       }
        return error;
 }
 
index aaa1910..82b7cbb 100644 (file)
@@ -718,7 +718,6 @@ xfs_alloc_update_counters(
        agbp->b_pag->pagf_freeblks += len;
        be32_add_cpu(&agf->agf_freeblks, len);
 
-       xfs_trans_agblocks_delta(tp, len);
        if (unlikely(be32_to_cpu(agf->agf_freeblks) >
                     be32_to_cpu(agf->agf_length))) {
                xfs_buf_mark_corrupt(agbp);
@@ -2739,7 +2738,6 @@ xfs_alloc_get_freelist(
        pag = agbp->b_pag;
        ASSERT(!pag->pagf_agflreset);
        be32_add_cpu(&agf->agf_flcount, -1);
-       xfs_trans_agflist_delta(tp, -1);
        pag->pagf_flcount--;
 
        logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
@@ -2846,7 +2844,6 @@ xfs_alloc_put_freelist(
        pag = agbp->b_pag;
        ASSERT(!pag->pagf_agflreset);
        be32_add_cpu(&agf->agf_flcount, 1);
-       xfs_trans_agflist_delta(tp, 1);
        pag->pagf_flcount++;
 
        logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
@@ -3036,6 +3033,7 @@ xfs_alloc_read_agf(
        struct xfs_agf          *agf;           /* ag freelist header */
        struct xfs_perag        *pag;           /* per allocation group data */
        int                     error;
+       int                     allocbt_blks;
 
        trace_xfs_alloc_read_agf(mp, agno);
 
@@ -3066,6 +3064,19 @@ xfs_alloc_read_agf(
                pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
                pag->pagf_init = 1;
                pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
+
+               /*
+                * Update the in-core allocbt counter. Filter out the rmapbt
+                * subset of the btreeblks counter because the rmapbt is managed
+                * by perag reservation. Subtract one for the rmapbt root block
+                * because the rmap counter includes it while the btreeblks
+                * counter only tracks non-root blocks.
+                */
+               allocbt_blks = pag->pagf_btreeblks;
+               if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+                       allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
+               if (allocbt_blks > 0)
+                       atomic64_add(allocbt_blks, &mp->m_allocbt_blks);
        }
 #ifdef DEBUG
        else if (!XFS_FORCED_SHUTDOWN(mp)) {
index 8e01231..a43e4c5 100644 (file)
@@ -71,9 +71,9 @@ xfs_allocbt_alloc_block(
                return 0;
        }
 
+       atomic64_inc(&cur->bc_mp->m_allocbt_blks);
        xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false);
 
-       xfs_trans_agbtree_delta(cur->bc_tp, 1);
        new->s = cpu_to_be32(bno);
 
        *stat = 1;
@@ -95,9 +95,9 @@ xfs_allocbt_free_block(
        if (error)
                return error;
 
+       atomic64_dec(&cur->bc_mp->m_allocbt_blks);
        xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
                              XFS_EXTENT_BUSY_SKIP_DISCARD);
-       xfs_trans_agbtree_delta(cur->bc_tp, -1);
        return 0;
 }
 
index 8bd00da..3e15ea2 100644 (file)
@@ -368,10 +368,10 @@ static inline int xfs_ilog_fdata(int w)
  * directly mirrors the xfs_dinode structure as it must contain all the same
  * information.
  */
-typedef uint64_t xfs_ictimestamp_t;
+typedef uint64_t xfs_log_timestamp_t;
 
 /* Legacy timestamp encoding format. */
-struct xfs_legacy_ictimestamp {
+struct xfs_log_legacy_timestamp {
        int32_t         t_sec;          /* timestamp seconds */
        int32_t         t_nsec;         /* timestamp nanoseconds */
 };
@@ -393,9 +393,9 @@ struct xfs_log_dinode {
        uint16_t        di_projid_hi;   /* higher part of owner's project id */
        uint8_t         di_pad[6];      /* unused, zeroed space */
        uint16_t        di_flushiter;   /* incremented on flush */
-       xfs_ictimestamp_t di_atime;     /* time last accessed */
-       xfs_ictimestamp_t di_mtime;     /* time last modified */
-       xfs_ictimestamp_t di_ctime;     /* time created/inode modified */
+       xfs_log_timestamp_t di_atime;   /* time last accessed */
+       xfs_log_timestamp_t di_mtime;   /* time last modified */
+       xfs_log_timestamp_t di_ctime;   /* time created/inode modified */
        xfs_fsize_t     di_size;        /* number of bytes in file */
        xfs_rfsblock_t  di_nblocks;     /* # of direct & btree blocks used */
        xfs_extlen_t    di_extsize;     /* basic/minimum extent size for file */
@@ -420,7 +420,7 @@ struct xfs_log_dinode {
        uint8_t         di_pad2[12];    /* more padding for future expansion */
 
        /* fields only written to during inode creation */
-       xfs_ictimestamp_t di_crtime;    /* time created */
+       xfs_log_timestamp_t di_crtime;  /* time created */
        xfs_ino_t       di_ino;         /* inode number */
        uuid_t          di_uuid;        /* UUID of the filesystem */
 
index beb81c8..9f5bcbd 100644 (file)
@@ -103,7 +103,6 @@ xfs_rmapbt_alloc_block(
        xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1,
                        false);
 
-       xfs_trans_agbtree_delta(cur->bc_tp, 1);
        new->s = cpu_to_be32(bno);
        be32_add_cpu(&agf->agf_rmap_blocks, 1);
        xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
@@ -136,7 +135,6 @@ xfs_rmapbt_free_block(
 
        xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
                              XFS_EXTENT_BUSY_SKIP_DISCARD);
-       xfs_trans_agbtree_delta(cur->bc_tp, -1);
 
        pag = cur->bc_ag.agbp->b_pag;
        xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
index 60e6d25..dfbbcbd 100644 (file)
@@ -926,9 +926,19 @@ xfs_log_sb(
        struct xfs_mount        *mp = tp->t_mountp;
        struct xfs_buf          *bp = xfs_trans_getsb(tp);
 
-       mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
-       mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
-       mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+       /*
+        * Lazy sb counters don't update the in-core superblock so do that now.
+        * If this is at unmount, the counters will be exactly correct, but at
+        * any other time they will only be ballpark correct because of
+        * reservations that have been taken out percpu counters. If we have an
+        * unclean shutdown, this will be corrected by log recovery rebuilding
+        * the counters from the AGF block counts.
+        */
+       if (xfs_sb_version_haslazysbcount(&mp->m_sb)) {
+               mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+               mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+               mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+       }
 
        xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
index 749faa1..7a2f9b5 100644 (file)
@@ -416,6 +416,10 @@ xchk_agf_xref_btreeblks(
        xfs_agblock_t           btreeblks;
        int                     error;
 
+       /* agf_btreeblks didn't exist before lazysbcount */
+       if (!xfs_sb_version_haslazysbcount(&sc->mp->m_sb))
+               return;
+
        /* Check agf_rmap_blocks; set up for agf_btreeblks check */
        if (sc->sa.rmap_cur) {
                error = xfs_btree_count_blocks(sc->sa.rmap_cur, &blocks);
@@ -581,7 +585,8 @@ xchk_agf(
                xchk_block_set_corrupt(sc, sc->sa.agf_bp);
        if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount))
                xchk_block_set_corrupt(sc, sc->sa.agf_bp);
-       if (pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks))
+       if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb) &&
+           pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks))
                xchk_block_set_corrupt(sc, sc->sa.agf_bp);
        xfs_perag_put(pag);
 
index 7b4386c..f1d1a8c 100644 (file)
@@ -13,6 +13,7 @@
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_health.h"
+#include "xfs_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -143,6 +144,35 @@ xchk_setup_fscounters(
        return xchk_trans_alloc(sc, 0);
 }
 
+/* Count free space btree blocks manually for pre-lazysbcount filesystems. */
+static int
+xchk_fscount_btreeblks(
+       struct xfs_scrub        *sc,
+       struct xchk_fscounters  *fsc,
+       xfs_agnumber_t          agno)
+{
+       xfs_extlen_t            blocks;
+       int                     error;
+
+       error = xchk_ag_init(sc, agno, &sc->sa);
+       if (error)
+               return error;
+
+       error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
+       if (error)
+               goto out_free;
+       fsc->fdblocks += blocks - 1;
+
+       error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
+       if (error)
+               goto out_free;
+       fsc->fdblocks += blocks - 1;
+
+out_free:
+       xchk_ag_free(sc, &sc->sa);
+       return error;
+}
+
 /*
  * Calculate what the global in-core counters ought to be from the incore
  * per-AG structure.  Callers can compare this to the actual in-core counters
@@ -182,7 +212,15 @@ retry:
                /* Add up the free/freelist/bnobt/cntbt blocks */
                fsc->fdblocks += pag->pagf_freeblks;
                fsc->fdblocks += pag->pagf_flcount;
-               fsc->fdblocks += pag->pagf_btreeblks;
+               if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) {
+                       fsc->fdblocks += pag->pagf_btreeblks;
+               } else {
+                       error = xchk_fscount_btreeblks(sc, fsc, agno);
+                       if (error) {
+                               xfs_perag_put(pag);
+                               break;
+                       }
+               }
 
                /*
                 * Per-AG reservations are taken out of the incore counters,
index 9b08db4..826caa6 100644 (file)
@@ -146,7 +146,7 @@ xfs_end_io(
        while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
                        io_list))) {
                list_del_init(&ioend->io_list);
-               iomap_ioend_try_merge(ioend, &tmp, NULL);
+               iomap_ioend_try_merge(ioend, &tmp);
                xfs_end_ioend(ioend);
        }
 }
index b33c894..be9cf88 100644 (file)
@@ -69,8 +69,6 @@ xfs_resizefs_init_new_ags(
        if (error)
                return error;
 
-       xfs_trans_agblocks_delta(tp, id->nfree);
-
        if (delta) {
                *lastag_extended = true;
                error = xfs_ag_extend_space(mp, tp, id, delta);
index c1b3268..6764d12 100644 (file)
@@ -299,18 +299,18 @@ xfs_inode_item_format_attr_fork(
  * Convert an incore timestamp to a log timestamp.  Note that the log format
  * specifies host endian format!
  */
-static inline xfs_ictimestamp_t
+static inline xfs_log_timestamp_t
 xfs_inode_to_log_dinode_ts(
        struct xfs_inode                *ip,
        const struct timespec64         tv)
 {
-       struct xfs_legacy_ictimestamp   *lits;
-       xfs_ictimestamp_t               its;
+       struct xfs_log_legacy_timestamp *lits;
+       xfs_log_timestamp_t             its;
 
        if (xfs_inode_has_bigtime(ip))
                return xfs_inode_encode_bigtime(tv);
 
-       lits = (struct xfs_legacy_ictimestamp *)&its;
+       lits = (struct xfs_log_legacy_timestamp *)&its;
        lits->t_sec = tv.tv_sec;
        lits->t_nsec = tv.tv_nsec;
 
index cb44f76..7b79518 100644 (file)
@@ -125,17 +125,17 @@ static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld)
 static inline xfs_timestamp_t
 xfs_log_dinode_to_disk_ts(
        struct xfs_log_dinode           *from,
-       const xfs_ictimestamp_t         its)
+       const xfs_log_timestamp_t       its)
 {
        struct xfs_legacy_timestamp     *lts;
-       struct xfs_legacy_ictimestamp   *lits;
+       struct xfs_log_legacy_timestamp *lits;
        xfs_timestamp_t                 ts;
 
        if (xfs_log_dinode_has_bigtime(from))
                return cpu_to_be64(its);
 
        lts = (struct xfs_legacy_timestamp *)&ts;
-       lits = (struct xfs_legacy_ictimestamp *)&its;
+       lits = (struct xfs_log_legacy_timestamp *)&its;
        lts->t_sec = cpu_to_be32(lits->t_sec);
        lts->t_nsec = cpu_to_be32(lits->t_nsec);
 
index 0604183..c19a82a 100644 (file)
@@ -355,13 +355,15 @@ xfs_log_writable(
        struct xfs_mount        *mp)
 {
        /*
-        * Never write to the log on norecovery mounts, if the block device is
-        * read-only, or if the filesystem is shutdown. Read-only mounts still
-        * allow internal writes for log recovery and unmount purposes, so don't
-        * restrict that case here.
+        * Do not write to the log on norecovery mounts, if the data or log
+        * devices are read-only, or if the filesystem is shutdown. Read-only
+        * mounts allow internal writes for log recovery and unmount purposes,
+        * so don't restrict that case.
         */
        if (mp->m_flags & XFS_MOUNT_NORECOVERY)
                return false;
+       if (xfs_readonly_buftarg(mp->m_ddev_targp))
+               return false;
        if (xfs_readonly_buftarg(mp->m_log->l_targ))
                return false;
        if (XFS_FORCED_SHUTDOWN(mp))
index cb1e2c4..bdfee19 100644 (file)
@@ -1188,6 +1188,7 @@ xfs_mod_fdblocks(
        int64_t                 lcounter;
        long long               res_used;
        s32                     batch;
+       uint64_t                set_aside;
 
        if (delta > 0) {
                /*
@@ -1227,8 +1228,20 @@ xfs_mod_fdblocks(
        else
                batch = XFS_FDBLOCKS_BATCH;
 
+       /*
+        * Set aside allocbt blocks because these blocks are tracked as free
+        * space but not available for allocation. Technically this means that a
+        * single reservation cannot consume all remaining free space, but the
+        * ratio of allocbt blocks to usable free blocks should be rather small.
+        * The tradeoff without this is that filesystems that maintain high
+        * perag block reservations can over reserve physical block availability
+        * and fail physical allocation, which leads to much more serious
+        * problems (i.e. transaction abort, pagecache discards, etc.) than
+        * slightly premature -ENOSPC.
+        */
+       set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
        percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
-       if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside,
+       if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
                                     XFS_FDBLOCKS_BATCH) >= 0) {
                /* we had space! */
                return 0;
index 81829d1..bb67274 100644 (file)
@@ -170,6 +170,12 @@ typedef struct xfs_mount {
         * extents or anything related to the rt device.
         */
        struct percpu_counter   m_delalloc_blks;
+       /*
+        * Global count of allocation btree blocks in use across all AGs. Only
+        * used when perag reservation is enabled. Helps prevent block
+        * reservation from attempting to reserve allocation btree blocks.
+        */
+       atomic64_t              m_allocbt_blks;
 
        struct radix_tree_root  m_perag_tree;   /* per-ag accounting info */
        spinlock_t              m_perag_lock;   /* lock for m_perag_tree */
index 0aa87c2..2599192 100644 (file)
@@ -126,8 +126,8 @@ xfs_check_ondisk_structs(void)
        XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64,             16);
        XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode,            176);
        XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log,           28);
-       XFS_CHECK_STRUCT_SIZE(xfs_ictimestamp_t,                8);
-       XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_ictimestamp,    8);
+       XFS_CHECK_STRUCT_SIZE(xfs_log_timestamp_t,              8);
+       XFS_CHECK_STRUCT_SIZE(struct xfs_log_legacy_timestamp,  8);
        XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32,   52);
        XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format,      56);
        XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat,        20);
index 4dd4af6..060695d 100644 (file)
@@ -1522,7 +1522,8 @@ xfs_reflink_unshare(
        if (error)
                goto out;
 
-       error = filemap_write_and_wait_range(inode->i_mapping, offset, len);
+       error = filemap_write_and_wait_range(inode->i_mapping, offset,
+                       offset + len - 1);
        if (error)
                goto out;
 
index bcc9780..586f299 100644 (file)
@@ -487,13 +487,6 @@ xfs_trans_apply_sb_deltas(
        bp = xfs_trans_getsb(tp);
        sbp = bp->b_addr;
 
-       /*
-        * Check that superblock mods match the mods made to AGF counters.
-        */
-       ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) ==
-              (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta +
-               tp->t_ag_btree_delta));
-
        /*
         * Only update the superblock counters if we are logging them
         */
@@ -629,6 +622,9 @@ xfs_trans_unreserve_and_mod_sb(
 
        /* apply remaining deltas */
        spin_lock(&mp->m_sb_lock);
+       mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta;
+       mp->m_sb.sb_icount += idelta;
+       mp->m_sb.sb_ifree += ifreedelta;
        mp->m_sb.sb_frextents += rtxdelta;
        mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
        mp->m_sb.sb_agcount += tp->t_agcount_delta;
index 9dd745c..ee42d98 100644 (file)
@@ -140,11 +140,6 @@ typedef struct xfs_trans {
        int64_t                 t_res_fdblocks_delta; /* on-disk only chg */
        int64_t                 t_frextents_delta;/* superblock freextents chg*/
        int64_t                 t_res_frextents_delta; /* on-disk only chg */
-#if defined(DEBUG) || defined(XFS_WARN)
-       int64_t                 t_ag_freeblks_delta; /* debugging counter */
-       int64_t                 t_ag_flist_delta; /* debugging counter */
-       int64_t                 t_ag_btree_delta; /* debugging counter */
-#endif
        int64_t                 t_dblocks_delta;/* superblock dblocks change */
        int64_t                 t_agcount_delta;/* superblock agcount change */
        int64_t                 t_imaxpct_delta;/* superblock imaxpct change */
@@ -165,16 +160,6 @@ typedef struct xfs_trans {
  */
 #define        xfs_trans_set_sync(tp)          ((tp)->t_flags |= XFS_TRANS_SYNC)
 
-#if defined(DEBUG) || defined(XFS_WARN)
-#define        xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d)
-#define        xfs_trans_agflist_delta(tp, d)  ((tp)->t_ag_flist_delta += (int64_t)d)
-#define        xfs_trans_agbtree_delta(tp, d)  ((tp)->t_ag_btree_delta += (int64_t)d)
-#else
-#define        xfs_trans_agblocks_delta(tp, d)
-#define        xfs_trans_agflist_delta(tp, d)
-#define        xfs_trans_agbtree_delta(tp, d)
-#endif
-
 /*
  * XFS transaction mechanism exported interfaces.
  */
index 9fdf213..0d132ee 100644 (file)
@@ -2,6 +2,13 @@
 #ifndef _ASM_GENERIC_BITOPS_FIND_H_
 #define _ASM_GENERIC_BITOPS_FIND_H_
 
+extern unsigned long _find_next_bit(const unsigned long *addr1,
+               const unsigned long *addr2, unsigned long nbits,
+               unsigned long start, unsigned long invert, unsigned long le);
+extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
+
 #ifndef find_next_bit
 /**
  * find_next_bit - find the next set bit in a memory region
  * Returns the bit number for the next set bit
  * If no bits are set, returns @size.
  */
-extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
-               size, unsigned long offset);
+static inline
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+                           unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = *addr & GENMASK(size - 1, offset);
+               return val ? __ffs(val) : size;
+       }
+
+       return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
+}
 #endif
 
 #ifndef find_next_and_bit
@@ -27,9 +48,23 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
  * Returns the bit number for the next set bit
  * If no bits are set, returns @size.
  */
-extern unsigned long find_next_and_bit(const unsigned long *addr1,
+static inline
+unsigned long find_next_and_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
-               unsigned long offset);
+               unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = *addr1 & *addr2 & GENMASK(size - 1, offset);
+               return val ? __ffs(val) : size;
+       }
+
+       return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
+}
 #endif
 
 #ifndef find_next_zero_bit
@@ -42,8 +77,22 @@ extern unsigned long find_next_and_bit(const unsigned long *addr1,
  * Returns the bit number of the next zero bit
  * If no bits are zero, returns @size.
  */
-extern unsigned long find_next_zero_bit(const unsigned long *addr, unsigned
-               long size, unsigned long offset);
+static inline
+unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+                                unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = *addr | ~GENMASK(size - 1, offset);
+               return val == ~0UL ? size : ffz(val);
+       }
+
+       return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
+}
 #endif
 
 #ifdef CONFIG_GENERIC_FIND_FIRST_BIT
@@ -56,8 +105,17 @@ extern unsigned long find_next_zero_bit(const unsigned long *addr, unsigned
  * Returns the bit number of the first set bit.
  * If no bits are set, returns @size.
  */
-extern unsigned long find_first_bit(const unsigned long *addr,
-                                   unsigned long size);
+static inline
+unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val = *addr & GENMASK(size - 1, 0);
+
+               return val ? __ffs(val) : size;
+       }
+
+       return _find_first_bit(addr, size);
+}
 
 /**
  * find_first_zero_bit - find the first cleared bit in a memory region
@@ -67,8 +125,17 @@ extern unsigned long find_first_bit(const unsigned long *addr,
  * Returns the bit number of the first cleared bit.
  * If no bits are zero, returns @size.
  */
-extern unsigned long find_first_zero_bit(const unsigned long *addr,
-                                        unsigned long size);
+static inline
+unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val = *addr | ~GENMASK(size - 1, 0);
+
+               return val == ~0UL ? size : ffz(val);
+       }
+
+       return _find_first_zero_bit(addr, size);
+}
 #else /* CONFIG_GENERIC_FIND_FIRST_BIT */
 
 #ifndef find_first_bit
@@ -80,6 +147,27 @@ extern unsigned long find_first_zero_bit(const unsigned long *addr,
 
 #endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
 
+#ifndef find_last_bit
+/**
+ * find_last_bit - find the last set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The number of bits to search
+ *
+ * Returns the bit number of the last set bit, or size.
+ */
+static inline
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val = *addr & GENMASK(size - 1, 0);
+
+               return val ? __fls(val) : size;
+       }
+
+       return _find_last_bit(addr, size);
+}
+#endif
+
 /**
  * find_next_clump8 - find next 8-bit clump with set bits in a memory region
  * @clump: location to store copy of found clump
index 188d3eb..5a28629 100644 (file)
@@ -2,8 +2,10 @@
 #ifndef _ASM_GENERIC_BITOPS_LE_H_
 #define _ASM_GENERIC_BITOPS_LE_H_
 
+#include <asm-generic/bitops/find.h>
 #include <asm/types.h>
 #include <asm/byteorder.h>
+#include <linux/swab.h>
 
 #if defined(__LITTLE_ENDIAN)
 
@@ -32,13 +34,41 @@ static inline unsigned long find_first_zero_bit_le(const void *addr,
 #define BITOP_LE_SWIZZLE       ((BITS_PER_LONG-1) & ~0x7)
 
 #ifndef find_next_zero_bit_le
-extern unsigned long find_next_zero_bit_le(const void *addr,
-               unsigned long size, unsigned long offset);
+static inline
+unsigned long find_next_zero_bit_le(const void *addr, unsigned
+               long size, unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val = *(const unsigned long *)addr;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = swab(val) | ~GENMASK(size - 1, offset);
+               return val == ~0UL ? size : ffz(val);
+       }
+
+       return _find_next_bit(addr, NULL, size, offset, ~0UL, 1);
+}
 #endif
 
 #ifndef find_next_bit_le
-extern unsigned long find_next_bit_le(const void *addr,
-               unsigned long size, unsigned long offset);
+static inline
+unsigned long find_next_bit_le(const void *addr, unsigned
+               long size, unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val = *(const unsigned long *)addr;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = swab(val) & GENMASK(size - 1, offset);
+               return val ? __ffs(val) : size;
+       }
+
+       return _find_next_bit(addr, NULL, size, offset, 0UL, 1);
+}
 #endif
 
 #ifndef find_first_zero_bit_le
index 3905c1c..1023e2a 100644 (file)
 #define BITS_PER_LONG_LONG 64
 #endif
 
+/*
+ * small_const_nbits(n) is true precisely when it is known at compile-time
+ * that BITMAP_SIZE(n) is 1, i.e. 1 <= n <= BITS_PER_LONG. This allows
+ * various bit/bitmap APIs to provide a fast inline implementation. Bitmaps
+ * of size 0 are very rare, and a compile-time-known-size 0 is most likely
+ * a sign of error. They will be handled correctly by the bit/bitmap APIs,
+ * but using the out-of-line functions, so that the inline implementations
+ * can unconditionally dereference the pointer(s).
+ */
+#define small_const_nbits(nbits) \
+       (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0)
+
 #endif /* __ASM_GENERIC_BITS_PER_LONG */
index 76d456c..e93375c 100644 (file)
@@ -1064,17 +1064,6 @@ static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p)
 #endif
 #endif /* CONFIG_GENERIC_IOMAP */
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#ifndef xlate_dev_kmem_ptr
-#define xlate_dev_kmem_ptr xlate_dev_kmem_ptr
-static inline void *xlate_dev_kmem_ptr(void *addr)
-{
-       return addr;
-}
-#endif
-
 #ifndef xlate_dev_mem_ptr
 #define xlate_dev_mem_ptr xlate_dev_mem_ptr
 static inline void *xlate_dev_mem_ptr(phys_addr_t addr)
index cd7706e..7899b7f 100644 (file)
@@ -19,5 +19,6 @@
 #define PRCI_CLK_CLTXPLL              5
 #define PRCI_CLK_TLCLK                6
 #define PRCI_CLK_PCLK                 7
+#define PRCI_CLK_PCIE_AUX             8
 
 #endif /* __DT_BINDINGS_CLOCK_SIFIVE_FU740_PRCI_H */
diff --git a/include/dt-bindings/input/atmel-maxtouch.h b/include/dt-bindings/input/atmel-maxtouch.h
new file mode 100644 (file)
index 0000000..7345ab3
--- /dev/null
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef _DT_BINDINGS_ATMEL_MAXTOUCH_H
+#define _DT_BINDINGS_ATMEL_MAXTOUCH_H
+
+#define ATMEL_MXT_WAKEUP_NONE          0
+#define ATMEL_MXT_WAKEUP_I2C_SCL       1
+#define ATMEL_MXT_WAKEUP_GPIO          2
+
+#endif /* _DT_BINDINGS_ATMEL_MAXTOUCH_H */
index 875e002..6acd3cf 100644 (file)
@@ -16,9 +16,16 @@ extern int restrict_link_by_builtin_trusted(struct key *keyring,
                                            const struct key_type *type,
                                            const union key_payload *payload,
                                            struct key *restriction_key);
+extern __init int load_module_cert(struct key *keyring);
 
 #else
 #define restrict_link_by_builtin_trusted restrict_link_reject
+
+static inline __init int load_module_cert(struct key *keyring)
+{
+       return 0;
+}
+
 #endif
 
 #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING
index 6fd3cda..864b999 100644 (file)
@@ -61,6 +61,7 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu,
 int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu,
                            struct kvm_device_attr *attr);
 int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu);
+int kvm_pmu_probe_pmuver(void);
 #else
 struct kvm_pmu {
 };
@@ -116,6 +117,9 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
 {
        return 0;
 }
+
+static inline int kvm_pmu_probe_pmuver(void) { return 0xf; }
+
 #endif
 
 #endif
index 3d74f10..ec62118 100644 (file)
@@ -322,6 +322,7 @@ struct vgic_cpu {
         */
        struct vgic_io_device   rd_iodev;
        struct vgic_redist_region *rdreg;
+       u32 rdreg_index;
 
        /* Contains the attributes and gpa of the LPI pending tables. */
        u64 pendbaser;
diff --git a/include/linux/align.h b/include/linux/align.h
new file mode 100644 (file)
index 0000000..2b4acec
--- /dev/null
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ALIGN_H
+#define _LINUX_ALIGN_H
+
+#include <linux/const.h>
+
+/* @a is a power of 2 value */
+#define ALIGN(x, a)            __ALIGN_KERNEL((x), (a))
+#define ALIGN_DOWN(x, a)       __ALIGN_KERNEL((x) - ((a) - 1), (a))
+#define __ALIGN_MASK(x, mask)  __ALIGN_KERNEL_MASK((x), (mask))
+#define PTR_ALIGN(p, a)                ((typeof(p))ALIGN((unsigned long)(p), (a)))
+#define PTR_ALIGN_DOWN(p, a)   ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a)))
+#define IS_ALIGNED(x, a)               (((x) & ((typeof(x))(a) - 1)) == 0)
+
+#endif /* _LINUX_ALIGN_H */
index 4507172..58e6c38 100644 (file)
@@ -10,6 +10,8 @@
 
 #include <linux/types.h>
 
+struct amd_iommu;
+
 /*
  * This is mainly used to communicate information back-and-forth
  * between SVM and IOMMU for setting up and tearing down posted
@@ -32,24 +34,6 @@ struct pci_dev;
 extern int amd_iommu_detect(void);
 extern int amd_iommu_init_hardware(void);
 
-/**
- * amd_iommu_enable_device_erratum() - Enable erratum workaround for device
- *                                    in the IOMMUv2 driver
- * @pdev: The PCI device the workaround is necessary for
- * @erratum: The erratum workaround to enable
- *
- * The function needs to be called before amd_iommu_init_device().
- * Possible values for the erratum number are for now:
- * - AMD_PRI_DEV_ERRATUM_ENABLE_RESET - Reset PRI capability when PRI
- *                                     is enabled
- * - AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE - Limit number of outstanding PRI
- *                                      requests to one
- */
-#define AMD_PRI_DEV_ERRATUM_ENABLE_RESET               0
-#define AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE              1
-
-extern void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum);
-
 /**
  * amd_iommu_init_device() - Init device for use with IOMMUv2 driver
  * @pdev: The PCI device to initialize
@@ -212,4 +196,14 @@ static inline int amd_iommu_deactivate_guest_mode(void *data)
 }
 #endif /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */
 
+int amd_iommu_get_num_iommus(void);
+bool amd_iommu_pc_supported(void);
+u8 amd_iommu_pc_get_max_banks(unsigned int idx);
+u8 amd_iommu_pc_get_max_counters(unsigned int idx);
+int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn,
+               u64 *value);
+int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn,
+               u64 *value);
+struct amd_iommu *get_amd_iommu(unsigned int idx);
+
 #endif /* _ASM_X86_AMD_IOMMU_H */
index 62c5423..6861489 100644 (file)
@@ -55,6 +55,8 @@
 #define ARM_SMCCC_OWNER_TRUSTED_OS     50
 #define ARM_SMCCC_OWNER_TRUSTED_OS_END 63
 
+#define ARM_SMCCC_FUNC_QUERY_CALL_UID  0xff01
+
 #define ARM_SMCCC_QUIRK_NONE           0
 #define ARM_SMCCC_QUIRK_QCOM_A6                1 /* Save/restore register a6 */
 
                           ARM_SMCCC_SMC_32,                            \
                           0, 0x7fff)
 
+#define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID                          \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          ARM_SMCCC_OWNER_VENDOR_HYP,                  \
+                          ARM_SMCCC_FUNC_QUERY_CALL_UID)
+
+/* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 */
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0     0xb66fb428U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1     0xe911c52eU
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2     0x564bcaa9U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3     0x743a004dU
+
+/* KVM "vendor specific" services */
+#define ARM_SMCCC_KVM_FUNC_FEATURES            0
+#define ARM_SMCCC_KVM_FUNC_PTP                 1
+#define ARM_SMCCC_KVM_FUNC_FEATURES_2          127
+#define ARM_SMCCC_KVM_NUM_FUNCS                        128
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID                      \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          ARM_SMCCC_OWNER_VENDOR_HYP,                  \
+                          ARM_SMCCC_KVM_FUNC_FEATURES)
+
 #define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED   1
 
+/*
+ * ptp_kvm is a feature used for time sync between vm and host.
+ * ptp_kvm module in guest kernel will get service from host using
+ * this hypercall ID.
+ */
+#define ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID                           \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          ARM_SMCCC_OWNER_VENDOR_HYP,                  \
+                          ARM_SMCCC_KVM_FUNC_PTP)
+
+/* ptp_kvm counter type ID */
+#define KVM_PTP_VIRT_COUNTER                   0
+#define KVM_PTP_PHYS_COUNTER                   1
+
 /* Paravirtualised time calls (defined by ARM DEN0057A) */
 #define ARM_SMCCC_HV_PV_TIME_FEATURES                          \
        ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                 \
index 0a17cd2..cce4ad3 100644 (file)
@@ -112,7 +112,6 @@ async_schedule_dev_domain(async_func_t func, struct device *dev,
        return async_schedule_node_domain(func, dev, dev_to_node(dev), domain);
 }
 
-void async_unregister_domain(struct async_domain *domain);
 extern void async_synchronize_full(void);
 extern void async_synchronize_full_domain(struct async_domain *domain);
 extern void async_synchronize_cookie(async_cookie_t cookie);
index a0b4cfd..f1a99f0 100644 (file)
@@ -106,6 +106,8 @@ static inline void *bio_data(struct bio *bio)
        return NULL;
 }
 
+extern unsigned int bio_max_size(struct bio *bio);
+
 /**
  * bio_full - check if the bio is full
  * @bio:       bio to check
@@ -119,7 +121,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
        if (bio->bi_vcnt >= bio->bi_max_vecs)
                return true;
 
-       if (bio->bi_iter.bi_size > UINT_MAX - len)
+       if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
                return true;
 
        return false;
index 70a9324..a36cfce 100644 (file)
@@ -4,10 +4,13 @@
 
 #ifndef __ASSEMBLY__
 
-#include <linux/types.h>
+#include <linux/align.h>
 #include <linux/bitops.h>
+#include <linux/limits.h>
 #include <linux/string.h>
-#include <linux/kernel.h>
+#include <linux/types.h>
+
+struct device;
 
 /*
  * bitmaps provide bit arrays that consume one or more unsigned
  * Allocation and deallocation of bitmap.
  * Provided in lib/bitmap.c to avoid circular dependency.
  */
-extern unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags);
-extern unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags);
-extern void bitmap_free(const unsigned long *bitmap);
+unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags);
+unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags);
+void bitmap_free(const unsigned long *bitmap);
+
+/* Managed variants of the above. */
+unsigned long *devm_bitmap_alloc(struct device *dev,
+                                unsigned int nbits, gfp_t flags);
+unsigned long *devm_bitmap_zalloc(struct device *dev,
+                                 unsigned int nbits, gfp_t flags);
 
 /*
  * lib/bitmap.c provides these functions:
  */
 
-extern int __bitmap_equal(const unsigned long *bitmap1,
-                         const unsigned long *bitmap2, unsigned int nbits);
-extern bool __pure __bitmap_or_equal(const unsigned long *src1,
-                                    const unsigned long *src2,
-                                    const unsigned long *src3,
-                                    unsigned int nbits);
-extern void __bitmap_complement(unsigned long *dst, const unsigned long *src,
-                       unsigned int nbits);
-extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
-                               unsigned int shift, unsigned int nbits);
-extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
-                               unsigned int shift, unsigned int nbits);
-extern void bitmap_cut(unsigned long *dst, const unsigned long *src,
-                      unsigned int first, unsigned int cut,
-                      unsigned int nbits);
-extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
-                       const unsigned long *bitmap2, unsigned int nbits);
-extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
-                       const unsigned long *bitmap2, unsigned int nbits);
-extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
-                       const unsigned long *bitmap2, unsigned int nbits);
-extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
+int __bitmap_equal(const unsigned long *bitmap1,
+                  const unsigned long *bitmap2, unsigned int nbits);
+bool __pure __bitmap_or_equal(const unsigned long *src1,
+                             const unsigned long *src2,
+                             const unsigned long *src3,
+                             unsigned int nbits);
+void __bitmap_complement(unsigned long *dst, const unsigned long *src,
+                        unsigned int nbits);
+void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
+                         unsigned int shift, unsigned int nbits);
+void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
+                        unsigned int shift, unsigned int nbits);
+void bitmap_cut(unsigned long *dst, const unsigned long *src,
+               unsigned int first, unsigned int cut, unsigned int nbits);
+int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
+                const unsigned long *bitmap2, unsigned int nbits);
+void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
+                const unsigned long *bitmap2, unsigned int nbits);
+void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
+                 const unsigned long *bitmap2, unsigned int nbits);
+int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
+                   const unsigned long *bitmap2, unsigned int nbits);
+void __bitmap_replace(unsigned long *dst,
+                     const unsigned long *old, const unsigned long *new,
+                     const unsigned long *mask, unsigned int nbits);
+int __bitmap_intersects(const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int nbits);
-extern void __bitmap_replace(unsigned long *dst,
-                       const unsigned long *old, const unsigned long *new,
-                       const unsigned long *mask, unsigned int nbits);
-extern int __bitmap_intersects(const unsigned long *bitmap1,
-                       const unsigned long *bitmap2, unsigned int nbits);
-extern int __bitmap_subset(const unsigned long *bitmap1,
-                       const unsigned long *bitmap2, unsigned int nbits);
-extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
-extern void __bitmap_set(unsigned long *map, unsigned int start, int len);
-extern void __bitmap_clear(unsigned long *map, unsigned int start, int len);
-
-extern unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
-                                                   unsigned long size,
-                                                   unsigned long start,
-                                                   unsigned int nr,
-                                                   unsigned long align_mask,
-                                                   unsigned long align_offset);
+int __bitmap_subset(const unsigned long *bitmap1,
+                   const unsigned long *bitmap2, unsigned int nbits);
+int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
+void __bitmap_set(unsigned long *map, unsigned int start, int len);
+void __bitmap_clear(unsigned long *map, unsigned int start, int len);
+
+unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
+                                            unsigned long size,
+                                            unsigned long start,
+                                            unsigned int nr,
+                                            unsigned long align_mask,
+                                            unsigned long align_offset);
 
 /**
  * bitmap_find_next_zero_area - find a contiguous aligned zero area
@@ -190,46 +198,38 @@ bitmap_find_next_zero_area(unsigned long *map,
                                              align_mask, 0);
 }
 
-extern int bitmap_parse(const char *buf, unsigned int buflen,
+int bitmap_parse(const char *buf, unsigned int buflen,
                        unsigned long *dst, int nbits);
-extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
+int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
                        unsigned long *dst, int nbits);
-extern int bitmap_parselist(const char *buf, unsigned long *maskp,
+int bitmap_parselist(const char *buf, unsigned long *maskp,
                        int nmaskbits);
-extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
+int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
                        unsigned long *dst, int nbits);
-extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
+void bitmap_remap(unsigned long *dst, const unsigned long *src,
                const unsigned long *old, const unsigned long *new, unsigned int nbits);
-extern int bitmap_bitremap(int oldbit,
+int bitmap_bitremap(int oldbit,
                const unsigned long *old, const unsigned long *new, int bits);
-extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
+void bitmap_onto(unsigned long *dst, const unsigned long *orig,
                const unsigned long *relmap, unsigned int bits);
-extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
+void bitmap_fold(unsigned long *dst, const unsigned long *orig,
                unsigned int sz, unsigned int nbits);
-extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
-extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
-extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
+int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
+void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
+int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
 
 #ifdef __BIG_ENDIAN
-extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits);
+void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits);
 #else
 #define bitmap_copy_le bitmap_copy
 #endif
-extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
-extern int bitmap_print_to_pagebuf(bool list, char *buf,
+unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
+int bitmap_print_to_pagebuf(bool list, char *buf,
                                   const unsigned long *maskp, int nmaskbits);
 
 #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
 #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
 
-/*
- * The static inlines below do not handle constant nbits==0 correctly,
- * so make such users (should any ever turn up) call the out-of-line
- * versions.
- */
-#define small_const_nbits(nbits) \
-       (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0)
-
 static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 {
        unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
@@ -265,9 +265,9 @@ static inline void bitmap_copy_clear_tail(unsigned long *dst,
  * therefore conversion is not needed when copying data from/to arrays of u32.
  */
 #if BITS_PER_LONG == 64
-extern void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf,
+void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf,
                                                        unsigned int nbits);
-extern void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap,
+void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap,
                                                        unsigned int nbits);
 #else
 #define bitmap_from_arr32(bitmap, buf, nbits)                  \
index a5a4830..26bf15e 100644 (file)
@@ -286,17 +286,5 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
 })
 #endif
 
-#ifndef find_last_bit
-/**
- * find_last_bit - find the last set bit in a memory region
- * @addr: The address to start the search at
- * @size: The number of bits to search
- *
- * Returns the bit number of the last set bit, or size.
- */
-extern unsigned long find_last_bit(const unsigned long *addr,
-                                  unsigned long size);
-#endif
-
 #endif /* __KERNEL__ */
 #endif
index b91ba62..9fb255b 100644 (file)
@@ -11,7 +11,6 @@
 #include <linux/minmax.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
-#include <linux/pagemap.h>
 #include <linux/backing-dev-defs.h>
 #include <linux/wait.h>
 #include <linux/mempool.h>
@@ -327,6 +326,8 @@ enum blk_bounce {
 };
 
 struct queue_limits {
+       unsigned int            bio_max_bytes;
+
        enum blk_bounce         bounce;
        unsigned long           seg_boundary_mask;
        unsigned long           virt_boundary_mask;
index 6023a13..0684151 100644 (file)
@@ -302,10 +302,11 @@ struct bpf_verifier_state_list {
 };
 
 /* Possible states for alu_state member. */
-#define BPF_ALU_SANITIZE_SRC           1U
-#define BPF_ALU_SANITIZE_DST           2U
+#define BPF_ALU_SANITIZE_SRC           (1U << 0)
+#define BPF_ALU_SANITIZE_DST           (1U << 1)
 #define BPF_ALU_NEG_VALUE              (1U << 2)
 #define BPF_ALU_NON_POINTER            (1U << 3)
+#define BPF_ALU_IMMEDIATE              (1U << 4)
 #define BPF_ALU_SANITIZE               (BPF_ALU_SANITIZE_SRC | \
                                         BPF_ALU_SANITIZE_DST)
 
index 6b47f94..e7e99da 100644 (file)
@@ -194,6 +194,8 @@ void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size,
 struct buffer_head *__bread_gfp(struct block_device *,
                                sector_t block, unsigned size, gfp_t gfp);
 void invalidate_bh_lrus(void);
+void invalidate_bh_lrus_cpu(int cpu);
+bool has_bh_in_lru(int cpu, void *dummy);
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
 void unlock_buffer(struct buffer_head *bh);
@@ -406,6 +408,8 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; }
 static inline void invalidate_inode_buffers(struct inode *inode) {}
 static inline int remove_inode_buffers(struct inode *inode) { return 1; }
 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
+static inline void invalidate_bh_lrus_cpu(int cpu) {}
+static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; }
 #define buffer_heads_over_limit 0
 
 #endif /* CONFIG_BLOCK */
index f639bd0..348acf2 100644 (file)
@@ -36,6 +36,9 @@ static inline int is_warning_bug(const struct bug_entry *bug)
        return bug->flags & BUGFLAG_WARNING;
 }
 
+void bug_get_file_line(struct bug_entry *bug, const char **file,
+                      unsigned int *line);
+
 struct bug_entry *find_bug(unsigned long bugaddr);
 
 enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
@@ -58,6 +61,13 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
        return BUG_TRAP_TYPE_BUG;
 }
 
+struct bug_entry;
+static inline void bug_get_file_line(struct bug_entry *bug, const char **file,
+                                    unsigned int *line)
+{
+       *file = NULL;
+       *line = 0;
+}
 
 static inline void generic_bug_clear_once(void) {}
 
index a247b08..d6ab416 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/timer.h>
 #include <linux/init.h>
 #include <linux/of.h>
+#include <linux/clocksource_ids.h>
 #include <asm/div64.h>
 #include <asm/io.h>
 
@@ -62,6 +63,10 @@ struct module;
  *                     400-499: Perfect
  *                             The ideal clocksource. A must-use where
  *                             available.
+ * @id:                        Defaults to CSID_GENERIC. The id value is captured
+ *                     in certain snapshot functions to allow callers to
+ *                     validate the clocksource from which the snapshot was
+ *                     taken.
  * @flags:             Flags describing special properties
  * @enable:            Optional function to enable the clocksource
  * @disable:           Optional function to disable the clocksource
@@ -100,6 +105,7 @@ struct clocksource {
        const char              *name;
        struct list_head        list;
        int                     rating;
+       enum clocksource_ids    id;
        enum vdso_clock_mode    vdso_clock_mode;
        unsigned long           flags;
 
diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h
new file mode 100644 (file)
index 0000000..16775d7
--- /dev/null
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CLOCKSOURCE_IDS_H
+#define _LINUX_CLOCKSOURCE_IDS_H
+
+/* Enum to give clocksources a unique identifier */
+enum clocksource_ids {
+       CSID_GENERIC            = 0,
+       CSID_ARM_ARCH_COUNTER,
+       CSID_MAX,
+};
+
+#endif
index 217999c..53fd8c3 100644 (file)
@@ -44,9 +44,9 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
                                        unsigned int order_per_bit,
                                        const char *name,
                                        struct cma **res_cma);
-extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
+extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align,
                              bool no_warn);
-extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
+extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count);
 
 extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
 #endif
index ed4070e..4221888 100644 (file)
@@ -81,7 +81,6 @@ static inline unsigned long compact_gap(unsigned int order)
 }
 
 #ifdef CONFIG_COMPACTION
-extern int sysctl_compact_memory;
 extern unsigned int sysctl_compaction_proactiveness;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
                        void *buffer, size_t *length, loff_t *ppos);
index acac0b5..98dd7b3 100644 (file)
@@ -75,7 +75,6 @@
        __diag_push();                                                          \
        __diag_ignore(GCC, 8, "-Wattribute-alias",                              \
                      "Type aliasing is used to sanitize syscall arguments");\
-       asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));       \
        asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))        \
                __attribute__((alias(__stringify(__se_compat_sys##name))));     \
        ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO);                         \
index 2e8c69b..97cfd13 100644 (file)
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * configfs.h - definitions for the device driver filesystem
  *
  * Based on sysfs:
index 976ec26..85008a6 100644 (file)
@@ -50,6 +50,7 @@ enum coresight_dev_subtype_sink {
        CORESIGHT_DEV_SUBTYPE_SINK_PORT,
        CORESIGHT_DEV_SUBTYPE_SINK_BUFFER,
        CORESIGHT_DEV_SUBTYPE_SINK_SYSMEM,
+       CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM,
 };
 
 enum coresight_dev_subtype_link {
@@ -455,6 +456,18 @@ static inline void csdev_access_write64(struct csdev_access *csa, u64 val, u32 o
 }
 #endif /* CONFIG_64BIT */
 
+static inline bool coresight_is_percpu_source(struct coresight_device *csdev)
+{
+       return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SOURCE) &&
+              (csdev->subtype.source_subtype == CORESIGHT_DEV_SUBTYPE_SOURCE_PROC);
+}
+
+static inline bool coresight_is_percpu_sink(struct coresight_device *csdev)
+{
+       return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SINK) &&
+              (csdev->subtype.sink_subtype == CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM);
+}
+
 extern struct coresight_device *
 coresight_register(struct coresight_desc *desc);
 extern void coresight_unregister(struct coresight_device *csdev);
index db82ce5..4a62b39 100644 (file)
@@ -57,7 +57,7 @@ enum cpuhp_state {
        CPUHP_PAGE_ALLOC_DEAD,
        CPUHP_NET_DEV_DEAD,
        CPUHP_PCI_XGENE_DEAD,
-       CPUHP_IOMMU_INTEL_DEAD,
+       CPUHP_IOMMU_IOVA_DEAD,
        CPUHP_LUSTRE_CFS_DEAD,
        CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
        CPUHP_PADATA_DEAD,
@@ -169,6 +169,7 @@ enum cpuhp_state {
        CPUHP_AP_PERF_X86_RAPL_ONLINE,
        CPUHP_AP_PERF_X86_CQM_ONLINE,
        CPUHP_AP_PERF_X86_CSTATE_ONLINE,
+       CPUHP_AP_PERF_X86_IDXD_ONLINE,
        CPUHP_AP_PERF_S390_CF_ONLINE,
        CPUHP_AP_PERF_S390_CFD_ONLINE,
        CPUHP_AP_PERF_S390_SF_ONLINE,
index 13c8dab..674045c 100644 (file)
@@ -96,6 +96,6 @@ void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial);
  * Williams, Ross N., ross<at>ross.net
  * (see URL http://www.ross.net/crc/download/crc_v3.txt).
  */
-u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc);
+u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc);
 
 #endif /* __CRC8_H_ */
index ac0e5f9..1497132 100644 (file)
@@ -53,7 +53,6 @@ do {                                                  \
                groups_free(group_info);                \
 } while (0)
 
-extern struct group_info init_groups;
 #ifdef CONFIG_MULTIUSER
 extern struct group_info *groups_alloc(int);
 extern void groups_free(struct group_info *);
index c1e4801..9e23d33 100644 (file)
@@ -59,6 +59,7 @@ struct qstr {
 
 extern const struct qstr empty_name;
 extern const struct qstr slash_name;
+extern const struct qstr dotdot_name;
 
 struct dentry_stat_t {
        long nr_dentry;
@@ -300,8 +301,8 @@ char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
 extern char *__d_path(const struct path *, const struct path *, char *, int);
 extern char *d_absolute_path(const struct path *, char *, int);
 extern char *d_path(const struct path *, char *, int);
-extern char *dentry_path_raw(struct dentry *, char *, int);
-extern char *dentry_path(struct dentry *, char *, int);
+extern char *dentry_path_raw(const struct dentry *, char *, int);
+extern char *dentry_path(const struct dentry *, char *, int);
 
 /* Allocation counts.. */
 
index 2d3bdcc..21651f9 100644 (file)
@@ -82,16 +82,16 @@ static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
                return 0;
 }
 
-static inline void delayacct_set_flag(int flag)
+static inline void delayacct_set_flag(struct task_struct *p, int flag)
 {
-       if (current->delays)
-               current->delays->flags |= flag;
+       if (p->delays)
+               p->delays->flags |= flag;
 }
 
-static inline void delayacct_clear_flag(int flag)
+static inline void delayacct_clear_flag(struct task_struct *p, int flag)
 {
-       if (current->delays)
-               current->delays->flags &= ~flag;
+       if (p->delays)
+               p->delays->flags &= ~flag;
 }
 
 static inline void delayacct_tsk_init(struct task_struct *tsk)
@@ -114,7 +114,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk)
 
 static inline void delayacct_blkio_start(void)
 {
-       delayacct_set_flag(DELAYACCT_PF_BLKIO);
+       delayacct_set_flag(current, DELAYACCT_PF_BLKIO);
        if (current->delays)
                __delayacct_blkio_start();
 }
@@ -123,7 +123,7 @@ static inline void delayacct_blkio_end(struct task_struct *p)
 {
        if (p->delays)
                __delayacct_blkio_end(p);
-       delayacct_clear_flag(DELAYACCT_PF_BLKIO);
+       delayacct_clear_flag(p, DELAYACCT_PF_BLKIO);
 }
 
 static inline int delayacct_add_tsk(struct taskstats *d,
@@ -166,9 +166,9 @@ static inline void delayacct_thrashing_end(void)
 }
 
 #else
-static inline void delayacct_set_flag(int flag)
+static inline void delayacct_set_flag(struct task_struct *p, int flag)
 {}
-static inline void delayacct_clear_flag(int flag)
+static inline void delayacct_clear_flag(struct task_struct *p, int flag)
 {}
 static inline void delayacct_init(void)
 {}
index 5c641f9..ff700fb 100644 (file)
@@ -574,11 +574,6 @@ struct dm_table *dm_swap_table(struct mapped_device *md,
  */
 void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm);
 
-/*
- * A wrapper around vmalloc.
- */
-void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
-
 /*-----------------------------------------------------------------
  * Macros.
  *---------------------------------------------------------------*/
index 706b68d..6e75a2d 100644 (file)
@@ -40,6 +40,8 @@ void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
 void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
                struct iommu_domain *domain);
 
+extern bool iommu_dma_forcedac;
+
 #else /* CONFIG_IOMMU_DMA */
 
 struct iommu_domain;
@@ -81,10 +83,5 @@ static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_he
 {
 }
 
-static inline void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
-               struct iommu_domain *domain)
-{
-}
-
 #endif /* CONFIG_IOMMU_DMA */
 #endif /* __DMA_IOMMU_H */
index 51872e7..0d53a96 100644 (file)
@@ -22,6 +22,11 @@ struct dma_map_ops {
                        gfp_t gfp);
        void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
                        dma_addr_t dma_handle, enum dma_data_direction dir);
+       struct sg_table *(*alloc_noncontiguous)(struct device *dev, size_t size,
+                       enum dma_data_direction dir, gfp_t gfp,
+                       unsigned long attrs);
+       void (*free_noncontiguous)(struct device *dev, size_t size,
+                       struct sg_table *sgt, enum dma_data_direction dir);
        int (*mmap)(struct device *, struct vm_area_struct *,
                        void *, dma_addr_t, size_t, unsigned long attrs);
 
@@ -198,6 +203,20 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma,
 }
 #endif /* CONFIG_DMA_DECLARE_COHERENT */
 
+/*
+ * This is the actual return value from the ->alloc_noncontiguous method.
+ * The users of the DMA API should only care about the sg_table, but to make
+ * the DMA-API internal vmaping and freeing easier we stash away the page
+ * array as well (except for the fallback case).  This can go away any time,
+ * e.g. when a vmap-variant that takes a scatterlist comes along.
+ */
+struct dma_sgt_handle {
+       struct sg_table sgt;
+       struct page **pages;
+};
+#define sgt_handle(sgt) \
+       container_of((sgt), struct dma_sgt_handle, sgt)
+
 int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
                void *cpu_addr, dma_addr_t dma_addr, size_t size,
                unsigned long attrs);
index 2a984cb..183e710 100644 (file)
@@ -95,7 +95,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
        debug_dma_mapping_error(dev, dma_addr);
 
-       if (dma_addr == DMA_MAPPING_ERROR)
+       if (unlikely(dma_addr == DMA_MAPPING_ERROR))
                return -ENOMEM;
        return 0;
 }
@@ -144,6 +144,15 @@ u64 dma_get_required_mask(struct device *dev);
 size_t dma_max_mapping_size(struct device *dev);
 bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
 unsigned long dma_get_merge_boundary(struct device *dev);
+struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
+               enum dma_data_direction dir, gfp_t gfp, unsigned long attrs);
+void dma_free_noncontiguous(struct device *dev, size_t size,
+               struct sg_table *sgt, enum dma_data_direction dir);
+void *dma_vmap_noncontiguous(struct device *dev, size_t size,
+               struct sg_table *sgt);
+void dma_vunmap_noncontiguous(struct device *dev, void *vaddr);
+int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
+               size_t size, struct sg_table *sgt);
 #else /* CONFIG_HAS_DMA */
 static inline dma_addr_t dma_map_page_attrs(struct device *dev,
                struct page *page, size_t offset, size_t size,
@@ -257,12 +266,37 @@ static inline unsigned long dma_get_merge_boundary(struct device *dev)
 {
        return 0;
 }
+static inline struct sg_table *dma_alloc_noncontiguous(struct device *dev,
+               size_t size, enum dma_data_direction dir, gfp_t gfp,
+               unsigned long attrs)
+{
+       return NULL;
+}
+static inline void dma_free_noncontiguous(struct device *dev, size_t size,
+               struct sg_table *sgt, enum dma_data_direction dir)
+{
+}
+static inline void *dma_vmap_noncontiguous(struct device *dev, size_t size,
+               struct sg_table *sgt)
+{
+       return NULL;
+}
+static inline void dma_vunmap_noncontiguous(struct device *dev, void *vaddr)
+{
+}
+static inline int dma_mmap_noncontiguous(struct device *dev,
+               struct vm_area_struct *vma, size_t size, struct sg_table *sgt)
+{
+       return -EINVAL;
+}
 #endif /* CONFIG_HAS_DMA */
 
 struct page *dma_alloc_pages(struct device *dev, size_t size,
                dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
 void dma_free_pages(struct device *dev, size_t size, struct page *page,
                dma_addr_t dma_handle, enum dma_data_direction dir);
+int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma,
+               size_t size, struct page *page);
 
 static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
                dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
@@ -401,7 +435,6 @@ static inline void dma_sync_sgtable_for_device(struct device *dev,
 static inline void *dma_alloc_coherent(struct device *dev, size_t size,
                dma_addr_t *dma_handle, gfp_t gfp)
 {
-
        return dma_alloc_attrs(dev, size, dma_handle, gfp,
                        (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0);
 }
index c6cc0a5..5487a80 100644 (file)
@@ -168,7 +168,7 @@ struct f2fs_checkpoint {
        unsigned char alloc_type[MAX_ACTIVE_LOGS];
 
        /* SIT and NAT version bitmap */
-       unsigned char sit_nat_version_bitmap[1];
+       unsigned char sit_nat_version_bitmap[];
 } __packed;
 
 #define CP_CHKSUM_OFFSET       4092    /* default chksum offset in checkpoint */
index 2259827..2de2e46 100644 (file)
@@ -92,23 +92,20 @@ extern void put_unused_fd(unsigned int fd);
 
 extern void fd_install(unsigned int fd, struct file *file);
 
-extern int __receive_fd(int fd, struct file *file, int __user *ufd,
+extern int __receive_fd(struct file *file, int __user *ufd,
                        unsigned int o_flags);
 static inline int receive_fd_user(struct file *file, int __user *ufd,
                                  unsigned int o_flags)
 {
        if (ufd == NULL)
                return -EFAULT;
-       return __receive_fd(-1, file, ufd, o_flags);
+       return __receive_fd(file, ufd, o_flags);
 }
 static inline int receive_fd(struct file *file, unsigned int o_flags)
 {
-       return __receive_fd(-1, file, NULL, o_flags);
-}
-static inline int receive_fd_replace(int fd, struct file *file, unsigned int o_flags)
-{
-       return __receive_fd(fd, file, NULL, o_flags);
+       return __receive_fd(file, NULL, o_flags);
 }
+int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);
 
 extern void flush_delayed_fput(void);
 extern void __fput_sync(struct file *);
index 12766ed..c3c88fd 100644 (file)
@@ -145,7 +145,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* Expect random access pattern */
 #define FMODE_RANDOM           ((__force fmode_t)0x1000)
 
-/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
+/* File is huge (eg. /dev/mem): treat loff_t as unsigned */
 #define FMODE_UNSIGNED_OFFSET  ((__force fmode_t)0x2000)
 
 /* File is opened with O_PATH; almost nothing can be done with it */
@@ -442,7 +442,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
  * @i_mmap: Tree of private and shared mappings.
  * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
  * @nrpages: Number of page entries, protected by the i_pages lock.
- * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
  * @writeback_index: Writeback starts here.
  * @a_ops: Methods.
  * @flags: Error bits and flags (AS_*).
@@ -463,7 +462,6 @@ struct address_space {
        struct rb_root_cached   i_mmap;
        struct rw_semaphore     i_mmap_rwsem;
        unsigned long           nrpages;
-       unsigned long           nrexceptional;
        pgoff_t                 writeback_index;
        const struct address_space_operations *a_ops;
        unsigned long           flags;
index 86e5028..a69f363 100644 (file)
@@ -33,7 +33,7 @@
 /*
  * If the arch's mcount caller does not support all of ftrace's
  * features, then it must call an indirect function that
- * does. Or at least does enough to prevent any unwelcomed side effects.
+ * does. Or at least does enough to prevent any unwelcome side effects.
  */
 #if !ARCH_SUPPORTS_FTRACE_OPS
 # define FTRACE_FORCE_LIST_FUNC 1
@@ -389,7 +389,7 @@ DECLARE_PER_CPU(int, disable_stack_tracer);
  */
 static inline void stack_tracer_disable(void)
 {
-       /* Preemption or interupts must be disabled */
+       /* Preemption or interrupts must be disabled */
        if (IS_ENABLED(CONFIG_DEBUG_PREEMPT))
                WARN_ON_ONCE(!preempt_count() || !irqs_disabled());
        this_cpu_inc(disable_stack_tracer);
index 6cb8230..939b1a8 100644 (file)
@@ -404,4 +404,3 @@ s_fields                                                            \
 
 /* }}}1 */
 #endif /* GENL_MAGIC_FUNC_H */
-/* vim: set foldmethod=marker foldlevel=1 nofoldenable : */
index 35d21fd..f81d489 100644 (file)
@@ -283,4 +283,3 @@ enum {                                                                      \
 
 /* }}}1 */
 #endif /* GENL_MAGIC_STRUCT_H */
-/* vim: set foldmethod=marker nofoldenable : */
index 26f4d90..11da8af 100644 (file)
@@ -490,7 +490,7 @@ static inline int gfp_zonelist(gfp_t flags)
 
 /*
  * We get the zone list from the current node and the gfp_mask.
- * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
+ * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones.
  * There are two zonelists per node, one for all zones with memory and
  * one containing just zones from the node the zonelist belongs to.
  *
@@ -657,7 +657,7 @@ extern int alloc_contig_range(unsigned long start, unsigned long end,
 extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
                                       int nid, nodemask_t *nodemask);
 #endif
-void free_contig_range(unsigned long pfn, unsigned int nr_pages);
+void free_contig_range(unsigned long pfn, unsigned long nr_pages);
 
 #ifdef CONFIG_CMA
 /* CMA stuff */
index ecf0032..3a26878 100644 (file)
@@ -227,7 +227,7 @@ struct gpio_irq_chip {
        /**
         * @valid_mask:
         *
-        * If not %NULL holds bitmask of GPIOs which are valid to be included
+        * If not %NULL, holds bitmask of GPIOs which are valid to be included
         * in IRQ domain of the chip.
         */
        unsigned long *valid_mask;
@@ -346,7 +346,7 @@ struct gpio_irq_chip {
  *     output.
  *
  * A gpio_chip can help platforms abstract various sources of GPIOs so
- * they can all be accessed through a common programing interface.
+ * they can all be accessed through a common programming interface.
  * Example sources would be SOC controllers, FPGAs, multifunction
  * chips, dedicated GPIO expanders, and so on.
  *
@@ -435,15 +435,15 @@ struct gpio_chip {
        /**
         * @valid_mask:
         *
-        * If not %NULL holds bitmask of GPIOs which are valid to be used
+        * If not %NULL, holds bitmask of GPIOs which are valid to be used
         * from the chip.
         */
        unsigned long *valid_mask;
 
 #if defined(CONFIG_OF_GPIO)
        /*
-        * If CONFIG_OF is enabled, then all GPIO controllers described in the
-        * device tree automatically may have an OF translation
+        * If CONFIG_OF_GPIO is enabled, then all GPIO controllers described in
+        * the device tree automatically may have an OF translation
         */
 
        /**
@@ -508,7 +508,7 @@ extern int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
  * for GPIOs will fail rudely.
  *
  * gpiochip_add_data() must only be called after gpiolib initialization,
- * ie after core_initcall().
+ * i.e. after core_initcall().
  *
  * If gc->base is negative, this requests dynamic assignment of
  * a range of valid GPIOs.
index 44170f3..832b49b 100644 (file)
@@ -332,4 +332,11 @@ static inline void memcpy_to_page(struct page *page, size_t offset,
        kunmap_local(to);
 }
 
+static inline void memzero_page(struct page *page, size_t offset, size_t len)
+{
+       char *addr = kmap_atomic(page);
+       memset(addr + offset, 0, len);
+       kunmap_atomic(addr);
+}
+
 #endif /* _LINUX_HIGHMEM_H */
index ba973ef..9626fda 100644 (file)
@@ -87,9 +87,6 @@ enum transparent_hugepage_flag {
        TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
        TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
-#ifdef CONFIG_DEBUG_VM
-       TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
-#endif
 };
 
 struct kobject;
index cccd1aa..b92f25c 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/kref.h>
 #include <linux/pgtable.h>
 #include <linux/gfp.h>
+#include <linux/userfaultfd_k.h>
 
 struct ctl_table;
 struct user_struct;
@@ -134,11 +135,14 @@ void hugetlb_show_meminfo(void);
 unsigned long hugetlb_total_pages(void);
 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags);
+#ifdef CONFIG_USERFAULTFD
 int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
                                struct vm_area_struct *dst_vma,
                                unsigned long dst_addr,
                                unsigned long src_addr,
+                               enum mcopy_atomic_mode mode,
                                struct page **pagep);
+#endif /* CONFIG_USERFAULTFD */
 bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
                                                struct vm_area_struct *vma,
                                                vm_flags_t vm_flags);
@@ -152,7 +156,8 @@ void hugetlb_fix_reserve_counts(struct inode *inode);
 extern struct mutex *hugetlb_fault_mutex_table;
 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
 
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pud_t *pud);
 
 struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
 
@@ -161,7 +166,7 @@ extern struct list_head huge_boot_pages;
 
 /* arch callbacks */
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz);
 pte_t *huge_pte_offset(struct mm_struct *mm,
                       unsigned long addr, unsigned long sz);
@@ -187,6 +192,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot);
 
 bool is_hugetlb_entry_migration(pte_t pte);
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
@@ -308,16 +314,19 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
        BUG();
 }
 
+#ifdef CONFIG_USERFAULTFD
 static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                                                pte_t *dst_pte,
                                                struct vm_area_struct *dst_vma,
                                                unsigned long dst_addr,
                                                unsigned long src_addr,
+                                               enum mcopy_atomic_mode mode,
                                                struct page **pagep)
 {
        BUG();
        return 0;
 }
+#endif /* CONFIG_USERFAULTFD */
 
 static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
                                        unsigned long sz)
@@ -368,6 +377,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
        return 0;
 }
 
+static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
+
 #endif /* !CONFIG_HUGETLB_PAGE */
 /*
  * hugepages at page global directory. If arch support
@@ -555,6 +566,7 @@ HPAGEFLAG(Freed, freed)
 #define HSTATE_NAME_LEN 32
 /* Defines one hugetlb page size */
 struct hstate {
+       struct mutex resize_lock;
        int next_nid_to_alloc;
        int next_nid_to_free;
        unsigned int order;
@@ -583,6 +595,7 @@ struct huge_bootmem_page {
        struct hstate *hstate;
 };
 
+int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
 struct page *alloc_huge_page(struct vm_area_struct *vma,
                                unsigned long addr, int avoid_reserve);
 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
@@ -865,6 +878,12 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 #else  /* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 
+static inline int isolate_or_dissolve_huge_page(struct page *page,
+                                               struct list_head *list)
+{
+       return -ENOMEM;
+}
+
 static inline struct page *alloc_huge_page(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           int avoid_reserve)
@@ -1039,4 +1058,14 @@ static inline __init void hugetlb_cma_check(void)
 }
 #endif
 
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
+
+#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
+/*
+ * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
+ * implement this.
+ */
+#define flush_hugetlb_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
+#endif
+
 #endif /* _LINUX_HUGETLB_H */
index b2412b4..40fc581 100644 (file)
@@ -25,7 +25,6 @@
 extern struct files_struct init_files;
 extern struct fs_struct init_fs;
 extern struct nsproxy init_nsproxy;
-extern struct group_info init_groups;
 extern struct cred init_cred;
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
index 85c1571..1bbe9af 100644 (file)
@@ -20,8 +20,10 @@ extern void free_initrd_mem(unsigned long, unsigned long);
 
 #ifdef CONFIG_BLK_DEV_INITRD
 extern void __init reserve_initrd_mem(void);
+extern void wait_for_initramfs(void);
 #else
 static inline void __init reserve_initrd_mem(void) {}
+static inline void wait_for_initramfs(void) {}
 #endif
 
 extern phys_addr_t phys_initrd_start;
index 1bc46b8..03faf20 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/dmar.h>
 #include <linux/ioasid.h>
+#include <linux/bitfield.h>
 
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
@@ -80,6 +81,7 @@
 #define DMAR_IQ_SHIFT  4       /* Invalidation queue head/tail shift */
 #define DMAR_IQA_REG   0x90    /* Invalidation queue addr register */
 #define DMAR_ICS_REG   0x9c    /* Invalidation complete status register */
+#define DMAR_IQER_REG  0xb0    /* Invalidation queue error record register */
 #define DMAR_IRTA_REG  0xb8    /* Interrupt remapping table addr register */
 #define DMAR_PQH_REG   0xc0    /* Page request queue head register */
 #define DMAR_PQT_REG   0xc8    /* Page request queue tail register */
 #define DMAR_VCMD_REG          0xe10 /* Virtual command register */
 #define DMAR_VCRSP_REG         0xe20 /* Virtual command response register */
 
+#define DMAR_IQER_REG_IQEI(reg)                FIELD_GET(GENMASK_ULL(3, 0), reg)
+#define DMAR_IQER_REG_ITESID(reg)      FIELD_GET(GENMASK_ULL(47, 32), reg)
+#define DMAR_IQER_REG_ICESID(reg)      FIELD_GET(GENMASK_ULL(63, 48), reg)
+
 #define OFFSET_STRIDE          (9)
 
 #define dmar_readq(a) readq(a)
@@ -372,6 +378,7 @@ enum {
 /* PASID cache invalidation granu */
 #define QI_PC_ALL_PASIDS       0
 #define QI_PC_PASID_SEL                1
+#define QI_PC_GLOBAL           3
 
 #define QI_EIOTLB_ADDR(addr)   ((u64)(addr) & VTD_PAGE_MASK)
 #define QI_EIOTLB_IH(ih)       (((u64)ih) << 6)
@@ -763,14 +770,11 @@ u32 intel_svm_get_pasid(struct iommu_sva *handle);
 int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt,
                            struct iommu_page_response *msg);
 
-struct svm_dev_ops;
-
 struct intel_svm_dev {
        struct list_head list;
        struct rcu_head rcu;
        struct device *dev;
        struct intel_iommu *iommu;
-       struct svm_dev_ops *ops;
        struct iommu_sva sva;
        u32 pasid;
        int users;
index 39d368a..10fa80e 100644 (file)
@@ -8,29 +8,12 @@
 #ifndef __INTEL_SVM_H__
 #define __INTEL_SVM_H__
 
-struct device;
-
-struct svm_dev_ops {
-       void (*fault_cb)(struct device *dev, u32 pasid, u64 address,
-                        void *private, int rwxp, int response);
-};
-
 /* Values for rxwp in fault_cb callback */
 #define SVM_REQ_READ   (1<<3)
 #define SVM_REQ_WRITE  (1<<2)
 #define SVM_REQ_EXEC   (1<<1)
 #define SVM_REQ_PRIV   (1<<0)
 
-/*
- * The SVM_FLAG_PRIVATE_PASID flag requests a PASID which is *not* the "main"
- * PASID for the current process. Even if a PASID already exists, a new one
- * will be allocated. And the PASID allocated with SVM_FLAG_PRIVATE_PASID
- * will not be given to subsequent callers. This facility allows a driver to
- * disambiguate between multiple device contexts which access the same MM,
- * if there is no other way to do so. It should be used sparingly, if at all.
- */
-#define SVM_FLAG_PRIVATE_PASID         (1<<0)
-
 /*
  * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only
  * for access to kernel addresses. No IOTLB flushes are automatically done
@@ -42,18 +25,18 @@ struct svm_dev_ops {
  * It is unlikely that we will ever hook into flush_tlb_kernel_range() to
  * do such IOTLB flushes automatically.
  */
-#define SVM_FLAG_SUPERVISOR_MODE       (1<<1)
+#define SVM_FLAG_SUPERVISOR_MODE       BIT(0)
 /*
  * The SVM_FLAG_GUEST_MODE flag is used when a PASID bind is for guest
  * processes. Compared to the host bind, the primary differences are:
  * 1. mm life cycle management
  * 2. fault reporting
  */
-#define SVM_FLAG_GUEST_MODE            (1<<2)
+#define SVM_FLAG_GUEST_MODE            BIT(1)
 /*
  * The SVM_FLAG_GUEST_PASID flag is used when a guest has its own PASID space,
  * which requires guest and host PASID translation at both directions.
  */
-#define SVM_FLAG_GUEST_PASID           (1<<3)
+#define SVM_FLAG_GUEST_PASID           BIT(2)
 
 #endif /* __INTEL_SVM_H__ */
index a4c9ca2..4d40dfa 100644 (file)
@@ -204,10 +204,6 @@ struct io_pgtable {
 
 #define io_pgtable_ops_to_pgtable(x) container_of((x), struct io_pgtable, ops)
 
-struct io_pgtable_domain_attr {
-       unsigned long quirks;
-};
-
 static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop)
 {
        if (iop->cfg.tlb && iop->cfg.tlb->tlb_flush_all)
index d202fd2..c87d0cb 100644 (file)
@@ -198,7 +198,6 @@ struct iomap_ioend {
        struct inode            *io_inode;      /* file being written to */
        size_t                  io_size;        /* size of the extent */
        loff_t                  io_offset;      /* offset in the file */
-       void                    *io_private;    /* file system private data */
        struct bio              *io_bio;        /* bio being built */
        struct bio              io_inline_bio;  /* MUST BE LAST! */
 };
@@ -234,9 +233,7 @@ struct iomap_writepage_ctx {
 
 void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
 void iomap_ioend_try_merge(struct iomap_ioend *ioend,
-               struct list_head *more_ioends,
-               void (*merge_private)(struct iomap_ioend *ioend,
-                               struct iomap_ioend *next));
+               struct list_head *more_ioends);
 void iomap_sort_ioends(struct list_head *ioend_list);
 int iomap_writepage(struct page *page, struct writeback_control *wbc,
                struct iomap_writepage_ctx *wpc,
index 5e7fe51..32d4480 100644 (file)
@@ -96,32 +96,6 @@ enum iommu_cap {
        IOMMU_CAP_NOEXEC,               /* IOMMU_NOEXEC flag */
 };
 
-/*
- * Following constraints are specifc to FSL_PAMUV1:
- *  -aperture must be power of 2, and naturally aligned
- *  -number of windows must be power of 2, and address space size
- *   of each window is determined by aperture size / # of windows
- *  -the actual size of the mapped region of a window must be power
- *   of 2 starting with 4KB and physical address must be naturally
- *   aligned.
- * DOMAIN_ATTR_FSL_PAMUV1 corresponds to the above mentioned contraints.
- * The caller can invoke iommu_domain_get_attr to check if the underlying
- * iommu implementation supports these constraints.
- */
-
-enum iommu_attr {
-       DOMAIN_ATTR_GEOMETRY,
-       DOMAIN_ATTR_PAGING,
-       DOMAIN_ATTR_WINDOWS,
-       DOMAIN_ATTR_FSL_PAMU_STASH,
-       DOMAIN_ATTR_FSL_PAMU_ENABLE,
-       DOMAIN_ATTR_FSL_PAMUV1,
-       DOMAIN_ATTR_NESTING,    /* two stages of translation */
-       DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
-       DOMAIN_ATTR_IO_PGTABLE_CFG,
-       DOMAIN_ATTR_MAX,
-};
-
 /* These are the possible reserved region types */
 enum iommu_resv_type {
        /* Memory regions which must be mapped 1:1 at all times */
@@ -156,10 +130,24 @@ struct iommu_resv_region {
        enum iommu_resv_type    type;
 };
 
-/* Per device IOMMU features */
+/**
+ * enum iommu_dev_features - Per device IOMMU features
+ * @IOMMU_DEV_FEAT_AUX: Auxiliary domain feature
+ * @IOMMU_DEV_FEAT_SVA: Shared Virtual Addresses
+ * @IOMMU_DEV_FEAT_IOPF: I/O Page Faults such as PRI or Stall. Generally
+ *                      enabling %IOMMU_DEV_FEAT_SVA requires
+ *                      %IOMMU_DEV_FEAT_IOPF, but some devices manage I/O Page
+ *                      Faults themselves instead of relying on the IOMMU. When
+ *                      supported, this feature must be enabled before and
+ *                      disabled after %IOMMU_DEV_FEAT_SVA.
+ *
+ * Device drivers query whether a feature is supported using
+ * iommu_dev_has_feature(), and enable it using iommu_dev_enable_feature().
+ */
 enum iommu_dev_features {
-       IOMMU_DEV_FEAT_AUX,     /* Aux-domain feature */
-       IOMMU_DEV_FEAT_SVA,     /* Shared Virtual Addresses */
+       IOMMU_DEV_FEAT_AUX,
+       IOMMU_DEV_FEAT_SVA,
+       IOMMU_DEV_FEAT_IOPF,
 };
 
 #define IOMMU_PASID_INVALID    (-1U)
@@ -203,13 +191,11 @@ struct iommu_iotlb_gather {
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
  *                  group and attached to the groups domain
  * @device_group: find iommu group for a particular device
- * @domain_get_attr: Query domain attributes
- * @domain_set_attr: Change domain attributes
+ * @enable_nesting: Enable nesting
+ * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
  * @get_resv_regions: Request list of reserved regions for a device
  * @put_resv_regions: Free list of reserved regions for a device
  * @apply_resv_region: Temporary helper call-back for iova reserved ranges
- * @domain_window_enable: Configure and enable a particular window for a domain
- * @domain_window_disable: Disable a particular window for a domain
  * @of_xlate: add OF master IDs to iommu grouping
  * @is_attach_deferred: Check if domain attach should be deferred from iommu
  *                      driver init to device driver init (default no)
@@ -255,10 +241,9 @@ struct iommu_ops {
        void (*release_device)(struct device *dev);
        void (*probe_finalize)(struct device *dev);
        struct iommu_group *(*device_group)(struct device *dev);
-       int (*domain_get_attr)(struct iommu_domain *domain,
-                              enum iommu_attr attr, void *data);
-       int (*domain_set_attr)(struct iommu_domain *domain,
-                              enum iommu_attr attr, void *data);
+       int (*enable_nesting)(struct iommu_domain *domain);
+       int (*set_pgtable_quirks)(struct iommu_domain *domain,
+                                 unsigned long quirks);
 
        /* Request/Free a list of reserved regions for a device */
        void (*get_resv_regions)(struct device *dev, struct list_head *list);
@@ -267,11 +252,6 @@ struct iommu_ops {
                                  struct iommu_domain *domain,
                                  struct iommu_resv_region *region);
 
-       /* Window handling functions */
-       int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr,
-                                   phys_addr_t paddr, u64 size, int prot);
-       void (*domain_window_disable)(struct iommu_domain *domain, u32 wnd_nr);
-
        int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
        bool (*is_attach_deferred)(struct iommu_domain *domain, struct device *dev);
 
@@ -353,6 +333,7 @@ struct iommu_fault_param {
  * struct dev_iommu - Collection of per-device IOMMU data
  *
  * @fault_param: IOMMU detected device fault reporting data
+ * @iopf_param:         I/O Page Fault queue and data
  * @fwspec:     IOMMU fwspec data
  * @iommu_dev:  IOMMU device this device is linked to
  * @priv:       IOMMU Driver private data
@@ -363,12 +344,15 @@ struct iommu_fault_param {
 struct dev_iommu {
        struct mutex lock;
        struct iommu_fault_param        *fault_param;
+       struct iopf_device_param        *iopf_param;
        struct iommu_fwspec             *fwspec;
        struct iommu_device             *iommu_dev;
        void                            *priv;
 };
 
-int  iommu_device_register(struct iommu_device *iommu);
+int iommu_device_register(struct iommu_device *iommu,
+                         const struct iommu_ops *ops,
+                         struct device *hwdev);
 void iommu_device_unregister(struct iommu_device *iommu);
 int  iommu_device_sysfs_add(struct iommu_device *iommu,
                            struct device *parent,
@@ -379,25 +363,6 @@ int  iommu_device_link(struct iommu_device   *iommu, struct device *link);
 void iommu_device_unlink(struct iommu_device *iommu, struct device *link);
 int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain);
 
-static inline void __iommu_device_set_ops(struct iommu_device *iommu,
-                                         const struct iommu_ops *ops)
-{
-       iommu->ops = ops;
-}
-
-#define iommu_device_set_ops(iommu, ops)                               \
-do {                                                                   \
-       struct iommu_ops *__ops = (struct iommu_ops *)(ops);            \
-       __ops->owner = THIS_MODULE;                                     \
-       __iommu_device_set_ops(iommu, __ops);                           \
-} while (0)
-
-static inline void iommu_device_set_fwnode(struct iommu_device *iommu,
-                                          struct fwnode_handle *fwnode)
-{
-       iommu->fwnode = fwnode;
-}
-
 static inline struct iommu_device *dev_to_iommu_device(struct device *dev)
 {
        return (struct iommu_device *)dev_get_drvdata(dev);
@@ -507,15 +472,12 @@ extern int iommu_page_response(struct device *dev,
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 
-extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
-                                void *data);
-extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
-                                void *data);
+int iommu_enable_nesting(struct iommu_domain *domain);
+int iommu_set_pgtable_quirks(struct iommu_domain *domain,
+               unsigned long quirks);
 
-/* Window handling function prototypes */
-extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
-                                     phys_addr_t offset, u64 size,
-                                     int prot);
+void iommu_set_dma_strict(bool val);
+bool iommu_get_dma_strict(struct iommu_domain *domain);
 
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
                              unsigned long iova, int flags);
@@ -547,7 +509,7 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
         * structure can be rewritten.
         */
        if (gather->pgsize != size ||
-           end < gather->start || start > gather->end) {
+           end + 1 < gather->start || start > gather->end + 1) {
                if (gather->pgsize)
                        iommu_iotlb_sync(domain, gather);
                gather->pgsize = size;
@@ -571,8 +533,7 @@ struct iommu_group *fsl_mc_device_group(struct device *dev);
  * struct iommu_fwspec - per-device IOMMU instance data
  * @ops: ops for this device's IOMMU
  * @iommu_fwnode: firmware handle for this device's IOMMU
- * @iommu_priv: IOMMU driver private data for this device
- * @num_pasid_bits: number of PASID bits supported by this device
+ * @flags: IOMMU_FWSPEC_* flags
  * @num_ids: number of associated device IDs
  * @ids: IDs which this device may present to the IOMMU
  */
@@ -580,7 +541,6 @@ struct iommu_fwspec {
        const struct iommu_ops  *ops;
        struct fwnode_handle    *iommu_fwnode;
        u32                     flags;
-       u32                     num_pasid_bits;
        unsigned int            num_ids;
        u32                     ids[];
 };
@@ -742,13 +702,6 @@ static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 {
 }
 
-static inline int iommu_domain_window_enable(struct iommu_domain *domain,
-                                            u32 wnd_nr, phys_addr_t paddr,
-                                            u64 size, int prot)
-{
-       return -ENODEV;
-}
-
 static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 {
        return 0;
@@ -889,33 +842,19 @@ static inline int iommu_group_id(struct iommu_group *group)
        return -ENODEV;
 }
 
-static inline int iommu_domain_get_attr(struct iommu_domain *domain,
-                                       enum iommu_attr attr, void *data)
-{
-       return -EINVAL;
-}
-
-static inline int iommu_domain_set_attr(struct iommu_domain *domain,
-                                       enum iommu_attr attr, void *data)
+static inline int iommu_set_pgtable_quirks(struct iommu_domain *domain,
+               unsigned long quirks)
 {
-       return -EINVAL;
+       return 0;
 }
 
-static inline int  iommu_device_register(struct iommu_device *iommu)
+static inline int iommu_device_register(struct iommu_device *iommu,
+                                       const struct iommu_ops *ops,
+                                       struct device *hwdev)
 {
        return -ENODEV;
 }
 
-static inline void iommu_device_set_ops(struct iommu_device *iommu,
-                                       const struct iommu_ops *ops)
-{
-}
-
-static inline void iommu_device_set_fwnode(struct iommu_device *iommu,
-                                          struct fwnode_handle *fwnode)
-{
-}
-
 static inline struct iommu_device *dev_to_iommu_device(struct device *dev)
 {
        return NULL;
index c834c01..71d8a2d 100644 (file)
@@ -95,6 +95,7 @@ struct iova_domain {
                                                   flush-queues */
        atomic_t fq_timer_on;                   /* 1 when timer is active, 0
                                                   when not */
+       struct hlist_node       cpuhp_dead;
 };
 
 static inline unsigned long iova_size(struct iova *iova)
@@ -156,7 +157,6 @@ int init_iova_flush_queue(struct iova_domain *iovad,
                          iova_flush_cb flush_cb, iova_entry_dtor entry_dtor);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
-void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 #else
 static inline int iova_cache_get(void)
 {
@@ -233,10 +233,6 @@ static inline void put_iova_domain(struct iova_domain *iovad)
 {
 }
 
-static inline void free_cpu_cached_iovas(unsigned int cpu,
-                                        struct iova_domain *iovad)
-{
-}
 #endif
 
 #endif
index 7a1dd7b..62a8e3d 100644 (file)
@@ -256,11 +256,11 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
                                    irq_hw_number_t hwirq_max, int direct_max,
                                    const struct irq_domain_ops *ops,
                                    void *host_data);
-struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
-                                        unsigned int size,
-                                        unsigned int first_irq,
-                                        const struct irq_domain_ops *ops,
-                                        void *host_data);
+struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode,
+                                           unsigned int size,
+                                           unsigned int first_irq,
+                                           const struct irq_domain_ops *ops,
+                                           void *host_data);
 struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
                                         unsigned int size,
                                         unsigned int first_irq,
@@ -325,6 +325,15 @@ static inline struct irq_domain *irq_find_host(struct device_node *node)
        return d;
 }
 
+static inline struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
+                                                      unsigned int size,
+                                                      unsigned int first_irq,
+                                                      const struct irq_domain_ops *ops,
+                                                      void *host_data)
+{
+       return irq_domain_create_simple(of_node_to_fwnode(of_node), size, first_irq, ops, host_data);
+}
+
 /**
  * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
  * @of_node: pointer to interrupt controller's device tree node.
index 2f9d154..15d8bad 100644 (file)
@@ -3,6 +3,7 @@
 #define _LINUX_KERNEL_H
 
 #include <stdarg.h>
+#include <linux/align.h>
 #include <linux/limits.h>
 #include <linux/linkage.h>
 #include <linux/stddef.h>
  */
 #define REPEAT_BYTE(x) ((~0ul / 0xff) * (x))
 
-/* @a is a power of 2 value */
-#define ALIGN(x, a)            __ALIGN_KERNEL((x), (a))
-#define ALIGN_DOWN(x, a)       __ALIGN_KERNEL((x) - ((a) - 1), (a))
-#define __ALIGN_MASK(x, mask)  __ALIGN_KERNEL_MASK((x), (mask))
-#define PTR_ALIGN(p, a)                ((typeof(p))ALIGN((unsigned long)(p), (a)))
-#define PTR_ALIGN_DOWN(p, a)   ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a)))
-#define IS_ALIGNED(x, a)               (((x) & ((typeof(x))(a) - 1)) == 0)
-
 /* generic data direction definitions */
 #define READ                   0
 #define WRITE                  1
index 1b65e72..8895b95 100644 (file)
@@ -192,8 +192,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                            int len, struct kvm_io_device *dev);
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-                              struct kvm_io_device *dev);
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                             struct kvm_io_device *dev);
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                                         gpa_t addr);
 
@@ -218,6 +218,20 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+struct kvm_gfn_range {
+       struct kvm_memory_slot *slot;
+       gfn_t start;
+       gfn_t end;
+       pte_t pte;
+       bool may_block;
+};
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+#endif
+
 enum {
        OUTSIDE_GUEST_MODE,
        IN_GUEST_MODE,
@@ -640,6 +654,7 @@ void kvm_exit(void);
 
 void kvm_get_kvm(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);
+bool file_is_kvm(struct file *file);
 void kvm_put_kvm_no_destroy(struct kvm *kvm);
 
 static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
@@ -886,7 +901,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot);
 
 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot);
+                                       const struct kvm_memory_slot *memslot);
 #else /* !CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log);
 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
@@ -945,6 +960,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_post_init_vm(struct kvm *kvm);
 void kvm_arch_pre_destroy_vm(struct kvm *kvm);
 
@@ -1116,7 +1132,7 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
 }
 
 static inline unsigned long
-__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+__gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
 {
        return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
 }
index 61f04f7..04c0179 100644 (file)
@@ -59,6 +59,7 @@ LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
 LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc,
         struct fs_parameter *param)
 LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb)
+LSM_HOOK(void, LSM_RET_VOID, sb_delete, struct super_block *sb)
 LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb)
 LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts)
 LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts)
index ba2ccd9..5c4c5c0 100644 (file)
  *     allocated.
  *     @sb contains the super_block structure to be modified.
  *     Return 0 if operation was successful.
+ * @sb_delete:
+ *     Release objects tied to a superblock (e.g. inodes).
+ *     @sb contains the super_block structure being released.
  * @sb_free_security:
  *     Deallocate and clear the sb->s_security field.
  *     @sb contains the super_block structure to be modified.
@@ -1585,6 +1588,7 @@ struct lsm_blob_sizes {
        int     lbs_cred;
        int     lbs_file;
        int     lbs_inode;
+       int     lbs_superblock;
        int     lbs_ipc;
        int     lbs_msg_msg;
        int     lbs_task;
index 5904716..c193be7 100644 (file)
@@ -114,12 +114,13 @@ struct batched_lruvec_stat {
 };
 
 /*
- * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
- * which have elements charged to this memcg.
+ * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
+ * shrinkers, which have elements charged to this memcg.
  */
-struct memcg_shrinker_map {
+struct shrinker_info {
        struct rcu_head rcu;
-       unsigned long map[];
+       atomic_long_t *nr_deferred;
+       unsigned long *map;
 };
 
 /*
@@ -145,7 +146,7 @@ struct mem_cgroup_per_node {
 
        struct mem_cgroup_reclaim_iter  iter;
 
-       struct memcg_shrinker_map __rcu *shrinker_map;
+       struct shrinker_info __rcu      *shrinker_info;
 
        struct rb_node          tree_node;      /* RB tree node */
        unsigned long           usage_in_excess;/* Set to the value by which */
@@ -1610,10 +1611,10 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
        return false;
 }
 
-extern int memcg_expand_shrinker_maps(int new_id);
-
-extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
-                                  int nid, int shrinker_id);
+int alloc_shrinker_info(struct mem_cgroup *memcg);
+void free_shrinker_info(struct mem_cgroup *memcg);
+void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
+void reparent_shrinker_deferred(struct mem_cgroup *memcg);
 #else
 #define mem_cgroup_sockets_enabled 0
 static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
@@ -1623,8 +1624,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
        return false;
 }
 
-static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
-                                         int nid, int shrinker_id)
+static inline void set_shrinker_bit(struct mem_cgroup *memcg,
+                                   int nid, int shrinker_id)
 {
 }
 #endif
index 4da95e6..97e92e8 100644 (file)
@@ -29,6 +29,11 @@ struct memory_block {
        int online_type;                /* for passing data to online routine */
        int nid;                        /* NID for this memory block */
        struct device dev;
+       /*
+        * Number of vmemmap pages. These pages
+        * lay at the beginning of the memory block.
+        */
+       unsigned long nr_vmemmap_pages;
 };
 
 int arch_get_memory_phys_device(unsigned long start_pfn);
@@ -80,7 +85,8 @@ static inline int memory_notify(unsigned long val, void *v)
 #else
 extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
-int create_memory_block_devices(unsigned long start, unsigned long size);
+int create_memory_block_devices(unsigned long start, unsigned long size,
+                               unsigned long vmemmap_pages);
 void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern void memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
index 7288aa5..28f32fd 100644 (file)
@@ -55,6 +55,14 @@ typedef int __bitwise mhp_t;
  */
 #define MHP_MERGE_RESOURCE     ((__force mhp_t)BIT(0))
 
+/*
+ * We want memmap (struct page array) to be self contained.
+ * To do so, we will use the beginning of the hot-added range to build
+ * the page tables for the memmap array that describes the entire range.
+ * Only selected architectures support it with SPARSE_VMEMMAP.
+ */
+#define MHP_MEMMAP_ON_MEMORY   ((__force mhp_t)BIT(1))
+
 /*
  * Extended parameters for memory hotplug:
  * altmap: alternative allocator for memmap array (optional)
@@ -99,9 +107,13 @@ static inline void zone_seqlock_init(struct zone *zone)
 extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
 extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
 extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
+extern void adjust_present_page_count(struct zone *zone, long nr_pages);
 /* VM interface that may be used by firmware interface */
+extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+                                    struct zone *zone);
+extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
 extern int online_pages(unsigned long pfn, unsigned long nr_pages,
-                       int online_type, int nid);
+                       struct zone *zone);
 extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
                                         unsigned long end_pfn);
 extern void __offline_isolated_pages(unsigned long start_pfn,
@@ -359,6 +371,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_
 extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
                                      struct mhp_params *params);
 void arch_remove_linear_mapping(u64 start, u64 size);
+extern bool mhp_supports_memmap_on_memory(unsigned long size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
index f5b464d..45a79da 100644 (file)
@@ -17,7 +17,7 @@ struct device;
  * @alloc: track pages consumed, private to vmemmap_populate()
  */
 struct vmem_altmap {
-       const unsigned long base_pfn;
+       unsigned long base_pfn;
        const unsigned long end_pfn;
        const unsigned long reserve;
        unsigned long free;
index 3a38963..4bb4e51 100644 (file)
@@ -27,6 +27,7 @@ enum migrate_reason {
        MR_MEMPOLICY_MBIND,
        MR_NUMA_MISPLACED,
        MR_CONTIG_RANGE,
+       MR_LONGTERM_PIN,
        MR_TYPES
 };
 
@@ -43,10 +44,7 @@ extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
                unsigned long private, enum migrate_mode mode, int reason);
 extern struct page *alloc_migration_target(struct page *page, unsigned long private);
 extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
-extern void putback_movable_page(struct page *page);
 
-extern void migrate_prep(void);
-extern void migrate_prep_local(void);
 extern void migrate_page_states(struct page *newpage, struct page *page);
 extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
@@ -66,9 +64,6 @@ static inline struct page *alloc_migration_target(struct page *page,
 static inline int isolate_movable_page(struct page *page, isolate_mode_t mode)
        { return -EBUSY; }
 
-static inline int migrate_prep(void) { return -ENOSYS; }
-static inline int migrate_prep_local(void) { return -ENOSYS; }
-
 static inline void migrate_page_states(struct page *newpage, struct page *page)
 {
 }
index 4e531c2..f8e8d7e 100644 (file)
@@ -1236,7 +1236,7 @@ enum {
        MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32,
 };
 
-static inline bool mlx5_is_roce_enabled(struct mlx5_core_dev *dev)
+static inline bool mlx5_is_roce_init_enabled(struct mlx5_core_dev *dev)
 {
        struct devlink *devlink = priv_to_devlink(dev);
        union devlink_param_value val;
index 52b7cab..6d16eed 100644 (file)
@@ -133,6 +133,7 @@ enum {
        MLX5_CMD_OP_PAGE_FAULT_RESUME             = 0x204,
        MLX5_CMD_OP_ALLOC_MEMIC                   = 0x205,
        MLX5_CMD_OP_DEALLOC_MEMIC                 = 0x206,
+       MLX5_CMD_OP_MODIFY_MEMIC                  = 0x207,
        MLX5_CMD_OP_CREATE_EQ                     = 0x301,
        MLX5_CMD_OP_DESTROY_EQ                    = 0x302,
        MLX5_CMD_OP_QUERY_EQ                      = 0x303,
@@ -1031,7 +1032,11 @@ struct mlx5_ifc_device_mem_cap_bits {
 
        u8         header_modify_sw_icm_start_address[0x40];
 
-       u8         reserved_at_180[0x680];
+       u8         reserved_at_180[0x80];
+
+       u8         memic_operations[0x20];
+
+       u8         reserved_at_220[0x5e0];
 };
 
 struct mlx5_ifc_device_event_cap_bits {
@@ -10498,6 +10503,41 @@ struct mlx5_ifc_destroy_vport_lag_in_bits {
        u8         reserved_at_40[0x40];
 };
 
+enum {
+       MLX5_MODIFY_MEMIC_OP_MOD_ALLOC,
+       MLX5_MODIFY_MEMIC_OP_MOD_DEALLOC,
+};
+
+struct mlx5_ifc_modify_memic_in_bits {
+       u8         opcode[0x10];
+       u8         uid[0x10];
+
+       u8         reserved_at_20[0x10];
+       u8         op_mod[0x10];
+
+       u8         reserved_at_40[0x20];
+
+       u8         reserved_at_60[0x18];
+       u8         memic_operation_type[0x8];
+
+       u8         memic_start_addr[0x40];
+
+       u8         reserved_at_c0[0x140];
+};
+
+struct mlx5_ifc_modify_memic_out_bits {
+       u8         status[0x8];
+       u8         reserved_at_8[0x18];
+
+       u8         syndrome[0x20];
+
+       u8         reserved_at_40[0x40];
+
+       u8         memic_operation_addr[0x40];
+
+       u8         reserved_at_c0[0x140];
+};
+
 struct mlx5_ifc_alloc_memic_in_bits {
        u8         opcode[0x10];
        u8         reserved_at_10[0x10];
index 011f436..322ec61 100644 (file)
@@ -106,7 +106,7 @@ extern int mmap_rnd_compat_bits __read_mostly;
  * embedding these tags into addresses that point to these memory regions, and
  * checking that the memory and the pointer tags match on memory accesses)
  * redefine this macro to strip tags from pointers.
- * It's defined as noop for arcitectures that don't support memory tagging.
+ * It's defined as noop for architectures that don't support memory tagging.
  */
 #ifndef untagged_addr
 #define untagged_addr(addr) (addr)
@@ -372,6 +372,13 @@ extern unsigned int kobjsize(const void *objp);
 # define VM_GROWSUP    VM_NONE
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+# define VM_UFFD_MINOR_BIT     37
+# define VM_UFFD_MINOR         BIT(VM_UFFD_MINOR_BIT)  /* UFFD minor faults */
+#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+# define VM_UFFD_MINOR         VM_NONE
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+
 /* Bits set in the VMA until the stack is in its final location */
 #define VM_STACK_INCOMPLETE_SETUP      (VM_RAND_READ | VM_SEQ_READ)
 
@@ -1134,6 +1141,11 @@ static inline bool is_zone_device_page(const struct page *page)
 }
 #endif
 
+static inline bool is_zone_movable_page(const struct page *page)
+{
+       return page_zonenum(page) == ZONE_MOVABLE;
+}
+
 #ifdef CONFIG_DEV_PAGEMAP_OPS
 void free_devmap_managed_page(struct page *page);
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
@@ -1543,6 +1555,20 @@ static inline unsigned long page_to_section(const struct page *page)
 }
 #endif
 
+/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
+#ifdef CONFIG_MIGRATION
+static inline bool is_pinnable_page(struct page *page)
+{
+       return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) ||
+               is_zero_pfn(page_to_pfn(page));
+}
+#else
+static inline bool is_pinnable_page(struct page *page)
+{
+       return true;
+}
+#endif
+
 static inline void set_page_zone(struct page *page, enum zone_type zone)
 {
        page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
index 3b22057..0d53eba 100644 (file)
@@ -55,7 +55,7 @@ enum migratetype {
         * pageblocks to MIGRATE_CMA which can be done by
         * __free_pageblock_cma() function.  What is important though
         * is that a range of pageblocks must be aligned to
-        * MAX_ORDER_NR_PAGES should biggest page be bigger then
+        * MAX_ORDER_NR_PAGES should biggest page be bigger than
         * a single pageblock.
         */
        MIGRATE_CMA,
@@ -407,8 +407,13 @@ enum zone_type {
         * to increase the number of THP/huge pages. Notable special cases are:
         *
         * 1. Pinned pages: (long-term) pinning of movable pages might
-        *    essentially turn such pages unmovable. Memory offlining might
-        *    retry a long time.
+        *    essentially turn such pages unmovable. Therefore, we do not allow
+        *    pinning long-term pages in ZONE_MOVABLE. When pages are pinned and
+        *    faulted, they come from the right zone right away. However, it is
+        *    still possible that address space already has pages in
+        *    ZONE_MOVABLE at the time when pages are pinned (i.e. user has
+        *    touches that memory before pinning). In such case we migrate them
+        *    to a different zone. When migration fails - pinning fails.
         * 2. memblock allocations: kernelcore/movablecore setups might create
         *    situations where ZONE_MOVABLE contains unmovable allocations
         *    after boot. Memory offlining and allocations fail early.
@@ -427,6 +432,15 @@ enum zone_type {
         *    techniques might use alloc_contig_range() to hide previously
         *    exposed pages from the buddy again (e.g., to implement some sort
         *    of memory unplug in virtio-mem).
+        * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create
+        *    situations where ZERO_PAGE(0) which is allocated differently
+        *    on different platforms may end up in a movable zone. ZERO_PAGE(0)
+        *    cannot be migrated.
+        * 7. Memory-hotplug: when using memmap_on_memory and onlining the
+        *    memory to the MOVABLE zone, the vmemmap pages are also placed in
+        *    such zone. Such pages cannot be really moved around as they are
+        *    self-stored in the range, but they are treated as movable when
+        *    the range they describe is about to be offlined.
         *
         * In general, no unmovable allocations that degrade memory offlining
         * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
@@ -1383,10 +1397,8 @@ static inline int online_section_nr(unsigned long nr)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
 #endif
-#endif
 
 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 {
index aef35fd..6aff469 100644 (file)
@@ -240,8 +240,7 @@ void pci_msi_unmask_irq(struct irq_data *data);
 /*
  * The arch hooks to setup up msi irqs. Default functions are implemented
  * as weak symbols so that they /can/ be overriden by architecture specific
- * code if needed. These hooks must be enabled by the architecture or by
- * drivers which depend on them via msi_controller based MSI handling.
+ * code if needed. These hooks can only be enabled by the architecture.
  *
  * If CONFIG_PCI_MSI_ARCH_FALLBACKS is not selected they are replaced by
  * stubs with warnings.
@@ -251,7 +250,6 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc);
 void arch_teardown_msi_irq(unsigned int irq);
 int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void arch_teardown_msi_irqs(struct pci_dev *dev);
-void default_teardown_msi_irqs(struct pci_dev *dev);
 #else
 static inline int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
@@ -272,19 +270,6 @@ static inline void arch_teardown_msi_irqs(struct pci_dev *dev)
 void arch_restore_msi_irqs(struct pci_dev *dev);
 void default_restore_msi_irqs(struct pci_dev *dev);
 
-struct msi_controller {
-       struct module *owner;
-       struct device *dev;
-       struct device_node *of_node;
-       struct list_head list;
-
-       int (*setup_irq)(struct msi_controller *chip, struct pci_dev *dev,
-                        struct msi_desc *desc);
-       int (*setup_irqs)(struct msi_controller *chip, struct pci_dev *dev,
-                         int nvec, int type);
-       void (*teardown_irq)(struct msi_controller *chip, unsigned int irq);
-};
-
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
 
 #include <linux/irqhandler.h>
index 2aab961..4f9a4b3 100644 (file)
@@ -53,8 +53,7 @@ int arpt_register_table(struct net *net, const struct xt_table *table,
                        const struct arpt_replace *repl,
                        const struct nf_hook_ops *ops);
 void arpt_unregister_table(struct net *net, const char *name);
-void arpt_unregister_table_pre_exit(struct net *net, const char *name,
-                                   const struct nf_hook_ops *ops);
+void arpt_unregister_table_pre_exit(struct net *net, const char *name);
 extern unsigned int arpt_do_table(struct sk_buff *skb,
                                  const struct nf_hook_state *state,
                                  struct xt_table *table);
index 5b4c67c..15004c4 100644 (file)
@@ -452,6 +452,7 @@ enum lock_type4 {
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
 #define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
 #define FATTR4_WORD2_CLONE_BLKSIZE     (1UL << 13)
+#define FATTR4_WORD2_CHANGE_ATTR_TYPE  (1UL << 15)
 #define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
 #define FATTR4_WORD2_MODE_UMASK                (1UL << 17)
 #define FATTR4_WORD2_XATTR_SUPPORT     (1UL << 18)
@@ -709,6 +710,14 @@ struct nl4_server {
        } u;
 };
 
+enum nfs4_change_attr_type {
+       NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR = 0,
+       NFS4_CHANGE_TYPE_IS_VERSION_COUNTER = 1,
+       NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS = 2,
+       NFS4_CHANGE_TYPE_IS_TIME_METADATA = 3,
+       NFS4_CHANGE_TYPE_IS_UNDEFINED = 4,
+};
+
 /*
  * Options for setxattr. These match the flags for setxattr(2).
  */
index eadaabd..ffba254 100644 (file)
@@ -246,11 +246,15 @@ struct nfs4_copy_state {
                                BIT(13)         /* Deferred cache invalidation */
 #define NFS_INO_INVALID_BLOCKS BIT(14)         /* cached blocks are invalid */
 #define NFS_INO_INVALID_XATTR  BIT(15)         /* xattrs are invalid */
+#define NFS_INO_INVALID_NLINK  BIT(16)         /* cached nlinks is invalid */
+#define NFS_INO_INVALID_MODE   BIT(17)         /* cached mode is invalid */
 
 #define NFS_INO_INVALID_ATTR   (NFS_INO_INVALID_CHANGE \
                | NFS_INO_INVALID_CTIME \
                | NFS_INO_INVALID_MTIME \
                | NFS_INO_INVALID_SIZE \
+               | NFS_INO_INVALID_NLINK \
+               | NFS_INO_INVALID_MODE \
                | NFS_INO_INVALID_OTHER)        /* inode metadata is invalid */
 
 /*
@@ -386,7 +390,7 @@ extern void nfs_access_set_mask(struct nfs_access_entry *, u32);
 extern int nfs_permission(struct user_namespace *, struct inode *, int);
 extern int nfs_open(struct inode *, struct file *);
 extern int nfs_attribute_cache_expired(struct inode *inode);
-extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
+extern int nfs_revalidate_inode(struct inode *inode, unsigned long flags);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern int nfs_clear_invalid_mapping(struct address_space *mapping);
 extern bool nfs_mapping_need_revalidate_inode(struct inode *inode);
index a28d71b..d71a0e9 100644 (file)
@@ -156,6 +156,7 @@ struct nfs_server {
 #define NFS_MOUNT_WRITE_EAGER          0x01000000
 #define NFS_MOUNT_WRITE_WAIT           0x02000000
 
+       unsigned int            fattr_valid;    /* Valid attributes */
        unsigned int            caps;           /* server capabilities */
        unsigned int            rsize;          /* read size */
        unsigned int            rpages;         /* read size (in pages) */
@@ -180,6 +181,9 @@ struct nfs_server {
 #define NFS_OPTION_FSCACHE     0x00000001      /* - local caching enabled */
 #define NFS_OPTION_MIGRATION   0x00000002      /* - NFSv4 migration enabled */
 
+       enum nfs4_change_attr_type
+                               change_attr_type;/* Description of change attribute */
+
        struct nfs_fsid         fsid;
        __u64                   maxfilesize;    /* maximum file size */
        struct timespec64       time_delta;     /* smallest time granularity */
@@ -265,16 +269,7 @@ struct nfs_server {
 #define NFS_CAP_SYMLINKS       (1U << 2)
 #define NFS_CAP_ACLS           (1U << 3)
 #define NFS_CAP_ATOMIC_OPEN    (1U << 4)
-/* #define NFS_CAP_CHANGE_ATTR (1U << 5) */
 #define NFS_CAP_LGOPEN         (1U << 5)
-#define NFS_CAP_FILEID         (1U << 6)
-#define NFS_CAP_MODE           (1U << 7)
-#define NFS_CAP_NLINK          (1U << 8)
-#define NFS_CAP_OWNER          (1U << 9)
-#define NFS_CAP_OWNER_GROUP    (1U << 10)
-#define NFS_CAP_ATIME          (1U << 11)
-#define NFS_CAP_CTIME          (1U << 12)
-#define NFS_CAP_MTIME          (1U << 13)
 #define NFS_CAP_POSIX_LOCK     (1U << 14)
 #define NFS_CAP_UIDGID_NOMAP   (1U << 15)
 #define NFS_CAP_STATEID_NFSV41 (1U << 16)
index 3327239..717ecc8 100644 (file)
@@ -15,6 +15,8 @@
 #define NFS_DEF_FILE_IO_SIZE   (4096U)
 #define NFS_MIN_FILE_IO_SIZE   (1024U)
 
+#define NFS_BITMASK_SZ         3
+
 struct nfs4_string {
        unsigned int len;
        char *data;
@@ -150,6 +152,8 @@ struct nfs_fsinfo {
        __u32                   layouttype[NFS_MAX_LAYOUT_TYPES]; /* supported pnfs layout driver */
        __u32                   blksize; /* preferred pnfs io block size */
        __u32                   clone_blksize; /* granularity of a CLONE operation */
+       enum nfs4_change_attr_type
+                               change_attr_type; /* Info about change attr */
        __u32                   xattr_support; /* User xattrs supported */
 };
 
@@ -525,7 +529,8 @@ struct nfs_closeargs {
        struct nfs_seqid *      seqid;
        fmode_t                 fmode;
        u32                     share_access;
-       u32 *                   bitmask;
+       const u32 *             bitmask;
+       u32                     bitmask_store[NFS_BITMASK_SZ];
        struct nfs4_layoutreturn_args *lr_args;
 };
 
@@ -608,7 +613,8 @@ struct nfs4_delegreturnargs {
        struct nfs4_sequence_args       seq_args;
        const struct nfs_fh *fhandle;
        const nfs4_stateid *stateid;
-       u32 * bitmask;
+       const u32 *bitmask;
+       u32 bitmask_store[NFS_BITMASK_SZ];
        struct nfs4_layoutreturn_args *lr_args;
 };
 
@@ -648,7 +654,8 @@ struct nfs_pgio_args {
        union {
                unsigned int            replen;                 /* used by read */
                struct {
-                       u32 *                   bitmask;        /* used by write */
+                       const u32 *             bitmask;        /* used by write */
+                       u32 bitmask_store[NFS_BITMASK_SZ];      /* used by write */
                        enum nfs3_stable_how    stable;         /* used by write */
                };
        };
index 469fa7f..a4bd411 100644 (file)
 
 struct pagevec;
 
+static inline bool mapping_empty(struct address_space *mapping)
+{
+       return xa_empty(&mapping->i_pages);
+}
+
 /*
  * Bits in mapping->flags.
  */
index 65d3d83..fbdadd4 100644 (file)
@@ -85,6 +85,7 @@ extern const struct pci_ecam_ops pci_thunder_ecam_ops; /* Cavium ThunderX 1.x */
 extern const struct pci_ecam_ops xgene_v1_pcie_ecam_ops; /* APM X-Gene PCIe v1 */
 extern const struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x */
 extern const struct pci_ecam_ops al_pcie_ops;  /* Amazon Annapurna Labs PCIe */
+extern const struct pci_ecam_ops tegra194_pcie_ops; /* Tegra194 PCIe */
 #endif
 
 #if IS_ENABLED(CONFIG_PCI_HOST_COMMON)
index 0fa104e..c20211e 100644 (file)
@@ -458,7 +458,6 @@ struct pci_dev {
 
        u32             saved_config_space[16]; /* Config space saved at suspend time */
        struct hlist_head saved_cap_space;
-       struct bin_attribute *rom_attr;         /* Attribute descriptor for sysfs ROM entry */
        int             rom_attr_enabled;       /* Display of ROM attribute enabled? */
        struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
        struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */
@@ -540,7 +539,6 @@ struct pci_host_bridge {
        int (*map_irq)(const struct pci_dev *, u8, u8);
        void (*release_fn)(struct pci_host_bridge *);
        void            *release_data;
-       struct msi_controller *msi;
        unsigned int    ignore_reset_delay:1;   /* For entire hierarchy */
        unsigned int    no_ext_tags:1;          /* No Extended Tags */
        unsigned int    native_aer:1;           /* OS may use PCIe AER */
@@ -551,6 +549,7 @@ struct pci_host_bridge {
        unsigned int    native_dpc:1;           /* OS may use PCIe DPC */
        unsigned int    preserve_config:1;      /* Preserve FW resource setup */
        unsigned int    size_windows:1;         /* Enable root bus sizing */
+       unsigned int    msi_domain:1;           /* Bridge wants MSI domain */
 
        /* Resource alignment requirements */
        resource_size_t (*align_resource)(struct pci_dev *dev,
@@ -621,7 +620,6 @@ struct pci_bus {
        struct resource busn_res;       /* Bus numbers routed to this bus */
 
        struct pci_ops  *ops;           /* Configuration access functions */
-       struct msi_controller *msi;     /* MSI controller */
        void            *sysdata;       /* Hook for sys-specific extension */
        struct proc_dir_entry *procdir; /* Directory entry in /proc/bus/pci */
 
@@ -1085,6 +1083,7 @@ u8 pci_find_next_ht_capability(struct pci_dev *dev, u8 pos, int ht_cap);
 u16 pci_find_ext_capability(struct pci_dev *dev, int cap);
 u16 pci_find_next_ext_capability(struct pci_dev *dev, u16 pos, int cap);
 struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
+u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap);
 
 u64 pci_get_dsn(struct pci_dev *dev);
 
@@ -1209,6 +1208,7 @@ int __must_check pci_set_mwi(struct pci_dev *dev);
 int __must_check pcim_set_mwi(struct pci_dev *dev);
 int pci_try_set_mwi(struct pci_dev *dev);
 void pci_clear_mwi(struct pci_dev *dev);
+void pci_disable_parity(struct pci_dev *dev);
 void pci_intx(struct pci_dev *dev, int enable);
 bool pci_check_and_mask_intx(struct pci_dev *dev);
 bool pci_check_and_unmask_intx(struct pci_dev *dev);
@@ -1310,7 +1310,6 @@ void pci_unlock_rescan_remove(void);
 /* Vital Product Data routines */
 ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
 ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf);
-int pci_set_vpd_size(struct pci_dev *dev, size_t len);
 
 /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */
 resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx);
@@ -2319,14 +2318,13 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field)
 /**
  * pci_vpd_find_tag - Locates the Resource Data Type tag provided
  * @buf: Pointer to buffered vpd data
- * @off: The offset into the buffer at which to begin the search
  * @len: The length of the vpd buffer
  * @rdt: The Resource Data Type to search for
  *
  * Returns the index where the Resource Data Type was found or
  * -ENOENT otherwise.
  */
-int pci_vpd_find_tag(const u8 *buf, unsigned int off, unsigned int len, u8 rdt);
+int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt);
 
 /**
  * pci_vpd_find_info_keyword - Locates an information field keyword in the VPD
index a763928..f5a6a2f 100644 (file)
@@ -954,8 +954,6 @@ extern void perf_event_itrace_started(struct perf_event *event);
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
 
-extern int perf_num_counters(void);
-extern const char *perf_pmu_name(void);
 extern void __perf_event_task_sched_in(struct task_struct *prev,
                                       struct task_struct *task);
 extern void __perf_event_task_sched_out(struct task_struct *prev,
index 5e77239..46b1378 100644 (file)
@@ -426,7 +426,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 
 /*
  * On some architectures hardware does not set page access bit when accessing
- * memory page, it is responsibilty of software setting this bit. It brings
+ * memory page, it is responsibility of software setting this bit. It brings
  * out extra page fault penalty to track page access bit. For optimization page
  * access bit can be set during all page fault flow on these arches.
  * To be differentiate with macro pte_mkyoung, this macro is used on platforms
@@ -519,7 +519,7 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 /*
  * This is an implementation of pmdp_establish() that is only suitable for an
  * architecture that doesn't have hardware dirty/accessed bits. In this case we
- * can't race with CPU which sets these bits and non-atomic aproach is fine.
+ * can't race with CPU which sets these bits and non-atomic approach is fine.
  */
 static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
@@ -852,7 +852,7 @@ static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
  * updates, but to prevent any updates it may make from being lost.
  *
  * This does not protect against other software modifications of the
- * pte; the appropriate pte lock must be held over the transation.
+ * pte; the appropriate pte lock must be held over the transaction.
  *
  * Note that this interface is intended to be batchable, meaning that
  * ptep_modify_prot_commit may not actually update the pte, but merely
@@ -1111,6 +1111,7 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 extern void untrack_pfn_moved(struct vm_area_struct *vma);
 #endif
 
+#ifdef CONFIG_MMU
 #ifdef __HAVE_COLOR_ZERO_PAGE
 static inline int is_zero_pfn(unsigned long pfn)
 {
@@ -1134,6 +1135,17 @@ static inline unsigned long my_zero_pfn(unsigned long addr)
        return zero_pfn;
 }
 #endif
+#else
+static inline int is_zero_pfn(unsigned long pfn)
+{
+       return 0;
+}
+
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+       return 0;
+}
+#endif /* CONFIG_MMU */
 
 #ifdef CONFIG_MMU
 
@@ -1269,13 +1281,13 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
         *
         * The complete check uses is_pmd_migration_entry() in linux/swapops.h
         * But using that requires moving current function and pmd_trans_unstable()
-        * to linux/swapops.h to resovle dependency, which is too much code move.
+        * to linux/swapops.h to resolve dependency, which is too much code move.
         *
         * !pmd_present() is equivalent to is_pmd_migration_entry() currently,
         * because !pmd_present() pages can only be under migration not swapped
         * out.
         *
-        * pmd_none() is preseved for future condition checks on pmd migration
+        * pmd_none() is preserved for future condition checks on pmd migration
         * entries and not confusing with this function name, although it is
         * redundant with !pmd_present().
         */
index 6035d9a..45f53af 100644 (file)
@@ -5679,6 +5679,7 @@ enum tcpc_cc_polarity {
 
 #define PD_STATUS_EVENT_SOP_DISC_DONE          BIT(0)
 #define PD_STATUS_EVENT_SOP_PRIME_DISC_DONE    BIT(1)
+#define PD_STATUS_EVENT_HARD_RESET             BIT(2)
 
 struct ec_params_typec_status {
        uint8_t port;
index 000cc05..069c7fd 100644 (file)
@@ -32,6 +32,7 @@ struct proc_ops {
        ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *);
        ssize_t (*proc_read_iter)(struct kiocb *, struct iov_iter *);
        ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *);
+       /* mandatory unless nonseekable_open() or equivalent is used */
        loff_t  (*proc_lseek)(struct file *, loff_t, int);
        int     (*proc_release)(struct inode *, struct file *);
        __poll_t (*proc_poll)(struct file *, struct poll_table_struct *);
index bad18ca..fd18ca9 100644 (file)
@@ -15,7 +15,6 @@
 #define KVM_PROFILING  4
 
 struct proc_dir_entry;
-struct pt_regs;
 struct notifier_block;
 
 #if defined(CONFIG_PROFILING) && defined(CONFIG_PROC_FS)
@@ -84,8 +83,6 @@ int task_handoff_unregister(struct notifier_block * n);
 int profile_event_register(enum profile_type, struct notifier_block * n);
 int profile_event_unregister(enum profile_type, struct notifier_block * n);
 
-struct pt_regs;
-
 #else
 
 #define prof_on 0
index b801ead..d48a719 100644 (file)
@@ -73,6 +73,7 @@ enum sev_cmd {
        SEV_CMD_SEND_UPDATE_DATA        = 0x041,
        SEV_CMD_SEND_UPDATE_VMSA        = 0x042,
        SEV_CMD_SEND_FINISH             = 0x043,
+       SEV_CMD_SEND_CANCEL             = 0x044,
 
        /* Guest migration commands (incoming) */
        SEV_CMD_RECEIVE_START           = 0x050,
@@ -326,11 +327,11 @@ struct sev_data_send_start {
        u64 pdh_cert_address;                   /* In */
        u32 pdh_cert_len;                       /* In */
        u32 reserved1;
-       u64 plat_cert_address;                  /* In */
-       u32 plat_cert_len;                      /* In */
+       u64 plat_certs_address;                 /* In */
+       u32 plat_certs_len;                     /* In */
        u32 reserved2;
-       u64 amd_cert_address;                   /* In */
-       u32 amd_cert_len;                       /* In */
+       u64 amd_certs_address;                  /* In */
+       u32 amd_certs_len;                      /* In */
        u32 reserved3;
        u64 session_address;                    /* In */
        u32 session_len;                        /* In/Out */
@@ -392,6 +393,15 @@ struct sev_data_send_finish {
        u32 handle;                             /* In */
 } __packed;
 
+/**
+ * struct sev_data_send_cancel - SEND_CANCEL command parameters
+ *
+ * @handle: handle of the VM to process
+ */
+struct sev_data_send_cancel {
+       u32 handle;                             /* In */
+} __packed;
+
 /**
  * struct sev_data_receive_start - RECEIVE_START command parameters
  *
diff --git a/include/linux/ptp_kvm.h b/include/linux/ptp_kvm.h
new file mode 100644 (file)
index 0000000..f960a71
--- /dev/null
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+#ifndef _PTP_KVM_H_
+#define _PTP_KVM_H_
+
+struct timespec64;
+struct clocksource;
+
+int kvm_arch_ptp_init(void);
+int kvm_arch_ptp_get_clock(struct timespec64 *ts);
+int kvm_arch_ptp_get_crosststamp(u64 *cycle,
+               struct timespec64 *tspec, struct clocksource **cs);
+
+#endif /* _PTP_KVM_H_ */
index e4d84d4..5bb90af 100644 (file)
@@ -91,6 +91,11 @@ struct pwm_device {
  * pwm_get_state() - retrieve the current PWM state
  * @pwm: PWM device
  * @state: state to fill with the current PWM state
+ *
+ * The returned PWM state represents the state that was applied by a previous call to
+ * pwm_apply_state(). Drivers may have to slightly tweak that state before programming it to
+ * hardware. If pwm_apply_state() was never called, this returns either the current hardware
+ * state (if supported) or the default settings.
  */
 static inline void pwm_get_state(const struct pwm_device *pwm,
                                 struct pwm_state *state)
@@ -392,8 +397,6 @@ int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result,
 int pwm_set_chip_data(struct pwm_device *pwm, void *data);
 void *pwm_get_chip_data(struct pwm_device *pwm);
 
-int pwmchip_add_with_polarity(struct pwm_chip *chip,
-                             enum pwm_polarity polarity);
 int pwmchip_add(struct pwm_chip *chip);
 int pwmchip_remove(struct pwm_chip *chip);
 struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
index f28ee75..8b795b5 100644 (file)
@@ -315,6 +315,7 @@ struct rproc;
 /**
  * struct rproc_mem_entry - memory entry descriptor
  * @va:        virtual address
+ * @is_iomem: io memory
  * @dma: dma address
  * @len: length, in bytes
  * @da: device address
@@ -329,6 +330,7 @@ struct rproc;
  */
 struct rproc_mem_entry {
        void *va;
+       bool is_iomem;
        dma_addr_t dma;
        size_t len;
        u32 da;
@@ -361,6 +363,7 @@ enum rsc_handling_status {
  * @start:     power on the device and boot it
  * @stop:      power off the device
  * @attach:    attach to a device that his already powered up
+ * @detach:    detach from a device, leaving it powered up
  * @kick:      kick a virtqueue (virtqueue id given as a parameter)
  * @da_to_va:  optional platform hook to perform address translations
  * @parse_fw:  parse firmware to extract information (e.g. resource table)
@@ -368,7 +371,9 @@ enum rsc_handling_status {
  * RSC_HANDLED if resource was handled, RSC_IGNORED if not handled and a
  * negative value on error
  * @load_rsc_table:    load resource table from firmware image
- * @find_loaded_rsc_table: find the loaded resouce table
+ * @find_loaded_rsc_table: find the loaded resource table from firmware image
+ * @get_loaded_rsc_table: get resource table installed in memory
+ *                       by external entity
  * @load:              load firmware to memory, where the remote processor
  *                     expects to find it
  * @sanity_check:      sanity check the fw image
@@ -383,13 +388,16 @@ struct rproc_ops {
        int (*start)(struct rproc *rproc);
        int (*stop)(struct rproc *rproc);
        int (*attach)(struct rproc *rproc);
+       int (*detach)(struct rproc *rproc);
        void (*kick)(struct rproc *rproc, int vqid);
-       void * (*da_to_va)(struct rproc *rproc, u64 da, size_t len);
+       void * (*da_to_va)(struct rproc *rproc, u64 da, size_t len, bool *is_iomem);
        int (*parse_fw)(struct rproc *rproc, const struct firmware *fw);
        int (*handle_rsc)(struct rproc *rproc, u32 rsc_type, void *rsc,
                          int offset, int avail);
        struct resource_table *(*find_loaded_rsc_table)(
                                struct rproc *rproc, const struct firmware *fw);
+       struct resource_table *(*get_loaded_rsc_table)(
+                               struct rproc *rproc, size_t *size);
        int (*load)(struct rproc *rproc, const struct firmware *fw);
        int (*sanity_check)(struct rproc *rproc, const struct firmware *fw);
        u64 (*get_boot_addr)(struct rproc *rproc, const struct firmware *fw);
@@ -405,6 +413,8 @@ struct rproc_ops {
  * @RPROC_RUNNING:     device is up and running
  * @RPROC_CRASHED:     device has crashed; need to start recovery
  * @RPROC_DELETED:     device is deleted
+ * @RPROC_ATTACHED:    device has been booted by another entity and the core
+ *                     has attached to it
  * @RPROC_DETACHED:    device has been booted by another entity and waiting
  *                     for the core to attach to it
  * @RPROC_LAST:                just keep this one at the end
@@ -421,8 +431,9 @@ enum rproc_state {
        RPROC_RUNNING   = 2,
        RPROC_CRASHED   = 3,
        RPROC_DELETED   = 4,
-       RPROC_DETACHED  = 5,
-       RPROC_LAST      = 6,
+       RPROC_ATTACHED  = 5,
+       RPROC_DETACHED  = 6,
+       RPROC_LAST      = 7,
 };
 
 /**
@@ -505,11 +516,12 @@ struct rproc_dump_segment {
  * @recovery_disabled: flag that state if recovery was disabled
  * @max_notifyid: largest allocated notify id.
  * @table_ptr: pointer to the resource table in effect
+ * @clean_table: copy of the resource table without modifications.  Used
+ *              when a remote processor is attached or detached from the core
  * @cached_table: copy of the resource table
  * @table_sz: size of @cached_table
  * @has_iommu: flag to indicate if remote processor is behind an MMU
  * @auto_boot: flag to indicate if remote processor should be auto-started
- * @autonomous: true if an external entity has booted the remote processor
  * @dump_segments: list of segments in the firmware
  * @nb_vdev: number of vdev currently handled by rproc
  * @char_dev: character device of the rproc
@@ -542,11 +554,11 @@ struct rproc {
        bool recovery_disabled;
        int max_notifyid;
        struct resource_table *table_ptr;
+       struct resource_table *clean_table;
        struct resource_table *cached_table;
        size_t table_sz;
        bool has_iommu;
        bool auto_boot;
-       bool autonomous;
        struct list_head dump_segments;
        int nb_vdev;
        u8 elf_class;
@@ -655,6 +667,7 @@ rproc_of_resm_mem_entry_init(struct device *dev, u32 of_resm_idx, size_t len,
 
 int rproc_boot(struct rproc *rproc);
 void rproc_shutdown(struct rproc *rproc);
+int rproc_detach(struct rproc *rproc);
 int rproc_set_firmware(struct rproc *rproc, const char *fw_name);
 void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type);
 void rproc_coredump_using_sections(struct rproc *rproc);
index 46e6372..db0e611 100644 (file)
@@ -76,6 +76,11 @@ static inline int reset_control_reset(struct reset_control *rstc)
        return 0;
 }
 
+static inline int reset_control_rearm(struct reset_control *rstc)
+{
+       return 0;
+}
+
 static inline int reset_control_assert(struct reset_control *rstc)
 {
        return 0;
index 136ea09..dac53fd 100644 (file)
@@ -61,7 +61,8 @@ enum ring_buffer_type {
 
 unsigned ring_buffer_event_length(struct ring_buffer_event *event);
 void *ring_buffer_event_data(struct ring_buffer_event *event);
-u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event);
+u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
+                                struct ring_buffer_event *event);
 
 /*
  * ring_buffer_discard_commit will remove an event that has not
@@ -180,7 +181,7 @@ unsigned long ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cp
 unsigned long ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu);
 unsigned long ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu);
 
-u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu);
+u64 ring_buffer_time_stamp(struct trace_buffer *buffer);
 void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer,
                                      int cpu, u64 *ts);
 void ring_buffer_set_clock(struct trace_buffer *buffer,
index a5db828..d97dcd0 100644 (file)
@@ -18,8 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/poll.h>
 #include <linux/rpmsg/byteorder.h>
-
-#define RPMSG_ADDR_ANY         0xFFFFFFFF
+#include <uapi/linux/rpmsg.h>
 
 struct rpmsg_device;
 struct rpmsg_endpoint;
index 9c25c8e..d2c8813 100644 (file)
@@ -1583,7 +1583,7 @@ extern struct pid *cad_pid;
 #define PF_SWAPWRITE           0x00800000      /* Allowed to write to swap */
 #define PF_NO_SETAFFINITY      0x04000000      /* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY           0x08000000      /* Early kill for mce process policy */
-#define PF_MEMALLOC_NOCMA      0x10000000      /* All allocation request will have _GFP_MOVABLE cleared */
+#define PF_MEMALLOC_PIN                0x10000000      /* Allocation context constrained to zones which allow long term pinning. */
 #define PF_FREEZER_SKIP                0x40000000      /* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK                0x80000000      /* This thread called freeze_processes() and should not be frozen */
 
index 90b2a0b..e24b1fe 100644 (file)
@@ -151,12 +151,13 @@ static inline bool in_vfork(struct task_struct *tsk)
  * Applies per-task gfp context to the given allocation flags.
  * PF_MEMALLOC_NOIO implies GFP_NOIO
  * PF_MEMALLOC_NOFS implies GFP_NOFS
+ * PF_MEMALLOC_PIN  implies !GFP_MOVABLE
  */
 static inline gfp_t current_gfp_context(gfp_t flags)
 {
        unsigned int pflags = READ_ONCE(current->flags);
 
-       if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) {
+       if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) {
                /*
                 * NOIO implies both NOIO and NOFS and it is a weaker context
                 * so always make sure it makes precedence
@@ -165,6 +166,9 @@ static inline gfp_t current_gfp_context(gfp_t flags)
                        flags &= ~(__GFP_IO | __GFP_FS);
                else if (pflags & PF_MEMALLOC_NOFS)
                        flags &= ~__GFP_FS;
+
+               if (pflags & PF_MEMALLOC_PIN)
+                       flags &= ~__GFP_MOVABLE;
        }
        return flags;
 }
@@ -271,29 +275,18 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
        current->flags = (current->flags & ~PF_MEMALLOC) | flags;
 }
 
-#ifdef CONFIG_CMA
-static inline unsigned int memalloc_nocma_save(void)
+static inline unsigned int memalloc_pin_save(void)
 {
-       unsigned int flags = current->flags & PF_MEMALLOC_NOCMA;
+       unsigned int flags = current->flags & PF_MEMALLOC_PIN;
 
-       current->flags |= PF_MEMALLOC_NOCMA;
+       current->flags |= PF_MEMALLOC_PIN;
        return flags;
 }
 
-static inline void memalloc_nocma_restore(unsigned int flags)
-{
-       current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags;
-}
-#else
-static inline unsigned int memalloc_nocma_save(void)
-{
-       return 0;
-}
-
-static inline void memalloc_nocma_restore(unsigned int flags)
+static inline void memalloc_pin_restore(unsigned int flags)
 {
+       current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags;
 }
-#endif
 
 #ifdef CONFIG_MEMCG
 DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
index 9aeda3f..06f7c50 100644 (file)
@@ -291,6 +291,7 @@ void security_bprm_committed_creds(struct linux_binprm *bprm);
 int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc);
 int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param);
 int security_sb_alloc(struct super_block *sb);
+void security_sb_delete(struct super_block *sb);
 void security_sb_free(struct super_block *sb);
 void security_free_mnt_opts(void **mnt_opts);
 int security_sb_eat_lsm_opts(char *options, void **mnt_opts);
@@ -633,6 +634,9 @@ static inline int security_sb_alloc(struct super_block *sb)
        return 0;
 }
 
+static inline void security_sb_delete(struct super_block *sb)
+{ }
+
 static inline void security_sb_free(struct super_block *sb)
 { }
 
index 9d6c28c..5b31c51 100644 (file)
@@ -71,6 +71,31 @@ static inline unsigned int seq_buf_used(struct seq_buf *s)
        return min(s->len, s->size);
 }
 
+/**
+ * seq_buf_terminate - Make sure buffer is nul terminated
+ * @s: the seq_buf descriptor to terminate.
+ *
+ * This makes sure that the buffer in @s is nul terminated and
+ * safe to read as a string.
+ *
+ * Note, if this is called when the buffer has overflowed, then
+ * the last byte of the buffer is zeroed, and the len will still
+ * point passed it.
+ *
+ * After this function is called, s->buffer is safe to use
+ * in string operations.
+ */
+static inline void seq_buf_terminate(struct seq_buf *s)
+{
+       if (WARN_ON(s->size == 0))
+               return;
+
+       if (seq_buf_buffer_left(s))
+               s->buffer[s->len] = 0;
+       else
+               s->buffer[s->size - 1] = 0;
+}
+
 /**
  * seq_buf_get_buf - get buffer to write arbitrary data to
  * @s: the seq_buf handle
index 0f80123..1eac79c 100644 (file)
@@ -79,13 +79,14 @@ struct shrinker {
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
 
 /* Flags */
-#define SHRINKER_NUMA_AWARE    (1 << 0)
-#define SHRINKER_MEMCG_AWARE   (1 << 1)
+#define SHRINKER_REGISTERED    (1 << 0)
+#define SHRINKER_NUMA_AWARE    (1 << 1)
+#define SHRINKER_MEMCG_AWARE   (1 << 2)
 /*
  * It just makes sense when the shrinker is also MEMCG_AWARE for now,
  * non-MEMCG_AWARE shrinker should not have this flag set.
  */
-#define SHRINKER_NONSLAB       (1 << 2)
+#define SHRINKER_NONSLAB       (1 << 3)
 
 extern int prealloc_shrinker(struct shrinker *shrinker);
 extern void register_shrinker_prepared(struct shrinker *shrinker);
index 84a0b48..669e35c 100644 (file)
@@ -55,6 +55,14 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
 
 int smp_call_function_single_async(int cpu, call_single_data_t *csd);
 
+/*
+ * Cpus stopping functions in panic. All have default weak definitions.
+ * Architecture-dependent code may override them.
+ */
+void panic_smp_self_stop(void);
+void nmi_panic_self_stop(struct pt_regs *regs);
+void crash_smp_send_stop(void);
+
 /*
  * Call a function on all processors
  */
index d2e97ee..d81fe8b 100644 (file)
@@ -247,6 +247,7 @@ struct rpc_xprt {
        struct rpc_task *       snd_task;       /* Task blocked in send */
 
        struct list_head        xmit_queue;     /* Send queue */
+       atomic_long_t           xmit_queuelen;
 
        struct svc_xprt         *bc_xprt;       /* NFSv4.1 backchannel */
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
index 4cc6ec3..1447270 100644 (file)
 #include <linux/sched.h>
 #include <linux/node.h>
 #include <linux/fs.h>
+#include <linux/pagemap.h>
 #include <linux/atomic.h>
 #include <linux/page-flags.h>
+#include <uapi/linux/mempolicy.h>
 #include <asm/page.h>
 
 struct notifier_block;
@@ -339,6 +341,20 @@ extern void lru_note_cost(struct lruvec *lruvec, bool file,
 extern void lru_note_cost_page(struct page *);
 extern void lru_cache_add(struct page *);
 extern void mark_page_accessed(struct page *);
+
+extern atomic_t lru_disable_count;
+
+static inline bool lru_cache_disabled(void)
+{
+       return atomic_read(&lru_disable_count);
+}
+
+static inline void lru_cache_enable(void)
+{
+       atomic_dec(&lru_disable_count);
+}
+
+extern void lru_cache_disable(void);
 extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_cpu_zone(struct zone *zone);
@@ -378,6 +394,12 @@ extern int sysctl_min_slab_ratio;
 #define node_reclaim_mode 0
 #endif
 
+static inline bool node_reclaim_enabled(void)
+{
+       /* Is any node_reclaim_mode bit set? */
+       return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
+}
+
 extern void check_move_unevictable_pages(struct pagevec *pvec);
 
 extern int kswapd_run(int nid);
index 5857a93..216854a 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/limits.h>
+#include <linux/spinlock.h>
 
 struct device;
 struct page;
@@ -36,20 +37,11 @@ enum swiotlb_force {
 
 extern void swiotlb_init(int verbose);
 int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
-extern unsigned long swiotlb_nr_tbl(void);
 unsigned long swiotlb_size_or_default(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
 extern int swiotlb_late_init_with_default_size(size_t default_size);
 extern void __init swiotlb_update_mem_attributes(void);
 
-/*
- * Enumeration for sync targets
- */
-enum dma_sync_target {
-       SYNC_FOR_CPU = 0,
-       SYNC_FOR_DEVICE = 1,
-};
-
 phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
                size_t mapping_size, size_t alloc_size,
                enum dma_data_direction dir, unsigned long attrs);
@@ -57,32 +49,70 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
 extern void swiotlb_tbl_unmap_single(struct device *hwdev,
                                     phys_addr_t tlb_addr,
                                     size_t mapping_size,
-                                    size_t alloc_size,
                                     enum dma_data_direction dir,
                                     unsigned long attrs);
 
-extern void swiotlb_tbl_sync_single(struct device *hwdev,
-                                   phys_addr_t tlb_addr,
-                                   size_t size, enum dma_data_direction dir,
-                                   enum dma_sync_target target);
-
+void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
+               size_t size, enum dma_data_direction dir);
+void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
+               size_t size, enum dma_data_direction dir);
 dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
                size_t size, enum dma_data_direction dir, unsigned long attrs);
 
 #ifdef CONFIG_SWIOTLB
 extern enum swiotlb_force swiotlb_force;
-extern phys_addr_t io_tlb_start, io_tlb_end;
+
+/**
+ * struct io_tlb_mem - IO TLB Memory Pool Descriptor
+ *
+ * @start:     The start address of the swiotlb memory pool. Used to do a quick
+ *             range check to see if the memory was in fact allocated by this
+ *             API.
+ * @end:       The end address of the swiotlb memory pool. Used to do a quick
+ *             range check to see if the memory was in fact allocated by this
+ *             API.
+ * @nslabs:    The number of IO TLB blocks (in groups of 64) between @start and
+ *             @end. This is command line adjustable via setup_io_tlb_npages.
+ * @used:      The number of used IO TLB block.
+ * @list:      The free list describing the number of free entries available
+ *             from each index.
+ * @index:     The index to start searching in the next round.
+ * @orig_addr: The original address corresponding to a mapped entry.
+ * @alloc_size:        Size of the allocated buffer.
+ * @lock:      The lock to protect the above data structures in the map and
+ *             unmap calls.
+ * @debugfs:   The dentry to debugfs.
+ * @late_alloc:        %true if allocated using the page allocator
+ */
+struct io_tlb_mem {
+       phys_addr_t start;
+       phys_addr_t end;
+       unsigned long nslabs;
+       unsigned long used;
+       unsigned int index;
+       spinlock_t lock;
+       struct dentry *debugfs;
+       bool late_alloc;
+       struct io_tlb_slot {
+               phys_addr_t orig_addr;
+               size_t alloc_size;
+               unsigned int list;
+       } slots[];
+};
+extern struct io_tlb_mem *io_tlb_default_mem;
 
 static inline bool is_swiotlb_buffer(phys_addr_t paddr)
 {
-       return paddr >= io_tlb_start && paddr < io_tlb_end;
+       struct io_tlb_mem *mem = io_tlb_default_mem;
+
+       return mem && paddr >= mem->start && paddr < mem->end;
 }
 
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
 bool is_swiotlb_active(void);
-void __init swiotlb_adjust_size(unsigned long new_size);
+void __init swiotlb_adjust_size(unsigned long size);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
 static inline bool is_swiotlb_buffer(phys_addr_t paddr)
@@ -106,7 +136,7 @@ static inline bool is_swiotlb_active(void)
        return false;
 }
 
-static inline void swiotlb_adjust_size(unsigned long new_size)
+static inline void swiotlb_adjust_size(unsigned long size)
 {
 }
 #endif /* CONFIG_SWIOTLB */
index a672bbe..050511e 100644 (file)
@@ -69,6 +69,8 @@ struct io_uring_params;
 struct clone_args;
 struct open_how;
 struct mount_attr;
+struct landlock_ruleset_attr;
+enum landlock_rule_type;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -1043,6 +1045,11 @@ asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
                                       siginfo_t __user *info,
                                       unsigned int flags);
 asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags);
+asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr __user *attr,
+               size_t size, __u32 flags);
+asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type,
+               const void __user *rule_attr, __u32 flags);
+asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags);
 
 /*
  * Architecture-specific system calls
index 6ac7bb1..d296f3b 100644 (file)
@@ -91,7 +91,7 @@ struct thermal_cooling_device_ops {
 
 struct thermal_cooling_device {
        int id;
-       char type[THERMAL_NAME_LENGTH];
+       char *type;
        struct device device;
        struct device_node *np;
        void *devdata;
@@ -390,7 +390,6 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp);
 int thermal_zone_get_slope(struct thermal_zone_device *tz);
 int thermal_zone_get_offset(struct thermal_zone_device *tz);
 
-void thermal_notify_framework(struct thermal_zone_device *, int);
 int thermal_zone_device_enable(struct thermal_zone_device *tz);
 int thermal_zone_device_disable(struct thermal_zone_device *tz);
 void thermal_zone_device_critical(struct thermal_zone_device *tz);
@@ -436,10 +435,6 @@ static inline int thermal_zone_get_offset(
                struct thermal_zone_device *tz)
 { return -ENODEV; }
 
-static inline void thermal_notify_framework(struct thermal_zone_device *tz,
-       int trip)
-{ }
-
 static inline int thermal_zone_device_enable(struct thermal_zone_device *tz)
 { return -ENODEV; }
 
index c6792cf..78a98bd 100644 (file)
@@ -3,6 +3,7 @@
 #define _LINUX_TIMEKEEPING_H
 
 #include <linux/errno.h>
+#include <linux/clocksource_ids.h>
 
 /* Included from linux/ktime.h */
 
@@ -243,11 +244,12 @@ struct ktime_timestamps {
  * @cs_was_changed_seq:        The sequence number of clocksource change events
  */
 struct system_time_snapshot {
-       u64             cycles;
-       ktime_t         real;
-       ktime_t         raw;
-       unsigned int    clock_was_set_seq;
-       u8              cs_was_changed_seq;
+       u64                     cycles;
+       ktime_t                 real;
+       ktime_t                 raw;
+       enum clocksource_ids    cs_id;
+       unsigned int            clock_was_set_seq;
+       u8                      cs_was_changed_seq;
 };
 
 /**
index 28e7af1..ad413b3 100644 (file)
@@ -206,7 +206,7 @@ static inline unsigned int tracing_gen_ctx_dec(void)
 
        trace_ctx = tracing_gen_ctx();
        /*
-        * Subtract one from the preeption counter if preemption is enabled,
+        * Subtract one from the preemption counter if preemption is enabled,
         * see trace_event_buffer_reserve()for details.
         */
        if (IS_ENABLED(CONFIG_PREEMPTION))
@@ -404,7 +404,6 @@ trace_get_fields(struct trace_event_call *event_call)
        return event_call->class->get_fields(event_call);
 }
 
-struct trace_array;
 struct trace_subsystem_dir;
 
 enum {
@@ -640,7 +639,8 @@ enum event_trigger_type {
 extern int filter_match_preds(struct event_filter *filter, void *rec);
 
 extern enum event_trigger_type
-event_triggers_call(struct trace_event_file *file, void *rec,
+event_triggers_call(struct trace_event_file *file,
+                   struct trace_buffer *buffer, void *rec,
                    struct ring_buffer_event *event);
 extern void
 event_triggers_post_call(struct trace_event_file *file,
@@ -664,7 +664,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 
        if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) {
                if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
-                       event_triggers_call(file, NULL, NULL);
+                       event_triggers_call(file, NULL, NULL, NULL);
                if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
                        return true;
                if (eflags & EVENT_FILE_FL_PID_FILTER)
index 9cfb099..13f6542 100644 (file)
@@ -465,7 +465,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  *     *
  *     * The declared 'local variable' is called '__entry'
  *     *
- *     * __field(pid_t, prev_prid) is equivalent to a standard declariton:
+ *     * __field(pid_t, prev_prid) is equivalent to a standard declaration:
  *     *
  *     *       pid_t   prev_pid;
  *     *
index a8e5f3e..794d153 100644 (file)
@@ -17,6 +17,9 @@
 #include <linux/mm.h>
 #include <asm-generic/pgtable_uffd.h>
 
+/* The set of all possible UFFD-related VM flags. */
+#define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR)
+
 /*
  * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
  * new flags, since they might collide with O_* ones. We want
@@ -34,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd;
 
 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
+/*
+ * The mode of operation for __mcopy_atomic and its helpers.
+ *
+ * This is almost an implementation detail (mcopy_atomic below doesn't take this
+ * as a parameter), but it's exposed here because memory-kind-specific
+ * implementations (e.g. hugetlbfs) need to know the mode of operation.
+ */
+enum mcopy_atomic_mode {
+       /* A normal copy_from_user into the destination range. */
+       MCOPY_ATOMIC_NORMAL,
+       /* Don't copy; map the destination range to the zero page. */
+       MCOPY_ATOMIC_ZEROPAGE,
+       /* Just install pte(s) with the existing page(s) in the page cache. */
+       MCOPY_ATOMIC_CONTINUE,
+};
+
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
                            unsigned long src_start, unsigned long len,
                            bool *mmap_changing, __u64 mode);
@@ -41,6 +60,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
                              unsigned long dst_start,
                              unsigned long len,
                              bool *mmap_changing);
+extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
+                             unsigned long len, bool *mmap_changing);
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
                               unsigned long start, unsigned long len,
                               bool enable_wp, bool *mmap_changing);
@@ -52,6 +73,22 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
        return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
 }
 
+/*
+ * Never enable huge pmd sharing on some uffd registered vmas:
+ *
+ * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry.
+ *
+ * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for
+ *   VMAs which share huge pmds. (If you have two mappings to the same
+ *   underlying pages, and fault in the non-UFFD-registered one with a write,
+ *   with huge pmd sharing this would *also* setup the second UFFD-registered
+ *   mapping, and we'd not get minor faults.)
+ */
+static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR);
+}
+
 static inline bool userfaultfd_missing(struct vm_area_struct *vma)
 {
        return vma->vm_flags & VM_UFFD_MISSING;
@@ -62,6 +99,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma)
        return vma->vm_flags & VM_UFFD_WP;
 }
 
+static inline bool userfaultfd_minor(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & VM_UFFD_MINOR;
+}
+
 static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
                                      pte_t pte)
 {
@@ -76,7 +118,7 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
 
 static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 {
-       return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
+       return vma->vm_flags & __VM_UFFD_FLAGS;
 }
 
 extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
@@ -123,6 +165,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma)
        return false;
 }
 
+static inline bool userfaultfd_minor(struct vm_area_struct *vma)
+{
+       return false;
+}
+
 static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
                                      pte_t pte)
 {
index 15fa085..f311d22 100644 (file)
@@ -8,7 +8,7 @@
 #include <linux/vhost_iotlb.h>
 
 /**
- * vDPA callback definition.
+ * struct vdpa_calllback - vDPA callback definition.
  * @callback: interrupt callback function
  * @private: the data passed to the callback function
  */
@@ -18,7 +18,7 @@ struct vdpa_callback {
 };
 
 /**
- * vDPA notification area
+ * struct vdpa_notification_area - vDPA notification area
  * @addr: base address of the notification area
  * @size: size of the notification area
  */
@@ -28,7 +28,7 @@ struct vdpa_notification_area {
 };
 
 /**
- * vDPA vq_state definition
+ * struct vdpa_vq_state - vDPA vq_state definition
  * @avail_index: available index
  */
 struct vdpa_vq_state {
@@ -38,7 +38,7 @@ struct vdpa_vq_state {
 struct vdpa_mgmt_dev;
 
 /**
- * vDPA device - representation of a vDPA device
+ * struct vdpa_device - representation of a vDPA device
  * @dev: underlying device
  * @dma_dev: the actual device that is performing DMA
  * @config: the configuration ops for this device.
@@ -59,7 +59,7 @@ struct vdpa_device {
 };
 
 /**
- * vDPA IOVA range - the IOVA range support by the device
+ * struct vdpa_iova_range - the IOVA range support by the device
  * @first: start of the IOVA range
  * @last: end of the IOVA range
  */
@@ -69,7 +69,7 @@ struct vdpa_iova_range {
 };
 
 /**
- * vDPA_config_ops - operations for configuring a vDPA device.
+ * struct vdpa_config_ops - operations for configuring a vDPA device.
  * Note: vDPA device drivers are required to implement all of the
  * operations unless it is mentioned to be optional in the following
  * list.
@@ -150,6 +150,9 @@ struct vdpa_iova_range {
  * @set_status:                        Set the device status
  *                             @vdev: vdpa device
  *                             @status: virtio device status
+ * @get_config_size:           Get the size of the configuration space
+ *                             @vdev: vdpa device
+ *                             Returns size_t: configuration size
  * @get_config:                        Read from device specific configuration space
  *                             @vdev: vdpa device
  *                             @offset: offset from the beginning of
@@ -231,6 +234,7 @@ struct vdpa_config_ops {
        u32 (*get_vendor_id)(struct vdpa_device *vdev);
        u8 (*get_status)(struct vdpa_device *vdev);
        void (*set_status)(struct vdpa_device *vdev, u8 status);
+       size_t (*get_config_size)(struct vdpa_device *vdev);
        void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
                           void *buf, unsigned int len);
        void (*set_config)(struct vdpa_device *vdev, unsigned int offset,
@@ -267,7 +271,7 @@ int _vdpa_register_device(struct vdpa_device *vdev, int nvqs);
 void _vdpa_unregister_device(struct vdpa_device *vdev);
 
 /**
- * vdpa_driver - operations for a vDPA driver
+ * struct vdpa_driver - operations for a vDPA driver
  * @driver: underlying device driver
  * @probe: the function to call when a device is found.  Returns 0 or -errno.
  * @remove: the function to call when a device is removed.
@@ -344,18 +348,18 @@ static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset,
 }
 
 /**
- * vdpa_mgmtdev_ops - vdpa device ops
- * @dev_add:   Add a vdpa device using alloc and register
- *             @mdev: parent device to use for device addition
- *             @name: name of the new vdpa device
- *             Driver need to add a new device using _vdpa_register_device()
- *             after fully initializing the vdpa device. Driver must return 0
- *             on success or appropriate error code.
- * @dev_del:   Remove a vdpa device using unregister
- *             @mdev: parent device to use for device removal
- *             @dev: vdpa device to remove
- *             Driver need to remove the specified device by calling
- *             _vdpa_unregister_device().
+ * struct vdpa_mgmtdev_ops - vdpa device ops
+ * @dev_add: Add a vdpa device using alloc and register
+ *          @mdev: parent device to use for device addition
+ *          @name: name of the new vdpa device
+ *          Driver need to add a new device using _vdpa_register_device()
+ *          after fully initializing the vdpa device. Driver must return 0
+ *          on success or appropriate error code.
+ * @dev_del: Remove a vdpa device using unregister
+ *          @mdev: parent device to use for device removal
+ *          @dev: vdpa device to remove
+ *          Driver need to remove the specified device by calling
+ *          _vdpa_unregister_device().
  */
 struct vdpa_mgmtdev_ops {
        int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name);
index f26acbe..6a95b58 100644 (file)
@@ -13,6 +13,8 @@ struct virtio_pci_modern_device {
        void __iomem *device;
        /* Base of vq notifications (non-legacy mode). */
        void __iomem *notify_base;
+       /* Physical base of vq notifications */
+       resource_size_t notify_pa;
        /* Where to read and clear interrupt */
        u8 __iomem *isr;
 
@@ -99,13 +101,8 @@ void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev,
 u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev,
                             u16 idx);
 u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev);
-u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
-                                  u16 idx);
-void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
-                                      size_t minlen,
-                                      u32 align,
-                                      u32 start, u32 size,
-                                      size_t *len);
+void __iomem * vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
+                                      u16 index, resource_size_t *pa);
 int vp_modern_probe(struct virtio_pci_modern_device *mdev);
 void vp_modern_remove(struct virtio_pci_modern_device *mdev);
 #endif
index 18e7597..ae0dd19 100644 (file)
@@ -70,6 +70,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #endif
 #ifdef CONFIG_HUGETLB_PAGE
                HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
+#endif
+#ifdef CONFIG_CMA
+               CMA_ALLOC_SUCCESS,
+               CMA_ALLOC_FAIL,
 #endif
                UNEVICTABLE_PGCULLED,   /* culled to noreclaim list */
                UNEVICTABLE_PGSCANNED,  /* scanned for reclaimability */
@@ -120,6 +124,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_SWAP
                SWAP_RA,
                SWAP_RA_HIT,
+#endif
+#ifdef CONFIG_X86
+               DIRECT_MAP_LEVEL2_SPLIT,
+               DIRECT_MAP_LEVEL3_SPLIT,
 #endif
                NR_VM_EVENT_ITEMS
 };
index 394d03c..4d668ab 100644 (file)
@@ -33,7 +33,7 @@ struct notifier_block;                /* in notifier.h */
  *
  * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after
  * shadow memory has been mapped. It's used to handle allocation errors so that
- * we don't try to poision shadow on free if it was never allocated.
+ * we don't try to poison shadow on free if it was never allocated.
  *
  * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to
  * determine which allocations need the module shadow freed.
@@ -43,7 +43,7 @@ struct notifier_block;                /* in notifier.h */
 
 /*
  * Maximum alignment for ioremap() regions.
- * Can be overriden by arch-specific value.
+ * Can be overridden by arch-specific value.
  */
 #ifndef IOREMAP_MAX_ORDER
 #define IOREMAP_MAX_ORDER      (7 + PAGE_SHIFT)        /* 128 pages */
@@ -227,9 +227,8 @@ static inline void set_vm_flush_reset_perms(void *addr)
 }
 #endif
 
-/* for /dev/kmem */
+/* for /proc/kcore */
 extern long vread(char *buf, char *addr, unsigned long count);
-extern long vwrite(char *buf, char *addr, unsigned long count);
 
 /*
  *     Internals.  Dont't use..
index 59bd50f..84db7b8 100644 (file)
@@ -46,6 +46,9 @@ struct vringh {
        /* IOTLB for this vring */
        struct vhost_iotlb *iotlb;
 
+       /* spinlock to synchronize IOTLB accesses */
+       spinlock_t *iotlb_lock;
+
        /* The function to call to notify the guest about added buffers */
        void (*notify)(struct vringh *);
 };
@@ -196,6 +199,19 @@ static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov)
        kiov->iov = NULL;
 }
 
+static inline size_t vringh_kiov_length(struct vringh_kiov *kiov)
+{
+       size_t len = 0;
+       int i;
+
+       for (i = kiov->i; i < kiov->used; i++)
+               len += kiov->iov[i].iov_len;
+
+       return len;
+}
+
+void vringh_kiov_advance(struct vringh_kiov *kiov, size_t len);
+
 int vringh_getdesc_kern(struct vringh *vrh,
                        struct vringh_kiov *riov,
                        struct vringh_kiov *wiov,
@@ -258,7 +274,8 @@ static inline __virtio64 cpu_to_vringh64(const struct vringh *vrh, u64 val)
 
 #if IS_REACHABLE(CONFIG_VHOST_IOTLB)
 
-void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb);
+void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb,
+                     spinlock_t *iotlb_lock);
 
 int vringh_init_iotlb(struct vringh *vrh, u64 features,
                      unsigned int num, bool weak_barriers,
index e8df72e..5e84888 100644 (file)
@@ -68,7 +68,6 @@ enum sctp_verb {
        SCTP_CMD_ASSOC_FAILED,   /* Handle association failure. */
        SCTP_CMD_DISCARD_PACKET, /* Discard the whole packet. */
        SCTP_CMD_GEN_SHUTDOWN,   /* Generate a SHUTDOWN chunk. */
-       SCTP_CMD_UPDATE_ASSOC,   /* Update association information. */
        SCTP_CMD_PURGE_OUTQUEUE, /* Purge all data waiting to be sent. */
        SCTP_CMD_SETUP_T2,       /* Hi-level, setup T2-shutdown parms.  */
        SCTP_CMD_RTO_PENDING,    /* Set transport's rto_pending. */
index bae29f5..226ae37 100644 (file)
@@ -10,7 +10,7 @@
 
 #include <rdma/ib_verbs.h>
 
-int rdma_query_gid(struct ib_device *device, u8 port_num, int index,
+int rdma_query_gid(struct ib_device *device, u32 port_num, int index,
                   union ib_gid *gid);
 void *rdma_read_gid_hw_context(const struct ib_gid_attr *attr);
 const struct ib_gid_attr *rdma_find_gid(struct ib_device *device,
@@ -20,10 +20,10 @@ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device,
 const struct ib_gid_attr *rdma_find_gid_by_port(struct ib_device *ib_dev,
                                                const union ib_gid *gid,
                                                enum ib_gid_type gid_type,
-                                               u8 port,
+                                               u32 port,
                                                struct net_device *ndev);
 const struct ib_gid_attr *rdma_find_gid_by_filter(
-       struct ib_device *device, const union ib_gid *gid, u8 port_num,
+       struct ib_device *device, const union ib_gid *gid, u32 port_num,
        bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *,
                       void *),
        void *context);
@@ -43,7 +43,7 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr);
  * the local software cache.
  */
 int ib_get_cached_pkey(struct ib_device    *device_handle,
-                      u                  port_num,
+                      u32                  port_num,
                       int                  index,
                       u16                 *pkey);
 
@@ -59,7 +59,7 @@ int ib_get_cached_pkey(struct ib_device    *device_handle,
  * the local software cache.
  */
 int ib_find_cached_pkey(struct ib_device    *device,
-                       u                  port_num,
+                       u32                  port_num,
                        u16                  pkey,
                        u16                 *index);
 
@@ -75,7 +75,7 @@ int ib_find_cached_pkey(struct ib_device    *device,
  * the local software cache.
  */
 int ib_find_exact_cached_pkey(struct ib_device    *device,
-                             u                  port_num,
+                             u32                  port_num,
                              u16                  pkey,
                              u16                 *index);
 
@@ -89,7 +89,7 @@ int ib_find_exact_cached_pkey(struct ib_device    *device,
  * the local software cache.
  */
 int ib_get_cached_lmc(struct ib_device *device,
-                     u               port_num,
+                     u32               port_num,
                      u8                *lmc);
 
 /**
@@ -102,12 +102,12 @@ int ib_get_cached_lmc(struct ib_device *device,
  * the local software cache.
  */
 int ib_get_cached_port_state(struct ib_device *device,
-                             u8                port_num,
+                            u32               port_num,
                              enum ib_port_state *port_active);
 
 bool rdma_is_zero_gid(const union ib_gid *gid);
 const struct ib_gid_attr *rdma_get_gid_attr(struct ib_device *device,
-                                           u8 port_num, int index);
+                                           u32 port_num, int index);
 void rdma_put_gid_attr(const struct ib_gid_attr *attr);
 void rdma_hold_gid_attr(const struct ib_gid_attr *attr);
 ssize_t rdma_query_gid_table(struct ib_device *device,
index 8dfb1dd..f1d34f0 100644 (file)
@@ -668,7 +668,7 @@ struct ib_mad_reg_req {
  * @registration_flags: Registration flags to set for this agent
  */
 struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
-                                          u8 port_num,
+                                          u32 port_num,
                                           enum ib_qp_type qp_type,
                                           struct ib_mad_reg_req *mad_reg_req,
                                           u8 rmpp_version,
index 4c52c2f..ba3c808 100644 (file)
@@ -423,7 +423,7 @@ struct ib_sa_query;
 void ib_sa_cancel_query(int id, struct ib_sa_query *query);
 
 int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device,
-                      u8 port_num, struct sa_path_rec *rec,
+                      u32 port_num, struct sa_path_rec *rec,
                       ib_sa_comp_mask comp_mask, unsigned long timeout_ms,
                       gfp_t gfp_mask,
                       void (*callback)(int status, struct sa_path_rec *resp,
@@ -431,7 +431,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device,
                       void *context, struct ib_sa_query **query);
 
 int ib_sa_service_rec_query(struct ib_sa_client *client,
-                           struct ib_device *device, u8 port_num, u8 method,
+                           struct ib_device *device, u32 port_num, u8 method,
                            struct ib_sa_service_rec *rec,
                            ib_sa_comp_mask comp_mask, unsigned long timeout_ms,
                            gfp_t gfp_mask,
@@ -477,7 +477,8 @@ struct ib_sa_multicast {
  *   group, and the user must rejoin the group to continue using it.
  */
 struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client,
-                                            struct ib_device *device, u8 port_num,
+                                            struct ib_device *device,
+                                            u32 port_num,
                                             struct ib_sa_mcmember_rec *rec,
                                             ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
                                             int (*callback)(int status,
@@ -506,20 +507,20 @@ void ib_sa_free_multicast(struct ib_sa_multicast *multicast);
  * @mgid: MGID of multicast group.
  * @rec: Location to copy SA multicast member record.
  */
-int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+int ib_sa_get_mcmember_rec(struct ib_device *device, u32 port_num,
                           union ib_gid *mgid, struct ib_sa_mcmember_rec *rec);
 
 /**
  * ib_init_ah_from_mcmember - Initialize address handle attributes based on
  * an SA multicast member record.
  */
-int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+int ib_init_ah_from_mcmember(struct ib_device *device, u32 port_num,
                             struct ib_sa_mcmember_rec *rec,
                             struct net_device *ndev,
                             enum ib_gid_type gid_type,
                             struct rdma_ah_attr *ah_attr);
 
-int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num,
+int ib_init_ah_attr_from_path(struct ib_device *device, u32 port_num,
                              struct sa_path_rec *rec,
                              struct rdma_ah_attr *ah_attr,
                              const struct ib_gid_attr *sgid_attr);
@@ -538,7 +539,7 @@ void ib_sa_unpack_path(void *attribute, struct sa_path_rec *rec);
 
 /* Support GuidInfoRecord */
 int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
-                             struct ib_device *device, u8 port_num,
+                             struct ib_device *device, u32 port_num,
                              struct ib_sa_guidinfo_rec *rec,
                              ib_sa_comp_mask comp_mask, u8 method,
                              unsigned long timeout_ms, gfp_t gfp_mask,
index ca28fca..7e2f369 100644 (file)
@@ -152,7 +152,7 @@ struct ib_gid_attr {
        union ib_gid            gid;
        enum ib_gid_type        gid_type;
        u16                     index;
-       u                     port_num;
+       u32                     port_num;
 };
 
 enum {
@@ -736,7 +736,7 @@ struct ib_event {
                struct ib_qp    *qp;
                struct ib_srq   *srq;
                struct ib_wq    *wq;
-               u             port_num;
+               u32             port_num;
        } element;
        enum ib_event_type      event;
 };
@@ -919,7 +919,7 @@ struct rdma_ah_attr {
        struct ib_global_route  grh;
        u8                      sl;
        u8                      static_rate;
-       u                     port_num;
+       u32                     port_num;
        u8                      ah_flags;
        enum rdma_ah_attr_type type;
        union {
@@ -1006,7 +1006,7 @@ struct ib_wc {
        u16                     pkey_index;
        u8                      sl;
        u8                      dlid_path_bits;
-       u8                      port_num;       /* valid only for DR SMPs on switches */
+       u32 port_num; /* valid only for DR SMPs on switches */
        u8                      smac[ETH_ALEN];
        u16                     vlan_id;
        u8                      network_hdr_type;
@@ -1161,7 +1161,7 @@ struct ib_qp_init_attr {
        /*
         * Only needed for special QP types, or when using the RW API.
         */
-       u                     port_num;
+       u32                     port_num;
        struct ib_rwq_ind_table *rwq_ind_tbl;
        u32                     source_qpn;
 };
@@ -1280,11 +1280,11 @@ struct ib_qp_attr {
        u8                      max_rd_atomic;
        u8                      max_dest_rd_atomic;
        u8                      min_rnr_timer;
-       u                     port_num;
+       u32                     port_num;
        u8                      timeout;
        u8                      retry_cnt;
        u8                      rnr_retry;
-       u                     alt_port_num;
+       u32                     alt_port_num;
        u8                      alt_timeout;
        u32                     rate_limit;
        struct net_device       *xmit_slave;
@@ -1401,7 +1401,7 @@ struct ib_ud_wr {
        u32                     remote_qpn;
        u32                     remote_qkey;
        u16                     pkey_index; /* valid for GSI only */
-       u8                      port_num;   /* valid for DR SMPs on switch only */
+       u32                     port_num; /* valid for DR SMPs on switch only */
 };
 
 static inline const struct ib_ud_wr *ud_wr(const struct ib_send_wr *wr)
@@ -1610,6 +1610,11 @@ struct ib_srq {
                        } xrc;
                };
        } ext;
+
+       /*
+        * Implementation details of the RDMA core, don't use in drivers:
+        */
+       struct rdma_restrack_entry res;
 };
 
 enum ib_raw_packet_caps {
@@ -1708,7 +1713,7 @@ struct ib_qp_security;
 struct ib_port_pkey {
        enum port_pkey_state    state;
        u16                     pkey_index;
-       u                     port_num;
+       u32                     port_num;
        struct list_head        qp_list;
        struct list_head        to_error_list;
        struct ib_qp_security  *sec;
@@ -1769,7 +1774,7 @@ struct ib_qp {
        enum ib_qp_type         qp_type;
        struct ib_rwq_ind_table *rwq_ind_tbl;
        struct ib_qp_security  *qp_sec;
-       u                     port;
+       u32                     port;
 
        bool                    integrity_en;
        /*
@@ -2065,7 +2070,7 @@ struct ib_flow_attr {
        u16          priority;
        u32          flags;
        u8           num_of_specs;
-       u          port;
+       u32          port;
        union ib_flow_spec flows[];
 };
 
@@ -2194,7 +2199,7 @@ enum rdma_netdev_t {
 struct rdma_netdev {
        void              *clnt_priv;
        struct ib_device  *hca;
-       u                port_num;
+       u32                port_num;
        int                mtu;
 
        /*
@@ -2215,6 +2220,8 @@ struct rdma_netdev {
                            int set_qkey, u32 qkey);
        int (*detach_mcast)(struct net_device *dev, struct ib_device *hca,
                            union ib_gid *gid, u16 mlid);
+       /* timeout */
+       void (*tx_timeout)(struct net_device *dev, unsigned int txqueue);
 };
 
 struct rdma_netdev_alloc_params {
@@ -2223,7 +2230,7 @@ struct rdma_netdev_alloc_params {
        unsigned int rxqs;
        void *param;
 
-       int (*initialize_rdma_netdev)(struct ib_device *device, u8 port_num,
+       int (*initialize_rdma_netdev)(struct ib_device *device, u32 port_num,
                                      struct net_device *netdev, void *param);
 };
 
@@ -2301,12 +2308,11 @@ struct ib_device_ops {
        int (*poll_cq)(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
        int (*peek_cq)(struct ib_cq *cq, int wc_cnt);
        int (*req_notify_cq)(struct ib_cq *cq, enum ib_cq_notify_flags flags);
-       int (*req_ncomp_notif)(struct ib_cq *cq, int wc_cnt);
        int (*post_srq_recv)(struct ib_srq *srq,
                             const struct ib_recv_wr *recv_wr,
                             const struct ib_recv_wr **bad_recv_wr);
        int (*process_mad)(struct ib_device *device, int process_mad_flags,
-                          u8 port_num, const struct ib_wc *in_wc,
+                          u32 port_num, const struct ib_wc *in_wc,
                           const struct ib_grh *in_grh,
                           const struct ib_mad *in_mad, struct ib_mad *out_mad,
                           size_t *out_mad_size, u16 *out_mad_pkey_index);
@@ -2318,9 +2324,9 @@ struct ib_device_ops {
        void (*get_dev_fw_str)(struct ib_device *device, char *str);
        const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev,
                                                     int comp_vector);
-       int (*query_port)(struct ib_device *device, u8 port_num,
+       int (*query_port)(struct ib_device *device, u32 port_num,
                          struct ib_port_attr *port_attr);
-       int (*modify_port)(struct ib_device *device, u8 port_num,
+       int (*modify_port)(struct ib_device *device, u32 port_num,
                           int port_modify_mask,
                           struct ib_port_modify *port_modify);
        /**
@@ -2329,10 +2335,10 @@ struct ib_device_ops {
         * structure to avoid cache line misses when accessing struct ib_device
         * in fast paths.
         */
-       int (*get_port_immutable)(struct ib_device *device, u8 port_num,
+       int (*get_port_immutable)(struct ib_device *device, u32 port_num,
                                  struct ib_port_immutable *immutable);
        enum rdma_link_layer (*get_link_layer)(struct ib_device *device,
-                                              u8 port_num);
+                                              u32 port_num);
        /**
         * When calling get_netdev, the HW vendor's driver should return the
         * net device of device @device at port @port_num or NULL if such
@@ -2341,7 +2347,8 @@ struct ib_device_ops {
         * that this function returns NULL before the net device has finished
         * NETDEV_UNREGISTER state.
         */
-       struct net_device *(*get_netdev)(struct ib_device *device, u8 port_num);
+       struct net_device *(*get_netdev)(struct ib_device *device,
+                                        u32 port_num);
        /**
         * rdma netdev operation
         *
@@ -2349,11 +2356,11 @@ struct ib_device_ops {
         * must return -EOPNOTSUPP if it doesn't support the specified type.
         */
        struct net_device *(*alloc_rdma_netdev)(
-               struct ib_device *device, u8 port_num, enum rdma_netdev_t type,
+               struct ib_device *device, u32 port_num, enum rdma_netdev_t type,
                const char *name, unsigned char name_assign_type,
                void (*setup)(struct net_device *));
 
-       int (*rdma_netdev_get_params)(struct ib_device *device, u8 port_num,
+       int (*rdma_netdev_get_params)(struct ib_device *device, u32 port_num,
                                      enum rdma_netdev_t type,
                                      struct rdma_netdev_alloc_params *params);
        /**
@@ -2361,7 +2368,7 @@ struct ib_device_ops {
         * link layer is either IB or iWarp. It is no-op if @port_num port
         * is RoCE link layer.
         */
-       int (*query_gid)(struct ib_device *device, u8 port_num, int index,
+       int (*query_gid)(struct ib_device *device, u32 port_num, int index,
                         union ib_gid *gid);
        /**
         * When calling add_gid, the HW vendor's driver should add the gid
@@ -2386,7 +2393,7 @@ struct ib_device_ops {
         * This function is only called when roce_gid_table is used.
         */
        int (*del_gid)(const struct ib_gid_attr *attr, void **context);
-       int (*query_pkey)(struct ib_device *device, u8 port_num, u16 index,
+       int (*query_pkey)(struct ib_device *device, u32 port_num, u16 index,
                          u16 *pkey);
        int (*alloc_ucontext)(struct ib_ucontext *context,
                              struct ib_udata *udata);
@@ -2475,16 +2482,16 @@ struct ib_device_ops {
                struct ib_flow_action *action,
                const struct ib_flow_action_attrs_esp *attr,
                struct uverbs_attr_bundle *attrs);
-       int (*set_vf_link_state)(struct ib_device *device, int vf, u8 port,
+       int (*set_vf_link_state)(struct ib_device *device, int vf, u32 port,
                                 int state);
-       int (*get_vf_config)(struct ib_device *device, int vf, u8 port,
+       int (*get_vf_config)(struct ib_device *device, int vf, u32 port,
                             struct ifla_vf_info *ivf);
-       int (*get_vf_stats)(struct ib_device *device, int vf, u8 port,
+       int (*get_vf_stats)(struct ib_device *device, int vf, u32 port,
                            struct ifla_vf_stats *stats);
-       int (*get_vf_guid)(struct ib_device *device, int vf, u8 port,
+       int (*get_vf_guid)(struct ib_device *device, int vf, u32 port,
                            struct ifla_vf_guid *node_guid,
                            struct ifla_vf_guid *port_guid);
-       int (*set_vf_guid)(struct ib_device *device, int vf, u8 port, u64 guid,
+       int (*set_vf_guid)(struct ib_device *device, int vf, u32 port, u64 guid,
                           int type);
        struct ib_wq *(*create_wq)(struct ib_pd *pd,
                                   struct ib_wq_init_attr *init_attr,
@@ -2522,7 +2529,7 @@ struct ib_device_ops {
         *   struct tells the core to set a default lifespan.
         */
        struct rdma_hw_stats *(*alloc_hw_stats)(struct ib_device *device,
-                                               u8 port_num);
+                                               u32 port_num);
        /**
         * get_hw_stats - Fill in the counter value(s) in the stats struct.
         * @index - The index in the value array we wish to have updated, or
@@ -2536,12 +2543,12 @@ struct ib_device_ops {
         *   one given in index at their option
         */
        int (*get_hw_stats)(struct ib_device *device,
-                           struct rdma_hw_stats *stats, u8 port, int index);
+                           struct rdma_hw_stats *stats, u32 port, int index);
        /*
         * This function is called once for each port when a ib device is
         * registered.
         */
-       int (*init_port)(struct ib_device *device, u8 port_num,
+       int (*init_port)(struct ib_device *device, u32 port_num,
                         struct kobject *port_sysfs);
        /**
         * Allows rdma drivers to add their own restrack attributes.
@@ -2685,7 +2692,7 @@ struct ib_device {
        /* CQ adaptive moderation (RDMA DIM) */
        u16                          use_cq_dim:1;
        u8                           node_type;
-       u                          phys_port_cnt;
+       u32                          phys_port_cnt;
        struct ib_device_attr        attrs;
        struct attribute_group       *hw_stats_ag;
        struct rdma_hw_stats         *hw_stats;
@@ -2751,7 +2758,7 @@ struct ib_client {
         * netdev. */
        struct net_device *(*get_net_dev_by_params)(
                        struct ib_device *dev,
-                       u8 port,
+                       u32 port,
                        u16 pkey,
                        const union ib_gid *gid,
                        const struct sockaddr *addr,
@@ -2932,10 +2939,10 @@ void ib_unregister_event_handler(struct ib_event_handler *event_handler);
 void ib_dispatch_event(const struct ib_event *event);
 
 int ib_query_port(struct ib_device *device,
-                 u8 port_num, struct ib_port_attr *port_attr);
+                 u32 port_num, struct ib_port_attr *port_attr);
 
 enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device,
-                                              u8 port_num);
+                                              u32 port_num);
 
 /**
  * rdma_cap_ib_switch - Check if the device is IB switch
@@ -2959,7 +2966,7 @@ static inline bool rdma_cap_ib_switch(const struct ib_device *device)
  *
  * Return start port number
  */
-static inline u8 rdma_start_port(const struct ib_device *device)
+static inline u32 rdma_start_port(const struct ib_device *device)
 {
        return rdma_cap_ib_switch(device) ? 0 : 1;
 }
@@ -2970,9 +2977,10 @@ static inline u8 rdma_start_port(const struct ib_device *device)
  * @iter - The unsigned int to store the port number
  */
 #define rdma_for_each_port(device, iter)                                       \
-       for (iter = rdma_start_port(device + BUILD_BUG_ON_ZERO(!__same_type(   \
-                                                    unsigned int, iter)));    \
-            iter <= rdma_end_port(device); (iter)++)
+       for (iter = rdma_start_port(device +                                   \
+                                   BUILD_BUG_ON_ZERO(!__same_type(u32,        \
+                                                                  iter)));    \
+            iter <= rdma_end_port(device); iter++)
 
 /**
  * rdma_end_port - Return the last valid port number for the device
@@ -2982,7 +2990,7 @@ static inline u8 rdma_start_port(const struct ib_device *device)
  *
  * Return last port number
  */
-static inline u8 rdma_end_port(const struct ib_device *device)
+static inline u32 rdma_end_port(const struct ib_device *device)
 {
        return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt;
 }
@@ -2995,55 +3003,63 @@ static inline int rdma_is_port_valid(const struct ib_device *device,
 }
 
 static inline bool rdma_is_grh_required(const struct ib_device *device,
-                                       u8 port_num)
+                                       u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_PORT_IB_GRH_REQUIRED;
 }
 
-static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num)
+static inline bool rdma_protocol_ib(const struct ib_device *device,
+                                   u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_PROT_IB;
 }
 
-static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num)
+static inline bool rdma_protocol_roce(const struct ib_device *device,
+                                     u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP);
 }
 
-static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num)
+static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device,
+                                               u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP;
 }
 
-static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num)
+static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device,
+                                               u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_PROT_ROCE;
 }
 
-static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_num)
+static inline bool rdma_protocol_iwarp(const struct ib_device *device,
+                                      u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_PROT_IWARP;
 }
 
-static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num)
+static inline bool rdma_ib_or_roce(const struct ib_device *device,
+                                  u32 port_num)
 {
        return rdma_protocol_ib(device, port_num) ||
                rdma_protocol_roce(device, port_num);
 }
 
-static inline bool rdma_protocol_raw_packet(const struct ib_device *device, u8 port_num)
+static inline bool rdma_protocol_raw_packet(const struct ib_device *device,
+                                           u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_PROT_RAW_PACKET;
 }
 
-static inline bool rdma_protocol_usnic(const struct ib_device *device, u8 port_num)
+static inline bool rdma_protocol_usnic(const struct ib_device *device,
+                                      u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_PROT_USNIC;
@@ -3061,7 +3077,7 @@ static inline bool rdma_protocol_usnic(const struct ib_device *device, u8 port_n
  *
  * Return: true if the port supports sending/receiving of MAD packets.
  */
-static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num)
+static inline bool rdma_cap_ib_mad(const struct ib_device *device, u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_IB_MAD;
@@ -3086,7 +3102,7 @@ static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num)
  *
  * Return: true if the port supports OPA MAD packet formats.
  */
-static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num)
+static inline bool rdma_cap_opa_mad(struct ib_device *device, u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
                RDMA_CORE_CAP_OPA_MAD;
@@ -3112,7 +3128,7 @@ static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num)
  *
  * Return: true if the port provides an SMI.
  */
-static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num)
+static inline bool rdma_cap_ib_smi(const struct ib_device *device, u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_IB_SMI;
@@ -3133,7 +3149,7 @@ static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num)
  * Return: true if the port supports an IB CM (this does not guarantee that
  * a CM is actually running however).
  */
-static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num)
+static inline bool rdma_cap_ib_cm(const struct ib_device *device, u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_IB_CM;
@@ -3151,7 +3167,7 @@ static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num)
  * Return: true if the port supports an iWARP CM (this does not guarantee that
  * a CM is actually running however).
  */
-static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num)
+static inline bool rdma_cap_iw_cm(const struct ib_device *device, u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_IW_CM;
@@ -3172,7 +3188,7 @@ static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num)
  * Administration interface.  This does not imply that the SA service is
  * running locally.
  */
-static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num)
+static inline bool rdma_cap_ib_sa(const struct ib_device *device, u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_IB_SA;
@@ -3195,7 +3211,8 @@ static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num)
  * overhead of registering/unregistering with the SM and tracking of the
  * total number of queue pairs attached to the multicast group.
  */
-static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num)
+static inline bool rdma_cap_ib_mcast(const struct ib_device *device,
+                                    u32 port_num)
 {
        return rdma_cap_ib_sa(device, port_num);
 }
@@ -3213,7 +3230,7 @@ static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num
  * Return: true if the port uses a GID address to identify devices on the
  * network.
  */
-static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num)
+static inline bool rdma_cap_af_ib(const struct ib_device *device, u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_AF_IB;
@@ -3235,7 +3252,7 @@ static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num)
  * addition of a Global Route Header built from our Ethernet Address
  * Handle into our header list for connectionless packets.
  */
-static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num)
+static inline bool rdma_cap_eth_ah(const struct ib_device *device, u32 port_num)
 {
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_ETH_AH;
@@ -3250,7 +3267,7 @@ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num)
  * Return: true if we are running on an OPA device which supports
  * the extended OPA addressing.
  */
-static inline bool rdma_cap_opa_ah(struct ib_device *device, u8 port_num)
+static inline bool rdma_cap_opa_ah(struct ib_device *device, u32 port_num)
 {
        return (device->port_data[port_num].immutable.core_cap_flags &
                RDMA_CORE_CAP_OPA_AH) == RDMA_CORE_CAP_OPA_AH;
@@ -3268,7 +3285,8 @@ static inline bool rdma_cap_opa_ah(struct ib_device *device, u8 port_num)
  * Return the max MAD size required by the Port.  Will return 0 if the port
  * does not support MADs
  */
-static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_num)
+static inline size_t rdma_max_mad_size(const struct ib_device *device,
+                                      u32 port_num)
 {
        return device->port_data[port_num].immutable.max_mad_size;
 }
@@ -3287,7 +3305,7 @@ static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_n
  * its GIDs.
  */
 static inline bool rdma_cap_roce_gid_table(const struct ib_device *device,
-                                          u8 port_num)
+                                          u32 port_num)
 {
        return rdma_protocol_roce(device, port_num) &&
                device->ops.add_gid && device->ops.del_gid;
@@ -3328,7 +3346,7 @@ static inline bool rdma_core_cap_opa_port(struct ib_device *device,
  * Return the MTU size supported by the port as an integer value. Will return
  * -1 if enum value of mtu is not supported.
  */
-static inline int rdma_mtu_enum_to_int(struct ib_device *device, u8 port,
+static inline int rdma_mtu_enum_to_int(struct ib_device *device, u32 port,
                                       int mtu)
 {
        if (rdma_core_cap_opa_port(device, port))
@@ -3345,7 +3363,7 @@ static inline int rdma_mtu_enum_to_int(struct ib_device *device, u8 port,
  *
  * Return the MTU size supported by the port as an integer value.
  */
-static inline int rdma_mtu_from_attr(struct ib_device *device, u8 port,
+static inline int rdma_mtu_from_attr(struct ib_device *device, u32 port,
                                     struct ib_port_attr *attr)
 {
        if (rdma_core_cap_opa_port(device, port))
@@ -3354,34 +3372,34 @@ static inline int rdma_mtu_from_attr(struct ib_device *device, u8 port,
                return ib_mtu_enum_to_int(attr->max_mtu);
 }
 
-int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port,
+int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port,
                         int state);
-int ib_get_vf_config(struct ib_device *device, int vf, u8 port,
+int ib_get_vf_config(struct ib_device *device, int vf, u32 port,
                     struct ifla_vf_info *info);
-int ib_get_vf_stats(struct ib_device *device, int vf, u8 port,
+int ib_get_vf_stats(struct ib_device *device, int vf, u32 port,
                    struct ifla_vf_stats *stats);
-int ib_get_vf_guid(struct ib_device *device, int vf, u8 port,
+int ib_get_vf_guid(struct ib_device *device, int vf, u32 port,
                    struct ifla_vf_guid *node_guid,
                    struct ifla_vf_guid *port_guid);
-int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
+int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid,
                   int type);
 
 int ib_query_pkey(struct ib_device *device,
-                 u8 port_num, u16 index, u16 *pkey);
+                 u32 port_num, u16 index, u16 *pkey);
 
 int ib_modify_device(struct ib_device *device,
                     int device_modify_mask,
                     struct ib_device_modify *device_modify);
 
 int ib_modify_port(struct ib_device *device,
-                  u8 port_num, int port_modify_mask,
+                  u32 port_num, int port_modify_mask,
                   struct ib_port_modify *port_modify);
 
 int ib_find_gid(struct ib_device *device, union ib_gid *gid,
-               u8 *port_num, u16 *index);
+               u32 *port_num, u16 *index);
 
 int ib_find_pkey(struct ib_device *device,
-                u8 port_num, u16 pkey, u16 *index);
+                u32 port_num, u16 pkey, u16 *index);
 
 enum ib_pd_flags {
        /*
@@ -3496,7 +3514,7 @@ int ib_get_rdma_header_version(const union rdma_network_hdr *hdr);
  * attributes which are initialized using ib_init_ah_attr_from_wc().
  *
  */
-int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num,
+int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num,
                            const struct ib_wc *wc, const struct ib_grh *grh,
                            struct rdma_ah_attr *ah_attr);
 
@@ -3513,7 +3531,7 @@ int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num,
  * in all UD QP post sends.
  */
 struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
-                                  const struct ib_grh *grh, u8 port_num);
+                                  const struct ib_grh *grh, u32 port_num);
 
 /**
  * rdma_modify_ah - Modifies the address vector associated with an address
@@ -3915,20 +3933,6 @@ struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe,
 
 void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe);
 
-/**
- * ib_req_ncomp_notif - Request completion notification when there are
- *   at least the specified number of unreaped completions on the CQ.
- * @cq: The CQ to generate an event for.
- * @wc_cnt: The number of unreaped completions that should be on the
- *   CQ before an event is generated.
- */
-static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt)
-{
-       return cq->device->ops.req_ncomp_notif ?
-               cq->device->ops.req_ncomp_notif(cq, wc_cnt) :
-               -ENOSYS;
-}
-
 /*
  * Drivers that don't need a DMA mapping at the RDMA layer, set dma_device to
  * NULL. This causes the ib_dma* helpers to just stash the kernel virtual
@@ -4272,12 +4276,12 @@ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
                                          enum rdma_driver_id driver_id);
 struct ib_device *ib_device_get_by_name(const char *name,
                                        enum rdma_driver_id driver_id);
-struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port,
                                            u16 pkey, const union ib_gid *gid,
                                            const struct sockaddr *addr);
 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
                         unsigned int port);
-struct net_device *ib_device_netdev(struct ib_device *dev, u8 port);
+struct net_device *ib_device_netdev(struct ib_device *dev, u32 port);
 
 struct ib_wq *ib_create_wq(struct ib_pd *pd,
                           struct ib_wq_init_attr *init_attr);
@@ -4311,7 +4315,8 @@ void ib_drain_rq(struct ib_qp *qp);
 void ib_drain_sq(struct ib_qp *qp);
 void ib_drain_qp(struct ib_qp *qp);
 
-int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u16 *speed, u8 *width);
+int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed,
+                    u8 *width);
 
 static inline u8 *rdma_ah_retrieve_dmac(struct rdma_ah_attr *attr)
 {
@@ -4379,12 +4384,12 @@ static inline bool rdma_ah_get_make_grd(const struct rdma_ah_attr *attr)
        return false;
 }
 
-static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u8 port_num)
+static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u32 port_num)
 {
        attr->port_num = port_num;
 }
 
-static inline u8 rdma_ah_get_port_num(const struct rdma_ah_attr *attr)
+static inline u32 rdma_ah_get_port_num(const struct rdma_ah_attr *attr)
 {
        return attr->port_num;
 }
@@ -4482,7 +4487,7 @@ void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src);
  * @port_num: Port number
  */
 static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev,
-                                                      u8 port_num)
+                                                      u32 port_num)
 {
        if (rdma_protocol_roce(dev, port_num))
                return RDMA_AH_ATTR_TYPE_ROCE;
@@ -4554,12 +4559,12 @@ struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile);
 
 int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs);
 
-struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num,
+struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num,
                                     enum rdma_netdev_t type, const char *name,
                                     unsigned char name_assign_type,
                                     void (*setup)(struct net_device *));
 
-int rdma_init_netdev(struct ib_device *device, u8 port_num,
+int rdma_init_netdev(struct ib_device *device, u32 port_num,
                     enum rdma_netdev_t type, const char *name,
                     unsigned char name_assign_type,
                     void (*setup)(struct net_device *),
index 9197540..03abd30 100644 (file)
@@ -70,6 +70,7 @@ struct iw_cm_id {
        u8  tos;
        bool tos_set:1;
        bool mapped:1;
+       bool afonly:1;
 };
 
 struct iw_cm_conn_param {
index 32a67af..d989f03 100644 (file)
@@ -107,7 +107,7 @@ struct rdma_cm_id {
        struct rdma_route        route;
        enum rdma_ucm_port_space ps;
        enum ib_qp_type          qp_type;
-       u                      port_num;
+       u32                      port_num;
 };
 
 struct rdma_cm_id *
@@ -331,6 +331,8 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse);
 int rdma_set_afonly(struct rdma_cm_id *id, int afonly);
 
 int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout);
+
+int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer);
  /**
  * rdma_get_service_id - Return the IB service ID for a specified address.
  * @id: Communication identifier associated with the address.
index e75cf97..0295b22 100644 (file)
@@ -40,26 +40,26 @@ struct rdma_counter {
        struct rdma_counter_mode        mode;
        struct mutex                    lock;
        struct rdma_hw_stats            *stats;
-       u                             port;
+       u32                             port;
 };
 
 void rdma_counter_init(struct ib_device *dev);
 void rdma_counter_release(struct ib_device *dev);
-int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
+int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port,
                               enum rdma_nl_counter_mask mask,
                               struct netlink_ext_ack *extack);
-int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port);
+int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port);
 int rdma_counter_unbind_qp(struct ib_qp *qp, bool force);
 
 int rdma_counter_query_stats(struct rdma_counter *counter);
-u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index);
-int rdma_counter_bind_qpn(struct ib_device *dev, u8 port,
+u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index);
+int rdma_counter_bind_qpn(struct ib_device *dev, u32 port,
                          u32 qp_num, u32 counter_id);
-int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
+int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port,
                                u32 qp_num, u32 *counter_id);
-int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port,
+int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port,
                            u32 qp_num, u32 counter_id);
-int rdma_counter_get_mode(struct ib_device *dev, u8 port,
+int rdma_counter_get_mode(struct ib_device *dev, u32 port,
                          enum rdma_nl_counter_mode *mode,
                          enum rdma_nl_counter_mask *mask);
 
index 9fd217b..2dafd7d 100644 (file)
@@ -92,7 +92,7 @@ struct rvt_ibport {
        /*
         * The pkey table is allocated and maintained by the driver. Drivers
         * need to have access to this before registering with rdmav. However
-        * rdmavt will need access to it so drivers need to proviee this during
+        * rdmavt will need access to it so drivers need to provide this during
         * the attach port API call.
         */
        u16 *pkey_table;
@@ -230,7 +230,7 @@ struct rvt_driver_provided {
        void (*do_send)(struct rvt_qp *qp);
 
        /*
-        * Returns a pointer to the undelying hardware's PCI device. This is
+        * Returns a pointer to the underlying hardware's PCI device. This is
         * used to display information as to what hardware is being referenced
         * in an output message
         */
@@ -245,7 +245,7 @@ struct rvt_driver_provided {
        void * (*qp_priv_alloc)(struct rvt_dev_info *rdi, struct rvt_qp *qp);
 
        /*
-        * Init a struture allocated with qp_priv_alloc(). This should be
+        * Init a structure allocated with qp_priv_alloc(). This should be
         * called after all qp fields have been initialized in rdmavt.
         */
        int (*qp_priv_init)(struct rvt_dev_info *rdi, struct rvt_qp *qp,
@@ -257,7 +257,7 @@ struct rvt_driver_provided {
        void (*qp_priv_free)(struct rvt_dev_info *rdi, struct rvt_qp *qp);
 
        /*
-        * Inform the driver the particular qp in quesiton has been reset so
+        * Inform the driver the particular qp in question has been reset so
         * that it can clean up anything it needs to.
         */
        void (*notify_qp_reset)(struct rvt_qp *qp);
@@ -281,7 +281,7 @@ struct rvt_driver_provided {
        void (*stop_send_queue)(struct rvt_qp *qp);
 
        /*
-        * Have the drivr drain any in progress operations
+        * Have the driver drain any in progress operations
         */
        void (*quiesce_qp)(struct rvt_qp *qp);
 
@@ -309,16 +309,16 @@ struct rvt_driver_provided {
        /*
         * Query driver for the state of the port.
         */
-       int (*query_port_state)(struct rvt_dev_info *rdi, u8 port_num,
+       int (*query_port_state)(struct rvt_dev_info *rdi, u32 port_num,
                                struct ib_port_attr *props);
 
        /*
         * Tell driver to shutdown a port
         */
-       int (*shut_down_port)(struct rvt_dev_info *rdi, u8 port_num);
+       int (*shut_down_port)(struct rvt_dev_info *rdi, u32 port_num);
 
        /* Tell driver to send a trap for changed  port capabilities */
-       void (*cap_mask_chg)(struct rvt_dev_info *rdi, u8 port_num);
+       void (*cap_mask_chg)(struct rvt_dev_info *rdi, u32 port_num);
 
        /*
         * The following functions can be safely ignored completely. Any use of
@@ -338,7 +338,7 @@ struct rvt_driver_provided {
 
        /* Let the driver pick the next queue pair number*/
        int (*alloc_qpn)(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
-                        enum ib_qp_type type, u8 port_num);
+                        enum ib_qp_type type, u32 port_num);
 
        /* Determine if its safe or allowed to modify the qp */
        int (*check_modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr,
index 05e1883..79d109c 100644 (file)
@@ -49,6 +49,10 @@ enum rdma_restrack_type {
         * @RDMA_RESTRACK_COUNTER: Statistic Counter
         */
        RDMA_RESTRACK_COUNTER,
+       /**
+        * @RDMA_RESTRACK_SRQ: Shared receive queue (SRQ)
+        */
+       RDMA_RESTRACK_SRQ,
        /**
         * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations
         */
index 6ad9dc8..d606cac 100644 (file)
@@ -42,29 +42,29 @@ struct rdma_rw_ctx {
        };
 };
 
-int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
                struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
                u64 remote_addr, u32 rkey, enum dma_data_direction dir);
-void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
-               struct scatterlist *sg, u32 sg_cnt,
-               enum dma_data_direction dir);
+void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+                        u32 port_num, struct scatterlist *sg, u32 sg_cnt,
+                        enum dma_data_direction dir);
 
 int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
-               u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+               u32 port_num, struct scatterlist *sg, u32 sg_cnt,
                struct scatterlist *prot_sg, u32 prot_sg_cnt,
                struct ib_sig_attrs *sig_attrs, u64 remote_addr, u32 rkey,
                enum dma_data_direction dir);
 void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
-               u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+               u32 port_num, struct scatterlist *sg, u32 sg_cnt,
                struct scatterlist *prot_sg, u32 prot_sg_cnt,
                enum dma_data_direction dir);
 
 struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
-               u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr);
-int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+               u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr);
+int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
                struct ib_cqe *cqe, struct ib_send_wr *chain_wr);
 
-unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num,
+unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num,
                unsigned int maxpages);
 void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr);
 int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr);
index 39ef204..23bb404 100644 (file)
@@ -875,9 +875,14 @@ static inline __malloc void *uverbs_kcalloc(struct uverbs_attr_bundle *bundle,
                return ERR_PTR(-EOVERFLOW);
        return uverbs_zalloc(bundle, bytes);
 }
-int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle,
-                     size_t idx, s64 lower_bound, u64 upper_bound,
-                     s64 *def_val);
+
+int _uverbs_get_const_signed(s64 *to,
+                            const struct uverbs_attr_bundle *attrs_bundle,
+                            size_t idx, s64 lower_bound, u64 upper_bound,
+                            s64 *def_val);
+int _uverbs_get_const_unsigned(u64 *to,
+                              const struct uverbs_attr_bundle *attrs_bundle,
+                              size_t idx, u64 upper_bound, u64 *def_val);
 int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle,
                                  size_t idx, const void *from, size_t size);
 #else
@@ -921,27 +926,77 @@ uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle,
 {
        return -EINVAL;
 }
+static inline int
+_uverbs_get_const_signed(s64 *to,
+                        const struct uverbs_attr_bundle *attrs_bundle,
+                        size_t idx, s64 lower_bound, u64 upper_bound,
+                        s64 *def_val)
+{
+       return -EINVAL;
+}
+static inline int
+_uverbs_get_const_unsigned(u64 *to,
+                          const struct uverbs_attr_bundle *attrs_bundle,
+                          size_t idx, u64 upper_bound, u64 *def_val)
+{
+       return -EINVAL;
+}
 #endif
 
-#define uverbs_get_const(_to, _attrs_bundle, _idx)                             \
+#define uverbs_get_const_signed(_to, _attrs_bundle, _idx)                      \
        ({                                                                     \
                s64 _val;                                                      \
-               int _ret = _uverbs_get_const(&_val, _attrs_bundle, _idx,       \
-                                            type_min(typeof(*_to)),           \
-                                            type_max(typeof(*_to)), NULL);    \
-               (*_to) = _val;                                                 \
+               int _ret =                                                     \
+                       _uverbs_get_const_signed(&_val, _attrs_bundle, _idx,   \
+                                         type_min(typeof(*(_to))),            \
+                                         type_max(typeof(*(_to))), NULL);     \
+               (*(_to)) = _val;                                               \
                _ret;                                                          \
        })
 
-#define uverbs_get_const_default(_to, _attrs_bundle, _idx, _default)           \
+#define uverbs_get_const_unsigned(_to, _attrs_bundle, _idx)                    \
+       ({                                                                     \
+               u64 _val;                                                      \
+               int _ret =                                                     \
+                       _uverbs_get_const_unsigned(&_val, _attrs_bundle, _idx, \
+                                         type_max(typeof(*(_to))), NULL);     \
+               (*(_to)) = _val;                                               \
+               _ret;                                                          \
+       })
+
+#define uverbs_get_const_default_signed(_to, _attrs_bundle, _idx, _default)    \
        ({                                                                     \
                s64 _val;                                                      \
                s64 _def_val = _default;                                       \
                int _ret =                                                     \
-                       _uverbs_get_const(&_val, _attrs_bundle, _idx,          \
-                                         type_min(typeof(*_to)),              \
-                                         type_max(typeof(*_to)), &_def_val);  \
-               (*_to) = _val;                                                 \
+                       _uverbs_get_const_signed(&_val, _attrs_bundle, _idx,   \
+                               type_min(typeof(*(_to))),                      \
+                               type_max(typeof(*(_to))), &_def_val);          \
+               (*(_to)) = _val;                                               \
                _ret;                                                          \
        })
+
+#define uverbs_get_const_default_unsigned(_to, _attrs_bundle, _idx, _default)  \
+       ({                                                                     \
+               u64 _val;                                                      \
+               u64 _def_val = _default;                                       \
+               int _ret =                                                     \
+                       _uverbs_get_const_unsigned(&_val, _attrs_bundle, _idx, \
+                               type_max(typeof(*(_to))), &_def_val);          \
+               (*(_to)) = _val;                                               \
+               _ret;                                                          \
+       })
+
+#define uverbs_get_const(_to, _attrs_bundle, _idx)                             \
+       (is_signed_type(typeof(*(_to))) ?                                      \
+                uverbs_get_const_signed(_to, _attrs_bundle, _idx) :           \
+                uverbs_get_const_unsigned(_to, _attrs_bundle, _idx))          \
+
+#define uverbs_get_const_default(_to, _attrs_bundle, _idx, _default)           \
+       (is_signed_type(typeof(*(_to))) ?                                      \
+                uverbs_get_const_default_signed(_to, _attrs_bundle, _idx,     \
+                                                 _default) :                  \
+                uverbs_get_const_default_unsigned(_to, _attrs_bundle, _idx,   \
+                                                   _default))
+
 #endif
index f04f512..ee7873f 100644 (file)
@@ -20,7 +20,7 @@
 
 /* These are static so they do not need to be qualified */
 #define UVERBS_METHOD_ATTRS(method_id) _method_attrs_##method_id
-#define UVERBS_OBJECT_METHODS(object_id) _object_methods_##object_id
+#define UVERBS_OBJECT_METHODS(object_id) _UVERBS_NAME(_object_methods_##object_id, __LINE__)
 
 #define DECLARE_UVERBS_NAMED_METHOD(_method_id, ...)                           \
        static const struct uverbs_attr_def *const UVERBS_METHOD_ATTRS(        \
index 5017a88..c3d3547 100644 (file)
@@ -8,28 +8,31 @@
 #include <linux/types.h>
 #include <linux/tracepoint.h>
 
-TRACE_EVENT(cma_alloc,
+DECLARE_EVENT_CLASS(cma_alloc_class,
 
-       TP_PROTO(unsigned long pfn, const struct page *page,
-                unsigned int count, unsigned int align),
+       TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
+                unsigned long count, unsigned int align),
 
-       TP_ARGS(pfn, page, count, align),
+       TP_ARGS(name, pfn, page, count, align),
 
        TP_STRUCT__entry(
+               __string(name, name)
                __field(unsigned long, pfn)
                __field(const struct page *, page)
-               __field(unsigned int, count)
+               __field(unsigned long, count)
                __field(unsigned int, align)
        ),
 
        TP_fast_assign(
+               __assign_str(name, name);
                __entry->pfn = pfn;
                __entry->page = page;
                __entry->count = count;
                __entry->align = align;
        ),
 
-       TP_printk("pfn=%lx page=%p count=%u align=%u",
+       TP_printk("name=%s pfn=%lx page=%p count=%lu align=%u",
+                 __get_str(name),
                  __entry->pfn,
                  __entry->page,
                  __entry->count,
@@ -38,29 +41,72 @@ TRACE_EVENT(cma_alloc,
 
 TRACE_EVENT(cma_release,
 
-       TP_PROTO(unsigned long pfn, const struct page *page,
-                unsigned int count),
+       TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
+                unsigned long count),
 
-       TP_ARGS(pfn, page, count),
+       TP_ARGS(name, pfn, page, count),
 
        TP_STRUCT__entry(
+               __string(name, name)
                __field(unsigned long, pfn)
                __field(const struct page *, page)
-               __field(unsigned int, count)
+               __field(unsigned long, count)
        ),
 
        TP_fast_assign(
+               __assign_str(name, name);
                __entry->pfn = pfn;
                __entry->page = page;
                __entry->count = count;
        ),
 
-       TP_printk("pfn=%lx page=%p count=%u",
+       TP_printk("name=%s pfn=%lx page=%p count=%lu",
+                 __get_str(name),
                  __entry->pfn,
                  __entry->page,
                  __entry->count)
 );
 
+TRACE_EVENT(cma_alloc_start,
+
+       TP_PROTO(const char *name, unsigned long count, unsigned int align),
+
+       TP_ARGS(name, count, align),
+
+       TP_STRUCT__entry(
+               __string(name, name)
+               __field(unsigned long, count)
+               __field(unsigned int, align)
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, name);
+               __entry->count = count;
+               __entry->align = align;
+       ),
+
+       TP_printk("name=%s count=%lu align=%u",
+                 __get_str(name),
+                 __entry->count,
+                 __entry->align)
+);
+
+DEFINE_EVENT(cma_alloc_class, cma_alloc_finish,
+
+       TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
+                unsigned long count, unsigned int align),
+
+       TP_ARGS(name, pfn, page, count, align)
+);
+
+DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry,
+
+       TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
+                unsigned long count, unsigned int align),
+
+       TP_ARGS(name, pfn, page, count, align)
+);
+
 #endif /* _TRACE_CMA_H */
 
 /* This part must be outside protection */
index e801f49..d233f29 100644 (file)
 #include <linux/tracepoint.h>
 #include <linux/intel-iommu.h>
 
-DECLARE_EVENT_CLASS(dma_map,
-       TP_PROTO(struct device *dev, dma_addr_t dev_addr, phys_addr_t phys_addr,
-                size_t size),
-
-       TP_ARGS(dev, dev_addr, phys_addr, size),
-
-       TP_STRUCT__entry(
-               __string(dev_name, dev_name(dev))
-               __field(dma_addr_t, dev_addr)
-               __field(phys_addr_t, phys_addr)
-               __field(size_t, size)
-       ),
-
-       TP_fast_assign(
-               __assign_str(dev_name, dev_name(dev));
-               __entry->dev_addr = dev_addr;
-               __entry->phys_addr = phys_addr;
-               __entry->size = size;
-       ),
-
-       TP_printk("dev=%s dev_addr=0x%llx phys_addr=0x%llx size=%zu",
-                 __get_str(dev_name),
-                 (unsigned long long)__entry->dev_addr,
-                 (unsigned long long)__entry->phys_addr,
-                 __entry->size)
-);
-
-DEFINE_EVENT(dma_map, map_single,
-       TP_PROTO(struct device *dev, dma_addr_t dev_addr, phys_addr_t phys_addr,
-                size_t size),
-       TP_ARGS(dev, dev_addr, phys_addr, size)
-);
-
-DEFINE_EVENT(dma_map, bounce_map_single,
-       TP_PROTO(struct device *dev, dma_addr_t dev_addr, phys_addr_t phys_addr,
-                size_t size),
-       TP_ARGS(dev, dev_addr, phys_addr, size)
-);
-
-DECLARE_EVENT_CLASS(dma_unmap,
-       TP_PROTO(struct device *dev, dma_addr_t dev_addr, size_t size),
-
-       TP_ARGS(dev, dev_addr, size),
-
-       TP_STRUCT__entry(
-               __string(dev_name, dev_name(dev))
-               __field(dma_addr_t, dev_addr)
-               __field(size_t, size)
-       ),
-
-       TP_fast_assign(
-               __assign_str(dev_name, dev_name(dev));
-               __entry->dev_addr = dev_addr;
-               __entry->size = size;
-       ),
-
-       TP_printk("dev=%s dev_addr=0x%llx size=%zu",
-                 __get_str(dev_name),
-                 (unsigned long long)__entry->dev_addr,
-                 __entry->size)
-);
-
-DEFINE_EVENT(dma_unmap, unmap_single,
-       TP_PROTO(struct device *dev, dma_addr_t dev_addr, size_t size),
-       TP_ARGS(dev, dev_addr, size)
-);
-
-DEFINE_EVENT(dma_unmap, unmap_sg,
-       TP_PROTO(struct device *dev, dma_addr_t dev_addr, size_t size),
-       TP_ARGS(dev, dev_addr, size)
-);
-
-DEFINE_EVENT(dma_unmap, bounce_unmap_single,
-       TP_PROTO(struct device *dev, dma_addr_t dev_addr, size_t size),
-       TP_ARGS(dev, dev_addr, size)
-);
-
-DECLARE_EVENT_CLASS(dma_map_sg,
-       TP_PROTO(struct device *dev, int index, int total,
-                struct scatterlist *sg),
-
-       TP_ARGS(dev, index, total, sg),
-
-       TP_STRUCT__entry(
-               __string(dev_name, dev_name(dev))
-               __field(dma_addr_t, dev_addr)
-               __field(phys_addr_t, phys_addr)
-               __field(size_t, size)
-               __field(int, index)
-               __field(int, total)
-       ),
-
-       TP_fast_assign(
-               __assign_str(dev_name, dev_name(dev));
-               __entry->dev_addr = sg->dma_address;
-               __entry->phys_addr = sg_phys(sg);
-               __entry->size = sg->dma_length;
-               __entry->index = index;
-               __entry->total = total;
-       ),
-
-       TP_printk("dev=%s [%d/%d] dev_addr=0x%llx phys_addr=0x%llx size=%zu",
-                 __get_str(dev_name), __entry->index, __entry->total,
-                 (unsigned long long)__entry->dev_addr,
-                 (unsigned long long)__entry->phys_addr,
-                 __entry->size)
-);
-
-DEFINE_EVENT(dma_map_sg, map_sg,
-       TP_PROTO(struct device *dev, int index, int total,
-                struct scatterlist *sg),
-       TP_ARGS(dev, index, total, sg)
-);
-
-DEFINE_EVENT(dma_map_sg, bounce_map_sg,
-       TP_PROTO(struct device *dev, int index, int total,
-                struct scatterlist *sg),
-       TP_ARGS(dev, index, total, sg)
-);
-
 TRACE_EVENT(qi_submit,
        TP_PROTO(struct intel_iommu *iommu, u64 qw0, u64 qw1, u64 qw2, u64 qw3),
 
index bd52817..abb8b24 100644 (file)
@@ -49,7 +49,7 @@ TRACE_EVENT(io_uring_create,
 );
 
 /**
- * io_uring_register - called after a buffer/file/eventfd was succesfully
+ * io_uring_register - called after a buffer/file/eventfd was successfully
  *                                        registered for a ring
  *
  * @ctx:                       pointer to a ring context structure
index 49d7d0f..37e1e1a 100644 (file)
@@ -255,30 +255,6 @@ TRACE_EVENT(kvm_fpu,
        TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol))
 );
 
-TRACE_EVENT(kvm_age_page,
-       TP_PROTO(ulong gfn, int level, struct kvm_memory_slot *slot, int ref),
-       TP_ARGS(gfn, level, slot, ref),
-
-       TP_STRUCT__entry(
-               __field(        u64,    hva             )
-               __field(        u64,    gfn             )
-               __field(        u8,     level           )
-               __field(        u8,     referenced      )
-       ),
-
-       TP_fast_assign(
-               __entry->gfn            = gfn;
-               __entry->level          = level;
-               __entry->hva            = ((gfn - slot->base_gfn) <<
-                                           PAGE_SHIFT) + slot->userspace_addr;
-               __entry->referenced     = ref;
-       ),
-
-       TP_printk("hva %llx gfn %llx level %u %s",
-                 __entry->hva, __entry->gfn, __entry->level,
-                 __entry->referenced ? "YOUNG" : "OLD")
-);
-
 #ifdef CONFIG_KVM_ASYNC_PF
 DECLARE_EVENT_CLASS(kvm_async_get_page_class,
 
@@ -462,6 +438,72 @@ TRACE_EVENT(kvm_dirty_ring_exit,
        TP_printk("vcpu %d", __entry->vcpu_id)
 );
 
+TRACE_EVENT(kvm_unmap_hva_range,
+       TP_PROTO(unsigned long start, unsigned long end),
+       TP_ARGS(start, end),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  start           )
+               __field(        unsigned long,  end             )
+       ),
+
+       TP_fast_assign(
+               __entry->start          = start;
+               __entry->end            = end;
+       ),
+
+       TP_printk("mmu notifier unmap range: %#016lx -- %#016lx",
+                 __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_set_spte_hva,
+       TP_PROTO(unsigned long hva),
+       TP_ARGS(hva),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  hva             )
+       ),
+
+       TP_fast_assign(
+               __entry->hva            = hva;
+       ),
+
+       TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
+);
+
+TRACE_EVENT(kvm_age_hva,
+       TP_PROTO(unsigned long start, unsigned long end),
+       TP_ARGS(start, end),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  start           )
+               __field(        unsigned long,  end             )
+       ),
+
+       TP_fast_assign(
+               __entry->start          = start;
+               __entry->end            = end;
+       ),
+
+       TP_printk("mmu notifier age hva: %#016lx -- %#016lx",
+                 __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_test_age_hva,
+       TP_PROTO(unsigned long hva),
+       TP_ARGS(hva),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  hva             )
+       ),
+
+       TP_fast_assign(
+               __entry->hva            = hva;
+       ),
+
+       TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
+);
+
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */
index 4d43439..9fb2a3b 100644 (file)
@@ -20,7 +20,8 @@
        EM( MR_SYSCALL,         "syscall_or_cpuset")            \
        EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind")              \
        EM( MR_NUMA_MISPLACED,  "numa_misplaced")               \
-       EMe(MR_CONTIG_RANGE,    "contig_range")
+       EM( MR_CONTIG_RANGE,    "contig_range")                 \
+       EMe(MR_LONGTERM_PIN,    "longterm_pin")
 
 /*
  * First define the enums in the above macros to be exported to userspace
@@ -81,6 +82,28 @@ TRACE_EVENT(mm_migrate_pages,
                __print_symbolic(__entry->mode, MIGRATE_MODE),
                __print_symbolic(__entry->reason, MIGRATE_REASON))
 );
+
+TRACE_EVENT(mm_migrate_pages_start,
+
+       TP_PROTO(enum migrate_mode mode, int reason),
+
+       TP_ARGS(mode, reason),
+
+       TP_STRUCT__entry(
+               __field(enum migrate_mode, mode)
+               __field(int, reason)
+       ),
+
+       TP_fast_assign(
+               __entry->mode   = mode;
+               __entry->reason = reason;
+       ),
+
+       TP_printk("mode=%s reason=%s",
+                 __print_symbolic(__entry->mode, MIGRATE_MODE),
+                 __print_symbolic(__entry->reason, MIGRATE_REASON))
+);
+
 #endif /* _TRACE_MIGRATE_H */
 
 /* This part must be outside protection */
index 67018d3..629c7a0 100644 (file)
@@ -137,6 +137,12 @@ IF_HAVE_PG_ARCH_2(PG_arch_2,               "arch_2"        )
 #define IF_HAVE_VM_SOFTDIRTY(flag,name)
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+# define IF_HAVE_UFFD_MINOR(flag, name) {flag, name},
+#else
+# define IF_HAVE_UFFD_MINOR(flag, name)
+#endif
+
 #define __def_vmaflag_names                                            \
        {VM_READ,                       "read"          },              \
        {VM_WRITE,                      "write"         },              \
@@ -148,6 +154,7 @@ IF_HAVE_PG_ARCH_2(PG_arch_2,                "arch_2"        )
        {VM_MAYSHARE,                   "mayshare"      },              \
        {VM_GROWSDOWN,                  "growsdown"     },              \
        {VM_UFFD_MISSING,               "uffd_missing"  },              \
+IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR,      "uffd_minor"    )               \
        {VM_PFNMAP,                     "pfnmap"        },              \
        {VM_DENYWRITE,                  "denywrite"     },              \
        {VM_UFFD_WP,                    "uffd_wp"       },              \
index c7711e9..6768b64 100644 (file)
@@ -48,7 +48,7 @@ TRACE_EVENT(rcu_utilization,
  * RCU flavor, the grace-period number, and a string identifying the
  * grace-period-related event as follows:
  *
- *     "AccReadyCB": CPU acclerates new callbacks to RCU_NEXT_READY_TAIL.
+ *     "AccReadyCB": CPU accelerates new callbacks to RCU_NEXT_READY_TAIL.
  *     "AccWaitCB": CPU accelerates new callbacks to RCU_WAIT_TAIL.
  *     "newreq": Request a new grace period.
  *     "start": Start a grace period.
index c838e7a..bd55908 100644 (file)
@@ -60,6 +60,46 @@ DECLARE_EVENT_CLASS(rpcrdma_completion_class,
                                ),                                      \
                                TP_ARGS(wc, cid))
 
+DECLARE_EVENT_CLASS(rpcrdma_mr_completion_class,
+       TP_PROTO(
+               const struct ib_wc *wc,
+               const struct rpc_rdma_cid *cid
+       ),
+
+       TP_ARGS(wc, cid),
+
+       TP_STRUCT__entry(
+               __field(u32, cq_id)
+               __field(int, completion_id)
+               __field(unsigned long, status)
+               __field(unsigned int, vendor_err)
+       ),
+
+       TP_fast_assign(
+               __entry->cq_id = cid->ci_queue_id;
+               __entry->completion_id = cid->ci_completion_id;
+               __entry->status = wc->status;
+               if (wc->status)
+                       __entry->vendor_err = wc->vendor_err;
+               else
+                       __entry->vendor_err = 0;
+       ),
+
+       TP_printk("cq.id=%u mr.id=%d status=%s (%lu/0x%x)",
+               __entry->cq_id, __entry->completion_id,
+               rdma_show_wc_status(__entry->status),
+               __entry->status, __entry->vendor_err
+       )
+);
+
+#define DEFINE_MR_COMPLETION_EVENT(name)                               \
+               DEFINE_EVENT(rpcrdma_mr_completion_class, name,         \
+                               TP_PROTO(                               \
+                                       const struct ib_wc *wc,         \
+                                       const struct rpc_rdma_cid *cid  \
+                               ),                                      \
+                               TP_ARGS(wc, cid))
+
 DECLARE_EVENT_CLASS(rpcrdma_receive_completion_class,
        TP_PROTO(
                const struct ib_wc *wc,
@@ -150,19 +190,17 @@ DECLARE_EVENT_CLASS(xprtrdma_rxprt,
        TP_ARGS(r_xprt),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
                __string(addr, rpcrdma_addrstr(r_xprt))
                __string(port, rpcrdma_portstr(r_xprt))
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p",
-               __get_str(addr), __get_str(port), __entry->r_xprt
+       TP_printk("peer=[%s]:%s",
+               __get_str(addr), __get_str(port)
        )
 );
 
@@ -182,7 +220,6 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class,
        TP_ARGS(r_xprt, rc),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
                __field(int, rc)
                __field(int, connect_status)
                __string(addr, rpcrdma_addrstr(r_xprt))
@@ -190,15 +227,14 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class,
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
                __entry->rc = rc;
                __entry->connect_status = r_xprt->rx_ep->re_connect_status;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connection status=%d",
-               __get_str(addr), __get_str(port), __entry->r_xprt,
+       TP_printk("peer=[%s]:%s rc=%d connection status=%d",
+               __get_str(addr), __get_str(port),
                __entry->rc, __entry->connect_status
        )
 );
@@ -343,7 +379,7 @@ DECLARE_EVENT_CLASS(xprtrdma_mr_class,
 
                __entry->task_id = task->tk_pid;
                __entry->client_id = task->tk_client->cl_clid;
-               __entry->mr_id  = mr->frwr.fr_mr->res.id;
+               __entry->mr_id  = mr->mr_ibmr->res.id;
                __entry->nents  = mr->mr_nents;
                __entry->handle = mr->mr_handle;
                __entry->length = mr->mr_length;
@@ -384,7 +420,7 @@ DECLARE_EVENT_CLASS(xprtrdma_anonymous_mr_class,
        ),
 
        TP_fast_assign(
-               __entry->mr_id  = mr->frwr.fr_mr->res.id;
+               __entry->mr_id  = mr->mr_ibmr->res.id;
                __entry->nents  = mr->mr_nents;
                __entry->handle = mr->mr_handle;
                __entry->length = mr->mr_length;
@@ -495,22 +531,19 @@ TRACE_EVENT(xprtrdma_op_connect,
        TP_ARGS(r_xprt, delay),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
                __field(unsigned long, delay)
                __string(addr, rpcrdma_addrstr(r_xprt))
                __string(port, rpcrdma_portstr(r_xprt))
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
                __entry->delay = delay;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p delay=%lu",
-               __get_str(addr), __get_str(port), __entry->r_xprt,
-               __entry->delay
+       TP_printk("peer=[%s]:%s delay=%lu",
+               __get_str(addr), __get_str(port), __entry->delay
        )
 );
 
@@ -525,7 +558,6 @@ TRACE_EVENT(xprtrdma_op_set_cto,
        TP_ARGS(r_xprt, connect, reconnect),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
                __field(unsigned long, connect)
                __field(unsigned long, reconnect)
                __string(addr, rpcrdma_addrstr(r_xprt))
@@ -533,51 +565,18 @@ TRACE_EVENT(xprtrdma_op_set_cto,
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
                __entry->connect = connect;
                __entry->reconnect = reconnect;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p: connect=%lu reconnect=%lu",
-               __get_str(addr), __get_str(port), __entry->r_xprt,
+       TP_printk("peer=[%s]:%s connect=%lu reconnect=%lu",
+               __get_str(addr), __get_str(port),
                __entry->connect / HZ, __entry->reconnect / HZ
        )
 );
 
-TRACE_EVENT(xprtrdma_qp_event,
-       TP_PROTO(
-               const struct rpcrdma_ep *ep,
-               const struct ib_event *event
-       ),
-
-       TP_ARGS(ep, event),
-
-       TP_STRUCT__entry(
-               __field(unsigned long, event)
-               __string(name, event->device->name)
-               __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
-               __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
-       ),
-
-       TP_fast_assign(
-               const struct rdma_cm_id *id = ep->re_id;
-
-               __entry->event = event->event;
-               __assign_str(name, event->device->name);
-               memcpy(__entry->srcaddr, &id->route.addr.src_addr,
-                      sizeof(struct sockaddr_in6));
-               memcpy(__entry->dstaddr, &id->route.addr.dst_addr,
-                      sizeof(struct sockaddr_in6));
-       ),
-
-       TP_printk("%pISpc -> %pISpc device=%s %s (%lu)",
-               __entry->srcaddr, __entry->dstaddr, __get_str(name),
-               rdma_show_ib_event(__entry->event), __entry->event
-       )
-);
-
 /**
  ** Call events
  **/
@@ -591,22 +590,19 @@ TRACE_EVENT(xprtrdma_createmrs,
        TP_ARGS(r_xprt, count),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
                __string(addr, rpcrdma_addrstr(r_xprt))
                __string(port, rpcrdma_portstr(r_xprt))
                __field(unsigned int, count)
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
                __entry->count = count;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p: created %u MRs",
-               __get_str(addr), __get_str(port), __entry->r_xprt,
-               __entry->count
+       TP_printk("peer=[%s]:%s created %u MRs",
+               __get_str(addr), __get_str(port), __entry->count
        )
 );
 
@@ -829,7 +825,7 @@ TRACE_EVENT(xprtrdma_post_recvs,
        TP_ARGS(r_xprt, count, status),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
+               __field(u32, cq_id)
                __field(unsigned int, count)
                __field(int, status)
                __field(int, posted)
@@ -838,16 +834,18 @@ TRACE_EVENT(xprtrdma_post_recvs,
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
+               const struct rpcrdma_ep *ep = r_xprt->rx_ep;
+
+               __entry->cq_id = ep->re_attr.recv_cq->res.id;
                __entry->count = count;
                __entry->status = status;
-               __entry->posted = r_xprt->rx_ep->re_receive_count;
+               __entry->posted = ep->re_receive_count;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p: %u new recvs, %d active (rc %d)",
-               __get_str(addr), __get_str(port), __entry->r_xprt,
+       TP_printk("peer=[%s]:%s cq.id=%d %u new recvs, %d active (rc %d)",
+               __get_str(addr), __get_str(port), __entry->cq_id,
                __entry->count, __entry->posted, __entry->status
        )
 );
@@ -886,10 +884,10 @@ TRACE_EVENT(xprtrdma_post_linv_err,
 DEFINE_RECEIVE_COMPLETION_EVENT(xprtrdma_wc_receive);
 
 DEFINE_COMPLETION_EVENT(xprtrdma_wc_send);
-DEFINE_COMPLETION_EVENT(xprtrdma_wc_fastreg);
-DEFINE_COMPLETION_EVENT(xprtrdma_wc_li);
-DEFINE_COMPLETION_EVENT(xprtrdma_wc_li_wake);
-DEFINE_COMPLETION_EVENT(xprtrdma_wc_li_done);
+DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_fastreg);
+DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_li);
+DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_li_wake);
+DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_li_done);
 
 TRACE_EVENT(xprtrdma_frwr_alloc,
        TP_PROTO(
@@ -905,7 +903,7 @@ TRACE_EVENT(xprtrdma_frwr_alloc,
        ),
 
        TP_fast_assign(
-               __entry->mr_id = mr->frwr.fr_mr->res.id;
+               __entry->mr_id = mr->mr_ibmr->res.id;
                __entry->rc = rc;
        ),
 
@@ -933,7 +931,7 @@ TRACE_EVENT(xprtrdma_frwr_dereg,
        ),
 
        TP_fast_assign(
-               __entry->mr_id  = mr->frwr.fr_mr->res.id;
+               __entry->mr_id  = mr->mr_ibmr->res.id;
                __entry->nents  = mr->mr_nents;
                __entry->handle = mr->mr_handle;
                __entry->length = mr->mr_length;
@@ -966,7 +964,7 @@ TRACE_EVENT(xprtrdma_frwr_sgerr,
        ),
 
        TP_fast_assign(
-               __entry->mr_id = mr->frwr.fr_mr->res.id;
+               __entry->mr_id = mr->mr_ibmr->res.id;
                __entry->addr = mr->mr_sg->dma_address;
                __entry->dir = mr->mr_dir;
                __entry->nents = sg_nents;
@@ -996,7 +994,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr,
        ),
 
        TP_fast_assign(
-               __entry->mr_id = mr->frwr.fr_mr->res.id;
+               __entry->mr_id = mr->mr_ibmr->res.id;
                __entry->addr = mr->mr_sg->dma_address;
                __entry->dir = mr->mr_dir;
                __entry->num_mapped = num_mapped;
@@ -1010,11 +1008,12 @@ TRACE_EVENT(xprtrdma_frwr_maperr,
        )
 );
 
+DEFINE_MR_EVENT(fastreg);
 DEFINE_MR_EVENT(localinv);
+DEFINE_MR_EVENT(reminv);
 DEFINE_MR_EVENT(map);
 
 DEFINE_ANON_MR_EVENT(unmap);
-DEFINE_ANON_MR_EVENT(recycle);
 
 TRACE_EVENT(xprtrdma_dma_maperr,
        TP_PROTO(
@@ -1248,22 +1247,19 @@ TRACE_EVENT(xprtrdma_cb_setup,
        TP_ARGS(r_xprt, reqs),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
                __field(unsigned int, reqs)
                __string(addr, rpcrdma_addrstr(r_xprt))
                __string(port, rpcrdma_portstr(r_xprt))
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
                __entry->reqs = reqs;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p: %u reqs",
-               __get_str(addr), __get_str(port),
-               __entry->r_xprt, __entry->reqs
+       TP_printk("peer=[%s]:%s %u reqs",
+               __get_str(addr), __get_str(port), __entry->reqs
        )
 );
 
index cbe3e15..1eca230 100644 (file)
@@ -174,7 +174,7 @@ DEFINE_EVENT(sched_wakeup_template, sched_waking,
             TP_ARGS(p));
 
 /*
- * Tracepoint called when the task is actually woken; p->state == TASK_RUNNNG.
+ * Tracepoint called when the task is actually woken; p->state == TASK_RUNNING.
  * It is not always called from the waking context.
  */
 DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
index bda16e9..d02e01a 100644 (file)
@@ -1079,6 +1079,46 @@ TRACE_EVENT(xprt_transmit,
                __entry->seqno, __entry->status)
 );
 
+TRACE_EVENT(xprt_retransmit,
+       TP_PROTO(
+               const struct rpc_rqst *rqst
+       ),
+
+       TP_ARGS(rqst),
+
+       TP_STRUCT__entry(
+               __field(unsigned int, task_id)
+               __field(unsigned int, client_id)
+               __field(u32, xid)
+               __field(int, ntrans)
+               __field(int, version)
+               __string(progname,
+                        rqst->rq_task->tk_client->cl_program->name)
+               __string(procedure,
+                        rqst->rq_task->tk_msg.rpc_proc->p_name)
+       ),
+
+       TP_fast_assign(
+               struct rpc_task *task = rqst->rq_task;
+
+               __entry->task_id = task->tk_pid;
+               __entry->client_id = task->tk_client ?
+                       task->tk_client->cl_clid : -1;
+               __entry->xid = be32_to_cpu(rqst->rq_xid);
+               __entry->ntrans = rqst->rq_ntrans;
+               __assign_str(progname,
+                            task->tk_client->cl_program->name)
+               __entry->version = task->tk_client->cl_vers;
+               __assign_str(procedure, task->tk_msg.rpc_proc->p_name)
+       ),
+
+       TP_printk(
+               "task:%u@%u xid=0x%08x %sv%d %s ntrans=%d",
+               __entry->task_id, __entry->client_id, __entry->xid,
+               __get_str(progname), __entry->version, __get_str(procedure),
+               __entry->ntrans)
+);
+
 TRACE_EVENT(xprt_ping,
        TP_PROTO(const struct rpc_xprt *xprt, int status),
 
@@ -1141,7 +1181,6 @@ DECLARE_EVENT_CLASS(xprt_writelock_event,
 
 DEFINE_WRITELOCK_EVENT(reserve_xprt);
 DEFINE_WRITELOCK_EVENT(release_xprt);
-DEFINE_WRITELOCK_EVENT(transmit_queued);
 
 DECLARE_EVENT_CLASS(xprt_cong_event,
        TP_PROTO(
index 19abb6c..6ad031c 100644 (file)
@@ -119,7 +119,7 @@ TRACE_EVENT(timer_expire_entry,
  * When used in combination with the timer_expire_entry tracepoint we can
  * determine the runtime of the timer callback function.
  *
- * NOTE: Do NOT derefernce timer in TP_fast_assign. The pointer might
+ * NOTE: Do NOT dereference timer in TP_fast_assign. The pointer might
  * be invalid. We solely track the pointer.
  */
 DEFINE_EVENT(timer_class, timer_expire_exit,
index 739c839..6de5a7f 100644 (file)
@@ -866,8 +866,15 @@ __SYSCALL(__NR_mount_setattr, sys_mount_setattr)
 #define __NR_quotactl_path 443
 __SYSCALL(__NR_quotactl_path, sys_quotactl_path)
 
+#define __NR_landlock_create_ruleset 444
+__SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset)
+#define __NR_landlock_add_rule 445
+__SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
+#define __NR_landlock_restrict_self 446
+__SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
+
 #undef __NR_syscalls
-#define __NR_syscalls 444
+#define __NR_syscalls 447
 
 /*
  * 32 bit systems traditionally used different
index fcff666..e5c6e45 100644 (file)
@@ -193,8 +193,22 @@ struct dm_name_list {
        __u32 next;             /* offset to the next record from
                                   the _start_ of this */
        char name[0];
+
+       /*
+        * The following members can be accessed by taking a pointer that
+        * points immediately after the terminating zero character in "name"
+        * and aligning this pointer to next 8-byte boundary.
+        * Uuid is present if the flag DM_NAME_LIST_FLAG_HAS_UUID is set.
+        *
+        * __u32 event_nr;
+        * __u32 flags;
+        * char uuid[0];
+        */
 };
 
+#define DM_NAME_LIST_FLAG_HAS_UUID             1
+#define DM_NAME_LIST_FLAG_DOESNT_HAVE_UUID     2
+
 /*
  * Used to retrieve the target versions
  */
@@ -272,9 +286,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY    _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR       4
-#define DM_VERSION_MINOR       44
+#define DM_VERSION_MINOR       45
 #define DM_VERSION_PATCHLEVEL  0
-#define DM_VERSION_EXTRA       "-ioctl (2021-02-01)"
+#define DM_VERSION_EXTRA       "-ioctl (2021-03-22)"
 
 /* Status bits */
 #define DM_READONLY_FLAG       (1 << 0) /* In/Out */
index e8eb4ad..d174914 100644 (file)
@@ -153,14 +153,3 @@ enum {
 #define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1)
 
 #endif /* _LINUX_IF_BONDING_H */
-
-/*
- * Local variables:
- *  version-control: t
- *  kept-new-versions: 5
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
-
index e1d9e75..59178fc 100644 (file)
@@ -288,7 +288,8 @@ struct iommu_gpasid_bind_data_vtd {
 #define IOMMU_SVA_VTD_GPASID_PWT       (1 << 3) /* page-level write through */
 #define IOMMU_SVA_VTD_GPASID_EMTE      (1 << 4) /* extended mem type enable */
 #define IOMMU_SVA_VTD_GPASID_CD                (1 << 5) /* PASID-level cache disable */
-#define IOMMU_SVA_VTD_GPASID_LAST      (1 << 6)
+#define IOMMU_SVA_VTD_GPASID_WPE       (1 << 6) /* Write protect enable */
+#define IOMMU_SVA_VTD_GPASID_LAST      (1 << 7)
        __u64 flags;
        __u32 pat;
        __u32 emt;
index 05669c8..778dc19 100644 (file)
@@ -42,6 +42,7 @@
 #define KEXEC_ARCH_MIPS_LE (10 << 16)
 #define KEXEC_ARCH_MIPS    ( 8 << 16)
 #define KEXEC_ARCH_AARCH64 (183 << 16)
+#define KEXEC_ARCH_RISCV   (243 << 16)
 
 /* The artificial cap on the number of segments passed to kexec_load. */
 #define KEXEC_SEGMENT_MAX 16
index f6afee2..3fd9a7e 100644 (file)
@@ -1078,6 +1078,10 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_DIRTY_LOG_RING 192
 #define KVM_CAP_X86_BUS_LOCK_EXIT 193
 #define KVM_CAP_PPC_DAWR1 194
+#define KVM_CAP_SET_GUEST_DEBUG2 195
+#define KVM_CAP_SGX_ATTRIBUTE 196
+#define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
+#define KVM_CAP_PTP_KVM 198
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1671,6 +1675,8 @@ enum sev_cmd_id {
        KVM_SEV_CERT_EXPORT,
        /* Attestation report */
        KVM_SEV_GET_ATTESTATION_REPORT,
+       /* Guest Migration Extension */
+       KVM_SEV_SEND_CANCEL,
 
        KVM_SEV_NR_MAX,
 };
@@ -1729,6 +1735,45 @@ struct kvm_sev_attestation_report {
        __u32 len;
 };
 
+struct kvm_sev_send_start {
+       __u32 policy;
+       __u64 pdh_cert_uaddr;
+       __u32 pdh_cert_len;
+       __u64 plat_certs_uaddr;
+       __u32 plat_certs_len;
+       __u64 amd_certs_uaddr;
+       __u32 amd_certs_len;
+       __u64 session_uaddr;
+       __u32 session_len;
+};
+
+struct kvm_sev_send_update_data {
+       __u64 hdr_uaddr;
+       __u32 hdr_len;
+       __u64 guest_uaddr;
+       __u32 guest_len;
+       __u64 trans_uaddr;
+       __u32 trans_len;
+};
+
+struct kvm_sev_receive_start {
+       __u32 handle;
+       __u32 policy;
+       __u64 pdh_uaddr;
+       __u32 pdh_len;
+       __u64 session_uaddr;
+       __u32 session_len;
+};
+
+struct kvm_sev_receive_update_data {
+       __u64 hdr_uaddr;
+       __u32 hdr_len;
+       __u64 guest_uaddr;
+       __u32 guest_len;
+       __u64 trans_uaddr;
+       __u32 trans_len;
+};
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3         (1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX       (1 << 2)
diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
new file mode 100644 (file)
index 0000000..b3d9520
--- /dev/null
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Landlock - User space API
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#ifndef _UAPI_LINUX_LANDLOCK_H
+#define _UAPI_LINUX_LANDLOCK_H
+
+#include <linux/types.h>
+
+/**
+ * struct landlock_ruleset_attr - Ruleset definition
+ *
+ * Argument of sys_landlock_create_ruleset().  This structure can grow in
+ * future versions.
+ */
+struct landlock_ruleset_attr {
+       /**
+        * @handled_access_fs: Bitmask of actions (cf. `Filesystem flags`_)
+        * that is handled by this ruleset and should then be forbidden if no
+        * rule explicitly allow them.  This is needed for backward
+        * compatibility reasons.
+        */
+       __u64 handled_access_fs;
+};
+
+/*
+ * sys_landlock_create_ruleset() flags:
+ *
+ * - %LANDLOCK_CREATE_RULESET_VERSION: Get the highest supported Landlock ABI
+ *   version.
+ */
+#define LANDLOCK_CREATE_RULESET_VERSION                        (1U << 0)
+
+/**
+ * enum landlock_rule_type - Landlock rule type
+ *
+ * Argument of sys_landlock_add_rule().
+ */
+enum landlock_rule_type {
+       /**
+        * @LANDLOCK_RULE_PATH_BENEATH: Type of a &struct
+        * landlock_path_beneath_attr .
+        */
+       LANDLOCK_RULE_PATH_BENEATH = 1,
+};
+
+/**
+ * struct landlock_path_beneath_attr - Path hierarchy definition
+ *
+ * Argument of sys_landlock_add_rule().
+ */
+struct landlock_path_beneath_attr {
+       /**
+        * @allowed_access: Bitmask of allowed actions for this file hierarchy
+        * (cf. `Filesystem flags`_).
+        */
+       __u64 allowed_access;
+       /**
+        * @parent_fd: File descriptor, open with ``O_PATH``, which identifies
+        * the parent directory of a file hierarchy, or just a file.
+        */
+       __s32 parent_fd;
+       /*
+        * This struct is packed to avoid trailing reserved members.
+        * Cf. security/landlock/syscalls.c:build_check_abi()
+        */
+} __attribute__((packed));
+
+/**
+ * DOC: fs_access
+ *
+ * A set of actions on kernel objects may be defined by an attribute (e.g.
+ * &struct landlock_path_beneath_attr) including a bitmask of access.
+ *
+ * Filesystem flags
+ * ~~~~~~~~~~~~~~~~
+ *
+ * These flags enable to restrict a sandboxed process to a set of actions on
+ * files and directories.  Files or directories opened before the sandboxing
+ * are not subject to these restrictions.
+ *
+ * A file can only receive these access rights:
+ *
+ * - %LANDLOCK_ACCESS_FS_EXECUTE: Execute a file.
+ * - %LANDLOCK_ACCESS_FS_WRITE_FILE: Open a file with write access.
+ * - %LANDLOCK_ACCESS_FS_READ_FILE: Open a file with read access.
+ *
+ * A directory can receive access rights related to files or directories.  The
+ * following access right is applied to the directory itself, and the
+ * directories beneath it:
+ *
+ * - %LANDLOCK_ACCESS_FS_READ_DIR: Open a directory or list its content.
+ *
+ * However, the following access rights only apply to the content of a
+ * directory, not the directory itself:
+ *
+ * - %LANDLOCK_ACCESS_FS_REMOVE_DIR: Remove an empty directory or rename one.
+ * - %LANDLOCK_ACCESS_FS_REMOVE_FILE: Unlink (or rename) a file.
+ * - %LANDLOCK_ACCESS_FS_MAKE_CHAR: Create (or rename or link) a character
+ *   device.
+ * - %LANDLOCK_ACCESS_FS_MAKE_DIR: Create (or rename) a directory.
+ * - %LANDLOCK_ACCESS_FS_MAKE_REG: Create (or rename or link) a regular file.
+ * - %LANDLOCK_ACCESS_FS_MAKE_SOCK: Create (or rename or link) a UNIX domain
+ *   socket.
+ * - %LANDLOCK_ACCESS_FS_MAKE_FIFO: Create (or rename or link) a named pipe.
+ * - %LANDLOCK_ACCESS_FS_MAKE_BLOCK: Create (or rename or link) a block device.
+ * - %LANDLOCK_ACCESS_FS_MAKE_SYM: Create (or rename or link) a symbolic link.
+ *
+ * .. warning::
+ *
+ *   It is currently not possible to restrict some file-related actions
+ *   accessible through these syscall families: :manpage:`chdir(2)`,
+ *   :manpage:`truncate(2)`, :manpage:`stat(2)`, :manpage:`flock(2)`,
+ *   :manpage:`chmod(2)`, :manpage:`chown(2)`, :manpage:`setxattr(2)`,
+ *   :manpage:`utime(2)`, :manpage:`ioctl(2)`, :manpage:`fcntl(2)`,
+ *   :manpage:`access(2)`.
+ *   Future Landlock evolutions will enable to restrict them.
+ */
+#define LANDLOCK_ACCESS_FS_EXECUTE                     (1ULL << 0)
+#define LANDLOCK_ACCESS_FS_WRITE_FILE                  (1ULL << 1)
+#define LANDLOCK_ACCESS_FS_READ_FILE                   (1ULL << 2)
+#define LANDLOCK_ACCESS_FS_READ_DIR                    (1ULL << 3)
+#define LANDLOCK_ACCESS_FS_REMOVE_DIR                  (1ULL << 4)
+#define LANDLOCK_ACCESS_FS_REMOVE_FILE                 (1ULL << 5)
+#define LANDLOCK_ACCESS_FS_MAKE_CHAR                   (1ULL << 6)
+#define LANDLOCK_ACCESS_FS_MAKE_DIR                    (1ULL << 7)
+#define LANDLOCK_ACCESS_FS_MAKE_REG                    (1ULL << 8)
+#define LANDLOCK_ACCESS_FS_MAKE_SOCK                   (1ULL << 9)
+#define LANDLOCK_ACCESS_FS_MAKE_FIFO                   (1ULL << 10)
+#define LANDLOCK_ACCESS_FS_MAKE_BLOCK                  (1ULL << 11)
+#define LANDLOCK_ACCESS_FS_MAKE_SYM                    (1ULL << 12)
+
+#endif /* _UAPI_LINUX_LANDLOCK_H */
index 8948467..4832fd0 100644 (file)
@@ -64,5 +64,12 @@ enum {
 #define MPOL_F_MOF     (1 << 3) /* this policy wants migrate on fault */
 #define MPOL_F_MORON   (1 << 4) /* Migrate On protnone Reference On Node */
 
+/*
+ * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
+ * ABI.  New bits are OK, but existing bits can never change.
+ */
+#define RECLAIM_ZONE   (1<<0)  /* Run shrink_inactive_list on the zone */
+#define RECLAIM_WRITE  (1<<1)  /* Writeout pages during reclaim */
+#define RECLAIM_UNMAP  (1<<2)  /* Unmap pages during reclaim */
 
 #endif /* _UAPI_LINUX_MEMPOLICY_H */
index 1f2a708..beb2cad 100644 (file)
@@ -20,4 +20,10 @@ struct xt_secmark_target_info {
        char secctx[SECMARK_SECCTX_MAX];
 };
 
+struct xt_secmark_target_info_v1 {
+       __u8 mode;
+       char secctx[SECMARK_SECCTX_MAX];
+       __u32 secid;
+};
+
 #endif /*_XT_SECMARK_H_target */
index ed5415e..800bb0f 100644 (file)
 #define NFS4_MAX_BACK_CHANNEL_OPS 2
 
 #endif /* _UAPI_LINUX_NFS4_H */
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
index e54e639..bf81435 100644 (file)
@@ -1182,10 +1182,15 @@ enum perf_callchain_context {
 /**
  * PERF_RECORD_AUX::flags bits
  */
-#define PERF_AUX_FLAG_TRUNCATED                0x01    /* record was truncated to fit */
-#define PERF_AUX_FLAG_OVERWRITE                0x02    /* snapshot from overwrite mode */
-#define PERF_AUX_FLAG_PARTIAL          0x04    /* record contains gaps */
-#define PERF_AUX_FLAG_COLLISION                0x08    /* sample collided with another */
+#define PERF_AUX_FLAG_TRUNCATED                        0x01    /* record was truncated to fit */
+#define PERF_AUX_FLAG_OVERWRITE                        0x02    /* snapshot from overwrite mode */
+#define PERF_AUX_FLAG_PARTIAL                  0x04    /* record contains gaps */
+#define PERF_AUX_FLAG_COLLISION                        0x08    /* sample collided with another */
+#define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK     0xff00  /* PMU specific trace format type */
+
+/* CoreSight PMU AUX buffer formats */
+#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT       0x0000 /* Default for backward compatibility */
+#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW             0x0100 /* Raw format of the source */
 
 #define PERF_FLAG_FD_NO_GROUP          (1UL << 0)
 #define PERF_FLAG_FD_OUTPUT            (1UL << 1)
index e14c6da..f5ca874 100644 (file)
@@ -9,11 +9,13 @@
 #include <linux/ioctl.h>
 #include <linux/types.h>
 
+#define RPMSG_ADDR_ANY         0xFFFFFFFF
+
 /**
  * struct rpmsg_endpoint_info - endpoint info representation
  * @name: name of service
- * @src: local address
- * @dst: destination address
+ * @src: local address. To set to RPMSG_ADDR_ANY if not used.
+ * @dst: destination address. To set to RPMSG_ADDR_ANY if not used.
  */
 struct rpmsg_endpoint_info {
        char name[32];
@@ -21,7 +23,14 @@ struct rpmsg_endpoint_info {
        __u32 dst;
 };
 
+/**
+ * Instantiate a new rmpsg char device endpoint.
+ */
 #define RPMSG_CREATE_EPT_IOCTL _IOW(0xb5, 0x1, struct rpmsg_endpoint_info)
+
+/**
+ * Destroy a rpmsg char device endpoint created by the RPMSG_CREATE_EPT_IOCTL.
+ */
 #define RPMSG_DESTROY_EPT_IOCTL        _IO(0xb5, 0x2)
 
 #endif
index 3b39ef1..5ae3ace 100644 (file)
@@ -27,6 +27,7 @@ enum {
        SEG6_LOCAL_OIF,
        SEG6_LOCAL_BPF,
        SEG6_LOCAL_VRFTABLE,
+       SEG6_LOCAL_COUNTERS,
        __SEG6_LOCAL_MAX,
 };
 #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1)
@@ -78,4 +79,33 @@ enum {
 
 #define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1)
 
+/* SRv6 Behavior counters are encoded as netlink attributes guaranteeing the
+ * correct alignment.
+ * Each counter is identified by a different attribute type (i.e.
+ * SEG6_LOCAL_CNT_PACKETS).
+ *
+ * - SEG6_LOCAL_CNT_PACKETS: identifies a counter that counts the number of
+ *   packets that have been CORRECTLY processed by an SRv6 Behavior instance
+ *   (i.e., packets that generate errors or are dropped are NOT counted).
+ *
+ * - SEG6_LOCAL_CNT_BYTES: identifies a counter that counts the total amount
+ *   of traffic in bytes of all packets that have been CORRECTLY processed by
+ *   an SRv6 Behavior instance (i.e., packets that generate errors or are
+ *   dropped are NOT counted).
+ *
+ * - SEG6_LOCAL_CNT_ERRORS: identifies a counter that counts the number of
+ *   packets that have NOT been properly processed by an SRv6 Behavior instance
+ *   (i.e., packets that generate errors or are dropped).
+ */
+enum {
+       SEG6_LOCAL_CNT_UNSPEC,
+       SEG6_LOCAL_CNT_PAD,             /* pad for 64 bits values */
+       SEG6_LOCAL_CNT_PACKETS,
+       SEG6_LOCAL_CNT_BYTES,
+       SEG6_LOCAL_CNT_ERRORS,
+       __SEG6_LOCAL_CNT_MAX,
+};
+
+#define SEG6_LOCAL_CNT_MAX (__SEG6_LOCAL_CNT_MAX - 1)
+
 #endif
index c105054..9aa2fed 100644 (file)
@@ -60,7 +60,7 @@ enum thermal_genl_event {
        THERMAL_GENL_EVENT_UNSPEC,
        THERMAL_GENL_EVENT_TZ_CREATE,           /* Thermal zone creation */
        THERMAL_GENL_EVENT_TZ_DELETE,           /* Thermal zone deletion */
-       THERMAL_GENL_EVENT_TZ_DISABLE,          /* Thermal zone disabed */
+       THERMAL_GENL_EVENT_TZ_DISABLE,          /* Thermal zone disabled */
        THERMAL_GENL_EVENT_TZ_ENABLE,           /* Thermal zone enabled */
        THERMAL_GENL_EVENT_TZ_TRIP_UP,          /* Trip point crossed the way up */
        THERMAL_GENL_EVENT_TZ_TRIP_DOWN,        /* Trip point crossed the way down */
index 5f2d882..bafbeb1 100644 (file)
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
+#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING |        \
+                                UFFDIO_REGISTER_MODE_WP |      \
+                                UFFDIO_REGISTER_MODE_MINOR)
 #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP |    \
                           UFFD_FEATURE_EVENT_FORK |            \
                           UFFD_FEATURE_EVENT_REMAP |           \
-                          UFFD_FEATURE_EVENT_REMOVE |  \
+                          UFFD_FEATURE_EVENT_REMOVE |          \
                           UFFD_FEATURE_EVENT_UNMAP |           \
                           UFFD_FEATURE_MISSING_HUGETLBFS |     \
                           UFFD_FEATURE_MISSING_SHMEM |         \
                           UFFD_FEATURE_SIGBUS |                \
-                          UFFD_FEATURE_THREAD_ID)
+                          UFFD_FEATURE_THREAD_ID |             \
+                          UFFD_FEATURE_MINOR_HUGETLBFS)
 #define UFFD_API_IOCTLS                                \
        ((__u64)1 << _UFFDIO_REGISTER |         \
         (__u64)1 << _UFFDIO_UNREGISTER |       \
        ((__u64)1 << _UFFDIO_WAKE |             \
         (__u64)1 << _UFFDIO_COPY |             \
         (__u64)1 << _UFFDIO_ZEROPAGE |         \
-        (__u64)1 << _UFFDIO_WRITEPROTECT)
+        (__u64)1 << _UFFDIO_WRITEPROTECT |     \
+        (__u64)1 << _UFFDIO_CONTINUE)
 #define UFFD_API_RANGE_IOCTLS_BASIC            \
        ((__u64)1 << _UFFDIO_WAKE |             \
-        (__u64)1 << _UFFDIO_COPY)
+        (__u64)1 << _UFFDIO_COPY |             \
+        (__u64)1 << _UFFDIO_CONTINUE)
 
 /*
  * Valid ioctl command number range with this API is from 0x00 to
@@ -55,6 +61,7 @@
 #define _UFFDIO_COPY                   (0x03)
 #define _UFFDIO_ZEROPAGE               (0x04)
 #define _UFFDIO_WRITEPROTECT           (0x06)
+#define _UFFDIO_CONTINUE               (0x07)
 #define _UFFDIO_API                    (0x3F)
 
 /* userfaultfd ioctl ids */
@@ -73,6 +80,8 @@
                                      struct uffdio_zeropage)
 #define UFFDIO_WRITEPROTECT    _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
                                      struct uffdio_writeprotect)
+#define UFFDIO_CONTINUE                _IOR(UFFDIO, _UFFDIO_CONTINUE,  \
+                                    struct uffdio_continue)
 
 /* read() structure */
 struct uffd_msg {
@@ -127,6 +136,7 @@ struct uffd_msg {
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE      (1<<0)  /* If this was a write fault */
 #define UFFD_PAGEFAULT_FLAG_WP         (1<<1)  /* If reason is VM_UFFD_WP */
+#define UFFD_PAGEFAULT_FLAG_MINOR      (1<<2)  /* If reason is VM_UFFD_MINOR */
 
 struct uffdio_api {
        /* userland asks for an API number and the features to enable */
@@ -171,6 +181,10 @@ struct uffdio_api {
         *
         * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
         * be returned, if feature is not requested 0 will be returned.
+        *
+        * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults
+        * can be intercepted (via REGISTER_MODE_MINOR) for
+        * hugetlbfs-backed pages.
         */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP         (1<<0)
 #define UFFD_FEATURE_EVENT_FORK                        (1<<1)
@@ -181,6 +195,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_EVENT_UNMAP               (1<<6)
 #define UFFD_FEATURE_SIGBUS                    (1<<7)
 #define UFFD_FEATURE_THREAD_ID                 (1<<8)
+#define UFFD_FEATURE_MINOR_HUGETLBFS           (1<<9)
        __u64 features;
 
        __u64 ioctls;
@@ -195,6 +210,7 @@ struct uffdio_register {
        struct uffdio_range range;
 #define UFFDIO_REGISTER_MODE_MISSING   ((__u64)1<<0)
 #define UFFDIO_REGISTER_MODE_WP                ((__u64)1<<1)
+#define UFFDIO_REGISTER_MODE_MINOR     ((__u64)1<<2)
        __u64 mode;
 
        /*
@@ -257,6 +273,18 @@ struct uffdio_writeprotect {
        __u64 mode;
 };
 
+struct uffdio_continue {
+       struct uffdio_range range;
+#define UFFDIO_CONTINUE_MODE_DONTWAKE          ((__u64)1<<0)
+       __u64 mode;
+
+       /*
+        * Fields below here are written by the ioctl and must be at the end:
+        * the copy_from_user will not read past here.
+        */
+       __s64 mapped;
+};
+
 /*
  * Flags for the userfaultfd(2) system call itself.
  */
index 34b1f53..ef33ea0 100644 (file)
@@ -333,10 +333,21 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG  (3)
 
 /* 10de vendor PCI sub-types */
-/* subtype 1 was VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, don't use */
+/*
+ * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space.
+ *
+ * Deprecated, region no longer provided
+ */
+#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM (1)
 
 /* 1014 vendor PCI sub-types */
-/* subtype 1 was VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, don't use */
+/*
+ * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU
+ * to do TLB invalidation on a GPU.
+ *
+ * Deprecated, region no longer provided
+ */
+#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD   (1)
 
 /* sub-types for VFIO_REGION_TYPE_GFX */
 #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
@@ -630,9 +641,36 @@ struct vfio_device_migration_info {
  */
 #define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE     3
 
-/* subtype 4 was VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, don't use */
+/*
+ * Capability with compressed real address (aka SSA - small system address)
+ * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing
+ * and by the userspace to associate a NVLink bridge with a GPU.
+ *
+ * Deprecated, capability no longer provided
+ */
+#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT    4
+
+struct vfio_region_info_cap_nvlink2_ssatgt {
+       struct vfio_info_cap_header header;
+       __u64 tgt;
+};
 
-/* subtype 5 was VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, don't use */
+/*
+ * Capability with an NVLink link speed. The value is read by
+ * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed"
+ * property in the device tree. The value is fixed in the hardware
+ * and failing to provide the correct value results in the link
+ * not working with no indication from the driver why.
+ *
+ * Deprecated, capability no longer provided
+ */
+#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD    5
+
+struct vfio_region_info_cap_nvlink2_lnkspd {
+       struct vfio_info_cap_header header;
+       __u32 link_speed;
+       __u32 __pad;
+};
 
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
index 90b739d..42b1776 100644 (file)
@@ -86,6 +86,8 @@ struct hns_roce_ib_create_qp_resp {
 struct hns_roce_ib_alloc_ucontext_resp {
        __u32   qp_tab_size;
        __u32   cqe_size;
+       __u32   srq_tab_size;
+       __u32   reserved;
 };
 
 struct hns_roce_ib_alloc_pd_resp {
index 3fd9b38..ca23728 100644 (file)
@@ -41,6 +41,25 @@ enum mlx5_ib_create_flow_action_attrs {
        MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS = (1U << UVERBS_ID_NS_SHIFT),
 };
 
+enum mlx5_ib_dm_methods {
+       MLX5_IB_METHOD_DM_MAP_OP_ADDR  = (1U << UVERBS_ID_NS_SHIFT),
+       MLX5_IB_METHOD_DM_QUERY,
+};
+
+enum mlx5_ib_dm_map_op_addr_attrs {
+       MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+       MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_OP,
+       MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_START_OFFSET,
+       MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_PAGE_INDEX,
+};
+
+enum mlx5_ib_query_dm_attrs {
+       MLX5_IB_ATTR_QUERY_DM_REQ_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+       MLX5_IB_ATTR_QUERY_DM_RESP_START_OFFSET,
+       MLX5_IB_ATTR_QUERY_DM_RESP_PAGE_INDEX,
+       MLX5_IB_ATTR_QUERY_DM_RESP_LENGTH,
+};
+
 enum mlx5_ib_alloc_dm_attrs {
        MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET = (1U << UVERBS_ID_NS_SHIFT),
        MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
@@ -154,6 +173,7 @@ enum mlx5_ib_devx_umem_reg_attrs {
        MLX5_IB_ATTR_DEVX_UMEM_REG_LEN,
        MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS,
        MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID,
+       MLX5_IB_ATTR_DEVX_UMEM_REG_PGSZ_BITMAP,
 };
 
 enum mlx5_ib_devx_umem_dereg_attrs {
@@ -300,4 +320,13 @@ enum mlx5_ib_pd_methods {
 
 };
 
+enum mlx5_ib_device_methods {
+       MLX5_IB_METHOD_QUERY_PORT = (1U << UVERBS_ID_NS_SHIFT),
+};
+
+enum mlx5_ib_query_port_attrs {
+       MLX5_IB_ATTR_QUERY_PORT_PORT_NUM = (1U << UVERBS_ID_NS_SHIFT),
+       MLX5_IB_ATTR_QUERY_PORT,
+};
+
 #endif
index 56b26ea..a21ca8e 100644 (file)
@@ -83,5 +83,30 @@ enum mlx5_ib_uapi_uar_alloc_type {
        MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC = 0x1,
 };
 
+enum mlx5_ib_uapi_query_port_flags {
+       MLX5_IB_UAPI_QUERY_PORT_VPORT                   = 1 << 0,
+       MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID           = 1 << 1,
+       MLX5_IB_UAPI_QUERY_PORT_VPORT_STEERING_ICM_RX   = 1 << 2,
+       MLX5_IB_UAPI_QUERY_PORT_VPORT_STEERING_ICM_TX   = 1 << 3,
+       MLX5_IB_UAPI_QUERY_PORT_VPORT_REG_C0            = 1 << 4,
+       MLX5_IB_UAPI_QUERY_PORT_ESW_OWNER_VHCA_ID       = 1 << 5,
+};
+
+struct mlx5_ib_uapi_reg {
+       __u32 value;
+       __u32 mask;
+};
+
+struct mlx5_ib_uapi_query_port {
+       __aligned_u64 flags;
+       __u16 vport;
+       __u16 vport_vhca_id;
+       __u16 esw_owner_vhca_id;
+       __u16 rsvd0;
+       __aligned_u64 vport_steering_icm_rx;
+       __aligned_u64 vport_steering_icm_tx;
+       struct mlx5_ib_uapi_reg reg_c0;
+};
+
 #endif
 
index d2f5b83..75a1ae2 100644 (file)
@@ -293,6 +293,10 @@ enum rdma_nldev_command {
 
        RDMA_NLDEV_CMD_RES_MR_GET_RAW,
 
+       RDMA_NLDEV_CMD_RES_CTX_GET, /* can dump */
+
+       RDMA_NLDEV_CMD_RES_SRQ_GET, /* can dump */
+
        RDMA_NLDEV_NUM_OPS
 };
 
@@ -533,6 +537,18 @@ enum rdma_nldev_attr {
 
        RDMA_NLDEV_ATTR_RES_RAW,        /* binary */
 
+       RDMA_NLDEV_ATTR_RES_CTX,                /* nested table */
+       RDMA_NLDEV_ATTR_RES_CTX_ENTRY,          /* nested table */
+
+       RDMA_NLDEV_ATTR_RES_SRQ,                /* nested table */
+       RDMA_NLDEV_ATTR_RES_SRQ_ENTRY,          /* nested table */
+       RDMA_NLDEV_ATTR_RES_SRQN,               /* u32 */
+
+       RDMA_NLDEV_ATTR_MIN_RANGE,              /* u32 */
+       RDMA_NLDEV_ATTR_MAX_RANGE,              /* u32 */
+
+       RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK,       /* u8 */
+
        /*
         * Always the end
         */
index 9e9f9bf..449bd38 100644 (file)
 #define XEN_ELFNOTE_MAX XEN_ELFNOTE_PHYS32_ENTRY
 
 #endif /* __XEN_PUBLIC_ELFNOTE_H__ */
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
index 32ca83e..bfc2138 100644 (file)
@@ -131,13 +131,3 @@ struct vcpu_hvm_context {
 typedef struct vcpu_hvm_context vcpu_hvm_context_t;
 
 #endif /* __XEN_PUBLIC_HVM_HVM_VCPU_H__ */
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
index aaf2951..fb87161 100644 (file)
@@ -39,13 +39,3 @@ enum xenbus_state
 };
 
 #endif /* _XEN_PUBLIC_IO_XENBUS_H */
-
-/*
- * Local variables:
- *  c-file-style: "linux"
- *  indent-tabs-mode: t
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
index dbc4a4b..b3e647f 100644 (file)
@@ -10,7 +10,8 @@ void xen_dma_sync_for_cpu(struct device *dev, dma_addr_t handle,
 void xen_dma_sync_for_device(struct device *dev, dma_addr_t handle,
                             size_t size, enum dma_data_direction dir);
 
-extern int xen_swiotlb_init(int verbose, bool early);
+int xen_swiotlb_init(void);
+void __init xen_swiotlb_init_early(void);
 extern const struct dma_map_ops xen_swiotlb_dma_ops;
 
 #endif /* __LINUX_SWIOTLB_XEN_H */
index 9acb776..1ea12c6 100644 (file)
@@ -1644,6 +1644,11 @@ config HAVE_ARCH_USERFAULTFD_WP
        help
          Arch has userfaultfd write protection support
 
+config HAVE_ARCH_USERFAULTFD_MINOR
+       bool
+       help
+         Arch has userfaultfd minor fault support
+
 config MEMBARRIER
        bool "Enable membarrier() system call" if EXPERT
        default y
@@ -2182,7 +2187,7 @@ config MODULE_SIG_FORCE
 config MODULE_SIG_ALL
        bool "Automatically sign all modules"
        default y
-       depends on MODULE_SIG
+       depends on MODULE_SIG || IMA_APPRAISE_MODSIG
        help
          Sign all modules during make modules_install. Without this option,
          modules must be signed manually, using the scripts/sign-file tool.
@@ -2192,7 +2197,7 @@ comment "Do not forget to sign required modules with scripts/sign-file"
 
 choice
        prompt "Which hash algorithm should modules be signed with?"
-       depends on MODULE_SIG
+       depends on MODULE_SIG || IMA_APPRAISE_MODSIG
        help
          This determines which sort of hashing algorithm will be used during
          signature generation.  This algorithm _must_ be built into the kernel
@@ -2224,7 +2229,7 @@ endchoice
 
 config MODULE_SIG_HASH
        string
-       depends on MODULE_SIG
+       depends on MODULE_SIG || IMA_APPRAISE_MODSIG
        default "sha1" if MODULE_SIG_SHA1
        default "sha224" if MODULE_SIG_SHA224
        default "sha256" if MODULE_SIG_SHA256
@@ -2294,6 +2299,18 @@ config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
 
          If unsure, say N.
 
+config MODPROBE_PATH
+       string "Path to modprobe binary"
+       default "/sbin/modprobe"
+       help
+         When kernel code requests a module, it does so by calling
+         the "modprobe" userspace utility. This option allows you to
+         set the path where that binary is found. This can be changed
+         at runtime via the sysctl file
+         /proc/sys/kernel/modprobe. Setting this to the empty string
+         removes the kernel's ability to request modules (but
+         userspace can still load modules explicitly).
+
 config TRIM_UNUSED_KSYMS
        bool "Trim unused exported kernel symbols" if EXPERT
        depends on !COMPILE_TEST
index d677e8e..af27abc 100644 (file)
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/init.h>
+#include <linux/async.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -541,6 +542,14 @@ static int __init keepinitrd_setup(char *__unused)
 __setup("keepinitrd", keepinitrd_setup);
 #endif
 
+static bool __initdata initramfs_async = true;
+static int __init initramfs_async_setup(char *str)
+{
+       strtobool(str, &initramfs_async);
+       return 1;
+}
+__setup("initramfs_async=", initramfs_async_setup);
+
 extern char __initramfs_start[];
 extern unsigned long __initramfs_size;
 #include <linux/initrd.h>
@@ -658,7 +667,7 @@ static void __init populate_initrd_image(char *err)
 }
 #endif /* CONFIG_BLK_DEV_RAM */
 
-static int __init populate_rootfs(void)
+static void __init do_populate_rootfs(void *unused, async_cookie_t cookie)
 {
        /* Load the built in initramfs */
        char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
@@ -693,6 +702,33 @@ done:
        initrd_end = 0;
 
        flush_delayed_fput();
+}
+
+static ASYNC_DOMAIN_EXCLUSIVE(initramfs_domain);
+static async_cookie_t initramfs_cookie;
+
+void wait_for_initramfs(void)
+{
+       if (!initramfs_cookie) {
+               /*
+                * Something before rootfs_initcall wants to access
+                * the filesystem/initramfs. Probably a bug. Make a
+                * note, avoid deadlocking the machine, and let the
+                * caller's access fail as it used to.
+                */
+               pr_warn_once("wait_for_initramfs() called before rootfs_initcalls\n");
+               return;
+       }
+       async_synchronize_cookie_domain(initramfs_cookie + 1, &initramfs_domain);
+}
+EXPORT_SYMBOL_GPL(wait_for_initramfs);
+
+static int __init populate_rootfs(void)
+{
+       initramfs_cookie = async_schedule_domain(do_populate_rootfs, NULL,
+                                                &initramfs_domain);
+       if (!initramfs_async)
+               wait_for_initramfs();
        return 0;
 }
 rootfs_initcall(populate_rootfs);
index dd11bfd..eb01e12 100644 (file)
@@ -405,7 +405,7 @@ static int __init bootconfig_params(char *param, char *val,
        return 0;
 }
 
-static void __init setup_boot_config(const char *cmdline)
+static void __init setup_boot_config(void)
 {
        static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
        const char *msg;
@@ -472,7 +472,7 @@ static void __init setup_boot_config(const char *cmdline)
 
 #else
 
-static void __init setup_boot_config(const char *cmdline)
+static void __init setup_boot_config(void)
 {
        /* Remove bootconfig data from initrd */
        get_boot_config_from_initrd(NULL, NULL);
@@ -895,7 +895,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
        pr_notice("%s", linux_banner);
        early_security_init();
        setup_arch(&command_line);
-       setup_boot_config(command_line);
+       setup_boot_config();
        setup_command_line(command_line);
        setup_nr_cpu_ids();
        setup_per_cpu_areas();
@@ -1561,6 +1561,7 @@ static noinline void __init kernel_init_freeable(void)
 
        kunit_run_all_tests();
 
+       wait_for_initramfs();
        console_on_rootfs();
 
        /*
index f6c30a8..e0ec239 100644 (file)
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -36,7 +36,7 @@
  * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
  * - undo adjustments at process exit are limited to 0..SEMVMX.
  * - namespace are supported.
- * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing
+ * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtime by writing
  *   to /proc/sys/kernel/sem.
  * - statistics about the usage are reported in /proc/sysvipc/sem.
  *
@@ -224,7 +224,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
  * Setting it to a result code is a RELEASE, this is ensured by both a
  * smp_store_release() (for case a) and while holding sem_lock()
  * (for case b).
- * The AQUIRE when reading the result code without holding sem_lock() is
+ * The ACQUIRE when reading the result code without holding sem_lock() is
  * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep().
  * (case a above).
  * Reading the result code while holding sem_lock() needs no further barriers,
@@ -786,7 +786,7 @@ static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
 {
        get_task_struct(q->sleeper);
 
-       /* see SEM_BARRIER_2 for purpuse/pairing */
+       /* see SEM_BARRIER_2 for purpose/pairing */
        smp_store_release(&q->status, error);
 
        wake_q_add_safe(wake_q, q->sleeper);
@@ -821,7 +821,7 @@ static inline int check_restart(struct sem_array *sma, struct sem_queue *q)
 
        /* It is impossible that someone waits for the new value:
         * - complex operations always restart.
-        * - wait-for-zero are handled seperately.
+        * - wait-for-zero are handled separately.
         * - q is a previously sleeping simple operation that
         *   altered the array. It must be a decrement, because
         *   simple increments never sleep.
@@ -1046,7 +1046,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
                         * - No complex ops, thus all sleeping ops are
                         *   decrease.
                         * - if we decreased the value, then any sleeping
-                        *   semaphore ops wont be able to run: If the
+                        *   semaphore ops won't be able to run: If the
                         *   previous value was too small, then the new
                         *   value will be too small, too.
                         */
@@ -2108,7 +2108,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops,
        queue.dupsop = dupsop;
 
        error = perform_atomic_semop(sma, &queue);
-       if (error == 0) { /* non-blocking succesfull path */
+       if (error == 0) { /* non-blocking successful path */
                DEFINE_WAKE_Q(wake_q);
 
                /*
index 33258e6..b8d7a66 100644 (file)
@@ -78,6 +78,12 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
 
 static atomic_t entry_count;
 
+static long long microseconds_since(ktime_t start)
+{
+       ktime_t now = ktime_get();
+       return ktime_to_ns(ktime_sub(now, start)) >> 10;
+}
+
 static async_cookie_t lowest_in_progress(struct async_domain *domain)
 {
        struct async_entry *first = NULL;
@@ -111,24 +117,18 @@ static void async_run_entry_fn(struct work_struct *work)
        struct async_entry *entry =
                container_of(work, struct async_entry, work);
        unsigned long flags;
-       ktime_t calltime, delta, rettime;
+       ktime_t calltime;
 
        /* 1) run (and print duration) */
-       if (initcall_debug && system_state < SYSTEM_RUNNING) {
-               pr_debug("calling  %lli_%pS @ %i\n",
-                       (long long)entry->cookie,
-                       entry->func, task_pid_nr(current));
-               calltime = ktime_get();
-       }
+       pr_debug("calling  %lli_%pS @ %i\n", (long long)entry->cookie,
+                entry->func, task_pid_nr(current));
+       calltime = ktime_get();
+
        entry->func(entry->data, entry->cookie);
-       if (initcall_debug && system_state < SYSTEM_RUNNING) {
-               rettime = ktime_get();
-               delta = ktime_sub(rettime, calltime);
-               pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n",
-                       (long long)entry->cookie,
-                       entry->func,
-                       (long long)ktime_to_ns(delta) >> 10);
-       }
+
+       pr_debug("initcall %lli_%pS returned after %lld usecs\n",
+                (long long)entry->cookie, entry->func,
+                microseconds_since(calltime));
 
        /* 2) remove self from the pending queues */
        spin_lock_irqsave(&async_lock, flags);
@@ -245,24 +245,6 @@ void async_synchronize_full(void)
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full);
 
-/**
- * async_unregister_domain - ensure no more anonymous waiters on this domain
- * @domain: idle domain to flush out of any async_synchronize_full instances
- *
- * async_synchronize_{cookie|full}_domain() are not flushed since callers
- * of these routines should know the lifetime of @domain
- *
- * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
- */
-void async_unregister_domain(struct async_domain *domain)
-{
-       spin_lock_irq(&async_lock);
-       WARN_ON(!domain->registered || !list_empty(&domain->pending));
-       domain->registered = 0;
-       spin_unlock_irq(&async_lock);
-}
-EXPORT_SYMBOL_GPL(async_unregister_domain);
-
 /**
  * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
  * @domain: the domain to synchronize
@@ -287,23 +269,15 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
  */
 void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
 {
-       ktime_t starttime, delta, endtime;
+       ktime_t starttime;
 
-       if (initcall_debug && system_state < SYSTEM_RUNNING) {
-               pr_debug("async_waiting @ %i\n", task_pid_nr(current));
-               starttime = ktime_get();
-       }
+       pr_debug("async_waiting @ %i\n", task_pid_nr(current));
+       starttime = ktime_get();
 
        wait_event(async_done, lowest_in_progress(domain) >= cookie);
 
-       if (initcall_debug && system_state < SYSTEM_RUNNING) {
-               endtime = ktime_get();
-               delta = ktime_sub(endtime, starttime);
-
-               pr_debug("async_continuing @ %i after %lli usec\n",
-                       task_pid_nr(current),
-                       (long long)ktime_to_ns(delta) >> 10);
-       }
+       pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current),
+                microseconds_since(starttime));
 }
 EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
 
index 8fd552c..757476c 100644 (file)
@@ -6496,6 +6496,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 {
        struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux;
        struct bpf_verifier_state *vstate = env->cur_state;
+       bool off_is_imm = tnum_is_const(off_reg->var_off);
        bool off_is_neg = off_reg->smin_value < 0;
        bool ptr_is_dst_reg = ptr_reg == dst_reg;
        u8 opcode = BPF_OP(insn->code);
@@ -6526,6 +6527,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
                alu_limit = abs(tmp_aux->alu_limit - alu_limit);
        } else {
                alu_state  = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+               alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
                alu_state |= ptr_is_dst_reg ?
                             BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
        }
@@ -12371,7 +12373,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
                        const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
                        const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
                        struct bpf_insn *patch = &insn_buf[0];
-                       bool issrc, isneg;
+                       bool issrc, isneg, isimm;
                        u32 off_reg;
 
                        aux = &env->insn_aux_data[i + delta];
@@ -12382,28 +12384,29 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
                        isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
                        issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
                                BPF_ALU_SANITIZE_SRC;
+                       isimm = aux->alu_state & BPF_ALU_IMMEDIATE;
 
                        off_reg = issrc ? insn->src_reg : insn->dst_reg;
-                       if (isneg)
-                               *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
-                       *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
-                       *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
-                       *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
-                       *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
-                       *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
-                       if (issrc) {
-                               *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX,
-                                                        off_reg);
-                               insn->src_reg = BPF_REG_AX;
+                       if (isimm) {
+                               *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
                        } else {
-                               *patch++ = BPF_ALU64_REG(BPF_AND, off_reg,
-                                                        BPF_REG_AX);
+                               if (isneg)
+                                       *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+                               *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
+                               *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
+                               *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
+                               *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
+                               *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
+                               *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
                        }
+                       if (!issrc)
+                               *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg);
+                       insn->src_reg = BPF_REG_AX;
                        if (isneg)
                                insn->code = insn->code == code_add ?
                                             code_sub : code_add;
                        *patch++ = *insn;
-                       if (issrc && isneg)
+                       if (issrc && isneg && !isimm)
                                *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
                        cnt = patch - insn_buf;
 
index d3fd428..eb701b2 100644 (file)
@@ -1,5 +1,4 @@
 #  KEEP ALPHABETICALLY SORTED
-# CONFIG_DEVKMEM is not set
 # CONFIG_DEVMEM is not set
 # CONFIG_FHANDLE is not set
 # CONFIG_INET_LRO is not set
index 421b114..e1d274c 100644 (file)
@@ -33,7 +33,7 @@ do {                                                                  \
 static struct kmem_cache *cred_jar;
 
 /* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+static struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
 
 /*
  * The initial credentials for the initial task
index 0022682..f737e33 100644 (file)
@@ -344,8 +344,8 @@ void dma_direct_sync_sg_for_device(struct device *dev,
                phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
                if (unlikely(is_swiotlb_buffer(paddr)))
-                       swiotlb_tbl_sync_single(dev, paddr, sg->length,
-                                       dir, SYNC_FOR_DEVICE);
+                       swiotlb_sync_single_for_device(dev, paddr, sg->length,
+                                                      dir);
 
                if (!dev_is_dma_coherent(dev))
                        arch_sync_dma_for_device(paddr, sg->length,
@@ -370,8 +370,8 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
                        arch_sync_dma_for_cpu(paddr, sg->length, dir);
 
                if (unlikely(is_swiotlb_buffer(paddr)))
-                       swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
-                                       SYNC_FOR_CPU);
+                       swiotlb_sync_single_for_cpu(dev, paddr, sg->length,
+                                                   dir);
 
                if (dir == DMA_FROM_DEVICE)
                        arch_dma_mark_clean(paddr, sg->length);
index b986155..50afc05 100644 (file)
@@ -57,7 +57,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
        phys_addr_t paddr = dma_to_phys(dev, addr);
 
        if (unlikely(is_swiotlb_buffer(paddr)))
-               swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
+               swiotlb_sync_single_for_device(dev, paddr, size, dir);
 
        if (!dev_is_dma_coherent(dev))
                arch_sync_dma_for_device(paddr, size, dir);
@@ -74,7 +74,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
        }
 
        if (unlikely(is_swiotlb_buffer(paddr)))
-               swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
+               swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
 
        if (dir == DMA_FROM_DEVICE)
                arch_dma_mark_clean(paddr, size);
@@ -114,6 +114,6 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
                dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 
        if (unlikely(is_swiotlb_buffer(phys)))
-               swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+               swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
 #endif /* _KERNEL_DMA_DIRECT_H */
index e0e64f8..9b9af1b 100644 (file)
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Copyright (C) 2020 Hisilicon Limited.
+ * Copyright (C) 2020 HiSilicon Limited.
  */
 
 #define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
@@ -38,7 +38,8 @@ struct map_benchmark {
        __u32 dma_bits; /* DMA addressing capability */
        __u32 dma_dir; /* DMA data direction */
        __u32 dma_trans_ns; /* time for DMA transmission in ns */
-       __u8 expansion[80];     /* For future use */
+       __u32 granule;  /* how many PAGE_SIZE will do map/unmap once a time */
+       __u8 expansion[76];     /* For future use */
 };
 
 struct map_benchmark_data {
@@ -58,9 +59,11 @@ static int map_benchmark_thread(void *data)
        void *buf;
        dma_addr_t dma_addr;
        struct map_benchmark_data *map = data;
+       int npages = map->bparam.granule;
+       u64 size = npages * PAGE_SIZE;
        int ret = 0;
 
-       buf = (void *)__get_free_page(GFP_KERNEL);
+       buf = alloc_pages_exact(size, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;
 
@@ -76,10 +79,10 @@ static int map_benchmark_thread(void *data)
                 * 66 means evertything goes well! 66 is lucky.
                 */
                if (map->dir != DMA_FROM_DEVICE)
-                       memset(buf, 0x66, PAGE_SIZE);
+                       memset(buf, 0x66, size);
 
                map_stime = ktime_get();
-               dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir);
+               dma_addr = dma_map_single(map->dev, buf, size, map->dir);
                if (unlikely(dma_mapping_error(map->dev, dma_addr))) {
                        pr_err("dma_map_single failed on %s\n",
                                dev_name(map->dev));
@@ -93,7 +96,7 @@ static int map_benchmark_thread(void *data)
                ndelay(map->bparam.dma_trans_ns);
 
                unmap_stime = ktime_get();
-               dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
+               dma_unmap_single(map->dev, dma_addr, size, map->dir);
                unmap_etime = ktime_get();
                unmap_delta = ktime_sub(unmap_etime, unmap_stime);
 
@@ -112,7 +115,7 @@ static int map_benchmark_thread(void *data)
        }
 
 out:
-       free_page((unsigned long)buf);
+       free_pages_exact(buf, size);
        return ret;
 }
 
@@ -203,7 +206,6 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
        struct map_benchmark_data *map = file->private_data;
        void __user *argp = (void __user *)arg;
        u64 old_dma_mask;
-
        int ret;
 
        if (copy_from_user(&map->bparam, argp, sizeof(map->bparam)))
@@ -234,6 +236,11 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
                        return -EINVAL;
                }
 
+               if (map->bparam.granule < 1 || map->bparam.granule > 1024) {
+                       pr_err("invalid granule size\n");
+                       return -EINVAL;
+               }
+
                switch (map->bparam.dma_dir) {
                case DMA_MAP_BIDIRECTIONAL:
                        map->dir = DMA_BIDIRECTIONAL;
index b6a6336..2b06a80 100644 (file)
@@ -477,11 +477,10 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
 }
 EXPORT_SYMBOL(dma_free_attrs);
 
-struct page *dma_alloc_pages(struct device *dev, size_t size,
+static struct page *__dma_alloc_pages(struct device *dev, size_t size,
                dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
 {
        const struct dma_map_ops *ops = get_dma_ops(dev);
-       struct page *page;
 
        if (WARN_ON_ONCE(!dev->coherent_dma_mask))
                return NULL;
@@ -490,33 +489,162 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
 
        size = PAGE_ALIGN(size);
        if (dma_alloc_direct(dev, ops))
-               page = dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
-       else if (ops->alloc_pages)
-               page = ops->alloc_pages(dev, size, dma_handle, dir, gfp);
-       else
+               return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
+       if (!ops->alloc_pages)
                return NULL;
+       return ops->alloc_pages(dev, size, dma_handle, dir, gfp);
+}
 
-       debug_dma_map_page(dev, page, 0, size, dir, *dma_handle);
+struct page *dma_alloc_pages(struct device *dev, size_t size,
+               dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+       struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp);
 
+       if (page)
+               debug_dma_map_page(dev, page, 0, size, dir, *dma_handle);
        return page;
 }
 EXPORT_SYMBOL_GPL(dma_alloc_pages);
 
-void dma_free_pages(struct device *dev, size_t size, struct page *page,
+static void __dma_free_pages(struct device *dev, size_t size, struct page *page,
                dma_addr_t dma_handle, enum dma_data_direction dir)
 {
        const struct dma_map_ops *ops = get_dma_ops(dev);
 
        size = PAGE_ALIGN(size);
-       debug_dma_unmap_page(dev, dma_handle, size, dir);
-
        if (dma_alloc_direct(dev, ops))
                dma_direct_free_pages(dev, size, page, dma_handle, dir);
        else if (ops->free_pages)
                ops->free_pages(dev, size, page, dma_handle, dir);
 }
+
+void dma_free_pages(struct device *dev, size_t size, struct page *page,
+               dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+       debug_dma_unmap_page(dev, dma_handle, size, dir);
+       __dma_free_pages(dev, size, page, dma_handle, dir);
+}
 EXPORT_SYMBOL_GPL(dma_free_pages);
 
+int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma,
+               size_t size, struct page *page)
+{
+       unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+       if (vma->vm_pgoff >= count || vma_pages(vma) > count - vma->vm_pgoff)
+               return -ENXIO;
+       return remap_pfn_range(vma, vma->vm_start,
+                              page_to_pfn(page) + vma->vm_pgoff,
+                              vma_pages(vma) << PAGE_SHIFT, vma->vm_page_prot);
+}
+EXPORT_SYMBOL_GPL(dma_mmap_pages);
+
+static struct sg_table *alloc_single_sgt(struct device *dev, size_t size,
+               enum dma_data_direction dir, gfp_t gfp)
+{
+       struct sg_table *sgt;
+       struct page *page;
+
+       sgt = kmalloc(sizeof(*sgt), gfp);
+       if (!sgt)
+               return NULL;
+       if (sg_alloc_table(sgt, 1, gfp))
+               goto out_free_sgt;
+       page = __dma_alloc_pages(dev, size, &sgt->sgl->dma_address, dir, gfp);
+       if (!page)
+               goto out_free_table;
+       sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+       sg_dma_len(sgt->sgl) = sgt->sgl->length;
+       return sgt;
+out_free_table:
+       sg_free_table(sgt);
+out_free_sgt:
+       kfree(sgt);
+       return NULL;
+}
+
+struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
+               enum dma_data_direction dir, gfp_t gfp, unsigned long attrs)
+{
+       const struct dma_map_ops *ops = get_dma_ops(dev);
+       struct sg_table *sgt;
+
+       if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES))
+               return NULL;
+
+       if (ops && ops->alloc_noncontiguous)
+               sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs);
+       else
+               sgt = alloc_single_sgt(dev, size, dir, gfp);
+
+       if (sgt) {
+               sgt->nents = 1;
+               debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir);
+       }
+       return sgt;
+}
+EXPORT_SYMBOL_GPL(dma_alloc_noncontiguous);
+
+static void free_single_sgt(struct device *dev, size_t size,
+               struct sg_table *sgt, enum dma_data_direction dir)
+{
+       __dma_free_pages(dev, size, sg_page(sgt->sgl), sgt->sgl->dma_address,
+                        dir);
+       sg_free_table(sgt);
+       kfree(sgt);
+}
+
+void dma_free_noncontiguous(struct device *dev, size_t size,
+               struct sg_table *sgt, enum dma_data_direction dir)
+{
+       const struct dma_map_ops *ops = get_dma_ops(dev);
+
+       debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir);
+       if (ops && ops->free_noncontiguous)
+               ops->free_noncontiguous(dev, size, sgt, dir);
+       else
+               free_single_sgt(dev, size, sgt, dir);
+}
+EXPORT_SYMBOL_GPL(dma_free_noncontiguous);
+
+void *dma_vmap_noncontiguous(struct device *dev, size_t size,
+               struct sg_table *sgt)
+{
+       const struct dma_map_ops *ops = get_dma_ops(dev);
+       unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+       if (ops && ops->alloc_noncontiguous)
+               return vmap(sgt_handle(sgt)->pages, count, VM_MAP, PAGE_KERNEL);
+       return page_address(sg_page(sgt->sgl));
+}
+EXPORT_SYMBOL_GPL(dma_vmap_noncontiguous);
+
+void dma_vunmap_noncontiguous(struct device *dev, void *vaddr)
+{
+       const struct dma_map_ops *ops = get_dma_ops(dev);
+
+       if (ops && ops->alloc_noncontiguous)
+               vunmap(vaddr);
+}
+EXPORT_SYMBOL_GPL(dma_vunmap_noncontiguous);
+
+int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
+               size_t size, struct sg_table *sgt)
+{
+       const struct dma_map_ops *ops = get_dma_ops(dev);
+
+       if (ops && ops->alloc_noncontiguous) {
+               unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+               if (vma->vm_pgoff >= count ||
+                   vma_pages(vma) > count - vma->vm_pgoff)
+                       return -ENXIO;
+               return vm_map_pages(vma, sgt_handle(sgt)->pages, count);
+       }
+       return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl));
+}
+EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous);
+
 int dma_supported(struct device *dev, u64 mask)
 {
        const struct dma_map_ops *ops = get_dma_ops(dev);
index c10e855..8ca7d50 100644 (file)
  */
 #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
 
-enum swiotlb_force swiotlb_force;
-
-/*
- * Used to do a quick range check in swiotlb_tbl_unmap_single and
- * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
- * API.
- */
-phys_addr_t io_tlb_start, io_tlb_end;
-
-/*
- * The number of IO TLB blocks (in groups of 64) between io_tlb_start and
- * io_tlb_end.  This is command line adjustable via setup_io_tlb_npages.
- */
-static unsigned long io_tlb_nslabs;
+#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
 
-/*
- * The number of used IO TLB block
- */
-static unsigned long io_tlb_used;
+enum swiotlb_force swiotlb_force;
 
-/*
- * This is a free list describing the number of free entries available from
- * each index
- */
-static unsigned int *io_tlb_list;
-static unsigned int io_tlb_index;
+struct io_tlb_mem *io_tlb_default_mem;
 
 /*
  * Max segment that we can provide which (if pages are contingous) will
@@ -92,57 +71,30 @@ static unsigned int io_tlb_index;
  */
 static unsigned int max_segment;
 
-/*
- * We need to save away the original address corresponding to a mapped entry
- * for the sync operations.
- */
-#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
-static phys_addr_t *io_tlb_orig_addr;
-
-/*
- * The mapped buffer's size should be validated during a sync operation.
- */
-static size_t *io_tlb_orig_size;
-
-/*
- * Protect the above data structures in the map and unmap calls
- */
-static DEFINE_SPINLOCK(io_tlb_lock);
-
-static int late_alloc;
+static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
 
 static int __init
 setup_io_tlb_npages(char *str)
 {
        if (isdigit(*str)) {
-               io_tlb_nslabs = simple_strtoul(str, &str, 0);
                /* avoid tail segment of size < IO_TLB_SEGSIZE */
-               io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+               default_nslabs =
+                       ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE);
        }
        if (*str == ',')
                ++str;
-       if (!strcmp(str, "force")) {
+       if (!strcmp(str, "force"))
                swiotlb_force = SWIOTLB_FORCE;
-       } else if (!strcmp(str, "noforce")) {
+       else if (!strcmp(str, "noforce"))
                swiotlb_force = SWIOTLB_NO_FORCE;
-               io_tlb_nslabs = 1;
-       }
 
        return 0;
 }
 early_param("swiotlb", setup_io_tlb_npages);
 
-static bool no_iotlb_memory;
-
-unsigned long swiotlb_nr_tbl(void)
-{
-       return unlikely(no_iotlb_memory) ? 0 : io_tlb_nslabs;
-}
-EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
-
 unsigned int swiotlb_max_segment(void)
 {
-       return unlikely(no_iotlb_memory) ? 0 : max_segment;
+       return io_tlb_default_mem ? max_segment : 0;
 }
 EXPORT_SYMBOL_GPL(swiotlb_max_segment);
 
@@ -156,42 +108,34 @@ void swiotlb_set_max_segment(unsigned int val)
 
 unsigned long swiotlb_size_or_default(void)
 {
-       unsigned long size;
-
-       size = io_tlb_nslabs << IO_TLB_SHIFT;
-
-       return size ? size : (IO_TLB_DEFAULT_SIZE);
+       return default_nslabs << IO_TLB_SHIFT;
 }
 
-void __init swiotlb_adjust_size(unsigned long new_size)
+void __init swiotlb_adjust_size(unsigned long size)
 {
-       unsigned long size;
-
        /*
         * If swiotlb parameter has not been specified, give a chance to
         * architectures such as those supporting memory encryption to
         * adjust/expand SWIOTLB size for their use.
         */
-       if (!io_tlb_nslabs) {
-               size = ALIGN(new_size, IO_TLB_SIZE);
-               io_tlb_nslabs = size >> IO_TLB_SHIFT;
-               io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
-
-               pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
-       }
+       if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT)
+               return;
+       size = ALIGN(size, IO_TLB_SIZE);
+       default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+       pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
 }
 
 void swiotlb_print_info(void)
 {
-       unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+       struct io_tlb_mem *mem = io_tlb_default_mem;
 
-       if (no_iotlb_memory) {
+       if (!mem) {
                pr_warn("No low mem\n");
                return;
        }
 
-       pr_info("mapped [mem %pa-%pa] (%luMB)\n", &io_tlb_start, &io_tlb_end,
-              bytes >> 20);
+       pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end,
+              (mem->nslabs << IO_TLB_SHIFT) >> 20);
 }
 
 static inline unsigned long io_tlb_offset(unsigned long val)
@@ -212,64 +156,51 @@ static inline unsigned long nr_slots(u64 val)
  */
 void __init swiotlb_update_mem_attributes(void)
 {
+       struct io_tlb_mem *mem = io_tlb_default_mem;
        void *vaddr;
        unsigned long bytes;
 
-       if (no_iotlb_memory || late_alloc)
+       if (!mem || mem->late_alloc)
                return;
-
-       vaddr = phys_to_virt(io_tlb_start);
-       bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT);
+       vaddr = phys_to_virt(mem->start);
+       bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
        set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
        memset(vaddr, 0, bytes);
 }
 
 int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 {
-       unsigned long i, bytes;
+       unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
+       struct io_tlb_mem *mem;
        size_t alloc_size;
 
-       bytes = nslabs << IO_TLB_SHIFT;
-
-       io_tlb_nslabs = nslabs;
-       io_tlb_start = __pa(tlb);
-       io_tlb_end = io_tlb_start + bytes;
-
-       /*
-        * Allocate and initialize the free list array.  This array is used
-        * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
-        * between io_tlb_start and io_tlb_end.
-        */
-       alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(int));
-       io_tlb_list = memblock_alloc(alloc_size, PAGE_SIZE);
-       if (!io_tlb_list)
-               panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
-                     __func__, alloc_size, PAGE_SIZE);
+       if (swiotlb_force == SWIOTLB_NO_FORCE)
+               return 0;
 
-       alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t));
-       io_tlb_orig_addr = memblock_alloc(alloc_size, PAGE_SIZE);
-       if (!io_tlb_orig_addr)
-               panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
-                     __func__, alloc_size, PAGE_SIZE);
+       /* protect against double initialization */
+       if (WARN_ON_ONCE(io_tlb_default_mem))
+               return -ENOMEM;
 
-       alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t));
-       io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE);
-       if (!io_tlb_orig_size)
+       alloc_size = PAGE_ALIGN(struct_size(mem, slots, nslabs));
+       mem = memblock_alloc(alloc_size, PAGE_SIZE);
+       if (!mem)
                panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
                      __func__, alloc_size, PAGE_SIZE);
-
-       for (i = 0; i < io_tlb_nslabs; i++) {
-               io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
-               io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-               io_tlb_orig_size[i] = 0;
+       mem->nslabs = nslabs;
+       mem->start = __pa(tlb);
+       mem->end = mem->start + bytes;
+       mem->index = 0;
+       spin_lock_init(&mem->lock);
+       for (i = 0; i < mem->nslabs; i++) {
+               mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
+               mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+               mem->slots[i].alloc_size = 0;
        }
-       io_tlb_index = 0;
-       no_iotlb_memory = false;
 
+       io_tlb_default_mem = mem;
        if (verbose)
                swiotlb_print_info();
-
-       swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT);
+       swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
        return 0;
 }
 
@@ -280,29 +211,24 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 void  __init
 swiotlb_init(int verbose)
 {
-       size_t default_size = IO_TLB_DEFAULT_SIZE;
-       unsigned char *vstart;
-       unsigned long bytes;
-
-       if (!io_tlb_nslabs) {
-               io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
-               io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
-       }
-
-       bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+       size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT);
+       void *tlb;
 
-       /* Get IO TLB memory from the low pages */
-       vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE);
-       if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
+       if (swiotlb_force == SWIOTLB_NO_FORCE)
                return;
 
-       if (io_tlb_start) {
-               memblock_free_early(io_tlb_start,
-                                   PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
-               io_tlb_start = 0;
-       }
+       /* Get IO TLB memory from the low pages */
+       tlb = memblock_alloc_low(bytes, PAGE_SIZE);
+       if (!tlb)
+               goto fail;
+       if (swiotlb_init_with_tbl(tlb, default_nslabs, verbose))
+               goto fail_free_mem;
+       return;
+
+fail_free_mem:
+       memblock_free_early(__pa(tlb), bytes);
+fail:
        pr_warn("Cannot allocate buffer");
-       no_iotlb_memory = true;
 }
 
 /*
@@ -313,22 +239,22 @@ swiotlb_init(int verbose)
 int
 swiotlb_late_init_with_default_size(size_t default_size)
 {
-       unsigned long bytes, req_nslabs = io_tlb_nslabs;
+       unsigned long nslabs =
+               ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+       unsigned long bytes;
        unsigned char *vstart = NULL;
        unsigned int order;
        int rc = 0;
 
-       if (!io_tlb_nslabs) {
-               io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
-               io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
-       }
+       if (swiotlb_force == SWIOTLB_NO_FORCE)
+               return 0;
 
        /*
         * Get IO TLB memory from the low pages
         */
-       order = get_order(io_tlb_nslabs << IO_TLB_SHIFT);
-       io_tlb_nslabs = SLABS_PER_PAGE << order;
-       bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+       order = get_order(nslabs << IO_TLB_SHIFT);
+       nslabs = SLABS_PER_PAGE << order;
+       bytes = nslabs << IO_TLB_SHIFT;
 
        while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
                vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
@@ -338,134 +264,99 @@ swiotlb_late_init_with_default_size(size_t default_size)
                order--;
        }
 
-       if (!vstart) {
-               io_tlb_nslabs = req_nslabs;
+       if (!vstart)
                return -ENOMEM;
-       }
+
        if (order != get_order(bytes)) {
                pr_warn("only able to allocate %ld MB\n",
                        (PAGE_SIZE << order) >> 20);
-               io_tlb_nslabs = SLABS_PER_PAGE << order;
+               nslabs = SLABS_PER_PAGE << order;
        }
-       rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs);
+       rc = swiotlb_late_init_with_tbl(vstart, nslabs);
        if (rc)
                free_pages((unsigned long)vstart, order);
 
        return rc;
 }
 
-static void swiotlb_cleanup(void)
-{
-       io_tlb_end = 0;
-       io_tlb_start = 0;
-       io_tlb_nslabs = 0;
-       max_segment = 0;
-}
-
 int
 swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 {
-       unsigned long i, bytes;
+       unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
+       struct io_tlb_mem *mem;
 
-       bytes = nslabs << IO_TLB_SHIFT;
+       if (swiotlb_force == SWIOTLB_NO_FORCE)
+               return 0;
 
-       io_tlb_nslabs = nslabs;
-       io_tlb_start = virt_to_phys(tlb);
-       io_tlb_end = io_tlb_start + bytes;
+       /* protect against double initialization */
+       if (WARN_ON_ONCE(io_tlb_default_mem))
+               return -ENOMEM;
 
-       set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
-       memset(tlb, 0, bytes);
+       mem = (void *)__get_free_pages(GFP_KERNEL,
+               get_order(struct_size(mem, slots, nslabs)));
+       if (!mem)
+               return -ENOMEM;
 
-       /*
-        * Allocate and initialize the free list array.  This array is used
-        * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
-        * between io_tlb_start and io_tlb_end.
-        */
-       io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
-                                     get_order(io_tlb_nslabs * sizeof(int)));
-       if (!io_tlb_list)
-               goto cleanup3;
-
-       io_tlb_orig_addr = (phys_addr_t *)
-               __get_free_pages(GFP_KERNEL,
-                                get_order(io_tlb_nslabs *
-                                          sizeof(phys_addr_t)));
-       if (!io_tlb_orig_addr)
-               goto cleanup4;
-
-       io_tlb_orig_size = (size_t *)
-               __get_free_pages(GFP_KERNEL,
-                                get_order(io_tlb_nslabs *
-                                          sizeof(size_t)));
-       if (!io_tlb_orig_size)
-               goto cleanup5;
-
-
-       for (i = 0; i < io_tlb_nslabs; i++) {
-               io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
-               io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-               io_tlb_orig_size[i] = 0;
+       mem->nslabs = nslabs;
+       mem->start = virt_to_phys(tlb);
+       mem->end = mem->start + bytes;
+       mem->index = 0;
+       mem->late_alloc = 1;
+       spin_lock_init(&mem->lock);
+       for (i = 0; i < mem->nslabs; i++) {
+               mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
+               mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+               mem->slots[i].alloc_size = 0;
        }
-       io_tlb_index = 0;
-       no_iotlb_memory = false;
-
-       swiotlb_print_info();
 
-       late_alloc = 1;
-
-       swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT);
+       set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
+       memset(tlb, 0, bytes);
 
+       io_tlb_default_mem = mem;
+       swiotlb_print_info();
+       swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
        return 0;
-
-cleanup5:
-       free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs *
-                                                             sizeof(phys_addr_t)));
-
-cleanup4:
-       free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
-                                                        sizeof(int)));
-       io_tlb_list = NULL;
-cleanup3:
-       swiotlb_cleanup();
-       return -ENOMEM;
 }
 
 void __init swiotlb_exit(void)
 {
-       if (!io_tlb_orig_addr)
+       struct io_tlb_mem *mem = io_tlb_default_mem;
+       size_t size;
+
+       if (!mem)
                return;
 
-       if (late_alloc) {
-               free_pages((unsigned long)io_tlb_orig_size,
-                          get_order(io_tlb_nslabs * sizeof(size_t)));
-               free_pages((unsigned long)io_tlb_orig_addr,
-                          get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
-               free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
-                                                                sizeof(int)));
-               free_pages((unsigned long)phys_to_virt(io_tlb_start),
-                          get_order(io_tlb_nslabs << IO_TLB_SHIFT));
-       } else {
-               memblock_free_late(__pa(io_tlb_orig_addr),
-                                  PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
-               memblock_free_late(__pa(io_tlb_orig_size),
-                                  PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)));
-               memblock_free_late(__pa(io_tlb_list),
-                                  PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
-               memblock_free_late(io_tlb_start,
-                                  PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
-       }
-       swiotlb_cleanup();
+       size = struct_size(mem, slots, mem->nslabs);
+       if (mem->late_alloc)
+               free_pages((unsigned long)mem, get_order(size));
+       else
+               memblock_free_late(__pa(mem), PAGE_ALIGN(size));
+       io_tlb_default_mem = NULL;
 }
 
 /*
  * Bounce: copy the swiotlb buffer from or back to the original dma location
  */
-static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
-                          size_t size, enum dma_data_direction dir)
+static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
+                          enum dma_data_direction dir)
 {
+       struct io_tlb_mem *mem = io_tlb_default_mem;
+       int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
+       phys_addr_t orig_addr = mem->slots[index].orig_addr;
+       size_t alloc_size = mem->slots[index].alloc_size;
        unsigned long pfn = PFN_DOWN(orig_addr);
        unsigned char *vaddr = phys_to_virt(tlb_addr);
 
+       if (orig_addr == INVALID_PHYS_ADDR)
+               return;
+
+       if (size > alloc_size) {
+               dev_WARN_ONCE(dev, 1,
+                       "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n",
+                       alloc_size, size);
+               size = alloc_size;
+       }
+
        if (PageHighMem(pfn_to_page(pfn))) {
                /* The buffer does not have a mapping.  Map it in and copy */
                unsigned int offset = orig_addr & ~PAGE_MASK;
@@ -517,9 +408,9 @@ static inline unsigned long get_max_slots(unsigned long boundary_mask)
        return nr_slots(boundary_mask + 1);
 }
 
-static unsigned int wrap_index(unsigned int index)
+static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
 {
-       if (index >= io_tlb_nslabs)
+       if (index >= mem->nslabs)
                return 0;
        return index;
 }
@@ -531,9 +422,10 @@ static unsigned int wrap_index(unsigned int index)
 static int find_slots(struct device *dev, phys_addr_t orig_addr,
                size_t alloc_size)
 {
+       struct io_tlb_mem *mem = io_tlb_default_mem;
        unsigned long boundary_mask = dma_get_seg_boundary(dev);
        dma_addr_t tbl_dma_addr =
-               phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask;
+               phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
        unsigned long max_slots = get_max_slots(boundary_mask);
        unsigned int iotlb_align_mask =
                dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
@@ -552,15 +444,15 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr,
        if (alloc_size >= PAGE_SIZE)
                stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
 
-       spin_lock_irqsave(&io_tlb_lock, flags);
-       if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
+       spin_lock_irqsave(&mem->lock, flags);
+       if (unlikely(nslots > mem->nslabs - mem->used))
                goto not_found;
 
-       index = wrap = wrap_index(ALIGN(io_tlb_index, stride));
+       index = wrap = wrap_index(mem, ALIGN(mem->index, stride));
        do {
                if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
                    (orig_addr & iotlb_align_mask)) {
-                       index = wrap_index(index + 1);
+                       index = wrap_index(mem, index + 1);
                        continue;
                }
 
@@ -572,34 +464,34 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr,
                if (!iommu_is_span_boundary(index, nslots,
                                            nr_slots(tbl_dma_addr),
                                            max_slots)) {
-                       if (io_tlb_list[index] >= nslots)
+                       if (mem->slots[index].list >= nslots)
                                goto found;
                }
-               index = wrap_index(index + stride);
+               index = wrap_index(mem, index + stride);
        } while (index != wrap);
 
 not_found:
-       spin_unlock_irqrestore(&io_tlb_lock, flags);
+       spin_unlock_irqrestore(&mem->lock, flags);
        return -1;
 
 found:
        for (i = index; i < index + nslots; i++)
-               io_tlb_list[i] = 0;
+               mem->slots[i].list = 0;
        for (i = index - 1;
             io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
-            io_tlb_list[i]; i--)
-               io_tlb_list[i] = ++count;
+            mem->slots[i].list; i--)
+               mem->slots[i].list = ++count;
 
        /*
         * Update the indices to avoid searching in the next round.
         */
-       if (index + nslots < io_tlb_nslabs)
-               io_tlb_index = index + nslots;
+       if (index + nslots < mem->nslabs)
+               mem->index = index + nslots;
        else
-               io_tlb_index = 0;
-       io_tlb_used += nslots;
+               mem->index = 0;
+       mem->used += nslots;
 
-       spin_unlock_irqrestore(&io_tlb_lock, flags);
+       spin_unlock_irqrestore(&mem->lock, flags);
        return index;
 }
 
@@ -607,11 +499,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
                size_t mapping_size, size_t alloc_size,
                enum dma_data_direction dir, unsigned long attrs)
 {
+       struct io_tlb_mem *mem = io_tlb_default_mem;
        unsigned int offset = swiotlb_align_offset(dev, orig_addr);
-       unsigned int index, i;
+       unsigned int i;
+       int index;
        phys_addr_t tlb_addr;
 
-       if (no_iotlb_memory)
+       if (!mem)
                panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
 
        if (mem_encrypt_active())
@@ -628,7 +522,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
                if (!(attrs & DMA_ATTR_NO_WARN))
                        dev_warn_ratelimited(dev,
        "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
-                                alloc_size, io_tlb_nslabs, io_tlb_used);
+                                alloc_size, mem->nslabs, mem->used);
                return (phys_addr_t)DMA_MAPPING_ERROR;
        }
 
@@ -638,49 +532,37 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
         * needed.
         */
        for (i = 0; i < nr_slots(alloc_size + offset); i++) {
-               io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i);
-               io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT);
+               mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
+               mem->slots[index + i].alloc_size =
+                       alloc_size - (i << IO_TLB_SHIFT);
        }
-       tlb_addr = slot_addr(io_tlb_start, index) + offset;
+       tlb_addr = slot_addr(mem->start, index) + offset;
        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
            (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-               swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
+               swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
        return tlb_addr;
 }
 
-static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size)
-{
-       if (*size > orig_size) {
-               /* Warn and truncate mapping_size */
-               dev_WARN_ONCE(hwdev, 1,
-                       "Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n",
-                       orig_size, *size);
-               *size = orig_size;
-       }
-}
-
 /*
  * tlb_addr is the physical address of the bounce buffer to unmap.
  */
 void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
-                             size_t mapping_size, size_t alloc_size,
-                             enum dma_data_direction dir, unsigned long attrs)
+                             size_t mapping_size, enum dma_data_direction dir,
+                             unsigned long attrs)
 {
+       struct io_tlb_mem *mem = io_tlb_default_mem;
        unsigned long flags;
        unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
-       int i, count, nslots = nr_slots(alloc_size + offset);
-       int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT;
-       phys_addr_t orig_addr = io_tlb_orig_addr[index];
-
-       validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size);
+       int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
+       int nslots = nr_slots(mem->slots[index].alloc_size + offset);
+       int count, i;
 
        /*
         * First, sync the memory before unmapping the entry
         */
-       if (orig_addr != INVALID_PHYS_ADDR &&
-           !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-           ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
-               swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+       if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+           (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+               swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
 
        /*
         * Return the buffer to the free list by setting the corresponding
@@ -688,9 +570,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
         * While returning the entries to the free list, we merge the entries
         * with slots below and above the pool being returned.
         */
-       spin_lock_irqsave(&io_tlb_lock, flags);
+       spin_lock_irqsave(&mem->lock, flags);
        if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
-               count = io_tlb_list[index + nslots];
+               count = mem->slots[index + nslots].list;
        else
                count = 0;
 
@@ -699,9 +581,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
         * superceeding slots
         */
        for (i = index + nslots - 1; i >= index; i--) {
-               io_tlb_list[i] = ++count;
-               io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-               io_tlb_orig_size[i] = 0;
+               mem->slots[i].list = ++count;
+               mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+               mem->slots[i].alloc_size = 0;
        }
 
        /*
@@ -709,44 +591,29 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
         * available (non zero)
         */
        for (i = index - 1;
-            io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i];
+            io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list;
             i--)
-               io_tlb_list[i] = ++count;
-       io_tlb_used -= nslots;
-       spin_unlock_irqrestore(&io_tlb_lock, flags);
+               mem->slots[i].list = ++count;
+       mem->used -= nslots;
+       spin_unlock_irqrestore(&mem->lock, flags);
 }
 
-void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
-                            size_t size, enum dma_data_direction dir,
-                            enum dma_sync_target target)
+void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
+               size_t size, enum dma_data_direction dir)
 {
-       int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
-       size_t orig_size = io_tlb_orig_size[index];
-       phys_addr_t orig_addr = io_tlb_orig_addr[index];
-
-       if (orig_addr == INVALID_PHYS_ADDR)
-               return;
+       if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+               swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
+       else
+               BUG_ON(dir != DMA_FROM_DEVICE);
+}
 
-       validate_sync_size_and_truncate(hwdev, orig_size, &size);
-
-       switch (target) {
-       case SYNC_FOR_CPU:
-               if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
-                       swiotlb_bounce(orig_addr, tlb_addr,
-                                      size, DMA_FROM_DEVICE);
-               else
-                       BUG_ON(dir != DMA_TO_DEVICE);
-               break;
-       case SYNC_FOR_DEVICE:
-               if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-                       swiotlb_bounce(orig_addr, tlb_addr,
-                                      size, DMA_TO_DEVICE);
-               else
-                       BUG_ON(dir != DMA_FROM_DEVICE);
-               break;
-       default:
-               BUG();
-       }
+void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
+               size_t size, enum dma_data_direction dir)
+{
+       if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+               swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE);
+       else
+               BUG_ON(dir != DMA_TO_DEVICE);
 }
 
 /*
@@ -770,7 +637,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
        /* Ensure that the address returned is DMA'ble */
        dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
        if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
-               swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir,
+               swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
                        attrs | DMA_ATTR_SKIP_CPU_SYNC);
                dev_WARN_ONCE(dev, 1,
                        "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
@@ -790,22 +657,21 @@ size_t swiotlb_max_mapping_size(struct device *dev)
 
 bool is_swiotlb_active(void)
 {
-       /*
-        * When SWIOTLB is initialized, even if io_tlb_start points to physical
-        * address zero, io_tlb_end surely doesn't.
-        */
-       return io_tlb_end != 0;
+       return io_tlb_default_mem != NULL;
 }
+EXPORT_SYMBOL_GPL(is_swiotlb_active);
 
 #ifdef CONFIG_DEBUG_FS
 
 static int __init swiotlb_create_debugfs(void)
 {
-       struct dentry *root;
+       struct io_tlb_mem *mem = io_tlb_default_mem;
 
-       root = debugfs_create_dir("swiotlb", NULL);
-       debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs);
-       debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used);
+       if (!mem)
+               return 0;
+       mem->debugfs = debugfs_create_dir("swiotlb", NULL);
+       debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
+       debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
        return 0;
 }
 
index 928b166..2e947a4 100644 (file)
@@ -581,11 +581,6 @@ static u64 perf_event_time(struct perf_event *event);
 
 void __weak perf_event_print_debug(void)       { }
 
-extern __weak const char *perf_pmu_name(void)
-{
-       return "pmu";
-}
-
 static inline u64 perf_clock(void)
 {
        return local_clock();
index 0596526..fd1c041 100644 (file)
@@ -1440,9 +1440,48 @@ void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
                           TASK_INTERRUPTIBLE, p);
 }
 
+static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
+                                struct task_struct *target)
+{
+       struct task_struct *parent =
+               !ptrace ? target->real_parent : target->parent;
+
+       return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
+                                    same_thread_group(current, parent));
+}
+
+/*
+ * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
+ * and tracee lists to find the target task.
+ */
+static int do_wait_pid(struct wait_opts *wo)
+{
+       bool ptrace;
+       struct task_struct *target;
+       int retval;
+
+       ptrace = false;
+       target = pid_task(wo->wo_pid, PIDTYPE_TGID);
+       if (target && is_effectively_child(wo, ptrace, target)) {
+               retval = wait_consider_task(wo, ptrace, target);
+               if (retval)
+                       return retval;
+       }
+
+       ptrace = true;
+       target = pid_task(wo->wo_pid, PIDTYPE_PID);
+       if (target && target->ptrace &&
+           is_effectively_child(wo, ptrace, target)) {
+               retval = wait_consider_task(wo, ptrace, target);
+               if (retval)
+                       return retval;
+       }
+
+       return 0;
+}
+
 static long do_wait(struct wait_opts *wo)
 {
-       struct task_struct *tsk;
        int retval;
 
        trace_sched_process_wait(wo->wo_pid);
@@ -1464,19 +1503,27 @@ repeat:
 
        set_current_state(TASK_INTERRUPTIBLE);
        read_lock(&tasklist_lock);
-       tsk = current;
-       do {
-               retval = do_wait_thread(wo, tsk);
-               if (retval)
-                       goto end;
 
-               retval = ptrace_do_wait(wo, tsk);
+       if (wo->wo_type == PIDTYPE_PID) {
+               retval = do_wait_pid(wo);
                if (retval)
                        goto end;
+       } else {
+               struct task_struct *tsk = current;
+
+               do {
+                       retval = do_wait_thread(wo, tsk);
+                       if (retval)
+                               goto end;
 
-               if (wo->wo_flags & __WNOTHREAD)
-                       break;
-       } while_each_thread(current, tsk);
+                       retval = ptrace_do_wait(wo, tsk);
+                       if (retval)
+                               goto end;
+
+                       if (wo->wo_flags & __WNOTHREAD)
+                               break;
+               } while_each_thread(current, tsk);
+       }
        read_unlock(&tasklist_lock);
 
 notask:
index 771e0ea..dc06afd 100644 (file)
@@ -1145,7 +1145,7 @@ void mmput_async(struct mm_struct *mm)
  * invocations: in mmput() nobody alive left, in execve task is single
  * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
  * mm->exe_file, but does so without using set_mm_exe_file() in order
- * to do avoid the need for any locks.
+ * to avoid the need for any locks.
  */
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
@@ -1396,7 +1396,6 @@ fail_nomem:
 static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 {
        struct mm_struct *mm, *oldmm;
-       int retval;
 
        tsk->min_flt = tsk->maj_flt = 0;
        tsk->nvcsw = tsk->nivcsw = 0;
@@ -1423,21 +1422,15 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
        if (clone_flags & CLONE_VM) {
                mmget(oldmm);
                mm = oldmm;
-               goto good_mm;
+       } else {
+               mm = dup_mm(tsk, current->mm);
+               if (!mm)
+                       return -ENOMEM;
        }
 
-       retval = -ENOMEM;
-       mm = dup_mm(tsk, current->mm);
-       if (!mm)
-               goto fail_nomem;
-
-good_mm:
        tsk->mm = mm;
        tsk->active_mm = mm;
        return 0;
-
-fail_nomem:
-       return retval;
 }
 
 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
@@ -1743,7 +1736,7 @@ static int pidfd_release(struct inode *inode, struct file *file)
  * /proc/<pid>/status where Pid and NSpid are always shown relative to
  * the  pid namespace of the procfs instance. The difference becomes
  * obvious when sending around a pidfd between pid namespaces from a
- * different branch of the tree, i.e. where no ancestoral relation is
+ * different branch of the tree, i.e. where no ancestral relation is
  * present between the pid namespaces:
  * - create two new pid namespaces ns1 and ns2 in the initial pid
  *   namespace (also take care to create new mount namespaces in the
@@ -2735,8 +2728,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
                return false;
 
        /*
-        * - make the CLONE_DETACHED bit reuseable for clone3
-        * - make the CSIGNAL bits reuseable for clone3
+        * - make the CLONE_DETACHED bit reusable for clone3
+        * - make the CSIGNAL bits reusable for clone3
         */
        if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
                return false;
index f62de2d..58f87a3 100644 (file)
@@ -4,6 +4,7 @@ menu "GCOV-based kernel profiling"
 config GCOV_KERNEL
        bool "Enable gcov-based kernel profiling"
        depends on DEBUG_FS
+       depends on !CC_IS_CLANG || CLANG_VERSION >= 110000
        select CONSTRUCTORS
        default n
        help
index 0ffe9f1..073a373 100644 (file)
@@ -49,6 +49,55 @@ void gcov_enable_events(void)
        mutex_unlock(&gcov_lock);
 }
 
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+       u32 *data;
+
+       if (buffer) {
+               data = buffer + off;
+               *data = v;
+       }
+
+       return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+       u32 *data;
+
+       if (buffer) {
+               data = buffer + off;
+
+               data[0] = (v & 0xffffffffUL);
+               data[1] = (v >> 32);
+       }
+
+       return sizeof(*data) * 2;
+}
+
 #ifdef CONFIG_MODULES
 /* Update list and generate events when modules are unloaded. */
 static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
index c466c7f..cbb0bed 100644 (file)
@@ -48,9 +48,8 @@
 #include <linux/list.h>
 #include <linux/printk.h>
 #include <linux/ratelimit.h>
-#include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
 #include "gcov.h"
 
 typedef void (*llvm_gcov_callback)(void);
@@ -70,16 +69,10 @@ struct gcov_fn_info {
 
        u32 ident;
        u32 checksum;
-#if CONFIG_CLANG_VERSION < 110000
-       u8 use_extra_checksum;
-#endif
        u32 cfg_checksum;
 
        u32 num_counters;
        u64 *counters;
-#if CONFIG_CLANG_VERSION < 110000
-       const char *function_name;
-#endif
 };
 
 static struct gcov_info *current_info;
@@ -109,16 +102,6 @@ void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
 }
 EXPORT_SYMBOL(llvm_gcov_init);
 
-#if CONFIG_CLANG_VERSION < 110000
-void llvm_gcda_start_file(const char *orig_filename, const char version[4],
-               u32 checksum)
-{
-       current_info->filename = orig_filename;
-       memcpy(&current_info->version, version, sizeof(current_info->version));
-       current_info->checksum = checksum;
-}
-EXPORT_SYMBOL(llvm_gcda_start_file);
-#else
 void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum)
 {
        current_info->filename = orig_filename;
@@ -126,28 +109,7 @@ void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum)
        current_info->checksum = checksum;
 }
 EXPORT_SYMBOL(llvm_gcda_start_file);
-#endif
-
-#if CONFIG_CLANG_VERSION < 110000
-void llvm_gcda_emit_function(u32 ident, const char *function_name,
-               u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
-{
-       struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
-
-       if (!info)
-               return;
-
-       INIT_LIST_HEAD(&info->head);
-       info->ident = ident;
-       info->checksum = func_checksum;
-       info->use_extra_checksum = use_extra_checksum;
-       info->cfg_checksum = cfg_checksum;
-       if (function_name)
-               info->function_name = kstrdup(function_name, GFP_KERNEL);
 
-       list_add_tail(&info->head, &current_info->functions);
-}
-#else
 void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum)
 {
        struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
@@ -161,7 +123,6 @@ void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum)
        info->cfg_checksum = cfg_checksum;
        list_add_tail(&info->head, &current_info->functions);
 }
-#endif
 EXPORT_SYMBOL(llvm_gcda_emit_function);
 
 void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
@@ -292,16 +253,8 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
                !list_is_last(&fn_ptr2->head, &info2->functions)) {
                if (fn_ptr1->checksum != fn_ptr2->checksum)
                        return false;
-#if CONFIG_CLANG_VERSION < 110000
-               if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
-                       return false;
-               if (fn_ptr1->use_extra_checksum &&
-                       fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
-                       return false;
-#else
                if (fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
                        return false;
-#endif
                fn_ptr1 = list_next_entry(fn_ptr1, head);
                fn_ptr2 = list_next_entry(fn_ptr2, head);
        }
@@ -330,35 +283,6 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
        }
 }
 
-#if CONFIG_CLANG_VERSION < 110000
-static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
-{
-       size_t cv_size; /* counter values size */
-       struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
-                       GFP_KERNEL);
-       if (!fn_dup)
-               return NULL;
-       INIT_LIST_HEAD(&fn_dup->head);
-
-       fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL);
-       if (!fn_dup->function_name)
-               goto err_name;
-
-       cv_size = fn->num_counters * sizeof(fn->counters[0]);
-       fn_dup->counters = vmalloc(cv_size);
-       if (!fn_dup->counters)
-               goto err_counters;
-       memcpy(fn_dup->counters, fn->counters, cv_size);
-
-       return fn_dup;
-
-err_counters:
-       kfree(fn_dup->function_name);
-err_name:
-       kfree(fn_dup);
-       return NULL;
-}
-#else
 static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
 {
        size_t cv_size; /* counter values size */
@@ -369,7 +293,7 @@ static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
        INIT_LIST_HEAD(&fn_dup->head);
 
        cv_size = fn->num_counters * sizeof(fn->counters[0]);
-       fn_dup->counters = vmalloc(cv_size);
+       fn_dup->counters = kvmalloc(cv_size, GFP_KERNEL);
        if (!fn_dup->counters) {
                kfree(fn_dup);
                return NULL;
@@ -379,7 +303,6 @@ static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
 
        return fn_dup;
 }
-#endif
 
 /**
  * gcov_info_dup - duplicate profiling data set
@@ -420,99 +343,18 @@ err:
  * gcov_info_free - release memory for profiling data set duplicate
  * @info: profiling data set duplicate to free
  */
-#if CONFIG_CLANG_VERSION < 110000
-void gcov_info_free(struct gcov_info *info)
-{
-       struct gcov_fn_info *fn, *tmp;
-
-       list_for_each_entry_safe(fn, tmp, &info->functions, head) {
-               kfree(fn->function_name);
-               vfree(fn->counters);
-               list_del(&fn->head);
-               kfree(fn);
-       }
-       kfree(info->filename);
-       kfree(info);
-}
-#else
 void gcov_info_free(struct gcov_info *info)
 {
        struct gcov_fn_info *fn, *tmp;
 
        list_for_each_entry_safe(fn, tmp, &info->functions, head) {
-               vfree(fn->counters);
+               kvfree(fn->counters);
                list_del(&fn->head);
                kfree(fn);
        }
        kfree(info->filename);
        kfree(info);
 }
-#endif
-
-#define ITER_STRIDE    PAGE_SIZE
-
-/**
- * struct gcov_iterator - specifies current file position in logical records
- * @info: associated profiling data
- * @buffer: buffer containing file data
- * @size: size of buffer
- * @pos: current position in file
- */
-struct gcov_iterator {
-       struct gcov_info *info;
-       void *buffer;
-       size_t size;
-       loff_t pos;
-};
-
-/**
- * store_gcov_u32 - store 32 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
- * store anything.
- */
-static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
-{
-       u32 *data;
-
-       if (buffer) {
-               data = buffer + off;
-               *data = v;
-       }
-
-       return sizeof(*data);
-}
-
-/**
- * store_gcov_u64 - store 64 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. 64 bit numbers are stored as two 32 bit numbers, the low part
- * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
- * anything.
- */
-static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
-{
-       u32 *data;
-
-       if (buffer) {
-               data = buffer + off;
-
-               data[0] = (v & 0xffffffffUL);
-               data[1] = (v >> 32);
-       }
-
-       return sizeof(*data) * 2;
-}
 
 /**
  * convert_to_gcda - convert profiling data set to gcda file format
@@ -521,7 +363,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
  *
  * Returns the number of bytes that were/would have been stored into the buffer.
  */
-static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+size_t convert_to_gcda(char *buffer, struct gcov_info *info)
 {
        struct gcov_fn_info *fi_ptr;
        size_t pos = 0;
@@ -535,21 +377,10 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
                u32 i;
 
                pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
-#if CONFIG_CLANG_VERSION < 110000
-               pos += store_gcov_u32(buffer, pos,
-                       fi_ptr->use_extra_checksum ? 3 : 2);
-#else
                pos += store_gcov_u32(buffer, pos, 3);
-#endif
                pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
                pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
-#if CONFIG_CLANG_VERSION < 110000
-               if (fi_ptr->use_extra_checksum)
-                       pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
-#else
                pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
-#endif
-
                pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
                pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
                for (i = 0; i < fi_ptr->num_counters; i++)
@@ -558,102 +389,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
 
        return pos;
 }
-
-/**
- * gcov_iter_new - allocate and initialize profiling data iterator
- * @info: profiling data set to be iterated
- *
- * Return file iterator on success, %NULL otherwise.
- */
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
-{
-       struct gcov_iterator *iter;
-
-       iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
-       if (!iter)
-               goto err_free;
-
-       iter->info = info;
-       /* Dry-run to get the actual buffer size. */
-       iter->size = convert_to_gcda(NULL, info);
-       iter->buffer = vmalloc(iter->size);
-       if (!iter->buffer)
-               goto err_free;
-
-       convert_to_gcda(iter->buffer, info);
-
-       return iter;
-
-err_free:
-       kfree(iter);
-       return NULL;
-}
-
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-void gcov_iter_free(struct gcov_iterator *iter)
-{
-       vfree(iter->buffer);
-       kfree(iter);
-}
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
-{
-       return iter->info;
-}
-
-/**
- * gcov_iter_start - reset file iterator to starting position
- * @iter: file iterator
- */
-void gcov_iter_start(struct gcov_iterator *iter)
-{
-       iter->pos = 0;
-}
-
-/**
- * gcov_iter_next - advance file iterator to next logical record
- * @iter: file iterator
- *
- * Return zero if new position is valid, non-zero if iterator has reached end.
- */
-int gcov_iter_next(struct gcov_iterator *iter)
-{
-       if (iter->pos < iter->size)
-               iter->pos += ITER_STRIDE;
-
-       if (iter->pos >= iter->size)
-               return -EINVAL;
-
-       return 0;
-}
-
-/**
- * gcov_iter_write - write data for current pos to seq_file
- * @iter: file iterator
- * @seq: seq_file handle
- *
- * Return zero on success, non-zero otherwise.
- */
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
-{
-       size_t len;
-
-       if (iter->pos >= iter->size)
-               return -EINVAL;
-
-       len = ITER_STRIDE;
-       if (iter->pos + len > iter->size)
-               len = iter->size - iter->pos;
-
-       seq_write(seq, iter->buffer + iter->pos, len);
-
-       return 0;
-}
index 82babf5..5c3086c 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/seq_file.h>
+#include <linux/mm.h>
 #include "gcov.h"
 
 /**
@@ -85,6 +86,115 @@ static int __init gcov_persist_setup(char *str)
 }
 __setup("gcov_persist=", gcov_persist_setup);
 
+#define ITER_STRIDE    PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+       struct gcov_info *info;
+       size_t size;
+       loff_t pos;
+       char buffer[];
+};
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+static struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+       struct gcov_iterator *iter;
+       size_t size;
+
+       /* Dry-run to get the actual buffer size. */
+       size = convert_to_gcda(NULL, info);
+
+       iter = kvmalloc(struct_size(iter, buffer, size), GFP_KERNEL);
+       if (!iter)
+               return NULL;
+
+       iter->info = info;
+       iter->size = size;
+       convert_to_gcda(iter->buffer, info);
+
+       return iter;
+}
+
+
+/**
+ * gcov_iter_free - free iterator data
+ * @iter: file iterator
+ */
+static void gcov_iter_free(struct gcov_iterator *iter)
+{
+       kvfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+static struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+       return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+static void gcov_iter_start(struct gcov_iterator *iter)
+{
+       iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+static int gcov_iter_next(struct gcov_iterator *iter)
+{
+       if (iter->pos < iter->size)
+               iter->pos += ITER_STRIDE;
+
+       if (iter->pos >= iter->size)
+               return -EINVAL;
+
+       return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+static int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+       size_t len;
+
+       if (iter->pos >= iter->size)
+               return -EINVAL;
+
+       len = ITER_STRIDE;
+       if (iter->pos + len > iter->size)
+               len = iter->size - iter->pos;
+
+       seq_write(seq, iter->buffer + iter->pos, len);
+
+       return 0;
+}
+
 /*
  * seq_file.start() implementation for gcov data files. Note that the
  * gcov_iterator interface is designed to be more restrictive than seq_file
index c53408a..460c12b 100644 (file)
@@ -15,8 +15,7 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/seq_file.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
 #include "gcov.h"
 
 #if (__GNUC__ >= 10)
@@ -310,7 +309,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info)
 
                        cv_size = sizeof(gcov_type) * sci_ptr->num;
 
-                       dci_ptr->values = vmalloc(cv_size);
+                       dci_ptr->values = kvmalloc(cv_size, GFP_KERNEL);
 
                        if (!dci_ptr->values)
                                goto err_free;
@@ -352,7 +351,7 @@ void gcov_info_free(struct gcov_info *info)
                ci_ptr = info->functions[fi_idx]->ctrs;
 
                for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++)
-                       vfree(ci_ptr->values);
+                       kvfree(ci_ptr->values);
 
                kfree(info->functions[fi_idx]);
        }
@@ -363,71 +362,6 @@ free_info:
        kfree(info);
 }
 
-#define ITER_STRIDE    PAGE_SIZE
-
-/**
- * struct gcov_iterator - specifies current file position in logical records
- * @info: associated profiling data
- * @buffer: buffer containing file data
- * @size: size of buffer
- * @pos: current position in file
- */
-struct gcov_iterator {
-       struct gcov_info *info;
-       void *buffer;
-       size_t size;
-       loff_t pos;
-};
-
-/**
- * store_gcov_u32 - store 32 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
- * store anything.
- */
-static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
-{
-       u32 *data;
-
-       if (buffer) {
-               data = buffer + off;
-               *data = v;
-       }
-
-       return sizeof(*data);
-}
-
-/**
- * store_gcov_u64 - store 64 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. 64 bit numbers are stored as two 32 bit numbers, the low part
- * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
- * anything.
- */
-static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
-{
-       u32 *data;
-
-       if (buffer) {
-               data = buffer + off;
-
-               data[0] = (v & 0xffffffffUL);
-               data[1] = (v >> 32);
-       }
-
-       return sizeof(*data) * 2;
-}
-
 /**
  * convert_to_gcda - convert profiling data set to gcda file format
  * @buffer: the buffer to store file data or %NULL if no data should be stored
@@ -435,7 +369,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
  *
  * Returns the number of bytes that were/would have been stored into the buffer.
  */
-static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+size_t convert_to_gcda(char *buffer, struct gcov_info *info)
 {
        struct gcov_fn_info *fi_ptr;
        struct gcov_ctr_info *ci_ptr;
@@ -481,102 +415,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
 
        return pos;
 }
-
-/**
- * gcov_iter_new - allocate and initialize profiling data iterator
- * @info: profiling data set to be iterated
- *
- * Return file iterator on success, %NULL otherwise.
- */
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
-{
-       struct gcov_iterator *iter;
-
-       iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
-       if (!iter)
-               goto err_free;
-
-       iter->info = info;
-       /* Dry-run to get the actual buffer size. */
-       iter->size = convert_to_gcda(NULL, info);
-       iter->buffer = vmalloc(iter->size);
-       if (!iter->buffer)
-               goto err_free;
-
-       convert_to_gcda(iter->buffer, info);
-
-       return iter;
-
-err_free:
-       kfree(iter);
-       return NULL;
-}
-
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-void gcov_iter_free(struct gcov_iterator *iter)
-{
-       vfree(iter->buffer);
-       kfree(iter);
-}
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
-{
-       return iter->info;
-}
-
-/**
- * gcov_iter_start - reset file iterator to starting position
- * @iter: file iterator
- */
-void gcov_iter_start(struct gcov_iterator *iter)
-{
-       iter->pos = 0;
-}
-
-/**
- * gcov_iter_next - advance file iterator to next logical record
- * @iter: file iterator
- *
- * Return zero if new position is valid, non-zero if iterator has reached end.
- */
-int gcov_iter_next(struct gcov_iterator *iter)
-{
-       if (iter->pos < iter->size)
-               iter->pos += ITER_STRIDE;
-
-       if (iter->pos >= iter->size)
-               return -EINVAL;
-
-       return 0;
-}
-
-/**
- * gcov_iter_write - write data for current pos to seq_file
- * @iter: file iterator
- * @seq: seq_file handle
- *
- * Return zero on success, non-zero otherwise.
- */
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
-{
-       size_t len;
-
-       if (iter->pos >= iter->size)
-               return -EINVAL;
-
-       len = ITER_STRIDE;
-       if (iter->pos + len > iter->size)
-               len = iter->size - iter->pos;
-
-       seq_write(seq, iter->buffer + iter->pos, len);
-
-       return 0;
-}
index 6ab2c18..912b8ea 100644 (file)
@@ -48,6 +48,7 @@ struct gcov_info *gcov_info_next(struct gcov_info *info);
 void gcov_info_link(struct gcov_info *info);
 void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
 bool gcov_info_within_module(struct gcov_info *info, struct module *mod);
+size_t convert_to_gcda(char *buffer, struct gcov_info *info);
 
 /* Base interface. */
 enum gcov_action {
@@ -58,16 +59,9 @@ enum gcov_action {
 void gcov_event(enum gcov_action action, struct gcov_info *info);
 void gcov_enable_events(void);
 
-/* Iterator control. */
-struct seq_file;
-struct gcov_iterator;
-
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
-void gcov_iter_free(struct gcov_iterator *iter);
-void gcov_iter_start(struct gcov_iterator *iter);
-int gcov_iter_next(struct gcov_iterator *iter);
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
+/* writing helpers */
+size_t store_gcov_u32(void *buffer, size_t off, u32 v);
+size_t store_gcov_u64(void *buffer, size_t off, u64 v);
 
 /* gcov_info control. */
 void gcov_info_reset(struct gcov_info *info);
index f42ef86..6284443 100644 (file)
@@ -295,8 +295,8 @@ void irq_domain_update_bus_token(struct irq_domain *domain,
 EXPORT_SYMBOL_GPL(irq_domain_update_bus_token);
 
 /**
- * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
- * @of_node: pointer to interrupt controller's device tree node.
+ * irq_domain_create_simple() - Register an irq_domain and optionally map a range of irqs
+ * @fwnode: firmware node for the interrupt controller
  * @size: total number of irqs in mapping
  * @first_irq: first number of irq block assigned to the domain,
  *     pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
@@ -312,15 +312,15 @@ EXPORT_SYMBOL_GPL(irq_domain_update_bus_token);
  * irqs get mapped dynamically on the fly. However, if the controller requires
  * static virq assignments (non-DT boot) then it will set that up correctly.
  */
-struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
-                                        unsigned int size,
-                                        unsigned int first_irq,
-                                        const struct irq_domain_ops *ops,
-                                        void *host_data)
+struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode,
+                                           unsigned int size,
+                                           unsigned int first_irq,
+                                           const struct irq_domain_ops *ops,
+                                           void *host_data)
 {
        struct irq_domain *domain;
 
-       domain = __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data);
+       domain = __irq_domain_add(fwnode, size, size, 0, ops, host_data);
        if (!domain)
                return NULL;
 
@@ -328,7 +328,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
                if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
                        /* attempt to allocated irq_descs */
                        int rc = irq_alloc_descs(first_irq, first_irq, size,
-                                                of_node_to_nid(of_node));
+                                                of_node_to_nid(to_of_node(fwnode)));
                        if (rc < 0)
                                pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
                                        first_irq);
@@ -338,7 +338,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
 
        return domain;
 }
-EXPORT_SYMBOL_GPL(irq_domain_add_simple);
+EXPORT_SYMBOL_GPL(irq_domain_create_simple);
 
 /**
  * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
index a0b6780..f099bae 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/compiler.h>
 #include <linux/hugetlb.h>
 #include <linux/objtool.h>
+#include <linux/kmsg_dump.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -1165,7 +1166,7 @@ int kernel_kexec(void)
 #endif
        {
                kexec_in_progress = true;
-               kernel_restart_prepare(NULL);
+               kernel_restart_prepare("kexec reboot");
                migrate_to_reboot_cpu();
 
                /*
@@ -1179,6 +1180,7 @@ int kernel_kexec(void)
                machine_shutdown();
        }
 
+       kmsg_dump(KMSG_DUMP_SHUTDOWN);
        machine_kexec(kexec_image);
 
 #ifdef CONFIG_KEXEC_JUMP
index 5c3447c..33400ff 100644 (file)
@@ -740,8 +740,10 @@ static int kexec_calculate_store_digests(struct kimage *image)
 
        sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
        sha_regions = vzalloc(sha_region_sz);
-       if (!sha_regions)
+       if (!sha_regions) {
+               ret = -ENOMEM;
                goto out_free_desc;
+       }
 
        desc->tfm   = tfm;
 
index 3cd075c..b717134 100644 (file)
@@ -58,7 +58,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
 /*
        modprobe_path is set via /proc/sys.
 */
-char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
+char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH;
 
 static void free_modprobe_argv(struct subprocess_info *info)
 {
index 627e61b..028a5ab 100644 (file)
@@ -64,12 +64,8 @@ static DEFINE_RWLOCK(resource_lock);
 static struct resource *bootmem_resource_free;
 static DEFINE_SPINLOCK(bootmem_resource_lock);
 
-static struct resource *next_resource(struct resource *p, bool sibling_only)
+static struct resource *next_resource(struct resource *p)
 {
-       /* Caller wants to traverse through siblings only */
-       if (sibling_only)
-               return p->sibling;
-
        if (p->child)
                return p->child;
        while (!p->sibling && p->parent)
@@ -81,7 +77,7 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct resource *p = v;
        (*pos)++;
-       return (void *)next_resource(p, false);
+       return (void *)next_resource(p);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -330,14 +326,10 @@ EXPORT_SYMBOL(release_resource);
  * of the resource that's within [@start..@end]; if none is found, returns
  * -ENODEV.  Returns -EINVAL for invalid parameters.
  *
- * This function walks the whole tree and not just first level children
- * unless @first_lvl is true.
- *
  * @start:     start address of the resource searched for
  * @end:       end address of same resource
  * @flags:     flags which the resource must have
  * @desc:      descriptor the resource must have
- * @first_lvl: walk only the first level children, if set
  * @res:       return ptr, if resource found
  *
  * The caller must specify @start, @end, @flags, and @desc
@@ -345,9 +337,8 @@ EXPORT_SYMBOL(release_resource);
  */
 static int find_next_iomem_res(resource_size_t start, resource_size_t end,
                               unsigned long flags, unsigned long desc,
-                              bool first_lvl, struct resource *res)
+                              struct resource *res)
 {
-       bool siblings_only = true;
        struct resource *p;
 
        if (!res)
@@ -358,7 +349,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
 
        read_lock(&resource_lock);
 
-       for (p = iomem_resource.child; p; p = next_resource(p, siblings_only)) {
+       for (p = iomem_resource.child; p; p = next_resource(p)) {
                /* If we passed the resource we are looking for, stop */
                if (p->start > end) {
                        p = NULL;
@@ -369,13 +360,6 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
                if (p->end < start)
                        continue;
 
-               /*
-                * Now that we found a range that matches what we look for,
-                * check the flags and the descriptor. If we were not asked to
-                * use only the first level, start looking at children as well.
-                */
-               siblings_only = first_lvl;
-
                if ((p->flags & flags) != flags)
                        continue;
                if ((desc != IORES_DESC_NONE) && (desc != p->desc))
@@ -402,14 +386,14 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
 
 static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
                                 unsigned long flags, unsigned long desc,
-                                bool first_lvl, void *arg,
+                                void *arg,
                                 int (*func)(struct resource *, void *))
 {
        struct resource res;
        int ret = -EINVAL;
 
        while (start < end &&
-              !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
+              !find_next_iomem_res(start, end, flags, desc, &res)) {
                ret = (*func)(&res, arg);
                if (ret)
                        break;
@@ -431,7 +415,6 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
  * @arg: function argument for the callback @func
  * @func: callback function that is called for each qualifying resource area
  *
- * This walks through whole tree and not just first level children.
  * All the memory ranges which overlap start,end and also match flags and
  * desc are valid candidates.
  *
@@ -441,7 +424,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
 int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
                u64 end, void *arg, int (*func)(struct resource *, void *))
 {
-       return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
+       return __walk_iomem_res_desc(start, end, flags, desc, arg, func);
 }
 EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
 
@@ -457,8 +440,8 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 {
        unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
-       return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
-                                    arg, func);
+       return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
+                                    func);
 }
 
 /*
@@ -470,17 +453,14 @@ int walk_mem_res(u64 start, u64 end, void *arg,
 {
        unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 
-       return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
-                                    arg, func);
+       return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
+                                    func);
 }
 
 /*
  * This function calls the @func callback against all memory ranges of type
  * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
  * It is to be used only for System RAM.
- *
- * This will find System RAM ranges that are children of top-level resources
- * in addition to top-level System RAM resources.
  */
 int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
                          void *arg, int (*func)(unsigned long, unsigned long, void *))
@@ -495,8 +475,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
        end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
        flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
        while (start < end &&
-              !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
-                                   false, &res)) {
+              !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res)) {
                pfn = PFN_UP(res.start);
                end_pfn = PFN_DOWN(res.end + 1);
                if (end_pfn > pfn)
@@ -523,6 +502,34 @@ int __weak page_is_ram(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(page_is_ram);
 
+static int __region_intersects(resource_size_t start, size_t size,
+                       unsigned long flags, unsigned long desc)
+{
+       struct resource res;
+       int type = 0; int other = 0;
+       struct resource *p;
+
+       res.start = start;
+       res.end = start + size - 1;
+
+       for (p = iomem_resource.child; p ; p = p->sibling) {
+               bool is_type = (((p->flags & flags) == flags) &&
+                               ((desc == IORES_DESC_NONE) ||
+                                (desc == p->desc)));
+
+               if (resource_overlaps(p, &res))
+                       is_type ? type++ : other++;
+       }
+
+       if (type == 0)
+               return REGION_DISJOINT;
+
+       if (other == 0)
+               return REGION_INTERSECTS;
+
+       return REGION_MIXED;
+}
+
 /**
  * region_intersects() - determine intersection of region with known resources
  * @start: region start address
@@ -546,31 +553,13 @@ EXPORT_SYMBOL_GPL(page_is_ram);
 int region_intersects(resource_size_t start, size_t size, unsigned long flags,
                      unsigned long desc)
 {
-       struct resource res;
-       int type = 0; int other = 0;
-       struct resource *p;
-
-       res.start = start;
-       res.end = start + size - 1;
+       int ret;
 
        read_lock(&resource_lock);
-       for (p = iomem_resource.child; p ; p = p->sibling) {
-               bool is_type = (((p->flags & flags) == flags) &&
-                               ((desc == IORES_DESC_NONE) ||
-                                (desc == p->desc)));
-
-               if (resource_overlaps(p, &res))
-                       is_type ? type++ : other++;
-       }
+       ret = __region_intersects(start, size, flags, desc);
        read_unlock(&resource_lock);
 
-       if (type == 0)
-               return REGION_DISJOINT;
-
-       if (other == 0)
-               return REGION_INTERSECTS;
-
-       return REGION_MIXED;
+       return ret;
 }
 EXPORT_SYMBOL_GPL(region_intersects);
 
@@ -1171,31 +1160,16 @@ struct address_space *iomem_get_mapping(void)
        return smp_load_acquire(&iomem_inode)->i_mapping;
 }
 
-/**
- * __request_region - create a new busy resource region
- * @parent: parent resource descriptor
- * @start: resource start address
- * @n: resource region size
- * @name: reserving caller's ID string
- * @flags: IO resource flags
- */
-struct resource * __request_region(struct resource *parent,
+static int __request_region_locked(struct resource *res, struct resource *parent,
                                   resource_size_t start, resource_size_t n,
                                   const char *name, int flags)
 {
        DECLARE_WAITQUEUE(wait, current);
-       struct resource *res = alloc_resource(GFP_KERNEL);
-       struct resource *orig_parent = parent;
-
-       if (!res)
-               return NULL;
 
        res->name = name;
        res->start = start;
        res->end = start + n - 1;
 
-       write_lock(&resource_lock);
-
        for (;;) {
                struct resource *conflict;
 
@@ -1231,13 +1205,40 @@ struct resource * __request_region(struct resource *parent,
                        continue;
                }
                /* Uhhuh, that didn't work out.. */
-               free_resource(res);
-               res = NULL;
-               break;
+               return -EBUSY;
        }
+
+       return 0;
+}
+
+/**
+ * __request_region - create a new busy resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ * @name: reserving caller's ID string
+ * @flags: IO resource flags
+ */
+struct resource *__request_region(struct resource *parent,
+                                 resource_size_t start, resource_size_t n,
+                                 const char *name, int flags)
+{
+       struct resource *res = alloc_resource(GFP_KERNEL);
+       int ret;
+
+       if (!res)
+               return NULL;
+
+       write_lock(&resource_lock);
+       ret = __request_region_locked(res, parent, start, n, name, flags);
        write_unlock(&resource_lock);
 
-       if (res && orig_parent == &iomem_resource)
+       if (ret) {
+               free_resource(res);
+               return NULL;
+       }
+
+       if (parent == &iomem_resource)
                revoke_iomem(res);
 
        return res;
@@ -1779,25 +1780,56 @@ static struct resource *__request_free_mem_region(struct device *dev,
 {
        resource_size_t end, addr;
        struct resource *res;
+       struct region_devres *dr = NULL;
 
        size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
        end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
        addr = end - size + 1UL;
 
+       res = alloc_resource(GFP_KERNEL);
+       if (!res)
+               return ERR_PTR(-ENOMEM);
+
+       if (dev) {
+               dr = devres_alloc(devm_region_release,
+                               sizeof(struct region_devres), GFP_KERNEL);
+               if (!dr) {
+                       free_resource(res);
+                       return ERR_PTR(-ENOMEM);
+               }
+       }
+
+       write_lock(&resource_lock);
        for (; addr > size && addr >= base->start; addr -= size) {
-               if (region_intersects(addr, size, 0, IORES_DESC_NONE) !=
+               if (__region_intersects(addr, size, 0, IORES_DESC_NONE) !=
                                REGION_DISJOINT)
                        continue;
 
-               if (dev)
-                       res = devm_request_mem_region(dev, addr, size, name);
-               else
-                       res = request_mem_region(addr, size, name);
-               if (!res)
-                       return ERR_PTR(-ENOMEM);
+               if (!__request_region_locked(res, &iomem_resource, addr, size,
+                                               name, 0))
+                       break;
+
+               if (dev) {
+                       dr->parent = &iomem_resource;
+                       dr->start = addr;
+                       dr->n = size;
+                       devres_add(dev, dr);
+               }
+
                res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+               write_unlock(&resource_lock);
+
+               /*
+                * A driver is claiming this region so revoke any mappings.
+                */
+               revoke_iomem(res);
                return res;
        }
+       write_unlock(&resource_lock);
+
+       free_resource(res);
+       if (dr)
+               devres_free(dr);
 
        return ERR_PTR(-ERANGE);
 }
index 1e63db4..6ecd3f3 100644 (file)
@@ -119,8 +119,11 @@ struct seccomp_kaddfd {
        int fd;
        unsigned int flags;
 
-       /* To only be set on reply */
-       int ret;
+       union {
+               bool setfd;
+               /* To only be set on reply */
+               int ret;
+       };
        struct completion completion;
        struct list_head list;
 };
@@ -1069,7 +1072,11 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
         * that it has been handled.
         */
        list_del_init(&addfd->list);
-       addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
+       if (!addfd->setfd)
+               addfd->ret = receive_fd(addfd->file, addfd->flags);
+       else
+               addfd->ret = receive_fd_replace(addfd->fd, addfd->file,
+                                               addfd->flags);
        complete(&addfd->completion);
 }
 
@@ -1583,8 +1590,8 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter,
                return -EBADF;
 
        kaddfd.flags = addfd.newfd_flags;
-       kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
-                   addfd.newfd : -1;
+       kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD;
+       kaddfd.fd = addfd.newfd;
        init_completion(&kaddfd.completion);
 
        ret = mutex_lock_interruptible(&filter->notify_lock);
index 3d62c95..3a583a2 100644 (file)
@@ -1590,7 +1590,7 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
 
        /*
         * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
-        * infite. In case of RLIM_INFINITY the posix CPU timer code
+        * infinite. In case of RLIM_INFINITY the posix CPU timer code
         * ignores the rlimit.
         */
         if (!retval && new_rlim && resource == RLIMIT_CPU &&
@@ -2029,7 +2029,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
        }
 
        /*
-        * arg_lock protects concurent updates but we still need mmap_lock for
+        * arg_lock protects concurrent updates but we still need mmap_lock for
         * read to exclude races with sys_brk.
         */
        mmap_read_lock(mm);
@@ -2041,7 +2041,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
         * output in procfs mostly, except
         *
         *  - @start_brk/@brk which are used in do_brk_flags but kernel lookups
-        *    for VMAs when updating these memvers so anything wrong written
+        *    for VMAs when updating these members so anything wrong written
         *    here cause kernel to swear at userspace program but won't lead
         *    to any problem in kernel itself
         */
@@ -2143,7 +2143,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
        error = -EINVAL;
 
        /*
-        * arg_lock protects concurent updates of arg boundaries, we need
+        * arg_lock protects concurrent updates of arg boundaries, we need
         * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr
         * validation.
         */
@@ -2210,7 +2210,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
         * If command line arguments and environment
         * are placed somewhere else on stack, we can
         * set them up here, ARG_START/END to setup
-        * command line argumets and ENV_START/END
+        * command line arguments and ENV_START/END
         * for environment.
         */
        case PR_SET_MM_START_STACK:
@@ -2258,8 +2258,8 @@ static int prctl_get_tid_address(struct task_struct *me, int __user * __user *ti
 static int propagate_has_child_subreaper(struct task_struct *p, void *data)
 {
        /*
-        * If task has has_child_subreaper - all its decendants
-        * already have these flag too and new decendants will
+        * If task has has_child_subreaper - all its descendants
+        * already have these flag too and new descendants will
         * inherit it on fork, skip them.
         *
         * If we've found child_reaper - skip descendants in
index d244317..0ea8128 100644 (file)
@@ -267,6 +267,11 @@ COND_SYSCALL(request_key);
 COND_SYSCALL(keyctl);
 COND_SYSCALL_COMPAT(keyctl);
 
+/* security/landlock/syscalls.c */
+COND_SYSCALL(landlock_create_ruleset);
+COND_SYSCALL(landlock_add_rule);
+COND_SYSCALL(landlock_restrict_self);
+
 /* arch/example/kernel/sys_example.c */
 
 /* mm/fadvise.c */
index f91d327..14edf84 100644 (file)
@@ -2830,7 +2830,7 @@ static struct ctl_table vm_table[] = {
 #ifdef CONFIG_COMPACTION
        {
                .procname       = "compact_memory",
-               .data           = &sysctl_compact_memory,
+               .data           = NULL,
                .maxlen         = sizeof(int),
                .mode           = 0200,
                .proc_handler   = sysctl_compaction_handler,
index 1d1a613..2cd9025 100644 (file)
@@ -920,6 +920,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 
        clocksource_arch_init(cs);
 
+       if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
+               cs->id = CSID_GENERIC;
        if (cs->vdso_clock_mode < 0 ||
            cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
                pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
index 81fe2a3..8a364aa 100644 (file)
@@ -1048,6 +1048,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
                now = tk_clock_read(&tk->tkr_mono);
+               systime_snapshot->cs_id = tk->tkr_mono.clock->id;
                systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
                systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
                base_real = ktime_add(tk->tkr_mono.base,
index 29a6ebe..b8a0d1d 100644 (file)
@@ -42,7 +42,7 @@ bool ftrace_graph_is_dead(void)
 }
 
 /**
- * ftrace_graph_stop - set to permanently disable function graph tracincg
+ * ftrace_graph_stop - set to permanently disable function graph tracing
  *
  * In case of an error int function graph tracing, this is called
  * to try to keep function graph tracing from causing any more harm.
@@ -117,7 +117,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
 
        /*
         * Skip graph tracing if the return location is served by direct trampoline,
-        * since call sequence and return addresses is unpredicatable anymore.
+        * since call sequence and return addresses are unpredictable anyway.
         * Ex: BPF trampoline may call original function and may skip frame
         * depending on type of BPF programs attached.
         */
index 3ba52d4..2e8a3fd 100644 (file)
@@ -1045,7 +1045,7 @@ struct ftrace_ops global_ops = {
 };
 
 /*
- * Used by the stack undwinder to know about dynamic ftrace trampolines.
+ * Used by the stack unwinder to know about dynamic ftrace trampolines.
  */
 struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
 {
@@ -1090,7 +1090,7 @@ struct ftrace_page {
        struct ftrace_page      *next;
        struct dyn_ftrace       *records;
        int                     index;
-       int                     size;
+       int                     order;
 };
 
 #define ENTRY_SIZE sizeof(struct dyn_ftrace)
@@ -3000,7 +3000,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
                 * When the kernel is preemptive, tasks can be preempted
                 * while on a ftrace trampoline. Just scheduling a task on
                 * a CPU is not good enough to flush them. Calling
-                * synchornize_rcu_tasks() will wait for those tasks to
+                * synchronize_rcu_tasks() will wait for those tasks to
                 * execute and either schedule voluntarily or enter user space.
                 */
                if (IS_ENABLED(CONFIG_PREEMPTION))
@@ -3156,15 +3156,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
        if (WARN_ON(!count))
                return -EINVAL;
 
+       /* We want to fill as much as possible, with no empty pages */
        pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE);
-       order = get_count_order(pages);
-
-       /*
-        * We want to fill as much as possible. No more than a page
-        * may be empty.
-        */
-       if (!is_power_of_2(pages))
-               order--;
+       order = fls(pages) - 1;
 
  again:
        pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
@@ -3181,7 +3175,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
        ftrace_number_of_groups++;
 
        cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
-       pg->size = cnt;
+       pg->order = order;
 
        if (cnt > count)
                cnt = count;
@@ -3194,7 +3188,6 @@ ftrace_allocate_pages(unsigned long num_to_init)
 {
        struct ftrace_page *start_pg;
        struct ftrace_page *pg;
-       int order;
        int cnt;
 
        if (!num_to_init)
@@ -3230,13 +3223,13 @@ ftrace_allocate_pages(unsigned long num_to_init)
  free_pages:
        pg = start_pg;
        while (pg) {
-               order = get_count_order(pg->size / ENTRIES_PER_PAGE);
-               if (order >= 0)
-                       free_pages((unsigned long)pg->records, order);
+               if (pg->records) {
+                       free_pages((unsigned long)pg->records, pg->order);
+                       ftrace_number_of_pages -= 1 << pg->order;
+               }
                start_pg = pg->next;
                kfree(pg);
                pg = start_pg;
-               ftrace_number_of_pages -= 1 << order;
                ftrace_number_of_groups--;
        }
        pr_info("ftrace: FAILED to allocate memory for functions\n");
@@ -5407,7 +5400,7 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct);
  * @reset - non zero to reset all filters before applying this filter.
  *
  * Filters denote which functions should be enabled when tracing is enabled
- * If @ip is NULL, it failes to update filter.
+ * If @ip is NULL, it fails to update filter.
  */
 int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
                         int remove, int reset)
@@ -5631,7 +5624,10 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 
        parser = &iter->parser;
        if (trace_parser_loaded(parser)) {
-               ftrace_match_records(iter->hash, parser->buffer, parser->idx);
+               int enable = !(iter->flags & FTRACE_ITER_NOTRACE);
+
+               ftrace_process_regex(iter, parser->buffer,
+                                    parser->idx, enable);
        }
 
        trace_parser_put(parser);
@@ -6221,6 +6217,7 @@ static int ftrace_process_locs(struct module *mod,
        p = start;
        pg = start_pg;
        while (p < end) {
+               unsigned long end_offset;
                addr = ftrace_call_adjust(*p++);
                /*
                 * Some architecture linkers will pad between
@@ -6231,7 +6228,8 @@ static int ftrace_process_locs(struct module *mod,
                if (!addr)
                        continue;
 
-               if (pg->index == pg->size) {
+               end_offset = (pg->index+1) * sizeof(pg->records[0]);
+               if (end_offset > PAGE_SIZE << pg->order) {
                        /* We should have allocated enough */
                        if (WARN_ON(!pg->next))
                                break;
@@ -6359,7 +6357,7 @@ clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash)
        }
 }
 
-/* Clear any records from hashs */
+/* Clear any records from hashes */
 static void clear_mod_from_hashes(struct ftrace_page *pg)
 {
        struct trace_array *tr;
@@ -6400,7 +6398,6 @@ void ftrace_release_mod(struct module *mod)
        struct ftrace_page **last_pg;
        struct ftrace_page *tmp_page = NULL;
        struct ftrace_page *pg;
-       int order;
 
        mutex_lock(&ftrace_lock);
 
@@ -6451,12 +6448,12 @@ void ftrace_release_mod(struct module *mod)
                /* Needs to be called outside of ftrace_lock */
                clear_mod_from_hashes(pg);
 
-               order = get_count_order(pg->size / ENTRIES_PER_PAGE);
-               if (order >= 0)
-                       free_pages((unsigned long)pg->records, order);
+               if (pg->records) {
+                       free_pages((unsigned long)pg->records, pg->order);
+                       ftrace_number_of_pages -= 1 << pg->order;
+               }
                tmp_page = pg->next;
                kfree(pg);
-               ftrace_number_of_pages -= 1 << order;
                ftrace_number_of_groups--;
        }
 }
@@ -6774,7 +6771,6 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
        struct ftrace_mod_map *mod_map = NULL;
        struct ftrace_init_func *func, *func_next;
        struct list_head clear_hash;
-       int order;
 
        INIT_LIST_HEAD(&clear_hash);
 
@@ -6812,10 +6808,10 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
                ftrace_update_tot_cnt--;
                if (!pg->index) {
                        *last_pg = pg->next;
-                       order = get_count_order(pg->size / ENTRIES_PER_PAGE);
-                       if (order >= 0)
-                               free_pages((unsigned long)pg->records, order);
-                       ftrace_number_of_pages -= 1 << order;
+                       if (pg->records) {
+                               free_pages((unsigned long)pg->records, pg->order);
+                               ftrace_number_of_pages -= 1 << pg->order;
+                       }
                        ftrace_number_of_groups--;
                        kfree(pg);
                        pg = container_of(last_pg, struct ftrace_page, next);
index 68744c5..2c0ee64 100644 (file)
@@ -287,17 +287,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 #define TS_MASK                ((1ULL << TS_SHIFT) - 1)
 #define TS_DELTA_TEST  (~TS_MASK)
 
-/**
- * ring_buffer_event_time_stamp - return the event's extended timestamp
- * @event: the event to get the timestamp of
- *
- * Returns the extended timestamp associated with a data event.
- * An extended time_stamp is a 64-bit timestamp represented
- * internally in a special way that makes the best use of space
- * contained within a ring buffer event.  This function decodes
- * it and maps it to a straight u64 value.
- */
-u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
+static u64 rb_event_time_stamp(struct ring_buffer_event *event)
 {
        u64 ts;
 
@@ -487,6 +477,8 @@ struct rb_time_struct {
 #endif
 typedef struct rb_time_struct rb_time_t;
 
+#define MAX_NEST       5
+
 /*
  * head_page == tail_page && head == tail then buffer is empty.
  */
@@ -524,6 +516,7 @@ struct ring_buffer_per_cpu {
        unsigned long                   read_bytes;
        rb_time_t                       write_stamp;
        rb_time_t                       before_stamp;
+       u64                             event_stamp[MAX_NEST];
        u64                             read_stamp;
        /* ring buffer pages to update, > 0 to add, < 0 to remove */
        long                            nr_pages_to_update;
@@ -749,6 +742,99 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
 }
 #endif
 
+/*
+ * Enable this to make sure that the event passed to
+ * ring_buffer_event_time_stamp() is not committed and also
+ * is on the buffer that it passed in.
+ */
+//#define RB_VERIFY_EVENT
+#ifdef RB_VERIFY_EVENT
+static struct list_head *rb_list_head(struct list_head *list);
+static void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
+                        void *event)
+{
+       struct buffer_page *page = cpu_buffer->commit_page;
+       struct buffer_page *tail_page = READ_ONCE(cpu_buffer->tail_page);
+       struct list_head *next;
+       long commit, write;
+       unsigned long addr = (unsigned long)event;
+       bool done = false;
+       int stop = 0;
+
+       /* Make sure the event exists and is not committed yet */
+       do {
+               if (page == tail_page || WARN_ON_ONCE(stop++ > 100))
+                       done = true;
+               commit = local_read(&page->page->commit);
+               write = local_read(&page->write);
+               if (addr >= (unsigned long)&page->page->data[commit] &&
+                   addr < (unsigned long)&page->page->data[write])
+                       return;
+
+               next = rb_list_head(page->list.next);
+               page = list_entry(next, struct buffer_page, list);
+       } while (!done);
+       WARN_ON_ONCE(1);
+}
+#else
+static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
+                        void *event)
+{
+}
+#endif
+
+
+static inline u64 rb_time_stamp(struct trace_buffer *buffer);
+
+/**
+ * ring_buffer_event_time_stamp - return the event's current time stamp
+ * @buffer: The buffer that the event is on
+ * @event: the event to get the time stamp of
+ *
+ * Note, this must be called after @event is reserved, and before it is
+ * committed to the ring buffer. And must be called from the same
+ * context where the event was reserved (normal, softirq, irq, etc).
+ *
+ * Returns the time stamp associated with the current event.
+ * If the event has an extended time stamp, then that is used as
+ * the time stamp to return.
+ * In the highly unlikely case that the event was nested more than
+ * the max nesting, then the write_stamp of the buffer is returned,
+ * otherwise  current time is returned, but that really neither of
+ * the last two cases should ever happen.
+ */
+u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
+                                struct ring_buffer_event *event)
+{
+       struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()];
+       unsigned int nest;
+       u64 ts;
+
+       /* If the event includes an absolute time, then just use that */
+       if (event->type_len == RINGBUF_TYPE_TIME_STAMP)
+               return rb_event_time_stamp(event);
+
+       nest = local_read(&cpu_buffer->committing);
+       verify_event(cpu_buffer, event);
+       if (WARN_ON_ONCE(!nest))
+               goto fail;
+
+       /* Read the current saved nesting level time stamp */
+       if (likely(--nest < MAX_NEST))
+               return cpu_buffer->event_stamp[nest];
+
+       /* Shouldn't happen, warn if it does */
+       WARN_ONCE(1, "nest (%d) greater than max", nest);
+
+ fail:
+       /* Can only fail on 32 bit */
+       if (!rb_time_read(&cpu_buffer->write_stamp, &ts))
+               /* Screw it, just read the current time */
+               ts = rb_time_stamp(cpu_buffer->buffer);
+
+       return ts;
+}
+
 /**
  * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
  * @buffer: The ring_buffer to get the number of pages from
@@ -994,7 +1080,7 @@ static inline u64 rb_time_stamp(struct trace_buffer *buffer)
        return ts << DEBUG_SHIFT;
 }
 
-u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
+u64 ring_buffer_time_stamp(struct trace_buffer *buffer)
 {
        u64 time;
 
@@ -2710,6 +2796,10 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 {
        unsigned length = info->length;
        u64 delta = info->delta;
+       unsigned int nest = local_read(&cpu_buffer->committing) - 1;
+
+       if (!WARN_ON_ONCE(nest >= MAX_NEST))
+               cpu_buffer->event_stamp[nest] = info->ts;
 
        /*
         * If we need to add a timestamp, then we
@@ -2766,7 +2856,7 @@ static u64 rb_time_delta(struct ring_buffer_event *event)
                return 0;
 
        case RINGBUF_TYPE_TIME_EXTEND:
-               return ring_buffer_event_time_stamp(event);
+               return rb_event_time_stamp(event);
 
        case RINGBUF_TYPE_TIME_STAMP:
                return 0;
@@ -3064,7 +3154,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
  * is called before preempt_count() is updated, since the check will
  * be on the NORMAL bit, the TRANSITION bit will then be set. If an
  * NMI then comes in, it will set the NMI bit, but when the NMI code
- * does the trace_recursive_unlock() it will clear the TRANSTION bit
+ * does the trace_recursive_unlock() it will clear the TRANSITION bit
  * and leave the NMI bit set. But this is fine, because the interrupt
  * code that set the TRANSITION bit will then clear the NMI bit when it
  * calls trace_recursive_unlock(). If another NMI comes in, it will
@@ -3212,13 +3302,13 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
                switch (event->type_len) {
 
                case RINGBUF_TYPE_TIME_EXTEND:
-                       delta = ring_buffer_event_time_stamp(event);
+                       delta = rb_event_time_stamp(event);
                        ts += delta;
                        pr_warn("  [%lld] delta:%lld TIME EXTEND\n", ts, delta);
                        break;
 
                case RINGBUF_TYPE_TIME_STAMP:
-                       delta = ring_buffer_event_time_stamp(event);
+                       delta = rb_event_time_stamp(event);
                        ts = delta;
                        pr_warn("  [%lld] absolute:%lld TIME STAMP\n", ts, delta);
                        break;
@@ -3289,12 +3379,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
                switch (event->type_len) {
 
                case RINGBUF_TYPE_TIME_EXTEND:
-                       delta = ring_buffer_event_time_stamp(event);
+                       delta = rb_event_time_stamp(event);
                        ts += delta;
                        break;
 
                case RINGBUF_TYPE_TIME_STAMP:
-                       delta = ring_buffer_event_time_stamp(event);
+                       delta = rb_event_time_stamp(event);
                        ts = delta;
                        break;
 
@@ -3451,7 +3541,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                                    info->after, ts)) {
                        /* Nothing came after this event between C and E */
                        info->delta = ts - info->after;
-                       info->ts = ts;
                } else {
                        /*
                         * Interrupted between C and E:
@@ -3463,6 +3552,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                         */
                        info->delta = 0;
                }
+               info->ts = ts;
                info->add_timestamp &= ~RB_ADD_STAMP_FORCE;
        }
 
@@ -4256,12 +4346,12 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                return;
 
        case RINGBUF_TYPE_TIME_EXTEND:
-               delta = ring_buffer_event_time_stamp(event);
+               delta = rb_event_time_stamp(event);
                cpu_buffer->read_stamp += delta;
                return;
 
        case RINGBUF_TYPE_TIME_STAMP:
-               delta = ring_buffer_event_time_stamp(event);
+               delta = rb_event_time_stamp(event);
                cpu_buffer->read_stamp = delta;
                return;
 
@@ -4286,12 +4376,12 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
                return;
 
        case RINGBUF_TYPE_TIME_EXTEND:
-               delta = ring_buffer_event_time_stamp(event);
+               delta = rb_event_time_stamp(event);
                iter->read_stamp += delta;
                return;
 
        case RINGBUF_TYPE_TIME_STAMP:
-               delta = ring_buffer_event_time_stamp(event);
+               delta = rb_event_time_stamp(event);
                iter->read_stamp = delta;
                return;
 
@@ -4544,7 +4634,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
 
        case RINGBUF_TYPE_TIME_STAMP:
                if (ts) {
-                       *ts = ring_buffer_event_time_stamp(event);
+                       *ts = rb_event_time_stamp(event);
                        ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
                                                         cpu_buffer->cpu, ts);
                }
@@ -4635,7 +4725,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
        case RINGBUF_TYPE_TIME_STAMP:
                if (ts) {
-                       *ts = ring_buffer_event_time_stamp(event);
+                       *ts = rb_event_time_stamp(event);
                        ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
                                                         cpu_buffer->cpu, ts);
                }
@@ -5021,6 +5111,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        rb_time_set(&cpu_buffer->write_stamp, 0);
        rb_time_set(&cpu_buffer->before_stamp, 0);
 
+       memset(cpu_buffer->event_stamp, 0, sizeof(cpu_buffer->event_stamp));
+
        cpu_buffer->lost_events = 0;
        cpu_buffer->last_overrun = 0;
 
index a4b4bbf..0b15e97 100644 (file)
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Test module for in-kernel sythetic event creation and generation.
+ * Test module for in-kernel synthetic event creation and generation.
  *
  * Copyright (C) 2019 Tom Zanussi <zanussi@kernel.org>
  */
index 915fe87..560e4c8 100644 (file)
@@ -514,7 +514,7 @@ void trace_free_pid_list(struct trace_pid_list *pid_list)
  * @filtered_pids: The list of pids to check
  * @search_pid: The PID to find in @filtered_pids
  *
- * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis.
+ * Returns true if @search_pid is found in @filtered_pids, and false otherwise.
  */
 bool
 trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
@@ -545,7 +545,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids,
                       struct task_struct *task)
 {
        /*
-        * If filterd_no_pids is not empty, and the task's pid is listed
+        * If filtered_no_pids is not empty, and the task's pid is listed
         * in filtered_no_pids, then return true.
         * Otherwise, if filtered_pids is empty, that means we can
         * trace all tasks. If it has content, then only trace pids
@@ -612,7 +612,7 @@ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
 
        (*pos)++;
 
-       /* pid already is +1 of the actual prevous bit */
+       /* pid already is +1 of the actual previous bit */
        pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
 
        /* Return pid + 1 to allow zero to be represented */
@@ -771,7 +771,7 @@ static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu)
        if (!buf->buffer)
                return trace_clock_local();
 
-       ts = ring_buffer_time_stamp(buf->buffer, cpu);
+       ts = ring_buffer_time_stamp(buf->buffer);
        ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);
 
        return ts;
@@ -834,7 +834,7 @@ DEFINE_MUTEX(trace_types_lock);
  * The content of events may become garbage if we allow other process consumes
  * these events concurrently:
  *   A) the page of the consumed events may become a normal page
- *      (not reader page) in ring buffer, and this page will be rewrited
+ *      (not reader page) in ring buffer, and this page will be rewritten
  *      by events producer.
  *   B) The page of the consumed events may become a page for splice_read,
  *      and this page will be returned to system.
@@ -1520,7 +1520,7 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
 #undef C
 #define C(a, b) b
 
-/* These must match the bit postions in trace_iterator_flags */
+/* These must match the bit positions in trace_iterator_flags */
 static const char *trace_options[] = {
        TRACE_FLAGS
        NULL
@@ -2390,14 +2390,13 @@ static void tracing_stop_tr(struct trace_array *tr)
 
 static int trace_save_cmdline(struct task_struct *tsk)
 {
-       unsigned pid, idx;
+       unsigned tpid, idx;
 
        /* treat recording of idle task as a success */
        if (!tsk->pid)
                return 1;
 
-       if (unlikely(tsk->pid > PID_MAX_DEFAULT))
-               return 0;
+       tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
 
        /*
         * It's not the end of the world if we don't get
@@ -2408,26 +2407,15 @@ static int trace_save_cmdline(struct task_struct *tsk)
        if (!arch_spin_trylock(&trace_cmdline_lock))
                return 0;
 
-       idx = savedcmd->map_pid_to_cmdline[tsk->pid];
+       idx = savedcmd->map_pid_to_cmdline[tpid];
        if (idx == NO_CMDLINE_MAP) {
                idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
 
-               /*
-                * Check whether the cmdline buffer at idx has a pid
-                * mapped. We are going to overwrite that entry so we
-                * need to clear the map_pid_to_cmdline. Otherwise we
-                * would read the new comm for the old pid.
-                */
-               pid = savedcmd->map_cmdline_to_pid[idx];
-               if (pid != NO_CMDLINE_MAP)
-                       savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
-
-               savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
-               savedcmd->map_pid_to_cmdline[tsk->pid] = idx;
-
+               savedcmd->map_pid_to_cmdline[tpid] = idx;
                savedcmd->cmdline_idx = idx;
        }
 
+       savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
        set_cmdline(idx, tsk->comm);
 
        arch_spin_unlock(&trace_cmdline_lock);
@@ -2438,6 +2426,7 @@ static int trace_save_cmdline(struct task_struct *tsk)
 static void __trace_find_cmdline(int pid, char comm[])
 {
        unsigned map;
+       int tpid;
 
        if (!pid) {
                strcpy(comm, "<idle>");
@@ -2449,16 +2438,16 @@ static void __trace_find_cmdline(int pid, char comm[])
                return;
        }
 
-       if (pid > PID_MAX_DEFAULT) {
-               strcpy(comm, "<...>");
-               return;
+       tpid = pid & (PID_MAX_DEFAULT - 1);
+       map = savedcmd->map_pid_to_cmdline[tpid];
+       if (map != NO_CMDLINE_MAP) {
+               tpid = savedcmd->map_cmdline_to_pid[map];
+               if (tpid == pid) {
+                       strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
+                       return;
+               }
        }
-
-       map = savedcmd->map_pid_to_cmdline[pid];
-       if (map != NO_CMDLINE_MAP)
-               strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
-       else
-               strcpy(comm, "<...>");
+       strcpy(comm, "<...>");
 }
 
 void trace_find_cmdline(int pid, char comm[])
@@ -2737,12 +2726,13 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
                          unsigned int trace_ctx)
 {
        struct ring_buffer_event *entry;
+       struct trace_array *tr = trace_file->tr;
        int val;
 
-       *current_rb = trace_file->tr->array_buffer.buffer;
+       *current_rb = tr->array_buffer.buffer;
 
-       if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
-            (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
+       if (!tr->no_filter_buffering_ref &&
+           (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
            (entry = this_cpu_read(trace_buffered_event))) {
                /* Try to use the per cpu buffer first */
                val = this_cpu_inc_return(trace_buffered_event_cnt);
@@ -3116,6 +3106,40 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 
 #endif /* CONFIG_STACKTRACE */
 
+static inline void
+func_repeats_set_delta_ts(struct func_repeats_entry *entry,
+                         unsigned long long delta)
+{
+       entry->bottom_delta_ts = delta & U32_MAX;
+       entry->top_delta_ts = (delta >> 32);
+}
+
+void trace_last_func_repeats(struct trace_array *tr,
+                            struct trace_func_repeats *last_info,
+                            unsigned int trace_ctx)
+{
+       struct trace_buffer *buffer = tr->array_buffer.buffer;
+       struct func_repeats_entry *entry;
+       struct ring_buffer_event *event;
+       u64 delta;
+
+       event = __trace_buffer_lock_reserve(buffer, TRACE_FUNC_REPEATS,
+                                           sizeof(*entry), trace_ctx);
+       if (!event)
+               return;
+
+       delta = ring_buffer_event_time_stamp(buffer, event) -
+               last_info->ts_last_call;
+
+       entry = ring_buffer_event_data(event);
+       entry->ip = last_info->ip;
+       entry->parent_ip = last_info->parent_ip;
+       entry->count = last_info->count;
+       func_repeats_set_delta_ts(entry, delta);
+
+       __buffer_unlock_commit(buffer, event);
+}
+
 /* created for use with alloc_percpu */
 struct trace_buffer_struct {
        int nesting;
@@ -3368,7 +3392,7 @@ int trace_array_vprintk(struct trace_array *tr,
  * buffer (use trace_printk() for that), as writing into the top level
  * buffer should only have events that can be individually disabled.
  * trace_printk() is only used for debugging a kernel, and should not
- * be ever encorporated in normal use.
+ * be ever incorporated in normal use.
  *
  * trace_array_printk() can be used, as it will not add noise to the
  * top level tracing buffer.
@@ -3562,6 +3586,204 @@ static char *trace_iter_expand_format(struct trace_iterator *iter)
        return tmp;
 }
 
+/* Returns true if the string is safe to dereference from an event */
+static bool trace_safe_str(struct trace_iterator *iter, const char *str)
+{
+       unsigned long addr = (unsigned long)str;
+       struct trace_event *trace_event;
+       struct trace_event_call *event;
+
+       /* OK if part of the event data */
+       if ((addr >= (unsigned long)iter->ent) &&
+           (addr < (unsigned long)iter->ent + iter->ent_size))
+               return true;
+
+       /* OK if part of the temp seq buffer */
+       if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
+           (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE))
+               return true;
+
+       /* Core rodata can not be freed */
+       if (is_kernel_rodata(addr))
+               return true;
+
+       if (trace_is_tracepoint_string(str))
+               return true;
+
+       /*
+        * Now this could be a module event, referencing core module
+        * data, which is OK.
+        */
+       if (!iter->ent)
+               return false;
+
+       trace_event = ftrace_find_event(iter->ent->type);
+       if (!trace_event)
+               return false;
+
+       event = container_of(trace_event, struct trace_event_call, event);
+       if (!event->mod)
+               return false;
+
+       /* Would rather have rodata, but this will suffice */
+       if (within_module_core(addr, event->mod))
+               return true;
+
+       return false;
+}
+
+static const char *show_buffer(struct trace_seq *s)
+{
+       struct seq_buf *seq = &s->seq;
+
+       seq_buf_terminate(seq);
+
+       return seq->buffer;
+}
+
+static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
+
+static int test_can_verify_check(const char *fmt, ...)
+{
+       char buf[16];
+       va_list ap;
+       int ret;
+
+       /*
+        * The verifier is dependent on vsnprintf() modifies the va_list
+        * passed to it, where it is sent as a reference. Some architectures
+        * (like x86_32) passes it by value, which means that vsnprintf()
+        * does not modify the va_list passed to it, and the verifier
+        * would then need to be able to understand all the values that
+        * vsnprintf can use. If it is passed by value, then the verifier
+        * is disabled.
+        */
+       va_start(ap, fmt);
+       vsnprintf(buf, 16, "%d", ap);
+       ret = va_arg(ap, int);
+       va_end(ap);
+
+       return ret;
+}
+
+static void test_can_verify(void)
+{
+       if (!test_can_verify_check("%d %d", 0, 1)) {
+               pr_info("trace event string verifier disabled\n");
+               static_branch_inc(&trace_no_verify);
+       }
+}
+
+/**
+ * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer
+ * @iter: The iterator that holds the seq buffer and the event being printed
+ * @fmt: The format used to print the event
+ * @ap: The va_list holding the data to print from @fmt.
+ *
+ * This writes the data into the @iter->seq buffer using the data from
+ * @fmt and @ap. If the format has a %s, then the source of the string
+ * is examined to make sure it is safe to print, otherwise it will
+ * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string
+ * pointer.
+ */
+void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
+                        va_list ap)
+{
+       const char *p = fmt;
+       const char *str;
+       int i, j;
+
+       if (WARN_ON_ONCE(!fmt))
+               return;
+
+       if (static_branch_unlikely(&trace_no_verify))
+               goto print;
+
+       /* Don't bother checking when doing a ftrace_dump() */
+       if (iter->fmt == static_fmt_buf)
+               goto print;
+
+       while (*p) {
+               j = 0;
+
+               /* We only care about %s and variants */
+               for (i = 0; p[i]; i++) {
+                       if (i + 1 >= iter->fmt_size) {
+                               /*
+                                * If we can't expand the copy buffer,
+                                * just print it.
+                                */
+                               if (!trace_iter_expand_format(iter))
+                                       goto print;
+                       }
+
+                       if (p[i] == '\\' && p[i+1]) {
+                               i++;
+                               continue;
+                       }
+                       if (p[i] == '%') {
+                               /* Need to test cases like %08.*s */
+                               for (j = 1; p[i+j]; j++) {
+                                       if (isdigit(p[i+j]) ||
+                                           p[i+j] == '*' ||
+                                           p[i+j] == '.')
+                                               continue;
+                                       break;
+                               }
+                               if (p[i+j] == 's')
+                                       break;
+                       }
+                       j = 0;
+               }
+               /* If no %s found then just print normally */
+               if (!p[i])
+                       break;
+
+               /* Copy up to the %s, and print that */
+               strncpy(iter->fmt, p, i);
+               iter->fmt[i] = '\0';
+               trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+
+               /* The ap now points to the string data of the %s */
+               str = va_arg(ap, const char *);
+
+               /*
+                * If you hit this warning, it is likely that the
+                * trace event in question used %s on a string that
+                * was saved at the time of the event, but may not be
+                * around when the trace is read. Use __string(),
+                * __assign_str() and __get_str() helpers in the TRACE_EVENT()
+                * instead. See samples/trace_events/trace-events-sample.h
+                * for reference.
+                */
+               if (WARN_ONCE(!trace_safe_str(iter, str),
+                             "fmt: '%s' current_buffer: '%s'",
+                             fmt, show_buffer(&iter->seq))) {
+                       int ret;
+
+                       /* Try to safely read the string */
+                       ret = strncpy_from_kernel_nofault(iter->fmt, str,
+                                                         iter->fmt_size);
+                       if (ret < 0)
+                               trace_seq_printf(&iter->seq, "(0x%px)", str);
+                       else
+                               trace_seq_printf(&iter->seq, "(0x%px:%s)",
+                                                str, iter->fmt);
+                       str = "[UNSAFE-MEMORY]";
+                       strcpy(iter->fmt, "%s");
+               } else {
+                       strncpy(iter->fmt, p + i, j + 1);
+                       iter->fmt[j+1] = '\0';
+               }
+               trace_seq_printf(&iter->seq, iter->fmt, str);
+
+               p += i + j + 1;
+       }
+ print:
+       if (*p)
+               trace_seq_vprintf(&iter->seq, p, ap);
+}
+
 const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
 {
        const char *p, *new_fmt;
@@ -6768,7 +6990,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) {
                /* do not add \n before testing triggers, but add \0 */
                entry->buf[cnt] = '\0';
-               tt = event_triggers_call(tr->trace_marker_file, entry, event);
+               tt = event_triggers_call(tr->trace_marker_file, buffer, entry, event);
        }
 
        if (entry->buf[cnt - 1] != '\n') {
@@ -6976,31 +7198,34 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
        return ret;
 }
 
-int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
+u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe)
+{
+       if (rbe == this_cpu_read(trace_buffered_event))
+               return ring_buffer_time_stamp(buffer);
+
+       return ring_buffer_event_time_stamp(buffer, rbe);
+}
+
+/*
+ * Set or disable using the per CPU trace_buffer_event when possible.
+ */
+int tracing_set_filter_buffering(struct trace_array *tr, bool set)
 {
        int ret = 0;
 
        mutex_lock(&trace_types_lock);
 
-       if (abs && tr->time_stamp_abs_ref++)
+       if (set && tr->no_filter_buffering_ref++)
                goto out;
 
-       if (!abs) {
-               if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
+       if (!set) {
+               if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) {
                        ret = -EINVAL;
                        goto out;
                }
 
-               if (--tr->time_stamp_abs_ref)
-                       goto out;
+               --tr->no_filter_buffering_ref;
        }
-
-       ring_buffer_set_time_stamp_abs(tr->array_buffer.buffer, abs);
-
-#ifdef CONFIG_TRACER_MAX_TRACE
-       if (tr->max_buffer.buffer)
-               ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
-#endif
  out:
        mutex_unlock(&trace_types_lock);
 
@@ -7336,11 +7561,11 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
  * @cmd: The tracing command that caused the error
  * @str: The string to position the caret at within @cmd
  *
- * Finds the position of the first occurence of @str within @cmd.  The
+ * Finds the position of the first occurrence of @str within @cmd.  The
  * return value can be passed to tracing_log_err() for caret placement
  * within @cmd.
  *
- * Returns the index within @cmd of the first occurence of @str or 0
+ * Returns the index within @cmd of the first occurrence of @str or 0
  * if @str was not found.
  */
 unsigned int err_pos(char *cmd, const char *str)
@@ -7890,7 +8115,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
                trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
                                                                t, usec_rem);
 
-               t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
+               t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer));
                usec_rem = do_div(t, USEC_PER_SEC);
                trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
        } else {
@@ -7899,7 +8124,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
                                ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
 
                trace_seq_printf(s, "now ts: %llu\n",
-                               ring_buffer_time_stamp(trace_buf->buffer, cpu));
+                               ring_buffer_time_stamp(trace_buf->buffer));
        }
 
        cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
@@ -8906,6 +9131,7 @@ static int __remove_instance(struct trace_array *tr)
        ftrace_clear_pids(tr);
        ftrace_destroy_function_files(tr);
        tracefs_remove(tr->dir);
+       free_percpu(tr->last_func_repeats);
        free_trace_buffers(tr);
 
        for (i = 0; i < tr->nr_topts; i++) {
@@ -9123,7 +9349,7 @@ int tracing_init_dentry(void)
         * As there may still be users that expect the tracing
         * files to exist in debugfs/tracing, we must automount
         * the tracefs file system there, so older tools still
-        * work with the newer kerenl.
+        * work with the newer kernel.
         */
        tr->dir = debugfs_create_automount("tracing", NULL,
                                           trace_automount, NULL);
@@ -9676,6 +9902,8 @@ __init static int tracer_alloc_buffers(void)
 
        register_snapshot_cmd();
 
+       test_can_verify();
+
        return 0;
 
 out_free_savedcmd:
index a6446c0..cd80d04 100644 (file)
@@ -45,6 +45,7 @@ enum trace_type {
        TRACE_BPUTS,
        TRACE_HWLAT,
        TRACE_RAW_DATA,
+       TRACE_FUNC_REPEATS,
 
        __TRACE_LAST_TYPE,
 };
@@ -261,6 +262,17 @@ struct cond_snapshot {
        cond_update_fn_t                update;
 };
 
+/*
+ * struct trace_func_repeats - used to keep track of the consecutive
+ * (on the same CPU) calls of a single function.
+ */
+struct trace_func_repeats {
+       unsigned long   ip;
+       unsigned long   parent_ip;
+       unsigned long   count;
+       u64             ts_last_call;
+};
+
 /*
  * The trace array - an array of per-CPU trace arrays. This is the
  * highest level data structure that individual tracers deal with.
@@ -352,11 +364,12 @@ struct trace_array {
        /* function tracing enabled */
        int                     function_enabled;
 #endif
-       int                     time_stamp_abs_ref;
+       int                     no_filter_buffering_ref;
        struct list_head        hist_vars;
 #ifdef CONFIG_TRACER_SNAPSHOT
        struct cond_snapshot    *cond_snapshot;
 #endif
+       struct trace_func_repeats       __percpu *last_func_repeats;
 };
 
 enum {
@@ -372,7 +385,8 @@ extern int tracing_check_open_get_tr(struct trace_array *tr);
 extern struct trace_array *trace_array_find(const char *instance);
 extern struct trace_array *trace_array_find_get(const char *instance);
 
-extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
+extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe);
+extern int tracing_set_filter_buffering(struct trace_array *tr, bool set);
 extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
 
 extern bool trace_clock_in_ns(struct trace_array *tr);
@@ -441,6 +455,8 @@ extern void __ftrace_bad_type(void);
                          TRACE_GRAPH_ENT);             \
                IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,      \
                          TRACE_GRAPH_RET);             \
+               IF_ASSIGN(var, ent, struct func_repeats_entry,          \
+                         TRACE_FUNC_REPEATS);                          \
                __ftrace_bad_type();                                    \
        } while (0)
 
@@ -581,7 +597,10 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
                                        struct ring_buffer_event *event);
 
+bool trace_is_tracepoint_string(const char *str);
 const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
+void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
+                        va_list ap);
 
 int trace_empty(struct trace_iterator *iter);
 
@@ -676,6 +695,10 @@ static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
 }
 #endif /* CONFIG_STACKTRACE */
 
+void trace_last_func_repeats(struct trace_array *tr,
+                            struct trace_func_repeats *last_info,
+                            unsigned int trace_ctx);
+
 extern u64 ftrace_now(int cpu);
 
 extern void trace_find_cmdline(int pid, char comm[]);
@@ -1329,7 +1352,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
        unsigned long eflags = file->flags;
 
        if (eflags & EVENT_FILE_FL_TRIGGER_COND)
-               *tt = event_triggers_call(file, entry, event);
+               *tt = event_triggers_call(file, buffer, entry, event);
 
        if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
            (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
@@ -1343,7 +1366,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
 
 /**
  * event_trigger_unlock_commit - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
+ * @file: The file pointer associated with the event
  * @buffer: The ring buffer that the event is being written to
  * @event: The event meta data in the ring buffer
  * @entry: The event itself
@@ -1370,7 +1393,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
 
 /**
  * event_trigger_unlock_commit_regs - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
+ * @file: The file pointer associated with the event
  * @buffer: The ring buffer that the event is being written to
  * @event: The event meta data in the ring buffer
  * @entry: The event itself
@@ -1626,7 +1649,7 @@ extern int register_trigger_hist_enable_disable_cmds(void);
  */
 struct event_trigger_ops {
        void                    (*func)(struct event_trigger_data *data,
-                                       void *rec,
+                                       struct trace_buffer *buffer, void *rec,
                                        struct ring_buffer_event *rbe);
        int                     (*init)(struct event_trigger_ops *ops,
                                        struct event_trigger_data *data);
index aaf6793..c1637f9 100644 (file)
@@ -95,33 +95,49 @@ u64 notrace trace_clock_global(void)
 {
        unsigned long flags;
        int this_cpu;
-       u64 now;
+       u64 now, prev_time;
 
        raw_local_irq_save(flags);
 
        this_cpu = raw_smp_processor_id();
-       now = sched_clock_cpu(this_cpu);
+
        /*
-        * If in an NMI context then dont risk lockups and return the
-        * cpu_clock() time:
+        * The global clock "guarantees" that the events are ordered
+        * between CPUs. But if two events on two different CPUS call
+        * trace_clock_global at roughly the same time, it really does
+        * not matter which one gets the earlier time. Just make sure
+        * that the same CPU will always show a monotonic clock.
+        *
+        * Use a read memory barrier to get the latest written
+        * time that was recorded.
         */
-       if (unlikely(in_nmi()))
-               goto out;
+       smp_rmb();
+       prev_time = READ_ONCE(trace_clock_struct.prev_time);
+       now = sched_clock_cpu(this_cpu);
 
-       arch_spin_lock(&trace_clock_struct.lock);
+       /* Make sure that now is always greater than prev_time */
+       if ((s64)(now - prev_time) < 0)
+               now = prev_time + 1;
 
        /*
-        * TODO: if this happens often then maybe we should reset
-        * my_scd->clock to prev_time+1, to make sure
-        * we start ticking with the local clock from now on?
+        * If in an NMI context then dont risk lockups and simply return
+        * the current time.
         */
-       if ((s64)(now - trace_clock_struct.prev_time) < 0)
-               now = trace_clock_struct.prev_time + 1;
+       if (unlikely(in_nmi()))
+               goto out;
 
-       trace_clock_struct.prev_time = now;
+       /* Tracing can cause strange recursion, always use a try lock */
+       if (arch_spin_trylock(&trace_clock_struct.lock)) {
+               /* Reread prev_time in case it was already updated */
+               prev_time = READ_ONCE(trace_clock_struct.prev_time);
+               if ((s64)(now - prev_time) < 0)
+                       now = prev_time + 1;
 
-       arch_spin_unlock(&trace_clock_struct.lock);
+               trace_clock_struct.prev_time = now;
 
+               /* The unlock acts as the wmb for the above rmb */
+               arch_spin_unlock(&trace_clock_struct.lock);
+       }
  out:
        raw_local_irq_restore(flags);
 
index 4547ac5..251c819 100644 (file)
@@ -338,3 +338,25 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
                 __entry->nmi_total_ts,
                 __entry->nmi_count)
 );
+
+#define FUNC_REPEATS_GET_DELTA_TS(entry)                               \
+       (((u64)(entry)->top_delta_ts << 32) | (entry)->bottom_delta_ts) \
+
+FTRACE_ENTRY(func_repeats, func_repeats_entry,
+
+       TRACE_FUNC_REPEATS,
+
+       F_STRUCT(
+               __field(        unsigned long,  ip              )
+               __field(        unsigned long,  parent_ip       )
+               __field(        u16     ,       count           )
+               __field(        u16     ,       top_delta_ts    )
+               __field(        u32     ,       bottom_delta_ts )
+       ),
+
+       F_printk(" %ps <-%ps\t(repeats:%u  delta: -%llu)",
+                (void *)__entry->ip,
+                (void *)__entry->parent_ip,
+                __entry->count,
+                FUNC_REPEATS_GET_DELTA_TS(__entry))
+);
index 288ad2c..03be443 100644 (file)
@@ -16,7 +16,7 @@ static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
 
 /*
  * Force it to be aligned to unsigned long to avoid misaligned accesses
- * suprises
+ * surprises
  */
 typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
        perf_trace_t;
index a3563af..80e9698 100644 (file)
@@ -217,6 +217,214 @@ int trace_event_get_offsets(struct trace_event_call *call)
        return tail->offset + tail->size;
 }
 
+/*
+ * Check if the referenced field is an array and return true,
+ * as arrays are OK to dereference.
+ */
+static bool test_field(const char *fmt, struct trace_event_call *call)
+{
+       struct trace_event_fields *field = call->class->fields_array;
+       const char *array_descriptor;
+       const char *p = fmt;
+       int len;
+
+       if (!(len = str_has_prefix(fmt, "REC->")))
+               return false;
+       fmt += len;
+       for (p = fmt; *p; p++) {
+               if (!isalnum(*p) && *p != '_')
+                       break;
+       }
+       len = p - fmt;
+
+       for (; field->type; field++) {
+               if (strncmp(field->name, fmt, len) ||
+                   field->name[len])
+                       continue;
+               array_descriptor = strchr(field->type, '[');
+               /* This is an array and is OK to dereference. */
+               return array_descriptor != NULL;
+       }
+       return false;
+}
+
+/*
+ * Examine the print fmt of the event looking for unsafe dereference
+ * pointers using %p* that could be recorded in the trace event and
+ * much later referenced after the pointer was freed. Dereferencing
+ * pointers are OK, if it is dereferenced into the event itself.
+ */
+static void test_event_printk(struct trace_event_call *call)
+{
+       u64 dereference_flags = 0;
+       bool first = true;
+       const char *fmt, *c, *r, *a;
+       int parens = 0;
+       char in_quote = 0;
+       int start_arg = 0;
+       int arg = 0;
+       int i;
+
+       fmt = call->print_fmt;
+
+       if (!fmt)
+               return;
+
+       for (i = 0; fmt[i]; i++) {
+               switch (fmt[i]) {
+               case '\\':
+                       i++;
+                       if (!fmt[i])
+                               return;
+                       continue;
+               case '"':
+               case '\'':
+                       /*
+                        * The print fmt starts with a string that
+                        * is processed first to find %p* usage,
+                        * then after the first string, the print fmt
+                        * contains arguments that are used to check
+                        * if the dereferenced %p* usage is safe.
+                        */
+                       if (first) {
+                               if (fmt[i] == '\'')
+                                       continue;
+                               if (in_quote) {
+                                       arg = 0;
+                                       first = false;
+                                       /*
+                                        * If there was no %p* uses
+                                        * the fmt is OK.
+                                        */
+                                       if (!dereference_flags)
+                                               return;
+                               }
+                       }
+                       if (in_quote) {
+                               if (in_quote == fmt[i])
+                                       in_quote = 0;
+                       } else {
+                               in_quote = fmt[i];
+                       }
+                       continue;
+               case '%':
+                       if (!first || !in_quote)
+                               continue;
+                       i++;
+                       if (!fmt[i])
+                               return;
+                       switch (fmt[i]) {
+                       case '%':
+                               continue;
+                       case 'p':
+                               /* Find dereferencing fields */
+                               switch (fmt[i + 1]) {
+                               case 'B': case 'R': case 'r':
+                               case 'b': case 'M': case 'm':
+                               case 'I': case 'i': case 'E':
+                               case 'U': case 'V': case 'N':
+                               case 'a': case 'd': case 'D':
+                               case 'g': case 't': case 'C':
+                               case 'O': case 'f':
+                                       if (WARN_ONCE(arg == 63,
+                                                     "Too many args for event: %s",
+                                                     trace_event_name(call)))
+                                               return;
+                                       dereference_flags |= 1ULL << arg;
+                               }
+                               break;
+                       default:
+                       {
+                               bool star = false;
+                               int j;
+
+                               /* Increment arg if %*s exists. */
+                               for (j = 0; fmt[i + j]; j++) {
+                                       if (isdigit(fmt[i + j]) ||
+                                           fmt[i + j] == '.')
+                                               continue;
+                                       if (fmt[i + j] == '*') {
+                                               star = true;
+                                               continue;
+                                       }
+                                       if ((fmt[i + j] == 's') && star)
+                                               arg++;
+                                       break;
+                               }
+                               break;
+                       } /* default */
+
+                       } /* switch */
+                       arg++;
+                       continue;
+               case '(':
+                       if (in_quote)
+                               continue;
+                       parens++;
+                       continue;
+               case ')':
+                       if (in_quote)
+                               continue;
+                       parens--;
+                       if (WARN_ONCE(parens < 0,
+                                     "Paren mismatch for event: %s\narg='%s'\n%*s",
+                                     trace_event_name(call),
+                                     fmt + start_arg,
+                                     (i - start_arg) + 5, "^"))
+                               return;
+                       continue;
+               case ',':
+                       if (in_quote || parens)
+                               continue;
+                       i++;
+                       while (isspace(fmt[i]))
+                               i++;
+                       start_arg = i;
+                       if (!(dereference_flags & (1ULL << arg)))
+                               goto next_arg;
+
+                       /* Find the REC-> in the argument */
+                       c = strchr(fmt + i, ',');
+                       r = strstr(fmt + i, "REC->");
+                       if (r && (!c || r < c)) {
+                               /*
+                                * Addresses of events on the buffer,
+                                * or an array on the buffer is
+                                * OK to dereference.
+                                * There's ways to fool this, but
+                                * this is to catch common mistakes,
+                                * not malicious code.
+                                */
+                               a = strchr(fmt + i, '&');
+                               if ((a && (a < r)) || test_field(r, call))
+                                       dereference_flags &= ~(1ULL << arg);
+                       }
+               next_arg:
+                       i--;
+                       arg++;
+               }
+       }
+
+       /*
+        * If you triggered the below warning, the trace event reported
+        * uses an unsafe dereference pointer %p*. As the data stored
+        * at the trace event time may no longer exist when the trace
+        * event is printed, dereferencing to the original source is
+        * unsafe. The source of the dereference must be copied into the
+        * event itself, and the dereference must access the copy instead.
+        */
+       if (WARN_ON_ONCE(dereference_flags)) {
+               arg = 1;
+               while (!(dereference_flags & 1)) {
+                       dereference_flags >>= 1;
+                       arg++;
+               }
+               pr_warn("event %s has unsafe dereference of argument %d\n",
+                       trace_event_name(call), arg);
+               pr_warn("print_fmt: %s\n", fmt);
+       }
+}
+
 int trace_event_raw_init(struct trace_event_call *call)
 {
        int id;
@@ -225,6 +433,8 @@ int trace_event_raw_init(struct trace_event_call *call)
        if (!id)
                return -ENODEV;
 
+       test_event_printk(call);
+
        return 0;
 }
 EXPORT_SYMBOL_GPL(trace_event_raw_init);
@@ -2436,7 +2646,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
                }
 
                /*
-                * Since calls are grouped by systems, the likelyhood that the
+                * Since calls are grouped by systems, the likelihood that the
                 * next call in the iteration belongs to the same system as the
                 * previous call is high. As an optimization, we skip searching
                 * for a map[] that matches the call's system if the last call
@@ -2496,7 +2706,7 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
 }
 
 /*
- * Just create a decriptor for early init. A descriptor is required
+ * Just create a descriptor for early init. A descriptor is required
  * for enabling events at boot. We want to enable events before
  * the filesystem is initialized.
  */
index e91259f..c912403 100644 (file)
@@ -256,7 +256,7 @@ enum {
  * is "&&" we don't call update_preds(). Instead continue to "c". As the
  * next token after "c" is not "&&" but the end of input, we first process the
  * "&&" by calling update_preds() for the "&&" then we process the "||" by
- * callin updates_preds() with the values for processing "||".
+ * calling updates_preds() with the values for processing "||".
  *
  * What does that mean? What update_preds() does is to first save the "target"
  * of the program entry indexed by the current program entry's "target"
@@ -296,7 +296,7 @@ enum {
  * and "FALSE" the program entry after that, we are now done with the first
  * pass.
  *
- * Making the above "a || b && c" have a progam of:
+ * Making the above "a || b && c" have a program of:
  *  prog[0] = { "a", 1, 2 }
  *  prog[1] = { "b", 0, 2 }
  *  prog[2] = { "c", 0, 3 }
@@ -390,7 +390,7 @@ enum {
  * F: return FALSE
  *
  * As "r = a; if (!r) goto n5;" is obviously the same as
- * "if (!a) goto n5;" without doing anything we can interperate the
+ * "if (!a) goto n5;" without doing anything we can interpret the
  * program as:
  * n1: if (!a) goto n5;
  * n2: if (!b) goto n5;
@@ -1693,6 +1693,7 @@ static void create_filter_finish(struct filter_parse_error *pe)
 
 /**
  * create_filter - create a filter for a trace_event_call
+ * @tr: the trace array associated with these events
  * @call: trace_event_call to create a filter for
  * @filter_str: filter string
  * @set_str: remember @filter_str and enable detailed error in filter
@@ -1741,8 +1742,8 @@ int create_event_filter(struct trace_array *tr,
 }
 
 /**
- * create_system_filter - create a filter for an event_subsystem
- * @system: event_subsystem to create a filter for
+ * create_system_filter - create a filter for an event subsystem
+ * @dir: the descriptor for the subsystem directory
  * @filter_str: filter string
  * @filterp: out param for created filter (always updated on return)
  *
@@ -1750,7 +1751,6 @@ int create_event_filter(struct trace_array *tr,
  * and always remembers @filter_str.
  */
 static int create_system_filter(struct trace_subsystem_dir *dir,
-                               struct trace_array *tr,
                                char *filter_str, struct event_filter **filterp)
 {
        struct filter_parse_error *pe = NULL;
@@ -1758,13 +1758,13 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
 
        err = create_filter_start(filter_str, true, &pe, filterp);
        if (!err) {
-               err = process_system_preds(dir, tr, pe, filter_str);
+               err = process_system_preds(dir, dir->tr, pe, filter_str);
                if (!err) {
                        /* System filters just show a default message */
                        kfree((*filterp)->filter_string);
                        (*filterp)->filter_string = NULL;
                } else {
-                       append_filter_err(tr, pe, *filterp);
+                       append_filter_err(dir->tr, pe, *filterp);
                }
        }
        create_filter_finish(pe);
@@ -1852,7 +1852,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
                goto out_unlock;
        }
 
-       err = create_system_filter(dir, tr, filter_string, &filter);
+       err = create_system_filter(dir, filter_string, &filter);
        if (filter) {
                /*
                 * No event actually uses the system filter
index 39ebe18..c1abd63 100644 (file)
@@ -81,6 +81,7 @@ struct hist_field;
 
 typedef u64 (*hist_field_fn_t) (struct hist_field *field,
                                struct tracing_map_elt *elt,
+                               struct trace_buffer *buffer,
                                struct ring_buffer_event *rbe,
                                void *event);
 
@@ -153,6 +154,7 @@ struct hist_field {
 
 static u64 hist_field_none(struct hist_field *field,
                           struct tracing_map_elt *elt,
+                          struct trace_buffer *buffer,
                           struct ring_buffer_event *rbe,
                           void *event)
 {
@@ -161,6 +163,7 @@ static u64 hist_field_none(struct hist_field *field,
 
 static u64 hist_field_counter(struct hist_field *field,
                              struct tracing_map_elt *elt,
+                             struct trace_buffer *buffer,
                              struct ring_buffer_event *rbe,
                              void *event)
 {
@@ -169,6 +172,7 @@ static u64 hist_field_counter(struct hist_field *field,
 
 static u64 hist_field_string(struct hist_field *hist_field,
                             struct tracing_map_elt *elt,
+                            struct trace_buffer *buffer,
                             struct ring_buffer_event *rbe,
                             void *event)
 {
@@ -179,6 +183,7 @@ static u64 hist_field_string(struct hist_field *hist_field,
 
 static u64 hist_field_dynstring(struct hist_field *hist_field,
                                struct tracing_map_elt *elt,
+                               struct trace_buffer *buffer,
                                struct ring_buffer_event *rbe,
                                void *event)
 {
@@ -191,6 +196,7 @@ static u64 hist_field_dynstring(struct hist_field *hist_field,
 
 static u64 hist_field_pstring(struct hist_field *hist_field,
                              struct tracing_map_elt *elt,
+                             struct trace_buffer *buffer,
                              struct ring_buffer_event *rbe,
                              void *event)
 {
@@ -201,52 +207,56 @@ static u64 hist_field_pstring(struct hist_field *hist_field,
 
 static u64 hist_field_log2(struct hist_field *hist_field,
                           struct tracing_map_elt *elt,
+                          struct trace_buffer *buffer,
                           struct ring_buffer_event *rbe,
                           void *event)
 {
        struct hist_field *operand = hist_field->operands[0];
 
-       u64 val = operand->fn(operand, elt, rbe, event);
+       u64 val = operand->fn(operand, elt, buffer, rbe, event);
 
        return (u64) ilog2(roundup_pow_of_two(val));
 }
 
 static u64 hist_field_plus(struct hist_field *hist_field,
                           struct tracing_map_elt *elt,
+                          struct trace_buffer *buffer,
                           struct ring_buffer_event *rbe,
                           void *event)
 {
        struct hist_field *operand1 = hist_field->operands[0];
        struct hist_field *operand2 = hist_field->operands[1];
 
-       u64 val1 = operand1->fn(operand1, elt, rbe, event);
-       u64 val2 = operand2->fn(operand2, elt, rbe, event);
+       u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+       u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
 
        return val1 + val2;
 }
 
 static u64 hist_field_minus(struct hist_field *hist_field,
                            struct tracing_map_elt *elt,
+                           struct trace_buffer *buffer,
                            struct ring_buffer_event *rbe,
                            void *event)
 {
        struct hist_field *operand1 = hist_field->operands[0];
        struct hist_field *operand2 = hist_field->operands[1];
 
-       u64 val1 = operand1->fn(operand1, elt, rbe, event);
-       u64 val2 = operand2->fn(operand2, elt, rbe, event);
+       u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+       u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
 
        return val1 - val2;
 }
 
 static u64 hist_field_unary_minus(struct hist_field *hist_field,
                                  struct tracing_map_elt *elt,
+                                 struct trace_buffer *buffer,
                                  struct ring_buffer_event *rbe,
                                  void *event)
 {
        struct hist_field *operand = hist_field->operands[0];
 
-       s64 sval = (s64)operand->fn(operand, elt, rbe, event);
+       s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event);
        u64 val = (u64)-sval;
 
        return val;
@@ -255,6 +265,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field,
 #define DEFINE_HIST_FIELD_FN(type)                                     \
        static u64 hist_field_##type(struct hist_field *hist_field,     \
                                     struct tracing_map_elt *elt,       \
+                                    struct trace_buffer *buffer,       \
                                     struct ring_buffer_event *rbe,     \
                                     void *event)                       \
 {                                                                      \
@@ -380,7 +391,8 @@ struct hist_trigger_data {
 struct action_data;
 
 typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
-                            struct tracing_map_elt *elt, void *rec,
+                            struct tracing_map_elt *elt,
+                            struct trace_buffer *buffer, void *rec,
                             struct ring_buffer_event *rbe, void *key,
                             struct action_data *data, u64 *var_ref_vals);
 
@@ -608,7 +620,8 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
 }
 
 static void action_trace(struct hist_trigger_data *hist_data,
-                        struct tracing_map_elt *elt, void *rec,
+                        struct tracing_map_elt *elt,
+                        struct trace_buffer *buffer, void *rec,
                         struct ring_buffer_event *rbe, void *key,
                         struct action_data *data, u64 *var_ref_vals)
 {
@@ -624,13 +637,14 @@ struct hist_var_data {
 
 static u64 hist_field_timestamp(struct hist_field *hist_field,
                                struct tracing_map_elt *elt,
+                               struct trace_buffer *buffer,
                                struct ring_buffer_event *rbe,
                                void *event)
 {
        struct hist_trigger_data *hist_data = hist_field->hist_data;
        struct trace_array *tr = hist_data->event_file->tr;
 
-       u64 ts = ring_buffer_event_time_stamp(rbe);
+       u64 ts = ring_buffer_event_time_stamp(buffer, rbe);
 
        if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
                ts = ns2usecs(ts);
@@ -640,6 +654,7 @@ static u64 hist_field_timestamp(struct hist_field *hist_field,
 
 static u64 hist_field_cpu(struct hist_field *hist_field,
                          struct tracing_map_elt *elt,
+                         struct trace_buffer *buffer,
                          struct ring_buffer_event *rbe,
                          void *event)
 {
@@ -1020,6 +1035,7 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
 
 static u64 hist_field_var_ref(struct hist_field *hist_field,
                              struct tracing_map_elt *elt,
+                             struct trace_buffer *buffer,
                              struct ring_buffer_event *rbe,
                              void *event)
 {
@@ -2561,6 +2577,7 @@ find_target_event_var(struct hist_trigger_data *hist_data,
 }
 
 static inline void __update_field_vars(struct tracing_map_elt *elt,
+                                      struct trace_buffer *buffer,
                                       struct ring_buffer_event *rbe,
                                       void *rec,
                                       struct field_var **field_vars,
@@ -2576,7 +2593,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
                struct hist_field *var = field_var->var;
                struct hist_field *val = field_var->val;
 
-               var_val = val->fn(val, elt, rbe, rec);
+               var_val = val->fn(val, elt, buffer, rbe, rec);
                var_idx = var->var.idx;
 
                if (val->flags & HIST_FIELD_FL_STRING) {
@@ -2592,19 +2609,21 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
 
 static void update_field_vars(struct hist_trigger_data *hist_data,
                              struct tracing_map_elt *elt,
+                             struct trace_buffer *buffer,
                              struct ring_buffer_event *rbe,
                              void *rec)
 {
-       __update_field_vars(elt, rbe, rec, hist_data->field_vars,
+       __update_field_vars(elt, buffer, rbe, rec, hist_data->field_vars,
                            hist_data->n_field_vars, 0);
 }
 
 static void save_track_data_vars(struct hist_trigger_data *hist_data,
-                                struct tracing_map_elt *elt, void *rec,
+                                struct tracing_map_elt *elt,
+                                struct trace_buffer *buffer,  void *rec,
                                 struct ring_buffer_event *rbe, void *key,
                                 struct action_data *data, u64 *var_ref_vals)
 {
-       __update_field_vars(elt, rbe, rec, hist_data->save_vars,
+       __update_field_vars(elt, buffer, rbe, rec, hist_data->save_vars,
                            hist_data->n_save_vars, hist_data->n_field_var_str);
 }
 
@@ -2780,12 +2799,14 @@ static void save_track_val(struct hist_trigger_data *hist_data,
 }
 
 static void save_track_data(struct hist_trigger_data *hist_data,
-                           struct tracing_map_elt *elt, void *rec,
+                           struct tracing_map_elt *elt,
+                           struct trace_buffer *buffer, void *rec,
                            struct ring_buffer_event *rbe, void *key,
                            struct action_data *data, u64 *var_ref_vals)
 {
        if (data->track_data.save_data)
-               data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+               data->track_data.save_data(hist_data, elt, buffer, rec, rbe,
+                                          key, data, var_ref_vals);
 }
 
 static bool check_track_val(struct tracing_map_elt *elt,
@@ -2836,7 +2857,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
 }
 
 static void save_track_data_snapshot(struct hist_trigger_data *hist_data,
-                                    struct tracing_map_elt *elt, void *rec,
+                                    struct tracing_map_elt *elt,
+                                    struct trace_buffer *buffer, void *rec,
                                     struct ring_buffer_event *rbe, void *key,
                                     struct action_data *data,
                                     u64 *var_ref_vals)
@@ -2905,7 +2927,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
        return false;
 }
 static void save_track_data_snapshot(struct hist_trigger_data *hist_data,
-                                    struct tracing_map_elt *elt, void *rec,
+                                    struct tracing_map_elt *elt,
+                                    struct trace_buffer *buffer, void *rec,
                                     struct ring_buffer_event *rbe, void *key,
                                     struct action_data *data,
                                     u64 *var_ref_vals) {}
@@ -2947,7 +2970,8 @@ static void track_data_print(struct seq_file *m,
 }
 
 static void ontrack_action(struct hist_trigger_data *hist_data,
-                          struct tracing_map_elt *elt, void *rec,
+                          struct tracing_map_elt *elt,
+                          struct trace_buffer *buffer, void *rec,
                           struct ring_buffer_event *rbe, void *key,
                           struct action_data *data, u64 *var_ref_vals)
 {
@@ -2955,7 +2979,8 @@ static void ontrack_action(struct hist_trigger_data *hist_data,
 
        if (check_track_val(elt, data, var_val)) {
                save_track_val(hist_data, elt, data, var_val);
-               save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+               save_track_data(hist_data, elt, buffer, rec, rbe,
+                               key, data, var_ref_vals);
        }
 }
 
@@ -4400,7 +4425,8 @@ create_hist_data(unsigned int map_bits,
 }
 
 static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
-                                   struct tracing_map_elt *elt, void *rec,
+                                   struct tracing_map_elt *elt,
+                                   struct trace_buffer *buffer, void *rec,
                                    struct ring_buffer_event *rbe,
                                    u64 *var_ref_vals)
 {
@@ -4414,7 +4440,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
 
        for_each_hist_val_field(i, hist_data) {
                hist_field = hist_data->fields[i];
-               hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+               hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
                if (hist_field->flags & HIST_FIELD_FL_VAR) {
                        var_idx = hist_field->var.idx;
 
@@ -4442,13 +4468,13 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
        for_each_hist_key_field(i, hist_data) {
                hist_field = hist_data->fields[i];
                if (hist_field->flags & HIST_FIELD_FL_VAR) {
-                       hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+                       hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
                        var_idx = hist_field->var.idx;
                        tracing_map_set_var(elt, var_idx, hist_val);
                }
        }
 
-       update_field_vars(hist_data, elt, rbe, rec);
+       update_field_vars(hist_data, elt, buffer, rbe, rec);
 }
 
 static inline void add_to_key(char *compound_key, void *key,
@@ -4478,7 +4504,8 @@ static inline void add_to_key(char *compound_key, void *key,
 
 static void
 hist_trigger_actions(struct hist_trigger_data *hist_data,
-                    struct tracing_map_elt *elt, void *rec,
+                    struct tracing_map_elt *elt,
+                    struct trace_buffer *buffer, void *rec,
                     struct ring_buffer_event *rbe, void *key,
                     u64 *var_ref_vals)
 {
@@ -4487,11 +4514,12 @@ hist_trigger_actions(struct hist_trigger_data *hist_data,
 
        for (i = 0; i < hist_data->n_actions; i++) {
                data = hist_data->actions[i];
-               data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+               data->fn(hist_data, elt, buffer, rec, rbe, key, data, var_ref_vals);
        }
 }
 
-static void event_hist_trigger(struct event_trigger_data *data, void *rec,
+static void event_hist_trigger(struct event_trigger_data *data,
+                              struct trace_buffer *buffer, void *rec,
                               struct ring_buffer_event *rbe)
 {
        struct hist_trigger_data *hist_data = data->private_data;
@@ -4516,7 +4544,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
                                         HIST_STACKTRACE_SKIP);
                        key = entries;
                } else {
-                       field_contents = key_field->fn(key_field, elt, rbe, rec);
+                       field_contents = key_field->fn(key_field, elt, buffer, rbe, rec);
                        if (key_field->flags & HIST_FIELD_FL_STRING) {
                                key = (void *)(unsigned long)field_contents;
                                use_compound_key = true;
@@ -4539,10 +4567,10 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
        if (!elt)
                return;
 
-       hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
+       hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals);
 
        if (resolve_var_refs(hist_data, key, var_ref_vals, true))
-               hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals);
+               hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals);
 }
 
 static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -5456,7 +5484,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
                        goto out;
                }
 
-               tracing_set_time_stamp_abs(file->tr, true);
+               tracing_set_filter_buffering(file->tr, true);
        }
 
        if (named_data)
@@ -5564,7 +5592,7 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
 
        if (hist_data->enable_timestamps) {
                if (!hist_data->remove || unregistered)
-                       tracing_set_time_stamp_abs(file->tr, false);
+                       tracing_set_filter_buffering(file->tr, false);
        }
 }
 
@@ -5611,7 +5639,7 @@ static void hist_unreg_all(struct trace_event_file *file)
 
                        update_cond_flag(file);
                        if (hist_data->enable_timestamps)
-                               tracing_set_time_stamp_abs(file->tr, false);
+                               tracing_set_filter_buffering(file->tr, false);
                        if (test->ops->free)
                                test->ops->free(test->ops, test);
                }
@@ -5812,7 +5840,8 @@ __init int register_trigger_hist_cmd(void)
 }
 
 static void
-hist_enable_trigger(struct event_trigger_data *data, void *rec,
+hist_enable_trigger(struct event_trigger_data *data,
+                   struct trace_buffer *buffer,  void *rec,
                    struct ring_buffer_event *event)
 {
        struct enable_trigger_data *enable_data = data->private_data;
@@ -5830,7 +5859,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec,
 }
 
 static void
-hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
+hist_enable_count_trigger(struct event_trigger_data *data,
+                         struct trace_buffer *buffer,  void *rec,
                          struct ring_buffer_event *event)
 {
        if (!data->count)
@@ -5839,7 +5869,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
        if (data->count != -1)
                (data->count)--;
 
-       hist_enable_trigger(data, rec, event);
+       hist_enable_trigger(data, buffer, rec, event);
 }
 
 static struct event_trigger_ops hist_enable_trigger_ops = {
index 8d71e6c..2ac75eb 100644 (file)
@@ -1385,7 +1385,7 @@ static int destroy_synth_event(struct synth_event *se)
 
 /**
  * synth_event_delete - Delete a synthetic event
- * @event_name: The name of the new sythetic event
+ * @event_name: The name of the new synthetic event
  *
  * Delete a synthetic event that was created with synth_event_create().
  *
index f725802..b8bfa85 100644 (file)
@@ -53,7 +53,8 @@ void trigger_data_free(struct event_trigger_data *data)
  * any trigger that should be deferred, ETT_NONE if nothing to defer.
  */
 enum event_trigger_type
-event_triggers_call(struct trace_event_file *file, void *rec,
+event_triggers_call(struct trace_event_file *file,
+                   struct trace_buffer *buffer, void *rec,
                    struct ring_buffer_event *event)
 {
        struct event_trigger_data *data;
@@ -67,7 +68,7 @@ event_triggers_call(struct trace_event_file *file, void *rec,
                if (data->paused)
                        continue;
                if (!rec) {
-                       data->ops->func(data, rec, event);
+                       data->ops->func(data, buffer, rec, event);
                        continue;
                }
                filter = rcu_dereference_sched(data->filter);
@@ -77,7 +78,7 @@ event_triggers_call(struct trace_event_file *file, void *rec,
                        tt |= data->cmd_ops->trigger_type;
                        continue;
                }
-               data->ops->func(data, rec, event);
+               data->ops->func(data, buffer, rec, event);
        }
        return tt;
 }
@@ -105,7 +106,7 @@ event_triggers_post_call(struct trace_event_file *file,
                if (data->paused)
                        continue;
                if (data->cmd_ops->trigger_type & tt)
-                       data->ops->func(data, NULL, NULL);
+                       data->ops->func(data, NULL, NULL, NULL);
        }
 }
 EXPORT_SYMBOL_GPL(event_triggers_post_call);
@@ -937,7 +938,8 @@ get_named_trigger_data(struct event_trigger_data *data)
 }
 
 static void
-traceon_trigger(struct event_trigger_data *data, void *rec,
+traceon_trigger(struct event_trigger_data *data,
+               struct trace_buffer *buffer, void *rec,
                struct ring_buffer_event *event)
 {
        if (tracing_is_on())
@@ -947,7 +949,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec,
 }
 
 static void
-traceon_count_trigger(struct event_trigger_data *data, void *rec,
+traceon_count_trigger(struct event_trigger_data *data,
+                     struct trace_buffer *buffer, void *rec,
                      struct ring_buffer_event *event)
 {
        if (tracing_is_on())
@@ -963,7 +966,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec,
 }
 
 static void
-traceoff_trigger(struct event_trigger_data *data, void *rec,
+traceoff_trigger(struct event_trigger_data *data,
+                struct trace_buffer *buffer, void *rec,
                 struct ring_buffer_event *event)
 {
        if (!tracing_is_on())
@@ -973,7 +977,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec,
 }
 
 static void
-traceoff_count_trigger(struct event_trigger_data *data, void *rec,
+traceoff_count_trigger(struct event_trigger_data *data,
+                      struct trace_buffer *buffer, void *rec,
                       struct ring_buffer_event *event)
 {
        if (!tracing_is_on())
@@ -1071,7 +1076,8 @@ static struct event_command trigger_traceoff_cmd = {
 
 #ifdef CONFIG_TRACER_SNAPSHOT
 static void
-snapshot_trigger(struct event_trigger_data *data, void *rec,
+snapshot_trigger(struct event_trigger_data *data,
+                struct trace_buffer *buffer, void *rec,
                 struct ring_buffer_event *event)
 {
        struct trace_event_file *file = data->private_data;
@@ -1083,7 +1089,8 @@ snapshot_trigger(struct event_trigger_data *data, void *rec,
 }
 
 static void
-snapshot_count_trigger(struct event_trigger_data *data, void *rec,
+snapshot_count_trigger(struct event_trigger_data *data,
+                      struct trace_buffer *buffer, void *rec,
                       struct ring_buffer_event *event)
 {
        if (!data->count)
@@ -1092,7 +1099,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec,
        if (data->count != -1)
                (data->count)--;
 
-       snapshot_trigger(data, rec, event);
+       snapshot_trigger(data, buffer, rec, event);
 }
 
 static int
@@ -1176,14 +1183,16 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
 #endif
 
 static void
-stacktrace_trigger(struct event_trigger_data *data, void *rec,
+stacktrace_trigger(struct event_trigger_data *data,
+                  struct trace_buffer *buffer,  void *rec,
                   struct ring_buffer_event *event)
 {
        trace_dump_stack(STACK_SKIP);
 }
 
 static void
-stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
+stacktrace_count_trigger(struct event_trigger_data *data,
+                        struct trace_buffer *buffer, void *rec,
                         struct ring_buffer_event *event)
 {
        if (!data->count)
@@ -1192,7 +1201,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
        if (data->count != -1)
                (data->count)--;
 
-       stacktrace_trigger(data, rec, event);
+       stacktrace_trigger(data, buffer, rec, event);
 }
 
 static int
@@ -1254,7 +1263,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
 }
 
 static void
-event_enable_trigger(struct event_trigger_data *data, void *rec,
+event_enable_trigger(struct event_trigger_data *data,
+                    struct trace_buffer *buffer,  void *rec,
                     struct ring_buffer_event *event)
 {
        struct enable_trigger_data *enable_data = data->private_data;
@@ -1266,7 +1276,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec,
 }
 
 static void
-event_enable_count_trigger(struct event_trigger_data *data, void *rec,
+event_enable_count_trigger(struct event_trigger_data *data,
+                          struct trace_buffer *buffer,  void *rec,
                           struct ring_buffer_event *event)
 {
        struct enable_trigger_data *enable_data = data->private_data;
@@ -1281,7 +1292,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec,
        if (data->count != -1)
                (data->count)--;
 
-       event_enable_trigger(data, rec, event);
+       event_enable_trigger(data, buffer, rec, event);
 }
 
 int event_enable_trigger_print(struct seq_file *m,
index f93723c..1f0e63f 100644 (file)
@@ -27,13 +27,28 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 static void
 function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
                          struct ftrace_ops *op, struct ftrace_regs *fregs);
+static void
+function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
+                              struct ftrace_ops *op, struct ftrace_regs *fregs);
+static void
+function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
+                                    struct ftrace_ops *op,
+                                    struct ftrace_regs *fregs);
 static struct tracer_flags func_flags;
 
 /* Our option */
 enum {
-       TRACE_FUNC_OPT_STACK    = 0x1,
+
+       TRACE_FUNC_NO_OPTS              = 0x0, /* No flags set. */
+       TRACE_FUNC_OPT_STACK            = 0x1,
+       TRACE_FUNC_OPT_NO_REPEATS       = 0x2,
+
+       /* Update this to next highest bit. */
+       TRACE_FUNC_OPT_HIGHEST_BIT      = 0x4
 };
 
+#define TRACE_FUNC_OPT_MASK    (TRACE_FUNC_OPT_HIGHEST_BIT - 1)
+
 int ftrace_allocate_ftrace_ops(struct trace_array *tr)
 {
        struct ftrace_ops *ops;
@@ -86,6 +101,34 @@ void ftrace_destroy_function_files(struct trace_array *tr)
        ftrace_free_ftrace_ops(tr);
 }
 
+static ftrace_func_t select_trace_function(u32 flags_val)
+{
+       switch (flags_val & TRACE_FUNC_OPT_MASK) {
+       case TRACE_FUNC_NO_OPTS:
+               return function_trace_call;
+       case TRACE_FUNC_OPT_STACK:
+               return function_stack_trace_call;
+       case TRACE_FUNC_OPT_NO_REPEATS:
+               return function_no_repeats_trace_call;
+       case TRACE_FUNC_OPT_STACK | TRACE_FUNC_OPT_NO_REPEATS:
+               return function_stack_no_repeats_trace_call;
+       default:
+               return NULL;
+       }
+}
+
+static bool handle_func_repeats(struct trace_array *tr, u32 flags_val)
+{
+       if (!tr->last_func_repeats &&
+           (flags_val & TRACE_FUNC_OPT_NO_REPEATS)) {
+               tr->last_func_repeats = alloc_percpu(struct trace_func_repeats);
+               if (!tr->last_func_repeats)
+                       return false;
+       }
+
+       return true;
+}
+
 static int function_trace_init(struct trace_array *tr)
 {
        ftrace_func_t func;
@@ -97,12 +140,12 @@ static int function_trace_init(struct trace_array *tr)
        if (!tr->ops)
                return -ENOMEM;
 
-       /* Currently only the global instance can do stack tracing */
-       if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
-           func_flags.val & TRACE_FUNC_OPT_STACK)
-               func = function_stack_trace_call;
-       else
-               func = function_trace_call;
+       func = select_trace_function(func_flags.val);
+       if (!func)
+               return -EINVAL;
+
+       if (!handle_func_repeats(tr, func_flags.val))
+               return -ENOMEM;
 
        ftrace_init_array_ops(tr, func);
 
@@ -205,15 +248,137 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
        local_irq_restore(flags);
 }
 
+static inline bool is_repeat_check(struct trace_array *tr,
+                                  struct trace_func_repeats *last_info,
+                                  unsigned long ip, unsigned long parent_ip)
+{
+       if (last_info->ip == ip &&
+           last_info->parent_ip == parent_ip &&
+           last_info->count < U16_MAX) {
+               last_info->ts_last_call =
+                       ring_buffer_time_stamp(tr->array_buffer.buffer);
+               last_info->count++;
+               return true;
+       }
+
+       return false;
+}
+
+static inline void process_repeats(struct trace_array *tr,
+                                  unsigned long ip, unsigned long parent_ip,
+                                  struct trace_func_repeats *last_info,
+                                  unsigned int trace_ctx)
+{
+       if (last_info->count) {
+               trace_last_func_repeats(tr, last_info, trace_ctx);
+               last_info->count = 0;
+       }
+
+       last_info->ip = ip;
+       last_info->parent_ip = parent_ip;
+}
+
+static void
+function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
+                              struct ftrace_ops *op,
+                              struct ftrace_regs *fregs)
+{
+       struct trace_func_repeats *last_info;
+       struct trace_array *tr = op->private;
+       struct trace_array_cpu *data;
+       unsigned int trace_ctx;
+       unsigned long flags;
+       int bit;
+       int cpu;
+
+       if (unlikely(!tr->function_enabled))
+               return;
+
+       bit = ftrace_test_recursion_trylock(ip, parent_ip);
+       if (bit < 0)
+               return;
+
+       preempt_disable_notrace();
+
+       cpu = smp_processor_id();
+       data = per_cpu_ptr(tr->array_buffer.data, cpu);
+       if (atomic_read(&data->disabled))
+               goto out;
+
+       /*
+        * An interrupt may happen at any place here. But as far as I can see,
+        * the only damage that this can cause is to mess up the repetition
+        * counter without valuable data being lost.
+        * TODO: think about a solution that is better than just hoping to be
+        * lucky.
+        */
+       last_info = per_cpu_ptr(tr->last_func_repeats, cpu);
+       if (is_repeat_check(tr, last_info, ip, parent_ip))
+               goto out;
+
+       local_save_flags(flags);
+       trace_ctx = tracing_gen_ctx_flags(flags);
+       process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
+
+       trace_function(tr, ip, parent_ip, trace_ctx);
+
+out:
+       ftrace_test_recursion_unlock(bit);
+       preempt_enable_notrace();
+}
+
+static void
+function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
+                                    struct ftrace_ops *op,
+                                    struct ftrace_regs *fregs)
+{
+       struct trace_func_repeats *last_info;
+       struct trace_array *tr = op->private;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       unsigned int trace_ctx;
+
+       if (unlikely(!tr->function_enabled))
+               return;
+
+       /*
+        * Need to use raw, since this must be called before the
+        * recursive protection is performed.
+        */
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = per_cpu_ptr(tr->array_buffer.data, cpu);
+       disabled = atomic_inc_return(&data->disabled);
+
+       if (likely(disabled == 1)) {
+               last_info = per_cpu_ptr(tr->last_func_repeats, cpu);
+               if (is_repeat_check(tr, last_info, ip, parent_ip))
+                       goto out;
+
+               trace_ctx = tracing_gen_ctx_flags(flags);
+               process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
+
+               trace_function(tr, ip, parent_ip, trace_ctx);
+               __trace_stack(tr, trace_ctx, STACK_SKIP);
+       }
+
+ out:
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+}
+
 static struct tracer_opt func_opts[] = {
 #ifdef CONFIG_STACKTRACE
        { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
 #endif
+       { TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) },
        { } /* Always set a last empty entry */
 };
 
 static struct tracer_flags func_flags = {
-       .val = 0, /* By default: all flags disabled */
+       .val = TRACE_FUNC_NO_OPTS, /* By default: all flags disabled */
        .opts = func_opts
 };
 
@@ -235,30 +400,32 @@ static struct tracer function_trace;
 static int
 func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 {
-       switch (bit) {
-       case TRACE_FUNC_OPT_STACK:
-               /* do nothing if already set */
-               if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
-                       break;
-
-               /* We can change this flag when not running. */
-               if (tr->current_trace != &function_trace)
-                       break;
+       ftrace_func_t func;
+       u32 new_flags;
 
-               unregister_ftrace_function(tr->ops);
+       /* Do nothing if already set. */
+       if (!!set == !!(func_flags.val & bit))
+               return 0;
 
-               if (set) {
-                       tr->ops->func = function_stack_trace_call;
-                       register_ftrace_function(tr->ops);
-               } else {
-                       tr->ops->func = function_trace_call;
-                       register_ftrace_function(tr->ops);
-               }
+       /* We can change this flag only when not running. */
+       if (tr->current_trace != &function_trace)
+               return 0;
 
-               break;
-       default:
+       new_flags = (func_flags.val & ~bit) | (set ? bit : 0);
+       func = select_trace_function(new_flags);
+       if (!func)
                return -EINVAL;
-       }
+
+       /* Check if there's anything to change. */
+       if (tr->ops->func == func)
+               return 0;
+
+       if (!handle_func_repeats(tr, new_flags))
+               return -ENOMEM;
+
+       unregister_ftrace_function(tr->ops);
+       tr->ops->func = func;
+       register_ftrace_function(tr->ops);
 
        return 0;
 }
index 0aa6e6f..0de6837 100644 (file)
@@ -764,7 +764,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
  *  - we are inside irq code
  *  - we just entered irq code
  *
- * retunns 0 if
+ * returns 0 if
  *  - funcgraph-interrupts option is set
  *  - we are not inside irq code
  */
index 34dc1a7..632ef88 100644 (file)
@@ -83,7 +83,7 @@ struct hwlat_sample {
        u64                     nmi_total_ts;   /* Total time spent in NMIs */
        struct timespec64       timestamp;      /* wall time */
        int                     nmi_count;      /* # NMIs during this sample */
-       int                     count;          /* # of iteratons over threash */
+       int                     count;          /* # of iterations over thresh */
 };
 
 /* keep the global state somewhere. */
@@ -389,7 +389,7 @@ static int start_kthread(struct trace_array *tr)
 }
 
 /**
- * stop_kthread - Inform the hardware latency samping/detector kthread to stop
+ * stop_kthread - Inform the hardware latency sampling/detector kthread to stop
  *
  * This kicks the running hardware latency sampling/detector kernel thread and
  * tells it to stop sampling now. Use this on unload and at system shutdown.
index 6fe770d..ea6178c 100644 (file)
@@ -1748,7 +1748,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
        if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE))
                kretprobe_perf_func(tk, ri, regs);
 #endif
-       return 0;       /* We don't tweek kernel, so just return 0 */
+       return 0;       /* We don't tweak kernel, so just return 0 */
 }
 NOKPROBE_SYMBOL(kretprobe_dispatcher);
 
index 61255ba..d0368a5 100644 (file)
@@ -317,7 +317,7 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
        va_list ap;
 
        va_start(ap, fmt);
-       trace_seq_vprintf(&iter->seq, trace_event_format(iter, fmt), ap);
+       trace_check_vprintf(iter, trace_event_format(iter, fmt), ap);
        va_end(ap);
 }
 EXPORT_SYMBOL(trace_event_printf);
@@ -587,13 +587,26 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
        return !trace_seq_has_overflowed(s);
 }
 
+static void trace_print_time(struct trace_seq *s, struct trace_iterator *iter,
+                            unsigned long long ts)
+{
+       unsigned long secs, usec_rem;
+       unsigned long long t;
+
+       if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
+               t = ns2usecs(ts);
+               usec_rem = do_div(t, USEC_PER_SEC);
+               secs = (unsigned long)t;
+               trace_seq_printf(s, " %5lu.%06lu", secs, usec_rem);
+       } else
+               trace_seq_printf(s, " %12llu", ts);
+}
+
 int trace_print_context(struct trace_iterator *iter)
 {
        struct trace_array *tr = iter->tr;
        struct trace_seq *s = &iter->seq;
        struct trace_entry *entry = iter->ent;
-       unsigned long long t;
-       unsigned long secs, usec_rem;
        char comm[TASK_COMM_LEN];
 
        trace_find_cmdline(entry->pid, comm);
@@ -614,13 +627,8 @@ int trace_print_context(struct trace_iterator *iter)
        if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
                trace_print_lat_fmt(s, entry);
 
-       if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
-               t = ns2usecs(iter->ts);
-               usec_rem = do_div(t, USEC_PER_SEC);
-               secs = (unsigned long)t;
-               trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
-       } else
-               trace_seq_printf(s, " %12llu: ", iter->ts);
+       trace_print_time(s, iter, iter->ts);
+       trace_seq_puts(s, ": ");
 
        return !trace_seq_has_overflowed(s);
 }
@@ -837,6 +845,17 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
        return trace_handle_return(&iter->seq);
 }
 
+static void print_fn_trace(struct trace_seq *s, unsigned long ip,
+                          unsigned long parent_ip, int flags)
+{
+       seq_print_ip_sym(s, ip, flags);
+
+       if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) {
+               trace_seq_puts(s, " <-");
+               seq_print_ip_sym(s, parent_ip, flags);
+       }
+}
+
 /* TRACE_FN */
 static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
                                        struct trace_event *event)
@@ -846,13 +865,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
 
        trace_assign_type(field, iter->ent);
 
-       seq_print_ip_sym(s, field->ip, flags);
-
-       if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
-               trace_seq_puts(s, " <-");
-               seq_print_ip_sym(s, field->parent_ip, flags);
-       }
-
+       print_fn_trace(s, field->ip, field->parent_ip, flags);
        trace_seq_putc(s, '\n');
 
        return trace_handle_return(s);
@@ -1373,6 +1386,51 @@ static struct trace_event trace_raw_data_event = {
        .funcs          = &trace_raw_data_funcs,
 };
 
+static enum print_line_t
+trace_func_repeats_raw(struct trace_iterator *iter, int flags,
+                        struct trace_event *event)
+{
+       struct func_repeats_entry *field;
+       struct trace_seq *s = &iter->seq;
+
+       trace_assign_type(field, iter->ent);
+
+       trace_seq_printf(s, "%lu %lu %u %llu\n",
+                        field->ip,
+                        field->parent_ip,
+                        field->count,
+                        FUNC_REPEATS_GET_DELTA_TS(field));
+
+       return trace_handle_return(s);
+}
+
+static enum print_line_t
+trace_func_repeats_print(struct trace_iterator *iter, int flags,
+                        struct trace_event *event)
+{
+       struct func_repeats_entry *field;
+       struct trace_seq *s = &iter->seq;
+
+       trace_assign_type(field, iter->ent);
+
+       print_fn_trace(s, field->ip, field->parent_ip, flags);
+       trace_seq_printf(s, " (repeats: %u, last_ts:", field->count);
+       trace_print_time(s, iter,
+                        iter->ts - FUNC_REPEATS_GET_DELTA_TS(field));
+       trace_seq_puts(s, ")\n");
+
+       return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_func_repeats_funcs = {
+       .trace          = trace_func_repeats_print,
+       .raw            = trace_func_repeats_raw,
+};
+
+static struct trace_event trace_func_repeats_event = {
+       .type           = TRACE_FUNC_REPEATS,
+       .funcs          = &trace_func_repeats_funcs,
+};
 
 static struct trace_event *events[] __initdata = {
        &trace_fn_event,
@@ -1385,6 +1443,7 @@ static struct trace_event *events[] __initdata = {
        &trace_print_event,
        &trace_hwlat_event,
        &trace_raw_data_event,
+       &trace_func_repeats_event,
        NULL
 };
 
index ff32476..4b320fe 100644 (file)
@@ -251,6 +251,17 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
+bool trace_is_tracepoint_string(const char *str)
+{
+       const char **ptr = __start___tracepoint_str;
+
+       for (ptr = __start___tracepoint_str; ptr < __stop___tracepoint_str; ptr++) {
+               if (str == *ptr)
+                       return true;
+       }
+       return false;
+}
+
 static const char **find_next(void *v, loff_t *pos)
 {
        const char **fmt = v;
index ec589a4..15413ad 100644 (file)
@@ -168,7 +168,7 @@ void __trace_probe_log_err(int offset, int err_type)
        if (!trace_probe_log.argv)
                return;
 
-       /* Recalcurate the length and allocate buffer */
+       /* Recalculate the length and allocate buffer */
        for (i = 0; i < trace_probe_log.argc; i++) {
                if (i == trace_probe_log.index)
                        pos = len;
@@ -182,7 +182,7 @@ void __trace_probe_log_err(int offset, int err_type)
                /**
                 * Set the error position is next to the last arg + space.
                 * Note that len includes the terminal null and the cursor
-                * appaers at pos + 1.
+                * appears at pos + 1.
                 */
                pos = len;
                offset = 0;
@@ -592,7 +592,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
        }
 
        /*
-        * Since $comm and immediate string can not be dereferred,
+        * Since $comm and immediate string can not be dereferenced,
         * we can find those by strcmp.
         */
        if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) {
index 7ce4027..227d518 100644 (file)
@@ -134,7 +134,7 @@ struct fetch_type {
        size_t                  size;           /* Byte size of type */
        int                     is_signed;      /* Signed flag */
        print_type_func_t       print;          /* Print functions */
-       const char              *fmt;           /* Fromat string */
+       const char              *fmt;           /* Format string */
        const char              *fmttype;       /* Name in format file */
 };
 
index e528282..f003c5d 100644 (file)
@@ -167,7 +167,7 @@ array:
        return code->op == FETCH_OP_END ? ret : -EILSEQ;
 }
 
-/* Sum up total data length for dynamic arraies (strings) */
+/* Sum up total data length for dynamic arrays (strings) */
 static nokprobe_inline int
 __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
 {
index 73ef120..adf7ef1 100644 (file)
@@ -878,7 +878,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
        int ret;
 
        /*
-        * Now that the big kernel lock is no longer preemptable,
+        * Now that the big kernel lock is no longer preemptible,
         * and this is called with the BKL held, it will always
         * fail. If preemption is already disabled, simply
         * pass the test. When the BKL is removed, or becomes
@@ -940,7 +940,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
        int ret;
 
        /*
-        * Now that the big kernel lock is no longer preemptable,
+        * Now that the big kernel lock is no longer preemptible,
         * and this is called with the BKL held, it will always
         * fail. If preemption is already disabled, simply
         * pass the test. When the BKL is removed, or becomes
index 1d84fcc..9c90b3a 100644 (file)
@@ -16,7 +16,7 @@
  * The buffer size is currently PAGE_SIZE, although it may become dynamic
  * in the future.
  *
- * A write to the buffer will either succed or fail. That is, unlike
+ * A write to the buffer will either succeed or fail. That is, unlike
  * sprintf() there will not be a partial write (well it may write into
  * the buffer but it wont update the pointers). This allows users to
  * try to write something into the trace_seq buffer and if it fails
@@ -73,7 +73,7 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
  * @fmt: printf format string
  *
  * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
+ * copy to user routines. To simplify formatting of a trace
  * trace_seq_printf() is used to store strings into a special
  * buffer (@s). Then the output may be either used by
  * the sequencer or pulled into another buffer.
@@ -133,7 +133,7 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
  * @fmt: printf format string
  *
  * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
+ * copy to user routines. To simplify formatting of a trace
  * trace_seq_printf is used to store strings into a special
  * buffer (@s). Then the output may be either used by
  * the sequencer or pulled into another buffer.
@@ -226,7 +226,7 @@ EXPORT_SYMBOL_GPL(trace_seq_puts);
  * @c: simple character to record
  *
  * The tracer may use either the sequence operations or its own
- * copy to user routines. This function records a simple charater
+ * copy to user routines. This function records a simple character
  * into a special buffer (@s) for later retrieval by a sequencer
  * or other mechanism.
  */
@@ -348,7 +348,7 @@ int trace_seq_path(struct trace_seq *s, const struct path *path)
 EXPORT_SYMBOL_GPL(trace_seq_path);
 
 /**
- * trace_seq_to_user - copy the squence buffer to user space
+ * trace_seq_to_user - copy the sequence buffer to user space
  * @s: trace sequence descriptor
  * @ubuf: The userspace memory location to copy to
  * @cnt: The amount to copy
@@ -363,7 +363,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
  *
  * On failure it returns -EBUSY if all of the content in the
  * sequence has been already read, which includes nothing in the
- * sequenc (@s->len == @s->readpos).
+ * sequence (@s->len == @s->readpos).
  *
  * Returns -EFAULT if the copy to userspace fails.
  */
index 3f64661..36c1233 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/ptrace.h>
 #include <linux/async.h>
 #include <linux/uaccess.h>
+#include <linux/initrd.h>
 
 #include <trace/events/module.h>
 
@@ -107,6 +108,7 @@ static int call_usermodehelper_exec_async(void *data)
 
        commit_creds(new);
 
+       wait_for_initramfs();
        retval = kernel_execve(sub_info->path,
                               (const char *const *)sub_info->argv,
                               (const char *const *)sub_info->envp);
@@ -336,8 +338,8 @@ static void helper_unlock(void)
  * @argv: arg vector for process
  * @envp: environment for process
  * @gfp_mask: gfp mask for memory allocation
- * @cleanup: a cleanup function
  * @init: an init function
+ * @cleanup: a cleanup function
  * @data: arbitrary context sensitive data
  *
  * Returns either %NULL on allocation failure, or a subprocess_info
@@ -348,7 +350,7 @@ static void helper_unlock(void)
  * exec.  A non-zero return code causes the process to error out, exit,
  * and return the failure to the calling process
  *
- * The cleanup function is just before ethe subprocess_info is about to
+ * The cleanup function is just before the subprocess_info is about to
  * be freed.  This can be used for freeing the argv and envp.  The
  * Function must be runnable in either a process context or the
  * context in which call_usermodehelper_exec is called.
@@ -384,7 +386,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
 
 /**
  * call_usermodehelper_exec - start a usermode application
- * @sub_info: information about the subprocessa
+ * @sub_info: information about the subprocess
  * @wait: wait for the application to finish and return status.
  *        when UMH_NO_WAIT don't wait at all, but you get no useful error back
  *        when the program couldn't be exec'ed. This makes it safe to call
index bf20b4a..df50828 100644 (file)
@@ -38,7 +38,7 @@ EXPORT_SYMBOL(smp_call_function_single_async);
 
 /*
  * Preemption is disabled here to make sure the cond_func is called under the
- * same condtions in UP and SMP.
+ * same conditions in UP and SMP.
  */
 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
                           void *info, bool wait, const struct cpumask *mask)
index 9a4b980..8d62863 100644 (file)
@@ -85,7 +85,7 @@ int create_user_ns(struct cred *new)
        /*
         * Verify that we can not violate the policy of which files
         * may be accessed that is specified by the root directory,
-        * by verifing that the root directory is at the root of the
+        * by verifying that the root directory is at the root of the
         * mount namespace which allows all files to be accessed.
         */
        ret = -EPERM;
@@ -1014,7 +1014,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
                        goto out;
                ret = -EINVAL;
        }
-       /* Be very certaint the new map actually exists */
+       /* Be very certain the new map actually exists */
        if (new_map.nr_extents == 0)
                goto out;
 
@@ -1169,7 +1169,7 @@ static bool new_idmap_permitted(const struct file *file,
 
        /* Allow the specified ids if we have the appropriate capability
         * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
-        * And the opener of the id file also had the approprpiate capability.
+        * And the opener of the id file also has the appropriate capability.
         */
        if (ns_capable(ns->parent, cap_setid) &&
            file_ns_capable(file, ns->parent, cap_setid))
index 78f50cc..e641add 100644 (file)
@@ -7,6 +7,7 @@ menuconfig KFENCE
        bool "KFENCE: low-overhead sampling-based memory safety error detector"
        depends on HAVE_ARCH_KFENCE && (SLAB || SLUB)
        select STACKTRACE
+       select IRQ_WORK
        help
          KFENCE is a low-overhead sampling-based detector of heap out-of-bounds
          access, use-after-free, and invalid-free errors. KFENCE is designed
index 7c031ee..c8095f3 100644 (file)
--- a/lib/bch.c
+++ b/lib/bch.c
@@ -584,7 +584,7 @@ static int find_affine4_roots(struct bch_control *bch, unsigned int a,
        k = a_log(bch, a);
        rows[0] = c;
 
-       /* buid linear system to solve X^4+aX^2+bX+c = 0 */
+       /* build linear system to solve X^4+aX^2+bX+c = 0 */
        for (i = 0; i < m; i++) {
                rows[i+1] = bch->a_pow_tab[4*i]^
                        (a ? bch->a_pow_tab[mod_s(bch, k)] : 0)^
index 9f4626a..74ceb02 100644 (file)
@@ -3,17 +3,19 @@
  * lib/bitmap.c
  * Helper functions for bitmap.h.
  */
-#include <linux/export.h>
-#include <linux/thread_info.h>
-#include <linux/ctype.h>
-#include <linux/errno.h>
+
 #include <linux/bitmap.h>
 #include <linux/bitops.h>
 #include <linux/bug.h>
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/thread_info.h>
 #include <linux/uaccess.h>
 
 #include <asm/page.h>
@@ -1271,6 +1273,38 @@ void bitmap_free(const unsigned long *bitmap)
 }
 EXPORT_SYMBOL(bitmap_free);
 
+static void devm_bitmap_free(void *data)
+{
+       unsigned long *bitmap = data;
+
+       bitmap_free(bitmap);
+}
+
+unsigned long *devm_bitmap_alloc(struct device *dev,
+                                unsigned int nbits, gfp_t flags)
+{
+       unsigned long *bitmap;
+       int ret;
+
+       bitmap = bitmap_alloc(nbits, flags);
+       if (!bitmap)
+               return NULL;
+
+       ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap);
+       if (ret)
+               return NULL;
+
+       return bitmap;
+}
+EXPORT_SYMBOL_GPL(devm_bitmap_alloc);
+
+unsigned long *devm_bitmap_zalloc(struct device *dev,
+                                 unsigned int nbits, gfp_t flags)
+{
+       return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO);
+}
+EXPORT_SYMBOL_GPL(devm_bitmap_zalloc);
+
 #if BITS_PER_LONG == 64
 /**
  * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap
index 8f9d537..45a0584 100644 (file)
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -127,6 +127,22 @@ static inline struct bug_entry *module_find_bug(unsigned long bugaddr)
 }
 #endif
 
+void bug_get_file_line(struct bug_entry *bug, const char **file,
+                      unsigned int *line)
+{
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
+       *file = bug->file;
+#else
+       *file = (const char *)bug + bug->file_disp;
+#endif
+       *line = bug->line;
+#else
+       *file = NULL;
+       *line = 0;
+#endif
+}
+
 struct bug_entry *find_bug(unsigned long bugaddr)
 {
        struct bug_entry *bug;
@@ -153,32 +169,20 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 
        disable_trace_on_warning();
 
-       file = NULL;
-       line = 0;
-       warning = 0;
+       bug_get_file_line(bug, &file, &line);
 
-       if (bug) {
-#ifdef CONFIG_DEBUG_BUGVERBOSE
-#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
-               file = bug->file;
-#else
-               file = (const char *)bug + bug->file_disp;
-#endif
-               line = bug->line;
-#endif
-               warning = (bug->flags & BUGFLAG_WARNING) != 0;
-               once = (bug->flags & BUGFLAG_ONCE) != 0;
-               done = (bug->flags & BUGFLAG_DONE) != 0;
-
-               if (warning && once) {
-                       if (done)
-                               return BUG_TRAP_TYPE_WARN;
-
-                       /*
-                        * Since this is the only store, concurrency is not an issue.
-                        */
-                       bug->flags |= BUGFLAG_DONE;
-               }
+       warning = (bug->flags & BUGFLAG_WARNING) != 0;
+       once = (bug->flags & BUGFLAG_ONCE) != 0;
+       done = (bug->flags & BUGFLAG_DONE) != 0;
+
+       if (warning && once) {
+               if (done)
+                       return BUG_TRAP_TYPE_WARN;
+
+               /*
+                * Since this is the only store, concurrency is not an issue.
+                */
+               bug->flags |= BUGFLAG_DONE;
        }
 
        /*
index 5d474c6..5546bf5 100644 (file)
@@ -272,3 +272,4 @@ char *next_arg(char *args, char **param, char **val)
        /* Chew up trailing spaces. */
        return skip_spaces(args);
 }
+EXPORT_SYMBOL(next_arg);
index 595a5a7..1ad8e50 100644 (file)
@@ -71,7 +71,7 @@ EXPORT_SYMBOL(crc8_populate_lsb);
  * @nbytes: number of bytes in data buffer.
  * @crc: previous returned crc8 value.
  */
-u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc)
+u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc)
 {
        /* loop over the buffer data */
        while (nbytes-- > 0)
index 1cf409e..20a8580 100644 (file)
@@ -391,7 +391,7 @@ static inline int INIT process_bit0(struct writer *wr, struct rc *rc,
 static inline int INIT process_bit1(struct writer *wr, struct rc *rc,
                                            struct cstate *cst, uint16_t *p,
                                            int pos_state, uint16_t *prob) {
-  int offset;
+       int offset;
        uint16_t *prob_len;
        int num_bits;
        int len;
index c70d634..921d0a6 100644 (file)
@@ -396,7 +396,7 @@ static int ddebug_parse_query(char *words[], int nwords,
                        /* tail :$info is function or line-range */
                        fline = strchr(query->filename, ':');
                        if (!fline)
-                               break;
+                               continue;
                        *fline++ = '\0';
                        if (isalpha(*fline) || *fline == '*' || *fline == '?') {
                                /* take as function name */
index f67f86f..0f8e2e3 100644 (file)
@@ -29,7 +29,7 @@
  *    searching it for one bits.
  *  - The optional "addr2", which is anded with "addr1" if present.
  */
-static unsigned long _find_next_bit(const unsigned long *addr1,
+unsigned long _find_next_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long nbits,
                unsigned long start, unsigned long invert, unsigned long le)
 {
@@ -68,44 +68,14 @@ static unsigned long _find_next_bit(const unsigned long *addr1,
 
        return min(start + __ffs(tmp), nbits);
 }
-#endif
-
-#ifndef find_next_bit
-/*
- * Find the next set bit in a memory region.
- */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-                           unsigned long offset)
-{
-       return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
-}
-EXPORT_SYMBOL(find_next_bit);
-#endif
-
-#ifndef find_next_zero_bit
-unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
-                                unsigned long offset)
-{
-       return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
-}
-EXPORT_SYMBOL(find_next_zero_bit);
-#endif
-
-#if !defined(find_next_and_bit)
-unsigned long find_next_and_bit(const unsigned long *addr1,
-               const unsigned long *addr2, unsigned long size,
-               unsigned long offset)
-{
-       return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
-}
-EXPORT_SYMBOL(find_next_and_bit);
+EXPORT_SYMBOL(_find_next_bit);
 #endif
 
 #ifndef find_first_bit
 /*
  * Find the first set bit in a memory region.
  */
-unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+unsigned long _find_first_bit(const unsigned long *addr, unsigned long size)
 {
        unsigned long idx;
 
@@ -116,14 +86,14 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
 
        return size;
 }
-EXPORT_SYMBOL(find_first_bit);
+EXPORT_SYMBOL(_find_first_bit);
 #endif
 
 #ifndef find_first_zero_bit
 /*
  * Find the first cleared bit in a memory region.
  */
-unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size)
 {
        unsigned long idx;
 
@@ -134,11 +104,11 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
 
        return size;
 }
-EXPORT_SYMBOL(find_first_zero_bit);
+EXPORT_SYMBOL(_find_first_zero_bit);
 #endif
 
 #ifndef find_last_bit
-unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+unsigned long _find_last_bit(const unsigned long *addr, unsigned long size)
 {
        if (size) {
                unsigned long val = BITMAP_LAST_WORD_MASK(size);
@@ -154,31 +124,9 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
        }
        return size;
 }
-EXPORT_SYMBOL(find_last_bit);
-#endif
-
-#ifdef __BIG_ENDIAN
-
-#ifndef find_next_zero_bit_le
-unsigned long find_next_zero_bit_le(const void *addr, unsigned
-               long size, unsigned long offset)
-{
-       return _find_next_bit(addr, NULL, size, offset, ~0UL, 1);
-}
-EXPORT_SYMBOL(find_next_zero_bit_le);
-#endif
-
-#ifndef find_next_bit_le
-unsigned long find_next_bit_le(const void *addr, unsigned
-               long size, unsigned long offset)
-{
-       return _find_next_bit(addr, NULL, size, offset, 0UL, 1);
-}
-EXPORT_SYMBOL(find_next_bit_le);
+EXPORT_SYMBOL(_find_last_bit);
 #endif
 
-#endif /* __BIG_ENDIAN */
-
 unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr,
                               unsigned long size, unsigned long offset)
 {
index 5dcf9cd..9a57257 100644 (file)
@@ -642,6 +642,7 @@ EXPORT_SYMBOL(gen_pool_set_algo);
  * @nr: The number of zeroed bits we're looking for
  * @data: additional data - unused
  * @pool: pool to find the fit region memory from
+ * @start_addr: not used in this function
  */
 unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size,
                unsigned long start, unsigned int nr, void *data,
@@ -660,6 +661,7 @@ EXPORT_SYMBOL(gen_pool_first_fit);
  * @nr: The number of zeroed bits we're looking for
  * @data: data for alignment
  * @pool: pool to get order from
+ * @start_addr: start addr of alloction chunk
  */
 unsigned long gen_pool_first_fit_align(unsigned long *map, unsigned long size,
                unsigned long start, unsigned int nr, void *data,
@@ -687,6 +689,7 @@ EXPORT_SYMBOL(gen_pool_first_fit_align);
  * @nr: The number of zeroed bits we're looking for
  * @data: data for alignment
  * @pool: pool to get order from
+ * @start_addr: not used in this function
  */
 unsigned long gen_pool_fixed_alloc(unsigned long *map, unsigned long size,
                unsigned long start, unsigned int nr, void *data,
@@ -721,6 +724,7 @@ EXPORT_SYMBOL(gen_pool_fixed_alloc);
  * @nr: The number of zeroed bits we're looking for
  * @data: additional data - unused
  * @pool: pool to find the fit region memory from
+ * @start_addr: not used in this function
  */
 unsigned long gen_pool_first_fit_order_align(unsigned long *map,
                unsigned long size, unsigned long start,
@@ -735,13 +739,14 @@ EXPORT_SYMBOL(gen_pool_first_fit_order_align);
 
 /**
  * gen_pool_best_fit - find the best fitting region of memory
- * macthing the size requirement (no alignment constraint)
+ * matching the size requirement (no alignment constraint)
  * @map: The address to base the search on
  * @size: The bitmap size in bits
  * @start: The bitnumber to start searching at
  * @nr: The number of zeroed bits we're looking for
  * @data: additional data - unused
  * @pool: pool to find the fit region memory from
+ * @start_addr: not used in this function
  *
  * Iterate over the bitmap to find the smallest free region
  * which we can allocate the memory.
index 61228a6..c701b7a 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/fault-inject-usercopy.h>
 #include <linux/uio.h>
 #include <linux/pagemap.h>
+#include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/splice.h>
@@ -507,13 +508,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_init);
 
-static void memzero_page(struct page *page, size_t offset, size_t len)
-{
-       char *addr = kmap_atomic(page);
-       memset(addr + offset, 0, len);
-       kunmap_atomic(addr);
-}
-
 static inline bool allocated(struct pipe_buffer *buf)
 {
        return buf->ops == &default_pipe_buf_ops;
index a926d96..1e1e377 100644 (file)
@@ -137,7 +137,7 @@ static void merge_final(void *priv, list_cmp_func_t cmp, struct list_head *head,
  *
  *
  * The merging is controlled by "count", the number of elements in the
- * pending lists.  This is beautiully simple code, but rather subtle.
+ * pending lists.  This is beautifully simple code, but rather subtle.
  *
  * Each time we increment "count", we set one bit (bit k) and clear
  * bits k-1 .. 0.  Each time this happens (except the very first time
index 5b6116e..1d051ef 100644 (file)
@@ -828,7 +828,7 @@ int nla_strcmp(const struct nlattr *nla, const char *str)
        int attrlen = nla_len(nla);
        int d;
 
-       if (attrlen > 0 && buf[attrlen - 1] == '\0')
+       while (attrlen > 0 && buf[attrlen - 1] == '\0')
                attrlen--;
 
        d = attrlen - len;
index 7a5769d..f1a6d90 100644 (file)
@@ -98,7 +98,7 @@ static int match_one(char *s, const char *p, substring_t args[])
  * locations.
  *
  * Description: Detects which if any of a set of token strings has been passed
- * to it. Tokens can include up to MAX_OPT_ARGS instances of basic c-style
+ * to it. Tokens can include up to %MAX_OPT_ARGS instances of basic c-style
  * format identifiers which will be taken into account when matching the
  * tokens, and whose locations will be returned in the @args array.
  */
@@ -120,8 +120,10 @@ EXPORT_SYMBOL(match_token);
  * @base: base to use when converting string
  *
  * Description: Given a &substring_t and a base, attempts to parse the substring
- * as a number in that base. On success, sets @result to the integer represented
- * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ * as a number in that base.
+ *
+ * Return: On success, sets @result to the integer represented by the
+ * string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 static int match_number(substring_t *s, int *result, int base)
 {
@@ -153,8 +155,10 @@ static int match_number(substring_t *s, int *result, int base)
  * @base: base to use when converting string
  *
  * Description: Given a &substring_t and a base, attempts to parse the substring
- * as a number in that base. On success, sets @result to the integer represented
- * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ * as a number in that base.
+ *
+ * Return: On success, sets @result to the integer represented by the
+ * string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 static int match_u64int(substring_t *s, u64 *result, int base)
 {
@@ -178,9 +182,10 @@ static int match_u64int(substring_t *s, u64 *result, int base)
  * @s: substring_t to be scanned
  * @result: resulting integer on success
  *
- * Description: Attempts to parse the &substring_t @s as a decimal integer. On
- * success, sets @result to the integer represented by the string and returns 0.
- * Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ * Description: Attempts to parse the &substring_t @s as a decimal integer.
+ *
+ * Return: On success, sets @result to the integer represented by the string
+ * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 int match_int(substring_t *s, int *result)
 {
@@ -188,14 +193,15 @@ int match_int(substring_t *s, int *result)
 }
 EXPORT_SYMBOL(match_int);
 
-/*
+/**
  * match_uint - scan a decimal representation of an integer from a substring_t
  * @s: substring_t to be scanned
  * @result: resulting integer on success
  *
- * Description: Attempts to parse the &substring_t @s as a decimal integer. On
- * success, sets @result to the integer represented by the string and returns 0.
- * Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ * Description: Attempts to parse the &substring_t @s as a decimal integer.
+ *
+ * Return: On success, sets @result to the integer represented by the string
+ * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 int match_uint(substring_t *s, unsigned int *result)
 {
@@ -217,9 +223,10 @@ EXPORT_SYMBOL(match_uint);
  * @result: resulting unsigned long long on success
  *
  * Description: Attempts to parse the &substring_t @s as a long decimal
- * integer. On success, sets @result to the integer represented by the
- * string and returns 0.
- * Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ * integer.
+ *
+ * Return: On success, sets @result to the integer represented by the string
+ * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 int match_u64(substring_t *s, u64 *result)
 {
@@ -232,9 +239,10 @@ EXPORT_SYMBOL(match_u64);
  * @s: substring_t to be scanned
  * @result: resulting integer on success
  *
- * Description: Attempts to parse the &substring_t @s as an octal integer. On
- * success, sets @result to the integer represented by the string and returns
- * 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ * Description: Attempts to parse the &substring_t @s as an octal integer.
+ *
+ * Return: On success, sets @result to the integer represented by the string
+ * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 int match_octal(substring_t *s, int *result)
 {
@@ -248,8 +256,9 @@ EXPORT_SYMBOL(match_octal);
  * @result: resulting integer on success
  *
  * Description: Attempts to parse the &substring_t @s as a hexadecimal integer.
- * On success, sets @result to the integer represented by the string and
- * returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ *
+ * Return: On success, sets @result to the integer represented by the string
+ * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 int match_hex(substring_t *s, int *result)
 {
@@ -263,10 +272,11 @@ EXPORT_SYMBOL(match_hex);
  * @str: the string to be parsed
  *
  * Description: Parse the string @str to check if matches wildcard
- * pattern @pattern. The pattern may contain two type wildcardes:
+ * pattern @pattern. The pattern may contain two types of wildcards:
  *   '*' - matches zero or more characters
  *   '?' - matches one character
- * If it's matched, return true, else return false.
+ *
+ * Return: If the @str matches the @pattern, return true, else return false.
  */
 bool match_wildcard(const char *pattern, const char *str)
 {
@@ -316,7 +326,9 @@ EXPORT_SYMBOL(match_wildcard);
  *
  * Description: Copy the characters in &substring_t @src to the
  * c-style string @dest.  Copy no more than @size - 1 characters, plus
- * the terminating NUL.  Return length of @src.
+ * the terminating NUL.
+ *
+ * Return: length of @src.
  */
 size_t match_strlcpy(char *dest, const substring_t *src, size_t size)
 {
@@ -338,6 +350,9 @@ EXPORT_SYMBOL(match_strlcpy);
  * Description: Allocates and returns a string filled with the contents of
  * the &substring_t @s. The caller is responsible for freeing the returned
  * string with kfree().
+ *
+ * Return: the address of the newly allocated NUL-terminated string or
+ * %NULL on error.
  */
 char *match_strdup(const substring_t *s)
 {
index 00f666d..ed610b7 100644 (file)
@@ -72,7 +72,7 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
 }
 EXPORT_SYMBOL(percpu_counter_set);
 
-/**
+/*
  * This function is both preempt and irq safe. The former is due to explicit
  * preemption disable. The latter is guaranteed by the fact that the slow path
  * is explicitly protected by an irq-safe spinlock whereas the fast patch uses
index 49f67a0..df9179f 100644 (file)
@@ -71,7 +71,7 @@ static void *stack_slabs[STACK_ALLOC_MAX_SLABS];
 static int depot_index;
 static int next_slab_inited;
 static size_t depot_offset;
-static DEFINE_SPINLOCK(depot_lock);
+static DEFINE_RAW_SPINLOCK(depot_lock);
 
 static bool init_stack_slab(void **prealloc)
 {
@@ -305,7 +305,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
                        prealloc = page_address(page);
        }
 
-       spin_lock_irqsave(&depot_lock, flags);
+       raw_spin_lock_irqsave(&depot_lock, flags);
 
        found = find_stack(*bucket, entries, nr_entries, hash);
        if (!found) {
@@ -329,7 +329,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
                WARN_ON(!init_stack_slab(&prealloc));
        }
 
-       spin_unlock_irqrestore(&depot_lock, flags);
+       raw_spin_unlock_irqrestore(&depot_lock, flags);
 exit:
        if (prealloc) {
                /* Nobody used this memory, ok to free it. */
index 3636da2..02d44e3 100644 (file)
@@ -148,6 +148,9 @@ config MEMORY_ISOLATION
 config HAVE_BOOTMEM_INFO_NODE
        def_bool n
 
+config ARCH_ENABLE_MEMORY_HOTPLUG
+       bool
+
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
@@ -176,12 +179,20 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE
          Say N here if you want the default policy to keep all hot-plugged
          memory blocks in 'offline' state.
 
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+       bool
+
 config MEMORY_HOTREMOVE
        bool "Allow for memory hot remove"
        select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
        depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
        depends on MIGRATION
 
+config MHP_MEMMAP_ON_MEMORY
+       def_bool y
+       depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
+       depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
@@ -273,6 +284,13 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
 config ARCH_ENABLE_THP_MIGRATION
        bool
 
+config HUGETLB_PAGE_SIZE_VARIABLE
+       def_bool n
+       help
+         Allows the pageblock_order value to be dynamic instead of just standard
+         HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
+         on a platform.
+
 config CONTIG_ALLOC
        def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
 
@@ -511,6 +529,13 @@ config CMA_DEBUGFS
        help
          Turns on the DebugFS interface for CMA.
 
+config CMA_SYSFS
+       bool "CMA information through sysfs interface"
+       depends on CMA && SYSFS
+       help
+         This option exposes some sysfs attributes to get information
+         from CMA.
+
 config CMA_AREAS
        int "Maximum count of the CMA areas"
        depends on CMA
@@ -758,6 +783,9 @@ config IDLE_PAGE_TRACKING
          See Documentation/admin-guide/mm/idle_page_tracking.rst for
          more details.
 
+config ARCH_HAS_CACHE_LINE_SIZE
+       bool
+
 config ARCH_HAS_PTE_DEVMAP
        bool
 
index c0135e3..bf71e29 100644 (file)
@@ -58,9 +58,13 @@ obj-y                        := filemap.o mempool.o oom_kill.o fadvise.o \
 page-alloc-y := page_alloc.o
 page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
 
+# Give 'memory_hotplug' its own module-parameter namespace
+memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+
 obj-y += page-alloc.o
 obj-y += init-mm.o
 obj-y += memblock.o
+obj-y += $(memory-hotplug-y)
 
 ifdef CONFIG_MMU
        obj-$(CONFIG_ADVISE_SYSCALLS)   += madvise.o
@@ -83,7 +87,6 @@ obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_KASAN)    += kasan/
 obj-$(CONFIG_KFENCE) += kfence/
 obj-$(CONFIG_FAILSLAB) += failslab.o
-obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_MEMTEST)          += memtest.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
@@ -109,6 +112,7 @@ obj-$(CONFIG_CMA)   += cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
 obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
 obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
 obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
index 26de020..907fefd 100644 (file)
@@ -58,7 +58,7 @@ EXPORT_SYMBOL_GPL(balloon_page_list_enqueue);
 /**
  * balloon_page_list_dequeue() - removes pages from balloon's page list and
  *                              returns a list of the pages.
- * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @b_dev_info: balloon device descriptor where we will grab a page from.
  * @pages: pointer to the list of pages that would be returned to the caller.
  * @n_req_pages: number of requested pages.
  *
@@ -157,7 +157,7 @@ EXPORT_SYMBOL_GPL(balloon_page_enqueue);
 /*
  * balloon_page_dequeue - removes a page from balloon's page list and returns
  *                       its address to allow the driver to release the page.
- * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @b_dev_info: balloon device descriptor where we will grab a page from.
  *
  * Driver must call this function to properly dequeue a previously enqueued page
  * before definitively releasing it back to the guest system.
index 54eee21..995e154 100644 (file)
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -24,7 +24,6 @@
 #include <linux/memblock.h>
 #include <linux/err.h>
 #include <linux/mm.h>
-#include <linux/mutex.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
 #include <linux/log2.h>
@@ -80,16 +79,17 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
 }
 
 static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
-                            unsigned int count)
+                            unsigned long count)
 {
        unsigned long bitmap_no, bitmap_count;
+       unsigned long flags;
 
        bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit;
        bitmap_count = cma_bitmap_pages_to_bits(cma, count);
 
-       mutex_lock(&cma->lock);
+       spin_lock_irqsave(&cma->lock, flags);
        bitmap_clear(cma->bitmap, bitmap_no, bitmap_count);
-       mutex_unlock(&cma->lock);
+       spin_unlock_irqrestore(&cma->lock, flags);
 }
 
 static void __init cma_activate_area(struct cma *cma)
@@ -118,7 +118,7 @@ static void __init cma_activate_area(struct cma *cma)
             pfn += pageblock_nr_pages)
                init_cma_reserved_pageblock(pfn_to_page(pfn));
 
-       mutex_init(&cma->lock);
+       spin_lock_init(&cma->lock);
 
 #ifdef CONFIG_CMA_DEBUGFS
        INIT_HLIST_HEAD(&cma->mem_head);
@@ -392,7 +392,7 @@ static void cma_debug_show_areas(struct cma *cma)
        unsigned long nr_part, nr_total = 0;
        unsigned long nbits = cma_bitmap_maxno(cma);
 
-       mutex_lock(&cma->lock);
+       spin_lock_irq(&cma->lock);
        pr_info("number of available pages: ");
        for (;;) {
                next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
@@ -407,7 +407,7 @@ static void cma_debug_show_areas(struct cma *cma)
                start = next_zero_bit + nr_zero;
        }
        pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
-       mutex_unlock(&cma->lock);
+       spin_unlock_irq(&cma->lock);
 }
 #else
 static inline void cma_debug_show_areas(struct cma *cma) { }
@@ -423,25 +423,27 @@ static inline void cma_debug_show_areas(struct cma *cma) { }
  * This function allocates part of contiguous memory on specific
  * contiguous memory area.
  */
-struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
-                      bool no_warn)
+struct page *cma_alloc(struct cma *cma, unsigned long count,
+                      unsigned int align, bool no_warn)
 {
        unsigned long mask, offset;
        unsigned long pfn = -1;
        unsigned long start = 0;
        unsigned long bitmap_maxno, bitmap_no, bitmap_count;
-       size_t i;
+       unsigned long i;
        struct page *page = NULL;
        int ret = -ENOMEM;
 
        if (!cma || !cma->count || !cma->bitmap)
-               return NULL;
+               goto out;
 
-       pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma,
+       pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma,
                 count, align);
 
        if (!count)
-               return NULL;
+               goto out;
+
+       trace_cma_alloc_start(cma->name, count, align);
 
        mask = cma_bitmap_aligned_mask(cma, align);
        offset = cma_bitmap_aligned_offset(cma, align);
@@ -449,15 +451,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
        bitmap_count = cma_bitmap_pages_to_bits(cma, count);
 
        if (bitmap_count > bitmap_maxno)
-               return NULL;
+               goto out;
 
        for (;;) {
-               mutex_lock(&cma->lock);
+               spin_lock_irq(&cma->lock);
                bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
                                bitmap_maxno, start, bitmap_count, mask,
                                offset);
                if (bitmap_no >= bitmap_maxno) {
-                       mutex_unlock(&cma->lock);
+                       spin_unlock_irq(&cma->lock);
                        break;
                }
                bitmap_set(cma->bitmap, bitmap_no, bitmap_count);
@@ -466,7 +468,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
                 * our exclusive use. If the migration fails we will take the
                 * lock again and unmark it.
                 */
-               mutex_unlock(&cma->lock);
+               spin_unlock_irq(&cma->lock);
 
                pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
                ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
@@ -483,11 +485,14 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 
                pr_debug("%s(): memory range at %p is busy, retrying\n",
                         __func__, pfn_to_page(pfn));
+
+               trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn),
+                                          count, align);
                /* try again with a bit different memory target */
                start = bitmap_no + mask + 1;
        }
 
-       trace_cma_alloc(pfn, page, count, align);
+       trace_cma_alloc_finish(cma->name, pfn, page, count, align);
 
        /*
         * CMA can allocate multiple page blocks, which results in different
@@ -500,12 +505,22 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
        }
 
        if (ret && !no_warn) {
-               pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n",
-                      __func__, cma->name, count, ret);
+               pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n",
+                                  __func__, cma->name, count, ret);
                cma_debug_show_areas(cma);
        }
 
        pr_debug("%s(): returned %p\n", __func__, page);
+out:
+       if (page) {
+               count_vm_event(CMA_ALLOC_SUCCESS);
+               cma_sysfs_account_success_pages(cma, count);
+       } else {
+               count_vm_event(CMA_ALLOC_FAIL);
+               if (cma)
+                       cma_sysfs_account_fail_pages(cma, count);
+       }
+
        return page;
 }
 
@@ -519,14 +534,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
  * It returns false when provided pages do not belong to contiguous area and
  * true otherwise.
  */
-bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
+bool cma_release(struct cma *cma, const struct page *pages,
+                unsigned long count)
 {
        unsigned long pfn;
 
        if (!cma || !pages)
                return false;
 
-       pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count);
+       pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
 
        pfn = page_to_pfn(pages);
 
@@ -537,7 +553,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
 
        free_contig_range(pfn, count);
        cma_clear_bitmap(cma, pfn, count);
-       trace_cma_release(pfn, pages, count);
+       trace_cma_release(cma->name, pfn, pages, count);
 
        return true;
 }
index 42ae082..2c77587 100644 (file)
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -3,19 +3,33 @@
 #define __MM_CMA_H__
 
 #include <linux/debugfs.h>
+#include <linux/kobject.h>
+
+struct cma_kobject {
+       struct kobject kobj;
+       struct cma *cma;
+};
 
 struct cma {
        unsigned long   base_pfn;
        unsigned long   count;
        unsigned long   *bitmap;
        unsigned int order_per_bit; /* Order of pages represented by one bit */
-       struct mutex    lock;
+       spinlock_t      lock;
 #ifdef CONFIG_CMA_DEBUGFS
        struct hlist_head mem_head;
        spinlock_t mem_head_lock;
        struct debugfs_u32_array dfs_bitmap;
 #endif
        char name[CMA_MAX_NAME];
+#ifdef CONFIG_CMA_SYSFS
+       /* the number of CMA page successful allocations */
+       atomic64_t nr_pages_succeeded;
+       /* the number of CMA page allocation failures */
+       atomic64_t nr_pages_failed;
+       /* kobject requires dynamic object */
+       struct cma_kobject *cma_kobj;
+#endif
 };
 
 extern struct cma cma_areas[MAX_CMA_AREAS];
@@ -26,4 +40,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma)
        return cma->count >> cma->order_per_bit;
 }
 
+#ifdef CONFIG_CMA_SYSFS
+void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages);
+void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages);
+#else
+static inline void cma_sysfs_account_success_pages(struct cma *cma,
+                                                  unsigned long nr_pages) {};
+static inline void cma_sysfs_account_fail_pages(struct cma *cma,
+                                               unsigned long nr_pages) {};
+#endif
 #endif
index d5bf8aa..2e77049 100644 (file)
@@ -36,10 +36,10 @@ static int cma_used_get(void *data, u64 *val)
        struct cma *cma = data;
        unsigned long used;
 
-       mutex_lock(&cma->lock);
+       spin_lock_irq(&cma->lock);
        /* pages counter is smaller than sizeof(int) */
        used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma));
-       mutex_unlock(&cma->lock);
+       spin_unlock_irq(&cma->lock);
        *val = (u64)used << cma->order_per_bit;
 
        return 0;
@@ -53,7 +53,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
        unsigned long start, end = 0;
        unsigned long bitmap_maxno = cma_bitmap_maxno(cma);
 
-       mutex_lock(&cma->lock);
+       spin_lock_irq(&cma->lock);
        for (;;) {
                start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
                if (start >= bitmap_maxno)
@@ -61,7 +61,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
                end = find_next_bit(cma->bitmap, bitmap_maxno, start);
                maxchunk = max(end - start, maxchunk);
        }
-       mutex_unlock(&cma->lock);
+       spin_unlock_irq(&cma->lock);
        *val = (u64)maxchunk << cma->order_per_bit;
 
        return 0;
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
new file mode 100644 (file)
index 0000000..eb2f39c
--- /dev/null
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CMA SysFS Interface
+ *
+ * Copyright (c) 2021 Minchan Kim <minchan@kernel.org>
+ */
+
+#include <linux/cma.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "cma.h"
+
+#define CMA_ATTR_RO(_name) \
+       static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages)
+{
+       atomic64_add(nr_pages, &cma->nr_pages_succeeded);
+}
+
+void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages)
+{
+       atomic64_add(nr_pages, &cma->nr_pages_failed);
+}
+
+static inline struct cma *cma_from_kobj(struct kobject *kobj)
+{
+       return container_of(kobj, struct cma_kobject, kobj)->cma;
+}
+
+static ssize_t alloc_pages_success_show(struct kobject *kobj,
+                                       struct kobj_attribute *attr, char *buf)
+{
+       struct cma *cma = cma_from_kobj(kobj);
+
+       return sysfs_emit(buf, "%llu\n",
+                         atomic64_read(&cma->nr_pages_succeeded));
+}
+CMA_ATTR_RO(alloc_pages_success);
+
+static ssize_t alloc_pages_fail_show(struct kobject *kobj,
+                                    struct kobj_attribute *attr, char *buf)
+{
+       struct cma *cma = cma_from_kobj(kobj);
+
+       return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_failed));
+}
+CMA_ATTR_RO(alloc_pages_fail);
+
+static void cma_kobj_release(struct kobject *kobj)
+{
+       struct cma *cma = cma_from_kobj(kobj);
+       struct cma_kobject *cma_kobj = cma->cma_kobj;
+
+       kfree(cma_kobj);
+       cma->cma_kobj = NULL;
+}
+
+static struct attribute *cma_attrs[] = {
+       &alloc_pages_success_attr.attr,
+       &alloc_pages_fail_attr.attr,
+       NULL,
+};
+ATTRIBUTE_GROUPS(cma);
+
+static struct kobj_type cma_ktype = {
+       .release = cma_kobj_release,
+       .sysfs_ops = &kobj_sysfs_ops,
+       .default_groups = cma_groups,
+};
+
+static int __init cma_sysfs_init(void)
+{
+       struct kobject *cma_kobj_root;
+       struct cma_kobject *cma_kobj;
+       struct cma *cma;
+       int i, err;
+
+       cma_kobj_root = kobject_create_and_add("cma", mm_kobj);
+       if (!cma_kobj_root)
+               return -ENOMEM;
+
+       for (i = 0; i < cma_area_count; i++) {
+               cma_kobj = kzalloc(sizeof(*cma_kobj), GFP_KERNEL);
+               if (!cma_kobj) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               cma = &cma_areas[i];
+               cma->cma_kobj = cma_kobj;
+               cma_kobj->cma = cma;
+               err = kobject_init_and_add(&cma_kobj->kobj, &cma_ktype,
+                                          cma_kobj_root, "%s", cma->name);
+               if (err) {
+                       kobject_put(&cma_kobj->kobj);
+                       goto out;
+               }
+       }
+
+       return 0;
+out:
+       while (--i >= 0) {
+               cma = &cma_areas[i];
+               kobject_put(&cma->cma_kobj->kobj);
+       }
+       kobject_put(cma_kobj_root);
+
+       return err;
+}
+subsys_initcall(cma_sysfs_init);
index e04f447..84fde27 100644 (file)
@@ -787,15 +787,14 @@ static bool too_many_isolated(pg_data_t *pgdat)
  *
  * Isolate all pages that can be migrated from the range specified by
  * [low_pfn, end_pfn). The range is expected to be within same pageblock.
- * Returns zero if there is a fatal signal pending, otherwise PFN of the
- * first page that was not scanned (which may be both less, equal to or more
- * than end_pfn).
+ * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
+ * -ENOMEM in case we could not allocate a page, or 0.
+ * cc->migrate_pfn will contain the next pfn to scan.
  *
  * The pages are isolated on cc->migratepages list (not required to be empty),
- * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
- * is neither read nor updated.
+ * and cc->nr_migratepages is updated accordingly.
  */
-static unsigned long
+static int
 isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        unsigned long end_pfn, isolate_mode_t isolate_mode)
 {
@@ -809,6 +808,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
        bool skip_on_failure = false;
        unsigned long next_skip_pfn = 0;
        bool skip_updated = false;
+       int ret = 0;
+
+       cc->migrate_pfn = low_pfn;
 
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -818,16 +820,16 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
        while (unlikely(too_many_isolated(pgdat))) {
                /* stop isolation if there are still pages not migrated */
                if (cc->nr_migratepages)
-                       return 0;
+                       return -EAGAIN;
 
                /* async migration should just abort */
                if (cc->mode == MIGRATE_ASYNC)
-                       return 0;
+                       return -EAGAIN;
 
                congestion_wait(BLK_RW_ASYNC, HZ/10);
 
                if (fatal_signal_pending(current))
-                       return 0;
+                       return -EINTR;
        }
 
        cond_resched();
@@ -875,8 +877,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
                        if (fatal_signal_pending(current)) {
                                cc->contended = true;
+                               ret = -EINTR;
 
-                               low_pfn = 0;
                                goto fatal_pending;
                        }
 
@@ -904,6 +906,38 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        valid_page = page;
                }
 
+               if (PageHuge(page) && cc->alloc_contig) {
+                       ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
+
+                       /*
+                        * Fail isolation in case isolate_or_dissolve_huge_page()
+                        * reports an error. In case of -ENOMEM, abort right away.
+                        */
+                       if (ret < 0) {
+                                /* Do not report -EBUSY down the chain */
+                               if (ret == -EBUSY)
+                                       ret = 0;
+                               low_pfn += (1UL << compound_order(page)) - 1;
+                               goto isolate_fail;
+                       }
+
+                       if (PageHuge(page)) {
+                               /*
+                                * Hugepage was successfully isolated and placed
+                                * on the cc->migratepages list.
+                                */
+                               low_pfn += compound_nr(page) - 1;
+                               goto isolate_success_no_list;
+                       }
+
+                       /*
+                        * Ok, the hugepage was dissolved. Now these pages are
+                        * Buddy and cannot be re-allocated because they are
+                        * isolated. Fall-through as the check below handles
+                        * Buddy pages.
+                        */
+               }
+
                /*
                 * Skip if free. We read page order here without zone lock
                 * which is generally unsafe, but the race window is small and
@@ -1037,6 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
 isolate_success:
                list_add(&page->lru, &cc->migratepages);
+isolate_success_no_list:
                cc->nr_migratepages += compound_nr(page);
                nr_isolated += compound_nr(page);
 
@@ -1063,7 +1098,7 @@ isolate_fail_put:
                put_page(page);
 
 isolate_fail:
-               if (!skip_on_failure)
+               if (!skip_on_failure && ret != -ENOMEM)
                        continue;
 
                /*
@@ -1089,6 +1124,9 @@ isolate_fail:
                         */
                        next_skip_pfn += 1UL << cc->order;
                }
+
+               if (ret == -ENOMEM)
+                       break;
        }
 
        /*
@@ -1130,7 +1168,9 @@ fatal_pending:
        if (nr_isolated)
                count_compact_events(COMPACTISOLATED, nr_isolated);
 
-       return low_pfn;
+       cc->migrate_pfn = low_pfn;
+
+       return ret;
 }
 
 /**
@@ -1139,15 +1179,15 @@ fatal_pending:
  * @start_pfn: The first PFN to start isolating.
  * @end_pfn:   The one-past-last PFN.
  *
- * Returns zero if isolation fails fatally due to e.g. pending signal.
- * Otherwise, function returns one-past-the-last PFN of isolated page
- * (which may be greater than end_pfn if end fell in a middle of a THP page).
+ * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM
+ * in case we could not allocate a page, or 0.
  */
-unsigned long
+int
 isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
                                                        unsigned long end_pfn)
 {
        unsigned long pfn, block_start_pfn, block_end_pfn;
+       int ret = 0;
 
        /* Scan block by block. First and last block may be incomplete */
        pfn = start_pfn;
@@ -1166,17 +1206,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
                                        block_end_pfn, cc->zone))
                        continue;
 
-               pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
-                                                       ISOLATE_UNEVICTABLE);
+               ret = isolate_migratepages_block(cc, pfn, block_end_pfn,
+                                                ISOLATE_UNEVICTABLE);
 
-               if (!pfn)
+               if (ret)
                        break;
 
                if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX)
                        break;
        }
 
-       return pfn;
+       return ret;
 }
 
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
@@ -1847,7 +1887,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
         */
        for (; block_end_pfn <= cc->free_pfn;
                        fast_find_block = false,
-                       low_pfn = block_end_pfn,
+                       cc->migrate_pfn = low_pfn = block_end_pfn,
                        block_start_pfn = block_end_pfn,
                        block_end_pfn += pageblock_nr_pages) {
 
@@ -1889,10 +1929,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
                }
 
                /* Perform the isolation */
-               low_pfn = isolate_migratepages_block(cc, low_pfn,
-                                               block_end_pfn, isolate_mode);
-
-               if (!low_pfn)
+               if (isolate_migratepages_block(cc, low_pfn, block_end_pfn,
+                                               isolate_mode))
                        return ISOLATE_ABORT;
 
                /*
@@ -1903,9 +1941,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
                break;
        }
 
-       /* Record where migration scanner will be restarted. */
-       cc->migrate_pfn = low_pfn;
-
        return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
 }
 
@@ -1977,8 +2012,8 @@ static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
        unsigned int wmark_low;
 
        /*
-        * Cap the low watermak to avoid excessive compaction
-        * activity in case a user sets the proactivess tunable
+        * Cap the low watermark to avoid excessive compaction
+        * activity in case a user sets the proactiveness tunable
         * close to 100 (maximum).
         */
        wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
@@ -2319,7 +2354,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
        trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
                                cc->free_pfn, end_pfn, sync);
 
-       migrate_prep_local();
+       /* lru_add_drain_all could be expensive with involving other CPUs */
+       lru_add_drain();
 
        while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
                int err;
@@ -2494,6 +2530,14 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
         */
        WRITE_ONCE(current->capture_control, NULL);
        *capture = READ_ONCE(capc.page);
+       /*
+        * Technically, it is also possible that compaction is skipped but
+        * the page is still captured out of luck(IRQ came and freed the page).
+        * Returning COMPACT_SUCCESS in such cases helps in properly accounting
+        * the COMPACT[STALL|FAIL] when compaction is skipped.
+        */
+       if (*capture)
+               ret = COMPACT_SUCCESS;
 
        return ret;
 }
@@ -2657,9 +2701,6 @@ static void compact_nodes(void)
                compact_node(nid);
 }
 
-/* The written value is actually unused, all memory is compacted */
-int sysctl_compact_memory;
-
 /*
  * Tunable for proactive compaction. It determines how
  * aggressively the kernel should compact memory in the
@@ -2844,7 +2885,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
  */
 static int kcompactd(void *p)
 {
-       pg_data_t *pgdat = (pg_data_t*)p;
+       pg_data_t *pgdat = (pg_data_t *)p;
        struct task_struct *tsk = current;
        unsigned int proactive_defer = 0;
 
index 5be57ba..66f7e9f 100644 (file)
@@ -142,17 +142,6 @@ static void page_cache_delete(struct address_space *mapping,
 
        page->mapping = NULL;
        /* Leave page->index set: truncation lookup relies upon it */
-
-       if (shadow) {
-               mapping->nrexceptional += nr;
-               /*
-                * Make sure the nrexceptional update is committed before
-                * the nrpages update so that final truncate racing
-                * with reclaim does not see both counters 0 at the
-                * same time and miss a shadow entry.
-                */
-               smp_wmb();
-       }
        mapping->nrpages -= nr;
 }
 
@@ -629,9 +618,6 @@ EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
 /* Returns true if writeback might be needed or already in progress. */
 static bool mapping_needs_writeback(struct address_space *mapping)
 {
-       if (dax_mapping(mapping))
-               return mapping->nrexceptional;
-
        return mapping->nrpages;
 }
 
@@ -925,8 +911,6 @@ noinline int __add_to_page_cache_locked(struct page *page,
                if (xas_error(&xas))
                        goto unlock;
 
-               if (old)
-                       mapping->nrexceptional--;
                mapping->nrpages++;
 
                /* hugetlb pages do not participate in page cache accounting */
@@ -2771,7 +2755,7 @@ unsigned int seek_page_size(struct xa_state *xas, struct page *page)
  * entirely memory-based such as tmpfs, and filesystems which support
  * unwritten extents.
  *
- * Return: The requested offset on successs, or -ENXIO if @whence specifies
+ * Return: The requested offset on success, or -ENXIO if @whence specifies
  * SEEK_DATA and there is no data after @start.  There is an implicit hole
  * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
  * and @end contain data.
@@ -3283,7 +3267,7 @@ const struct vm_operations_struct generic_file_vm_ops = {
 
 /* This is used for a general mmap of a disk file */
 
-int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct address_space *mapping = file->f_mapping;
 
@@ -3308,11 +3292,11 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
 {
        return VM_FAULT_SIGBUS;
 }
-int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        return -ENOSYS;
 }
-int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
 {
        return -ENOSYS;
 }
@@ -3740,7 +3724,7 @@ EXPORT_SYMBOL(generic_perform_write);
 ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
        struct file *file = iocb->ki_filp;
-       struct address_space * mapping = file->f_mapping;
+       struct address_space *mapping = file->f_mapping;
        struct inode    *inode = mapping->host;
        ssize_t         written = 0;
        ssize_t         err;
index 2183a56..130e301 100644 (file)
@@ -60,16 +60,20 @@ static u64 frontswap_succ_stores;
 static u64 frontswap_failed_stores;
 static u64 frontswap_invalidates;
 
-static inline void inc_frontswap_loads(void) {
+static inline void inc_frontswap_loads(void)
+{
        data_race(frontswap_loads++);
 }
-static inline void inc_frontswap_succ_stores(void) {
+static inline void inc_frontswap_succ_stores(void)
+{
        data_race(frontswap_succ_stores++);
 }
-static inline void inc_frontswap_failed_stores(void) {
+static inline void inc_frontswap_failed_stores(void)
+{
        data_race(frontswap_failed_stores++);
 }
-static inline void inc_frontswap_invalidates(void) {
+static inline void inc_frontswap_invalidates(void)
+{
        data_race(frontswap_invalidates++);
 }
 #else
index 71e546e..0697134 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -87,11 +87,12 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page,
                int orig_refs = refs;
 
                /*
-                * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
-                * path, so fail and let the caller fall back to the slow path.
+                * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
+                * right zone, so fail and let the caller fall back to the slow
+                * path.
                 */
-               if (unlikely(flags & FOLL_LONGTERM) &&
-                               is_migrate_cma_page(page))
+               if (unlikely((flags & FOLL_LONGTERM) &&
+                            !is_pinnable_page(page)))
                        return NULL;
 
                /*
@@ -1527,7 +1528,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
 {
        struct vm_area_struct *vma;
        unsigned long vm_flags;
-       int i;
+       long i;
 
        /* calculate required read or write permissions.
         * If FOLL_FORCE is set, we only require the "MAY" flags.
@@ -1574,7 +1575,7 @@ finish_or_fault:
  * Returns NULL on any kind of failure - a hole must then be inserted into
  * the corefile, to preserve alignment with its headers; and also returns
  * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
+ * allowing a hole to be left in the corefile to save disk space.
  *
  * Called without mmap_lock (takes and releases the mmap_lock by itself).
  */
@@ -1600,112 +1601,92 @@ struct page *get_dump_page(unsigned long addr)
 }
 #endif /* CONFIG_ELF_CORE */
 
-#ifdef CONFIG_CMA
-static long check_and_migrate_cma_pages(struct mm_struct *mm,
-                                       unsigned long start,
-                                       unsigned long nr_pages,
-                                       struct page **pages,
-                                       struct vm_area_struct **vmas,
-                                       unsigned int gup_flags)
+#ifdef CONFIG_MIGRATION
+/*
+ * Check whether all pages are pinnable, if so return number of pages.  If some
+ * pages are not pinnable, migrate them, and unpin all pages. Return zero if
+ * pages were migrated, or if some pages were not successfully isolated.
+ * Return negative error if migration fails.
+ */
+static long check_and_migrate_movable_pages(unsigned long nr_pages,
+                                           struct page **pages,
+                                           unsigned int gup_flags)
 {
        unsigned long i;
-       unsigned long step;
+       unsigned long isolation_error_count = 0;
        bool drain_allow = true;
-       bool migrate_allow = true;
-       LIST_HEAD(cma_page_list);
-       long ret = nr_pages;
+       LIST_HEAD(movable_page_list);
+       long ret = 0;
+       struct page *prev_head = NULL;
+       struct page *head;
        struct migration_target_control mtc = {
                .nid = NUMA_NO_NODE,
-               .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
+               .gfp_mask = GFP_USER | __GFP_NOWARN,
        };
 
-check_again:
-       for (i = 0; i < nr_pages;) {
-
-               struct page *head = compound_head(pages[i]);
-
-               /*
-                * gup may start from a tail page. Advance step by the left
-                * part.
-                */
-               step = compound_nr(head) - (pages[i] - head);
+       for (i = 0; i < nr_pages; i++) {
+               head = compound_head(pages[i]);
+               if (head == prev_head)
+                       continue;
+               prev_head = head;
                /*
-                * If we get a page from the CMA zone, since we are going to
-                * be pinning these entries, we might as well move them out
-                * of the CMA zone if possible.
+                * If we get a movable page, since we are going to be pinning
+                * these entries, try to move them out if possible.
                 */
-               if (is_migrate_cma_page(head)) {
-                       if (PageHuge(head))
-                               isolate_huge_page(head, &cma_page_list);
-                       else {
+               if (!is_pinnable_page(head)) {
+                       if (PageHuge(head)) {
+                               if (!isolate_huge_page(head, &movable_page_list))
+                                       isolation_error_count++;
+                       } else {
                                if (!PageLRU(head) && drain_allow) {
                                        lru_add_drain_all();
                                        drain_allow = false;
                                }
 
-                               if (!isolate_lru_page(head)) {
-                                       list_add_tail(&head->lru, &cma_page_list);
-                                       mod_node_page_state(page_pgdat(head),
-                                                           NR_ISOLATED_ANON +
-                                                           page_is_file_lru(head),
-                                                           thp_nr_pages(head));
+                               if (isolate_lru_page(head)) {
+                                       isolation_error_count++;
+                                       continue;
                                }
+                               list_add_tail(&head->lru, &movable_page_list);
+                               mod_node_page_state(page_pgdat(head),
+                                                   NR_ISOLATED_ANON +
+                                                   page_is_file_lru(head),
+                                                   thp_nr_pages(head));
                        }
                }
-
-               i += step;
        }
 
-       if (!list_empty(&cma_page_list)) {
-               /*
-                * drop the above get_user_pages reference.
-                */
-               if (gup_flags & FOLL_PIN)
-                       unpin_user_pages(pages, nr_pages);
-               else
-                       for (i = 0; i < nr_pages; i++)
-                               put_page(pages[i]);
-
-               if (migrate_pages(&cma_page_list, alloc_migration_target, NULL,
-                       (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
-                       /*
-                        * some of the pages failed migration. Do get_user_pages
-                        * without migration.
-                        */
-                       migrate_allow = false;
+       /*
+        * If list is empty, and no isolation errors, means that all pages are
+        * in the correct zone.
+        */
+       if (list_empty(&movable_page_list) && !isolation_error_count)
+               return nr_pages;
 
-                       if (!list_empty(&cma_page_list))
-                               putback_movable_pages(&cma_page_list);
-               }
-               /*
-                * We did migrate all the pages, Try to get the page references
-                * again migrating any new CMA pages which we failed to isolate
-                * earlier.
-                */
-               ret = __get_user_pages_locked(mm, start, nr_pages,
-                                                  pages, vmas, NULL,
-                                                  gup_flags);
-
-               if ((ret > 0) && migrate_allow) {
-                       nr_pages = ret;
-                       drain_allow = true;
-                       goto check_again;
-               }
+       if (gup_flags & FOLL_PIN) {
+               unpin_user_pages(pages, nr_pages);
+       } else {
+               for (i = 0; i < nr_pages; i++)
+                       put_page(pages[i]);
+       }
+       if (!list_empty(&movable_page_list)) {
+               ret = migrate_pages(&movable_page_list, alloc_migration_target,
+                                   NULL, (unsigned long)&mtc, MIGRATE_SYNC,
+                                   MR_LONGTERM_PIN);
+               if (ret && !list_empty(&movable_page_list))
+                       putback_movable_pages(&movable_page_list);
        }
 
-       return ret;
+       return ret > 0 ? -ENOMEM : ret;
 }
 #else
-static long check_and_migrate_cma_pages(struct mm_struct *mm,
-                                       unsigned long start,
-                                       unsigned long nr_pages,
-                                       struct page **pages,
-                                       struct vm_area_struct **vmas,
-                                       unsigned int gup_flags)
+static long check_and_migrate_movable_pages(unsigned long nr_pages,
+                                           struct page **pages,
+                                           unsigned int gup_flags)
 {
        return nr_pages;
 }
-#endif /* CONFIG_CMA */
+#endif /* CONFIG_MIGRATION */
 
 /*
  * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
@@ -1718,21 +1699,22 @@ static long __gup_longterm_locked(struct mm_struct *mm,
                                  struct vm_area_struct **vmas,
                                  unsigned int gup_flags)
 {
-       unsigned long flags = 0;
+       unsigned int flags;
        long rc;
 
-       if (gup_flags & FOLL_LONGTERM)
-               flags = memalloc_nocma_save();
-
-       rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL,
-                                    gup_flags);
+       if (!(gup_flags & FOLL_LONGTERM))
+               return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+                                              NULL, gup_flags);
+       flags = memalloc_pin_save();
+       do {
+               rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+                                            NULL, gup_flags);
+               if (rc <= 0)
+                       break;
+               rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
+       } while (!rc);
+       memalloc_pin_restore(flags);
 
-       if (gup_flags & FOLL_LONGTERM) {
-               if (rc > 0)
-                       rc = check_and_migrate_cma_pages(mm, start, rc, pages,
-                                                        vmas, gup_flags);
-               memalloc_nocma_restore(flags);
-       }
        return rc;
 }
 
index e3cf78e..d974dec 100644 (file)
@@ -52,6 +52,12 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
 
                                dump_page(page, "gup_test failure");
                                break;
+                       } else if (cmd == PIN_LONGTERM_BENCHMARK &&
+                               WARN(!is_pinnable_page(page),
+                                    "pages[%lu] is NOT pinnable but pinned\n",
+                                    i)) {
+                               dump_page(page, "gup_test failure");
+                               break;
                        }
                }
                break;
@@ -94,7 +100,7 @@ static int __gup_test_ioctl(unsigned int cmd,
 {
        ktime_t start_time, end_time;
        unsigned long i, nr_pages, addr, next;
-       int nr;
+       long nr;
        struct page **pages;
        int ret = 0;
        bool needs_mmap_lock =
@@ -126,37 +132,34 @@ static int __gup_test_ioctl(unsigned int cmd,
                        nr = (next - addr) / PAGE_SIZE;
                }
 
-               /* Filter out most gup flags: only allow a tiny subset here: */
-               gup->flags &= FOLL_WRITE;
-
                switch (cmd) {
                case GUP_FAST_BENCHMARK:
-                       nr = get_user_pages_fast(addr, nr, gup->flags,
+                       nr = get_user_pages_fast(addr, nr, gup->gup_flags,
                                                 pages + i);
                        break;
                case GUP_BASIC_TEST:
-                       nr = get_user_pages(addr, nr, gup->flags, pages + i,
+                       nr = get_user_pages(addr, nr, gup->gup_flags, pages + i,
                                            NULL);
                        break;
                case PIN_FAST_BENCHMARK:
-                       nr = pin_user_pages_fast(addr, nr, gup->flags,
+                       nr = pin_user_pages_fast(addr, nr, gup->gup_flags,
                                                 pages + i);
                        break;
                case PIN_BASIC_TEST:
-                       nr = pin_user_pages(addr, nr, gup->flags, pages + i,
+                       nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i,
                                            NULL);
                        break;
                case PIN_LONGTERM_BENCHMARK:
                        nr = pin_user_pages(addr, nr,
-                                           gup->flags | FOLL_LONGTERM,
+                                           gup->gup_flags | FOLL_LONGTERM,
                                            pages + i, NULL);
                        break;
                case DUMP_USER_PAGES_TEST:
-                       if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
-                               nr = pin_user_pages(addr, nr, gup->flags,
+                       if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
+                               nr = pin_user_pages(addr, nr, gup->gup_flags,
                                                    pages + i, NULL);
                        else
-                               nr = get_user_pages(addr, nr, gup->flags,
+                               nr = get_user_pages(addr, nr, gup->gup_flags,
                                                    pages + i, NULL);
                        break;
                default:
@@ -187,7 +190,7 @@ static int __gup_test_ioctl(unsigned int cmd,
 
        start_time = ktime_get();
 
-       put_back_pages(cmd, pages, nr_pages, gup->flags);
+       put_back_pages(cmd, pages, nr_pages, gup->test_flags);
 
        end_time = ktime_get();
        gup->put_delta_usec = ktime_us_delta(end_time, start_time);
index 90a6713..887ac1d 100644 (file)
@@ -21,7 +21,8 @@ struct gup_test {
        __u64 addr;
        __u64 size;
        __u32 nr_pages_per_call;
-       __u32 flags;
+       __u32 gup_flags;
+       __u32 test_flags;
        /*
         * Each non-zero entry is the number of the page (1-based: first page is
         * page 1, so that zero entries mean "do nothing") from the .addr base.
index 6ef8f5e..4fb51d7 100644 (file)
@@ -104,7 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
 atomic_long_t _totalhigh_pages __read_mostly;
 EXPORT_SYMBOL(_totalhigh_pages);
 
-unsigned int __nr_free_highpages (void)
+unsigned int __nr_free_highpages(void)
 {
        struct zone *zone;
        unsigned int pages = 0;
@@ -120,7 +120,7 @@ unsigned int __nr_free_highpages (void)
 static int pkmap_count[LAST_PKMAP];
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
 
-pte_t * pkmap_page_table;
+pte_t *pkmap_page_table;
 
 /*
  * Most architectures have no use for kmap_high_get(), so let's abstract
@@ -147,6 +147,7 @@ struct page *__kmap_to_page(void *vaddr)
 
        if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
                int i = PKMAP_NR(addr);
+
                return pte_page(pkmap_page_table[i]);
        }
 
@@ -278,9 +279,8 @@ void *kmap_high(struct page *page)
        pkmap_count[PKMAP_NR(vaddr)]++;
        BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
        unlock_kmap();
-       return (void*) vaddr;
+       return (void *) vaddr;
 }
-
 EXPORT_SYMBOL(kmap_high);
 
 #ifdef ARCH_NEEDS_KMAP_HIGH_GET
@@ -305,7 +305,7 @@ void *kmap_high_get(struct page *page)
                pkmap_count[PKMAP_NR(vaddr)]++;
        }
        unlock_kmap_any(flags);
-       return (void*) vaddr;
+       return (void *) vaddr;
 }
 #endif
 
@@ -519,7 +519,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
 
        /*
         * Disable migration so resulting virtual address is stable
-        * accross preemption.
+        * across preemption.
         */
        migrate_disable();
        preempt_disable();
@@ -737,7 +737,6 @@ done:
        spin_unlock_irqrestore(&pas->lock, flags);
        return ret;
 }
-
 EXPORT_SYMBOL(page_address);
 
 /**
index ae907a9..63ed6b2 100644 (file)
@@ -7,6 +7,7 @@
 
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/highmem.h>
@@ -77,18 +78,18 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma)
        return false;
 }
 
-static struct page *get_huge_zero_page(void)
+static bool get_huge_zero_page(void)
 {
        struct page *zero_page;
 retry:
        if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
-               return READ_ONCE(huge_zero_page);
+               return true;
 
        zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
                        HPAGE_PMD_ORDER);
        if (!zero_page) {
                count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
-               return NULL;
+               return false;
        }
        count_vm_event(THP_ZERO_PAGE_ALLOC);
        preempt_disable();
@@ -101,7 +102,7 @@ retry:
        /* We take additional reference here. It will be put back by shrinker */
        atomic_set(&huge_zero_refcount, 2);
        preempt_enable();
-       return READ_ONCE(huge_zero_page);
+       return true;
 }
 
 static void put_huge_zero_page(void)
@@ -624,14 +625,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 
                /* Deliver the page fault to userland */
                if (userfaultfd_missing(vma)) {
-                       vm_fault_t ret2;
-
                        spin_unlock(vmf->ptl);
                        put_page(page);
                        pte_free(vma->vm_mm, pgtable);
-                       ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
-                       VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
-                       return ret2;
+                       ret = handle_userfault(vmf, VM_UFFD_MISSING);
+                       VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+                       return ret;
                }
 
                entry = mk_huge_pmd(page, vma->vm_page_prot);
@@ -1293,7 +1292,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
        }
 
        page = pmd_page(orig_pmd);
-       VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
+       VM_BUG_ON_PAGE(!PageHead(page), page);
 
        /* Lock page for reuse_swap_page() */
        if (!trylock_page(page)) {
@@ -1464,12 +1463,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
         */
        page_locked = trylock_page(page);
        target_nid = mpol_misplaced(page, vma, haddr);
-       if (target_nid == NUMA_NO_NODE) {
-               /* If the page was locked, there are no parallel migrations */
-               if (page_locked)
-                       goto clear_pmdnuma;
-       }
-
        /* Migration could have started since the pmd_trans_migrating check */
        if (!page_locked) {
                page_nid = NUMA_NO_NODE;
@@ -1478,6 +1471,11 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
                spin_unlock(vmf->ptl);
                put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
                goto out;
+       } else if (target_nid == NUMA_NO_NODE) {
+               /* There are no parallel migrations and page is in the right
+                * node. Clear the numa hinting info in this pmd.
+                */
+               goto clear_pmdnuma;
        }
 
        /*
@@ -1696,7 +1694,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
                        VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
                        entry = pmd_to_swp_entry(orig_pmd);
-                       page = pfn_to_page(swp_offset(entry));
+                       page = migration_entry_to_page(entry);
                        flush_needed = 0;
                } else
                        WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
@@ -1794,8 +1792,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 /*
  * Returns
  *  - 0 if PMD could not be locked
- *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
- *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
+ *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
  */
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
@@ -2104,7 +2102,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                swp_entry_t entry;
 
                entry = pmd_to_swp_entry(old_pmd);
-               page = pfn_to_page(swp_offset(entry));
+               page = migration_entry_to_page(entry);
                write = is_write_migration_entry(entry);
                young = false;
                soft_dirty = pmd_swp_soft_dirty(old_pmd);
@@ -2303,44 +2301,38 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
        __split_huge_pmd(vma, pmd, address, freeze, page);
 }
 
+static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
+{
+       /*
+        * If the new address isn't hpage aligned and it could previously
+        * contain an hugepage: check if we need to split an huge pmd.
+        */
+       if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
+           range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
+                        ALIGN(address, HPAGE_PMD_SIZE)))
+               split_huge_pmd_address(vma, address, false, NULL);
+}
+
 void vma_adjust_trans_huge(struct vm_area_struct *vma,
                             unsigned long start,
                             unsigned long end,
                             long adjust_next)
 {
-       /*
-        * If the new start address isn't hpage aligned and it could
-        * previously contain an hugepage: check if we need to split
-        * an huge pmd.
-        */
-       if (start & ~HPAGE_PMD_MASK &&
-           (start & HPAGE_PMD_MASK) >= vma->vm_start &&
-           (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_pmd_address(vma, start, false, NULL);
+       /* Check if we need to split start first. */
+       split_huge_pmd_if_needed(vma, start);
 
-       /*
-        * If the new end address isn't hpage aligned and it could
-        * previously contain an hugepage: check if we need to split
-        * an huge pmd.
-        */
-       if (end & ~HPAGE_PMD_MASK &&
-           (end & HPAGE_PMD_MASK) >= vma->vm_start &&
-           (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_pmd_address(vma, end, false, NULL);
+       /* Check if we need to split end next. */
+       split_huge_pmd_if_needed(vma, end);
 
        /*
-        * If we're also updating the vma->vm_next->vm_start, if the new
-        * vm_next->vm_start isn't hpage aligned and it could previously
-        * contain an hugepage: check if we need to split an huge pmd.
+        * If we're also updating the vma->vm_next->vm_start,
+        * check if we need to split it.
         */
        if (adjust_next > 0) {
                struct vm_area_struct *next = vma->vm_next;
                unsigned long nstart = next->vm_start;
                nstart += adjust_next;
-               if (nstart & ~HPAGE_PMD_MASK &&
-                   (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
-                   (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
-                       split_huge_pmd_address(next, nstart, false, NULL);
+               split_huge_pmd_if_needed(next, nstart);
        }
 }
 
@@ -2477,7 +2469,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
                xa_lock(&swap_cache->i_pages);
        }
 
-       /* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+       /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
        lruvec = lock_page_lruvec(head);
 
        for (i = nr - 1; i >= 1; i--) {
@@ -2838,8 +2830,8 @@ void deferred_split_huge_page(struct page *page)
                ds_queue->split_queue_len++;
 #ifdef CONFIG_MEMCG
                if (memcg)
-                       memcg_set_shrinker_bit(memcg, page_to_nid(page),
-                                              deferred_split_shrinker.id);
+                       set_shrinker_bit(memcg, page_to_nid(page),
+                                        deferred_split_shrinker.id);
 #endif
        }
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
@@ -2924,16 +2916,14 @@ static struct shrinker deferred_split_shrinker = {
 };
 
 #ifdef CONFIG_DEBUG_FS
-static int split_huge_pages_set(void *data, u64 val)
+static void split_huge_pages_all(void)
 {
        struct zone *zone;
        struct page *page;
        unsigned long pfn, max_zone_pfn;
        unsigned long total = 0, split = 0;
 
-       if (val != 1)
-               return -EINVAL;
-
+       pr_debug("Split all THPs\n");
        for_each_populated_zone(zone) {
                max_zone_pfn = zone_end_pfn(zone);
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
@@ -2957,15 +2947,243 @@ static int split_huge_pages_set(void *data, u64 val)
                        unlock_page(page);
 next:
                        put_page(page);
+                       cond_resched();
                }
        }
 
-       pr_info("%lu of %lu THP split\n", split, total);
+       pr_debug("%lu of %lu THP split\n", split, total);
+}
 
-       return 0;
+static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
+{
+       return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
+                   is_vm_hugetlb_page(vma);
+}
+
+static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
+                               unsigned long vaddr_end)
+{
+       int ret = 0;
+       struct task_struct *task;
+       struct mm_struct *mm;
+       unsigned long total = 0, split = 0;
+       unsigned long addr;
+
+       vaddr_start &= PAGE_MASK;
+       vaddr_end &= PAGE_MASK;
+
+       /* Find the task_struct from pid */
+       rcu_read_lock();
+       task = find_task_by_vpid(pid);
+       if (!task) {
+               rcu_read_unlock();
+               ret = -ESRCH;
+               goto out;
+       }
+       get_task_struct(task);
+       rcu_read_unlock();
+
+       /* Find the mm_struct */
+       mm = get_task_mm(task);
+       put_task_struct(task);
+
+       if (!mm) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
+                pid, vaddr_start, vaddr_end);
+
+       mmap_read_lock(mm);
+       /*
+        * always increase addr by PAGE_SIZE, since we could have a PTE page
+        * table filled with PTE-mapped THPs, each of which is distinct.
+        */
+       for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
+               struct vm_area_struct *vma = find_vma(mm, addr);
+               unsigned int follflags;
+               struct page *page;
+
+               if (!vma || addr < vma->vm_start)
+                       break;
+
+               /* skip special VMA and hugetlb VMA */
+               if (vma_not_suitable_for_thp_split(vma)) {
+                       addr = vma->vm_end;
+                       continue;
+               }
+
+               /* FOLL_DUMP to ignore special (like zero) pages */
+               follflags = FOLL_GET | FOLL_DUMP;
+               page = follow_page(vma, addr, follflags);
+
+               if (IS_ERR(page))
+                       continue;
+               if (!page)
+                       continue;
+
+               if (!is_transparent_hugepage(page))
+                       goto next;
+
+               total++;
+               if (!can_split_huge_page(compound_head(page), NULL))
+                       goto next;
+
+               if (!trylock_page(page))
+                       goto next;
+
+               if (!split_huge_page(page))
+                       split++;
+
+               unlock_page(page);
+next:
+               put_page(page);
+               cond_resched();
+       }
+       mmap_read_unlock(mm);
+       mmput(mm);
+
+       pr_debug("%lu of %lu THP split\n", split, total);
+
+out:
+       return ret;
+}
+
+static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
+                               pgoff_t off_end)
+{
+       struct filename *file;
+       struct file *candidate;
+       struct address_space *mapping;
+       int ret = -EINVAL;
+       pgoff_t index;
+       int nr_pages = 1;
+       unsigned long total = 0, split = 0;
+
+       file = getname_kernel(file_path);
+       if (IS_ERR(file))
+               return ret;
+
+       candidate = file_open_name(file, O_RDONLY, 0);
+       if (IS_ERR(candidate))
+               goto out;
+
+       pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
+                file_path, off_start, off_end);
+
+       mapping = candidate->f_mapping;
+
+       for (index = off_start; index < off_end; index += nr_pages) {
+               struct page *fpage = pagecache_get_page(mapping, index,
+                                               FGP_ENTRY | FGP_HEAD, 0);
+
+               nr_pages = 1;
+               if (xa_is_value(fpage) || !fpage)
+                       continue;
+
+               if (!is_transparent_hugepage(fpage))
+                       goto next;
+
+               total++;
+               nr_pages = thp_nr_pages(fpage);
+
+               if (!trylock_page(fpage))
+                       goto next;
+
+               if (!split_huge_page(fpage))
+                       split++;
+
+               unlock_page(fpage);
+next:
+               put_page(fpage);
+               cond_resched();
+       }
+
+       filp_close(candidate, NULL);
+       ret = 0;
+
+       pr_debug("%lu of %lu file-backed THP split\n", split, total);
+out:
+       putname(file);
+       return ret;
 }
-DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
-               "%llu\n");
+
+#define MAX_INPUT_BUF_SZ 255
+
+static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
+                               size_t count, loff_t *ppops)
+{
+       static DEFINE_MUTEX(split_debug_mutex);
+       ssize_t ret;
+       /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
+       char input_buf[MAX_INPUT_BUF_SZ];
+       int pid;
+       unsigned long vaddr_start, vaddr_end;
+
+       ret = mutex_lock_interruptible(&split_debug_mutex);
+       if (ret)
+               return ret;
+
+       ret = -EFAULT;
+
+       memset(input_buf, 0, MAX_INPUT_BUF_SZ);
+       if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
+               goto out;
+
+       input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
+
+       if (input_buf[0] == '/') {
+               char *tok;
+               char *buf = input_buf;
+               char file_path[MAX_INPUT_BUF_SZ];
+               pgoff_t off_start = 0, off_end = 0;
+               size_t input_len = strlen(input_buf);
+
+               tok = strsep(&buf, ",");
+               if (tok) {
+                       strncpy(file_path, tok, MAX_INPUT_BUF_SZ);
+               } else {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
+               if (ret != 2) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               ret = split_huge_pages_in_file(file_path, off_start, off_end);
+               if (!ret)
+                       ret = input_len;
+
+               goto out;
+       }
+
+       ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
+       if (ret == 1 && pid == 1) {
+               split_huge_pages_all();
+               ret = strlen(input_buf);
+               goto out;
+       } else if (ret != 3) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
+       if (!ret)
+               ret = strlen(input_buf);
+out:
+       mutex_unlock(&split_debug_mutex);
+       return ret;
+
+}
+
+static const struct file_operations split_huge_pages_fops = {
+       .owner   = THIS_MODULE,
+       .write   = split_huge_pages_write,
+       .llseek  = no_llseek,
+};
 
 static int __init split_huge_pages_debugfs(void)
 {
index 6c72433..3db405d 100644 (file)
@@ -39,7 +39,6 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
-#include <linux/userfaultfd_k.h>
 #include <linux/page_owner.h>
 #include "internal.h"
 
@@ -94,9 +93,10 @@ static inline bool subpool_is_free(struct hugepage_subpool *spool)
        return true;
 }
 
-static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
+                                               unsigned long irq_flags)
 {
-       spin_unlock(&spool->lock);
+       spin_unlock_irqrestore(&spool->lock, irq_flags);
 
        /* If no pages are used, and no other handles to the subpool
         * remain, give up any reservations based on minimum size and
@@ -135,10 +135,12 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
 
 void hugepage_put_subpool(struct hugepage_subpool *spool)
 {
-       spin_lock(&spool->lock);
+       unsigned long flags;
+
+       spin_lock_irqsave(&spool->lock, flags);
        BUG_ON(!spool->count);
        spool->count--;
-       unlock_or_release_subpool(spool);
+       unlock_or_release_subpool(spool, flags);
 }
 
 /*
@@ -157,7 +159,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
        if (!spool)
                return ret;
 
-       spin_lock(&spool->lock);
+       spin_lock_irq(&spool->lock);
 
        if (spool->max_hpages != -1) {          /* maximum size accounting */
                if ((spool->used_hpages + delta) <= spool->max_hpages)
@@ -184,7 +186,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
        }
 
 unlock_ret:
-       spin_unlock(&spool->lock);
+       spin_unlock_irq(&spool->lock);
        return ret;
 }
 
@@ -198,11 +200,12 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
                                       long delta)
 {
        long ret = delta;
+       unsigned long flags;
 
        if (!spool)
                return delta;
 
-       spin_lock(&spool->lock);
+       spin_lock_irqsave(&spool->lock, flags);
 
        if (spool->max_hpages != -1)            /* maximum size accounting */
                spool->used_hpages -= delta;
@@ -223,7 +226,7 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
         * If hugetlbfs_put_super couldn't free spool due to an outstanding
         * quota reference, free it now.
         */
-       unlock_or_release_subpool(spool);
+       unlock_or_release_subpool(spool, flags);
 
        return ret;
 }
@@ -463,7 +466,7 @@ static int allocate_file_region_entries(struct resv_map *resv,
                              resv->region_cache_count;
 
                /* At this point, we should have enough entries in the cache
-                * for all the existings adds_in_progress. We should only be
+                * for all the existing adds_in_progress. We should only be
                 * needing to allocate for regions_needed.
                 */
                VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
@@ -553,7 +556,6 @@ retry:
        resv->adds_in_progress -= in_regions_needed;
 
        spin_unlock(&resv->lock);
-       VM_BUG_ON(add < 0);
        return add;
 }
 
@@ -743,13 +745,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode)
 {
        struct hugepage_subpool *spool = subpool_inode(inode);
        long rsv_adjust;
+       bool reserved = false;
 
        rsv_adjust = hugepage_subpool_get_pages(spool, 1);
-       if (rsv_adjust) {
+       if (rsv_adjust > 0) {
                struct hstate *h = hstate_inode(inode);
 
-               hugetlb_acct_memory(h, 1);
+               if (!hugetlb_acct_memory(h, 1))
+                       reserved = true;
+       } else if (!rsv_adjust) {
+               reserved = true;
        }
+
+       if (!reserved)
+               pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
 }
 
 /*
@@ -1059,6 +1068,8 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
 static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
        int nid = page_to_nid(page);
+
+       lockdep_assert_held(&hugetlb_lock);
        list_move(&page->lru, &h->hugepage_freelists[nid]);
        h->free_huge_pages++;
        h->free_huge_pages_node[nid]++;
@@ -1068,10 +1079,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
 {
        struct page *page;
-       bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
+       bool pin = !!(current->flags & PF_MEMALLOC_PIN);
 
+       lockdep_assert_held(&hugetlb_lock);
        list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
-               if (nocma && is_migrate_cma_page(page))
+               if (pin && !is_pinnable_page(page))
                        continue;
 
                if (PageHWPoison(page))
@@ -1205,7 +1217,7 @@ static int hstate_next_node_to_alloc(struct hstate *h,
 }
 
 /*
- * helper for free_pool_huge_page() - return the previously saved
+ * helper for remove_pool_huge_page() - return the previously saved
  * node ["this node"] from which to free a huge page.  Advance the
  * next node id whether or not we find a free huge page to free so
  * that the next attempt to free addresses the next node.
@@ -1273,7 +1285,7 @@ static void free_gigantic_page(struct page *page, unsigned int order)
 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
                int nid, nodemask_t *nodemask)
 {
-       unsigned long nr_pages = 1UL << huge_page_order(h);
+       unsigned long nr_pages = pages_per_huge_page(h);
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();
 
@@ -1327,6 +1339,42 @@ static inline void destroy_compound_gigantic_page(struct page *page,
                                                unsigned int order) { }
 #endif
 
+/*
+ * Remove hugetlb page from lists, and update dtor so that page appears
+ * as just a compound page.  A reference is held on the page.
+ *
+ * Must be called with hugetlb lock held.
+ */
+static void remove_hugetlb_page(struct hstate *h, struct page *page,
+                                                       bool adjust_surplus)
+{
+       int nid = page_to_nid(page);
+
+       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
+
+       lockdep_assert_held(&hugetlb_lock);
+       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+               return;
+
+       list_del(&page->lru);
+
+       if (HPageFreed(page)) {
+               h->free_huge_pages--;
+               h->free_huge_pages_node[nid]--;
+       }
+       if (adjust_surplus) {
+               h->surplus_huge_pages--;
+               h->surplus_huge_pages_node[nid]--;
+       }
+
+       set_page_refcounted(page);
+       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+
+       h->nr_huge_pages--;
+       h->nr_huge_pages_node[nid]--;
+}
+
 static void update_and_free_page(struct hstate *h, struct page *page)
 {
        int i;
@@ -1335,8 +1383,6 @@ static void update_and_free_page(struct hstate *h, struct page *page)
        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;
 
-       h->nr_huge_pages--;
-       h->nr_huge_pages_node[page_to_nid(page)]--;
        for (i = 0; i < pages_per_huge_page(h);
             i++, subpage = mem_map_next(subpage, page, i)) {
                subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1344,24 +1390,24 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                1 << PG_active | 1 << PG_private |
                                1 << PG_writeback);
        }
-       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
-       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
-       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
-       set_page_refcounted(page);
        if (hstate_is_gigantic(h)) {
-               /*
-                * Temporarily drop the hugetlb_lock, because
-                * we might block in free_gigantic_page().
-                */
-               spin_unlock(&hugetlb_lock);
                destroy_compound_gigantic_page(page, huge_page_order(h));
                free_gigantic_page(page, huge_page_order(h));
-               spin_lock(&hugetlb_lock);
        } else {
                __free_pages(page, huge_page_order(h));
        }
 }
 
+static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
+{
+       struct page *page, *t_page;
+
+       list_for_each_entry_safe(page, t_page, list, lru) {
+               update_and_free_page(h, page);
+               cond_resched();
+       }
+}
+
 struct hstate *size_to_hstate(unsigned long size)
 {
        struct hstate *h;
@@ -1373,7 +1419,7 @@ struct hstate *size_to_hstate(unsigned long size)
        return NULL;
 }
 
-static void __free_huge_page(struct page *page)
+void free_huge_page(struct page *page)
 {
        /*
         * Can't pass hstate in here because it is called from the
@@ -1383,6 +1429,7 @@ static void __free_huge_page(struct page *page)
        int nid = page_to_nid(page);
        struct hugepage_subpool *spool = hugetlb_page_subpool(page);
        bool restore_reserve;
+       unsigned long flags;
 
        VM_BUG_ON_PAGE(page_count(page), page);
        VM_BUG_ON_PAGE(page_mapcount(page), page);
@@ -1411,7 +1458,7 @@ static void __free_huge_page(struct page *page)
                        restore_reserve = true;
        }
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irqsave(&hugetlb_lock, flags);
        ClearHPageMigratable(page);
        hugetlb_cgroup_uncharge_page(hstate_index(h),
                                     pages_per_huge_page(h), page);
@@ -1421,82 +1468,46 @@ static void __free_huge_page(struct page *page)
                h->resv_huge_pages++;
 
        if (HPageTemporary(page)) {
-               list_del(&page->lru);
-               ClearHPageTemporary(page);
+               remove_hugetlb_page(h, page, false);
+               spin_unlock_irqrestore(&hugetlb_lock, flags);
                update_and_free_page(h, page);
        } else if (h->surplus_huge_pages_node[nid]) {
                /* remove the page from active list */
-               list_del(&page->lru);
+               remove_hugetlb_page(h, page, true);
+               spin_unlock_irqrestore(&hugetlb_lock, flags);
                update_and_free_page(h, page);
-               h->surplus_huge_pages--;
-               h->surplus_huge_pages_node[nid]--;
        } else {
                arch_clear_hugepage_flags(page);
                enqueue_huge_page(h, page);
+               spin_unlock_irqrestore(&hugetlb_lock, flags);
        }
-       spin_unlock(&hugetlb_lock);
 }
 
 /*
- * As free_huge_page() can be called from a non-task context, we have
- * to defer the actual freeing in a workqueue to prevent potential
- * hugetlb_lock deadlock.
- *
- * free_hpage_workfn() locklessly retrieves the linked list of pages to
- * be freed and frees them one-by-one. As the page->mapping pointer is
- * going to be cleared in __free_huge_page() anyway, it is reused as the
- * llist_node structure of a lockless linked list of huge pages to be freed.
+ * Must be called with the hugetlb lock held
  */
-static LLIST_HEAD(hpage_freelist);
-
-static void free_hpage_workfn(struct work_struct *work)
+static void __prep_account_new_huge_page(struct hstate *h, int nid)
 {
-       struct llist_node *node;
-       struct page *page;
-
-       node = llist_del_all(&hpage_freelist);
-
-       while (node) {
-               page = container_of((struct address_space **)node,
-                                    struct page, mapping);
-               node = node->next;
-               __free_huge_page(page);
-       }
-}
-static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
-
-void free_huge_page(struct page *page)
-{
-       /*
-        * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
-        */
-       if (!in_task()) {
-               /*
-                * Only call schedule_work() if hpage_freelist is previously
-                * empty. Otherwise, schedule_work() had been called but the
-                * workfn hasn't retrieved the list yet.
-                */
-               if (llist_add((struct llist_node *)&page->mapping,
-                             &hpage_freelist))
-                       schedule_work(&free_hpage_work);
-               return;
-       }
-
-       __free_huge_page(page);
+       lockdep_assert_held(&hugetlb_lock);
+       h->nr_huge_pages++;
+       h->nr_huge_pages_node[nid]++;
 }
 
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+static void __prep_new_huge_page(struct page *page)
 {
        INIT_LIST_HEAD(&page->lru);
        set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
        hugetlb_set_page_subpool(page, NULL);
        set_hugetlb_cgroup(page, NULL);
        set_hugetlb_cgroup_rsvd(page, NULL);
-       spin_lock(&hugetlb_lock);
-       h->nr_huge_pages++;
-       h->nr_huge_pages_node[nid]++;
-       ClearHPageFreed(page);
-       spin_unlock(&hugetlb_lock);
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+       __prep_new_huge_page(page);
+       spin_lock_irq(&hugetlb_lock);
+       __prep_account_new_huge_page(h, nid);
+       spin_unlock_irq(&hugetlb_lock);
 }
 
 static void prep_compound_gigantic_page(struct page *page, unsigned int order)
@@ -1693,17 +1704,20 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
 }
 
 /*
- * Free huge page from pool from next node to free.
- * Attempt to keep persistent huge pages more or less
- * balanced over allowed nodes.
+ * Remove huge page from pool from next node to free.  Attempt to keep
+ * persistent huge pages more or less balanced over allowed nodes.
+ * This routine only 'removes' the hugetlb page.  The caller must make
+ * an additional call to free the page to low level allocators.
  * Called with hugetlb_lock locked.
  */
-static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
-                                                        bool acct_surplus)
+static struct page *remove_pool_huge_page(struct hstate *h,
+                                               nodemask_t *nodes_allowed,
+                                                bool acct_surplus)
 {
        int nr_nodes, node;
-       int ret = 0;
+       struct page *page = NULL;
 
+       lockdep_assert_held(&hugetlb_lock);
        for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
                /*
                 * If we're returning unused surplus pages, only examine
@@ -1711,23 +1725,14 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
                 */
                if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
                    !list_empty(&h->hugepage_freelists[node])) {
-                       struct page *page =
-                               list_entry(h->hugepage_freelists[node].next,
+                       page = list_entry(h->hugepage_freelists[node].next,
                                          struct page, lru);
-                       list_del(&page->lru);
-                       h->free_huge_pages--;
-                       h->free_huge_pages_node[node]--;
-                       if (acct_surplus) {
-                               h->surplus_huge_pages--;
-                               h->surplus_huge_pages_node[node]--;
-                       }
-                       update_and_free_page(h, page);
-                       ret = 1;
+                       remove_hugetlb_page(h, page, acct_surplus);
                        break;
                }
        }
 
-       return ret;
+       return page;
 }
 
 /*
@@ -1749,7 +1754,7 @@ retry:
        if (!PageHuge(page))
                return 0;
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        if (!PageHuge(page)) {
                rc = 0;
                goto out;
@@ -1758,7 +1763,6 @@ retry:
        if (!page_count(page)) {
                struct page *head = compound_head(page);
                struct hstate *h = page_hstate(head);
-               int nid = page_to_nid(head);
                if (h->free_huge_pages - h->resv_huge_pages == 0)
                        goto out;
 
@@ -1767,7 +1771,7 @@ retry:
                 * when it is dissolved.
                 */
                if (unlikely(!HPageFreed(head))) {
-                       spin_unlock(&hugetlb_lock);
+                       spin_unlock_irq(&hugetlb_lock);
                        cond_resched();
 
                        /*
@@ -1789,15 +1793,14 @@ retry:
                        SetPageHWPoison(page);
                        ClearPageHWPoison(head);
                }
-               list_del(&head->lru);
-               h->free_huge_pages--;
-               h->free_huge_pages_node[nid]--;
+               remove_hugetlb_page(h, page, false);
                h->max_huge_pages--;
+               spin_unlock_irq(&hugetlb_lock);
                update_and_free_page(h, head);
-               rc = 0;
+               return 0;
        }
 out:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
        return rc;
 }
 
@@ -1839,16 +1842,16 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
        if (hstate_is_gigantic(h))
                return NULL;
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
                goto out_unlock;
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
 
        page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
        if (!page)
                return NULL;
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        /*
         * We could have raced with the pool size change.
         * Double check that and simply deallocate the new page
@@ -1858,7 +1861,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
         */
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                SetHPageTemporary(page);
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
                put_page(page);
                return NULL;
        } else {
@@ -1867,7 +1870,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
        }
 
 out_unlock:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
 
        return page;
 }
@@ -1917,17 +1920,17 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                nodemask_t *nmask, gfp_t gfp_mask)
 {
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        if (h->free_huge_pages - h->resv_huge_pages > 0) {
                struct page *page;
 
                page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
                if (page) {
-                       spin_unlock(&hugetlb_lock);
+                       spin_unlock_irq(&hugetlb_lock);
                        return page;
                }
        }
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
 
        return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
 }
@@ -1964,6 +1967,7 @@ static int gather_surplus_pages(struct hstate *h, long delta)
        long needed, allocated;
        bool alloc_ok = true;
 
+       lockdep_assert_held(&hugetlb_lock);
        needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
        if (needed <= 0) {
                h->resv_huge_pages += delta;
@@ -1975,7 +1979,7 @@ static int gather_surplus_pages(struct hstate *h, long delta)
 
        ret = -ENOMEM;
 retry:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
                page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
                                NUMA_NO_NODE, NULL);
@@ -1992,7 +1996,7 @@ retry:
         * After retaking hugetlb_lock, we need to recalculate 'needed'
         * because either resv_huge_pages or free_huge_pages may have changed.
         */
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        needed = (h->resv_huge_pages + delta) -
                        (h->free_huge_pages + allocated);
        if (needed > 0) {
@@ -2032,12 +2036,12 @@ retry:
                enqueue_huge_page(h, page);
        }
 free:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
 
        /* Free unnecessary surplus pages to the buddy allocator */
        list_for_each_entry_safe(page, tmp, &surplus_list, lru)
                put_page(page);
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
 
        return ret;
 }
@@ -2049,17 +2053,17 @@ free:
  *    to the associated reservation map.
  * 2) Free any unused surplus pages that may have been allocated to satisfy
  *    the reservation.  As many as unused_resv_pages may be freed.
- *
- * Called with hugetlb_lock held.  However, the lock could be dropped (and
- * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
- * we must make sure nobody else can claim pages we are in the process of
- * freeing.  Do this by ensuring resv_huge_page always is greater than the
- * number of huge pages we plan to free when dropping the lock.
  */
 static void return_unused_surplus_pages(struct hstate *h,
                                        unsigned long unused_resv_pages)
 {
        unsigned long nr_pages;
+       struct page *page;
+       LIST_HEAD(page_list);
+
+       lockdep_assert_held(&hugetlb_lock);
+       /* Uncommit the reservation */
+       h->resv_huge_pages -= unused_resv_pages;
 
        /* Cannot return gigantic pages currently */
        if (hstate_is_gigantic(h))
@@ -2076,24 +2080,21 @@ static void return_unused_surplus_pages(struct hstate *h,
         * evenly across all nodes with memory. Iterate across these nodes
         * until we can no longer free unreserved surplus pages. This occurs
         * when the nodes with surplus pages have no free pages.
-        * free_pool_huge_page() will balance the freed pages across the
+        * remove_pool_huge_page() will balance the freed pages across the
         * on-line nodes with memory and will handle the hstate accounting.
-        *
-        * Note that we decrement resv_huge_pages as we free the pages.  If
-        * we drop the lock, resv_huge_pages will still be sufficiently large
-        * to cover subsequent pages we may free.
         */
        while (nr_pages--) {
-               h->resv_huge_pages--;
-               unused_resv_pages--;
-               if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
+               page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
+               if (!page)
                        goto out;
-               cond_resched_lock(&hugetlb_lock);
+
+               list_add(&page->lru, &page_list);
        }
 
 out:
-       /* Fully uncommit the reservation */
-       h->resv_huge_pages -= unused_resv_pages;
+       spin_unlock_irq(&hugetlb_lock);
+       update_and_free_pages_bulk(h, &page_list);
+       spin_lock_irq(&hugetlb_lock);
 }
 
 
@@ -2175,27 +2176,26 @@ static long __vma_reservation_common(struct hstate *h,
 
        if (vma->vm_flags & VM_MAYSHARE)
                return ret;
-       else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
-               /*
-                * In most cases, reserves always exist for private mappings.
-                * However, a file associated with mapping could have been
-                * hole punched or truncated after reserves were consumed.
-                * As subsequent fault on such a range will not use reserves.
-                * Subtle - The reserve map for private mappings has the
-                * opposite meaning than that of shared mappings.  If NO
-                * entry is in the reserve map, it means a reservation exists.
-                * If an entry exists in the reserve map, it means the
-                * reservation has already been consumed.  As a result, the
-                * return value of this routine is the opposite of the
-                * value returned from reserve map manipulation routines above.
-                */
-               if (ret)
-                       return 0;
-               else
-                       return 1;
-       }
-       else
-               return ret < 0 ? ret : 0;
+       /*
+        * We know private mapping must have HPAGE_RESV_OWNER set.
+        *
+        * In most cases, reserves always exist for private mappings.
+        * However, a file associated with mapping could have been
+        * hole punched or truncated after reserves were consumed.
+        * As subsequent fault on such a range will not use reserves.
+        * Subtle - The reserve map for private mappings has the
+        * opposite meaning than that of shared mappings.  If NO
+        * entry is in the reserve map, it means a reservation exists.
+        * If an entry exists in the reserve map, it means the
+        * reservation has already been consumed.  As a result, the
+        * return value of this routine is the opposite of the
+        * value returned from reserve map manipulation routines above.
+        */
+       if (ret > 0)
+               return 0;
+       if (ret == 0)
+               return 1;
+       return ret;
 }
 
 static long vma_needs_reservation(struct hstate *h,
@@ -2266,6 +2266,134 @@ static void restore_reserve_on_error(struct hstate *h,
        }
 }
 
+/*
+ * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
+ * @h: struct hstate old page belongs to
+ * @old_page: Old page to dissolve
+ * @list: List to isolate the page in case we need to
+ * Returns 0 on success, otherwise negated error.
+ */
+static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
+                                       struct list_head *list)
+{
+       gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+       int nid = page_to_nid(old_page);
+       struct page *new_page;
+       int ret = 0;
+
+       /*
+        * Before dissolving the page, we need to allocate a new one for the
+        * pool to remain stable. Using alloc_buddy_huge_page() allows us to
+        * not having to deal with prep_new_huge_page() and avoids dealing of any
+        * counters. This simplifies and let us do the whole thing under the
+        * lock.
+        */
+       new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
+       if (!new_page)
+               return -ENOMEM;
+
+retry:
+       spin_lock_irq(&hugetlb_lock);
+       if (!PageHuge(old_page)) {
+               /*
+                * Freed from under us. Drop new_page too.
+                */
+               goto free_new;
+       } else if (page_count(old_page)) {
+               /*
+                * Someone has grabbed the page, try to isolate it here.
+                * Fail with -EBUSY if not possible.
+                */
+               spin_unlock_irq(&hugetlb_lock);
+               if (!isolate_huge_page(old_page, list))
+                       ret = -EBUSY;
+               spin_lock_irq(&hugetlb_lock);
+               goto free_new;
+       } else if (!HPageFreed(old_page)) {
+               /*
+                * Page's refcount is 0 but it has not been enqueued in the
+                * freelist yet. Race window is small, so we can succeed here if
+                * we retry.
+                */
+               spin_unlock_irq(&hugetlb_lock);
+               cond_resched();
+               goto retry;
+       } else {
+               /*
+                * Ok, old_page is still a genuine free hugepage. Remove it from
+                * the freelist and decrease the counters. These will be
+                * incremented again when calling __prep_account_new_huge_page()
+                * and enqueue_huge_page() for new_page. The counters will remain
+                * stable since this happens under the lock.
+                */
+               remove_hugetlb_page(h, old_page, false);
+
+               /*
+                * new_page needs to be initialized with the standard hugetlb
+                * state. This is normally done by prep_new_huge_page() but
+                * that takes hugetlb_lock which is already held so we need to
+                * open code it here.
+                * Reference count trick is needed because allocator gives us
+                * referenced page but the pool requires pages with 0 refcount.
+                */
+               __prep_new_huge_page(new_page);
+               __prep_account_new_huge_page(h, nid);
+               page_ref_dec(new_page);
+               enqueue_huge_page(h, new_page);
+
+               /*
+                * Pages have been replaced, we can safely free the old one.
+                */
+               spin_unlock_irq(&hugetlb_lock);
+               update_and_free_page(h, old_page);
+       }
+
+       return ret;
+
+free_new:
+       spin_unlock_irq(&hugetlb_lock);
+       __free_pages(new_page, huge_page_order(h));
+
+       return ret;
+}
+
+int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
+{
+       struct hstate *h;
+       struct page *head;
+       int ret = -EBUSY;
+
+       /*
+        * The page might have been dissolved from under our feet, so make sure
+        * to carefully check the state under the lock.
+        * Return success when racing as if we dissolved the page ourselves.
+        */
+       spin_lock_irq(&hugetlb_lock);
+       if (PageHuge(page)) {
+               head = compound_head(page);
+               h = page_hstate(head);
+       } else {
+               spin_unlock_irq(&hugetlb_lock);
+               return 0;
+       }
+       spin_unlock_irq(&hugetlb_lock);
+
+       /*
+        * Fence off gigantic pages as there is a cyclic dependency between
+        * alloc_contig_range and them. Return -ENOMEM as this has the effect
+        * of bailing out right away without further retrying.
+        */
+       if (hstate_is_gigantic(h))
+               return -ENOMEM;
+
+       if (page_count(head) && isolate_huge_page(head, list))
+               ret = 0;
+       else if (!page_count(head))
+               ret = alloc_and_dissolve_huge_page(h, head, list);
+
+       return ret;
+}
+
 struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr, int avoid_reserve)
 {
@@ -2316,7 +2444,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
 
        /* If this allocation is not consuming a reservation, charge it now.
         */
-       deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
+       deferred_reserve = map_chg || avoid_reserve;
        if (deferred_reserve) {
                ret = hugetlb_cgroup_charge_cgroup_rsvd(
                        idx, pages_per_huge_page(h), &h_cg);
@@ -2328,7 +2456,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
        if (ret)
                goto out_uncharge_cgroup_reservation;
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        /*
         * glb_chg is passed to indicate whether or not a page must be taken
         * from the global free pool (global change).  gbl_chg == 0 indicates
@@ -2336,7 +2464,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
         */
        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
        if (!page) {
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
                page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
                if (!page)
                        goto out_uncharge_cgroup;
@@ -2344,7 +2472,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
                        SetHPageRestoreReserve(page);
                        h->resv_huge_pages--;
                }
-               spin_lock(&hugetlb_lock);
+               spin_lock_irq(&hugetlb_lock);
                list_add(&page->lru, &h->hugepage_activelist);
                /* Fall through */
        }
@@ -2357,7 +2485,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
                                                  h_cg, page);
        }
 
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
 
        hugetlb_set_page_subpool(page, spool);
 
@@ -2547,24 +2675,32 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
                                                nodemask_t *nodes_allowed)
 {
        int i;
+       LIST_HEAD(page_list);
 
+       lockdep_assert_held(&hugetlb_lock);
        if (hstate_is_gigantic(h))
                return;
 
+       /*
+        * Collect pages to be freed on a list, and free after dropping lock
+        */
        for_each_node_mask(i, *nodes_allowed) {
                struct page *page, *next;
                struct list_head *freel = &h->hugepage_freelists[i];
                list_for_each_entry_safe(page, next, freel, lru) {
                        if (count >= h->nr_huge_pages)
-                               return;
+                               goto out;
                        if (PageHighMem(page))
                                continue;
-                       list_del(&page->lru);
-                       update_and_free_page(h, page);
-                       h->free_huge_pages--;
-                       h->free_huge_pages_node[page_to_nid(page)]--;
+                       remove_hugetlb_page(h, page, false);
+                       list_add(&page->lru, &page_list);
                }
        }
+
+out:
+       spin_unlock_irq(&hugetlb_lock);
+       update_and_free_pages_bulk(h, &page_list);
+       spin_lock_irq(&hugetlb_lock);
 }
 #else
 static inline void try_to_free_low(struct hstate *h, unsigned long count,
@@ -2583,6 +2719,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
 {
        int nr_nodes, node;
 
+       lockdep_assert_held(&hugetlb_lock);
        VM_BUG_ON(delta != -1 && delta != 1);
 
        if (delta < 0) {
@@ -2610,6 +2747,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
                              nodemask_t *nodes_allowed)
 {
        unsigned long min_count, ret;
+       struct page *page;
+       LIST_HEAD(page_list);
        NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
 
        /*
@@ -2622,7 +2761,12 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
        else
                return -ENOMEM;
 
-       spin_lock(&hugetlb_lock);
+       /*
+        * resize_lock mutex prevents concurrent adjustments to number of
+        * pages in hstate via the proc/sysfs interfaces.
+        */
+       mutex_lock(&h->resize_lock);
+       spin_lock_irq(&hugetlb_lock);
 
        /*
         * Check for a node specific request.
@@ -2653,7 +2797,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
         */
        if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
                if (count > persistent_huge_pages(h)) {
-                       spin_unlock(&hugetlb_lock);
+                       spin_unlock_irq(&hugetlb_lock);
+                       mutex_unlock(&h->resize_lock);
                        NODEMASK_FREE(node_alloc_noretry);
                        return -EINVAL;
                }
@@ -2682,14 +2827,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
                 * page, free_huge_page will handle it by freeing the page
                 * and reducing the surplus.
                 */
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
 
                /* yield cpu to avoid soft lockup */
                cond_resched();
 
                ret = alloc_pool_huge_page(h, nodes_allowed,
                                                node_alloc_noretry);
-               spin_lock(&hugetlb_lock);
+               spin_lock_irq(&hugetlb_lock);
                if (!ret)
                        goto out;
 
@@ -2716,18 +2861,30 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
        min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
        min_count = max(count, min_count);
        try_to_free_low(h, min_count, nodes_allowed);
+
+       /*
+        * Collect pages to be removed on list without dropping lock
+        */
        while (min_count < persistent_huge_pages(h)) {
-               if (!free_pool_huge_page(h, nodes_allowed, 0))
+               page = remove_pool_huge_page(h, nodes_allowed, 0);
+               if (!page)
                        break;
-               cond_resched_lock(&hugetlb_lock);
+
+               list_add(&page->lru, &page_list);
        }
+       /* free the pages after dropping lock */
+       spin_unlock_irq(&hugetlb_lock);
+       update_and_free_pages_bulk(h, &page_list);
+       spin_lock_irq(&hugetlb_lock);
+
        while (count < persistent_huge_pages(h)) {
                if (!adjust_pool_surplus(h, nodes_allowed, 1))
                        break;
        }
 out:
        h->max_huge_pages = persistent_huge_pages(h);
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
+       mutex_unlock(&h->resize_lock);
 
        NODEMASK_FREE(node_alloc_noretry);
 
@@ -2882,9 +3039,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
        if (err)
                return err;
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        h->nr_overcommit_huge_pages = input;
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
 
        return count;
 }
@@ -3215,6 +3372,7 @@ void __init hugetlb_add_hstate(unsigned int order)
        BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
        BUG_ON(order == 0);
        h = &hstates[hugetlb_max_hstate++];
+       mutex_init(&h->resize_lock);
        h->order = order;
        h->mask = ~(huge_page_size(h) - 1);
        for (i = 0; i < MAX_NUMNODES; ++i)
@@ -3267,10 +3425,10 @@ static int __init hugepages_setup(char *s)
 
        /*
         * Global state is always initialized later in hugetlb_init.
-        * But we need to allocate >= MAX_ORDER hstates here early to still
+        * But we need to allocate gigantic hstates here early to still
         * use the bootmem allocator.
         */
-       if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
+       if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
                hugetlb_hstate_alloc_pages(parsed_hstate);
 
        last_mhp = mhp;
@@ -3470,9 +3628,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
                goto out;
 
        if (write) {
-               spin_lock(&hugetlb_lock);
+               spin_lock_irq(&hugetlb_lock);
                h->nr_overcommit_huge_pages = tmp;
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
        }
 out:
        return ret;
@@ -3568,7 +3726,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
        if (!delta)
                return 0;
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        /*
         * When cpuset is configured, it breaks the strict hugetlb page
         * reservation as the accounting is done on a global variable. Such
@@ -3607,7 +3765,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
                return_unused_surplus_pages(h, (unsigned long) -delta);
 
 out:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
        return ret;
 }
 
@@ -3795,7 +3953,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                src_pte = huge_pte_offset(src, addr, sz);
                if (!src_pte)
                        continue;
-               dst_pte = huge_pte_alloc(dst, addr, sz);
+               dst_pte = huge_pte_alloc(dst, vma, addr, sz);
                if (!dst_pte) {
                        ret = -ENOMEM;
                        break;
@@ -4310,6 +4468,44 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
        return 0;
 }
 
+static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
+                                                 struct address_space *mapping,
+                                                 pgoff_t idx,
+                                                 unsigned int flags,
+                                                 unsigned long haddr,
+                                                 unsigned long reason)
+{
+       vm_fault_t ret;
+       u32 hash;
+       struct vm_fault vmf = {
+               .vma = vma,
+               .address = haddr,
+               .flags = flags,
+
+               /*
+                * Hard to debug if it ends up being
+                * used by a callee that assumes
+                * something about the other
+                * uninitialized fields... same as in
+                * memory.c
+                */
+       };
+
+       /*
+        * hugetlb_fault_mutex and i_mmap_rwsem must be
+        * dropped before handling userfault.  Reacquire
+        * after handling fault to make calling code simpler.
+        */
+       hash = hugetlb_fault_mutex_hash(mapping, idx);
+       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+       i_mmap_unlock_read(mapping);
+       ret = handle_userfault(&vmf, reason);
+       i_mmap_lock_read(mapping);
+       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+       return ret;
+}
+
 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                        struct vm_area_struct *vma,
                        struct address_space *mapping, pgoff_t idx,
@@ -4348,35 +4544,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 retry:
        page = find_lock_page(mapping, idx);
        if (!page) {
-               /*
-                * Check for page in userfault range
-                */
+               /* Check for page in userfault range */
                if (userfaultfd_missing(vma)) {
-                       u32 hash;
-                       struct vm_fault vmf = {
-                               .vma = vma,
-                               .address = haddr,
-                               .flags = flags,
-                               /*
-                                * Hard to debug if it ends up being
-                                * used by a callee that assumes
-                                * something about the other
-                                * uninitialized fields... same as in
-                                * memory.c
-                                */
-                       };
-
-                       /*
-                        * hugetlb_fault_mutex and i_mmap_rwsem must be
-                        * dropped before handling userfault.  Reacquire
-                        * after handling fault to make calling code simpler.
-                        */
-                       hash = hugetlb_fault_mutex_hash(mapping, idx);
-                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-                       i_mmap_unlock_read(mapping);
-                       ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-                       i_mmap_lock_read(mapping);
-                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+                                                      flags, haddr,
+                                                      VM_UFFD_MISSING);
                        goto out;
                }
 
@@ -4395,13 +4567,10 @@ retry:
                         * sure there really is no pte entry.
                         */
                        ptl = huge_pte_lock(h, mm, ptep);
-                       if (!huge_pte_none(huge_ptep_get(ptep))) {
-                               ret = 0;
-                               spin_unlock(ptl);
-                               goto out;
-                       }
+                       ret = 0;
+                       if (huge_pte_none(huge_ptep_get(ptep)))
+                               ret = vmf_error(PTR_ERR(page));
                        spin_unlock(ptl);
-                       ret = vmf_error(PTR_ERR(page));
                        goto out;
                }
                clear_huge_page(page, address, pages_per_huge_page(h));
@@ -4435,6 +4604,16 @@ retry:
                                VM_FAULT_SET_HINDEX(hstate_index(h));
                        goto backout_unlocked;
                }
+
+               /* Check for page in userfault range. */
+               if (userfaultfd_minor(vma)) {
+                       unlock_page(page);
+                       put_page(page);
+                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+                                                      flags, haddr,
+                                                      VM_UFFD_MINOR);
+                       goto out;
+               }
        }
 
        /*
@@ -4563,7 +4742,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        mapping = vma->vm_file->f_mapping;
        i_mmap_lock_read(mapping);
-       ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+       ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
        if (!ptep) {
                i_mmap_unlock_read(mapping);
                return VM_FAULT_OOM;
@@ -4675,6 +4854,7 @@ out_mutex:
        return ret;
 }
 
+#ifdef CONFIG_USERFAULTFD
 /*
  * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
  * modifications for huge pages.
@@ -4684,8 +4864,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                            struct vm_area_struct *dst_vma,
                            unsigned long dst_addr,
                            unsigned long src_addr,
+                           enum mcopy_atomic_mode mode,
                            struct page **pagep)
 {
+       bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
        struct address_space *mapping;
        pgoff_t idx;
        unsigned long size;
@@ -4695,8 +4877,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
        spinlock_t *ptl;
        int ret;
        struct page *page;
+       int writable;
+
+       mapping = dst_vma->vm_file->f_mapping;
+       idx = vma_hugecache_offset(h, dst_vma, dst_addr);
 
-       if (!*pagep) {
+       if (is_continue) {
+               ret = -EFAULT;
+               page = find_lock_page(mapping, idx);
+               if (!page)
+                       goto out;
+       } else if (!*pagep) {
                ret = -ENOMEM;
                page = alloc_huge_page(dst_vma, dst_addr, 0);
                if (IS_ERR(page))
@@ -4725,13 +4916,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
         */
        __SetPageUptodate(page);
 
-       mapping = dst_vma->vm_file->f_mapping;
-       idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
-       /*
-        * If shared, add to page cache
-        */
-       if (vm_shared) {
+       /* Add shared, newly allocated pages to the page cache. */
+       if (vm_shared && !is_continue) {
                size = i_size_read(mapping->host) >> huge_page_shift(h);
                ret = -EFAULT;
                if (idx >= size)
@@ -4776,8 +4962,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
        }
 
-       _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
-       if (dst_vma->vm_flags & VM_WRITE)
+       /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
+       if (is_continue && !vm_shared)
+               writable = 0;
+       else
+               writable = dst_vma->vm_flags & VM_WRITE;
+
+       _dst_pte = make_huge_pte(dst_vma, page, writable);
+       if (writable)
                _dst_pte = huge_pte_mkdirty(_dst_pte);
        _dst_pte = pte_mkyoung(_dst_pte);
 
@@ -4791,20 +4983,22 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
        update_mmu_cache(dst_vma, dst_addr, dst_pte);
 
        spin_unlock(ptl);
-       SetHPageMigratable(page);
-       if (vm_shared)
+       if (!is_continue)
+               SetHPageMigratable(page);
+       if (vm_shared || is_continue)
                unlock_page(page);
        ret = 0;
 out:
        return ret;
 out_release_unlock:
        spin_unlock(ptl);
-       if (vm_shared)
+       if (vm_shared || is_continue)
                unlock_page(page);
 out_release_nounlock:
        put_page(page);
        goto out;
 }
+#endif /* CONFIG_USERFAULTFD */
 
 static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
                                 int refs, struct page **pages,
@@ -4996,14 +5190,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
        return i ? i : err;
 }
 
-#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
-/*
- * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
- * implement this.
- */
-#define flush_hugetlb_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
-#endif
-
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot)
 {
@@ -5280,6 +5466,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
        /*
         * If the subpool has a minimum size, the number of global
         * reservations to be released may be adjusted.
+        *
+        * Note that !resv_map implies freed == 0. So (chg - freed)
+        * won't go negative.
         */
        gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
        hugetlb_acct_memory(h, -gbl_reserve);
@@ -5326,6 +5515,15 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
        return false;
 }
 
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+#ifdef CONFIG_USERFAULTFD
+       if (uffd_disable_huge_pmd_share(vma))
+               return false;
+#endif
+       return vma_shareable(vma, addr);
+}
+
 /*
  * Determine if start,end range within vma could be mapped by shared pmd.
  * If yes, adjust start and end to cover range associated with possible
@@ -5338,8 +5536,8 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
 
        /*
-        * vma need span at least one aligned PUD size and the start,end range
-        * must at least partialy within it.
+        * vma needs to span at least one aligned PUD size, and the range
+        * must be at least partially within in.
         */
        if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
                (*end <= v_start) || (*start >= v_end))
@@ -5370,9 +5568,9 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
  * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
  * only required for subsequent processing.
  */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pud_t *pud)
 {
-       struct vm_area_struct *vma = find_vma(mm, addr);
        struct address_space *mapping = vma->vm_file->f_mapping;
        pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
                        vma->vm_pgoff;
@@ -5382,9 +5580,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        pte_t *pte;
        spinlock_t *ptl;
 
-       if (!vma_shareable(vma, addr))
-               return (pte_t *)pmd_alloc(mm, pud, addr);
-
        i_mmap_assert_locked(mapping);
        vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                if (svma == vma)
@@ -5448,9 +5643,10 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
        *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
        return 1;
 }
-#define want_pmd_share()       (1)
+
 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pud_t *pud)
 {
        return NULL;
 }
@@ -5465,11 +5661,15 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
 {
 }
-#define want_pmd_share()       (0)
+
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+       return false;
+}
 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
 
 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz)
 {
        pgd_t *pgd;
@@ -5487,8 +5687,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
                        pte = (pte_t *)pud;
                } else {
                        BUG_ON(sz != PMD_SIZE);
-                       if (want_pmd_share() && pud_none(*pud))
-                               pte = huge_pmd_share(mm, addr, pud);
+                       if (want_pmd_share(vma, addr) && pud_none(*pud))
+                               pte = huge_pmd_share(mm, vma, addr, pud);
                        else
                                pte = (pte_t *)pmd_alloc(mm, pud, addr);
                }
@@ -5632,7 +5832,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
 {
        bool ret = true;
 
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        if (!PageHeadHuge(page) ||
            !HPageMigratable(page) ||
            !get_page_unless_zero(page)) {
@@ -5642,16 +5842,16 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
        ClearHPageMigratable(page);
        list_move_tail(&page->lru, list);
 unlock:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
        return ret;
 }
 
 void putback_active_hugepage(struct page *page)
 {
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        SetHPageMigratable(page);
        list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
        put_page(page);
 }
 
@@ -5679,13 +5879,70 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
                SetHPageTemporary(oldpage);
                ClearHPageTemporary(newpage);
 
-               spin_lock(&hugetlb_lock);
+               /*
+                * There is no need to transfer the per-node surplus state
+                * when we do not cross the node.
+                */
+               if (new_nid == old_nid)
+                       return;
+               spin_lock_irq(&hugetlb_lock);
                if (h->surplus_huge_pages_node[old_nid]) {
                        h->surplus_huge_pages_node[old_nid]--;
                        h->surplus_huge_pages_node[new_nid]++;
                }
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
+       }
+}
+
+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+       struct hstate *h = hstate_vma(vma);
+       unsigned long sz = huge_page_size(h);
+       struct mm_struct *mm = vma->vm_mm;
+       struct mmu_notifier_range range;
+       unsigned long address, start, end;
+       spinlock_t *ptl;
+       pte_t *ptep;
+
+       if (!(vma->vm_flags & VM_MAYSHARE))
+               return;
+
+       start = ALIGN(vma->vm_start, PUD_SIZE);
+       end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+       if (start >= end)
+               return;
+
+       /*
+        * No need to call adjust_range_if_pmd_sharing_possible(), because
+        * we have already done the PUD_SIZE alignment.
+        */
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+                               start, end);
+       mmu_notifier_invalidate_range_start(&range);
+       i_mmap_lock_write(vma->vm_file->f_mapping);
+       for (address = start; address < end; address += PUD_SIZE) {
+               unsigned long tmp = address;
+
+               ptep = huge_pte_offset(mm, address, sz);
+               if (!ptep)
+                       continue;
+               ptl = huge_pte_lock(h, mm, ptep);
+               /* We don't want 'address' to be changed */
+               huge_pmd_unshare(mm, vma, &tmp, ptep);
+               spin_unlock(ptl);
        }
+       flush_hugetlb_tlb_range(vma, start, end);
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
+       /*
+        * No need to call mmu_notifier_invalidate_range(), see
+        * Documentation/vm/mmu_notifier.rst.
+        */
+       mmu_notifier_invalidate_range_end(&range);
 }
 
 #ifdef CONFIG_CMA
index 603a131..5383023 100644 (file)
@@ -204,11 +204,11 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
        do {
                idx = 0;
                for_each_hstate(h) {
-                       spin_lock(&hugetlb_lock);
+                       spin_lock_irq(&hugetlb_lock);
                        list_for_each_entry(page, &h->hugepage_activelist, lru)
                                hugetlb_cgroup_move_parent(idx, h_cg, page);
 
-                       spin_unlock(&hugetlb_lock);
+                       spin_unlock_irq(&hugetlb_lock);
                        idx++;
                }
                cond_resched();
@@ -784,8 +784,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
        if (hugetlb_cgroup_disabled())
                return;
 
-       VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
        h_cg = hugetlb_cgroup_from_page(oldhpage);
        h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage);
        set_hugetlb_cgroup(oldhpage, NULL);
@@ -795,7 +794,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
        set_hugetlb_cgroup(newhpage, h_cg);
        set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd);
        list_move(&newhpage->lru, &h->hugepage_activelist);
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
        return;
 }
 
index ef5f336..54bd0dc 100644 (file)
@@ -244,7 +244,13 @@ struct compact_control {
        unsigned int nr_freepages;      /* Number of isolated free pages */
        unsigned int nr_migratepages;   /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
-       unsigned long migrate_pfn;      /* isolate_migratepages search base */
+       /*
+        * Acts as an in/out parameter to page isolation for migration.
+        * isolate_migratepages uses it as a search base.
+        * isolate_migratepages_block will update the value to the next pfn
+        * after the last isolated one.
+        */
+       unsigned long migrate_pfn;
        unsigned long fast_start_pfn;   /* a pfn to start linear scan from */
        struct zone *zone;
        unsigned long total_migrate_scanned;
@@ -280,7 +286,7 @@ struct capture_control {
 unsigned long
 isolate_freepages_range(struct compact_control *cc,
                        unsigned long start_pfn, unsigned long end_pfn);
-unsigned long
+int
 isolate_migratepages_range(struct compact_control *cc,
                           unsigned long low_pfn, unsigned long end_pfn);
 int find_suitable_fallback(struct free_area *area, unsigned int order,
@@ -328,7 +334,7 @@ static inline bool is_exec_mapping(vm_flags_t flags)
 }
 
 /*
- * Stack area - atomatically grows in one direction
+ * Stack area - automatically grows in one direction
  *
  * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
  * do_mmap() forbids all other combinations.
index 3820ca5..8f450bc 100644 (file)
@@ -55,9 +55,9 @@ extern bool kasan_flag_async __ro_after_init;
 #define KASAN_TAG_MAX          0xFD /* maximum value for random tags */
 
 #ifdef CONFIG_KASAN_HW_TAGS
-#define KASAN_TAG_MIN          0xF0 /* mimimum value for random tags */
+#define KASAN_TAG_MIN          0xF0 /* minimum value for random tags */
 #else
-#define KASAN_TAG_MIN          0x00 /* mimimum value for random tags */
+#define KASAN_TAG_MIN          0x00 /* minimum value for random tags */
 #endif
 
 #ifdef CONFIG_KASAN_GENERIC
@@ -403,7 +403,7 @@ static inline bool kasan_byte_accessible(const void *addr)
 #else /* CONFIG_KASAN_HW_TAGS */
 
 /**
- * kasan_poison - mark the memory range as unaccessible
+ * kasan_poison - mark the memory range as inaccessible
  * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
  * @size - range size, must be aligned to KASAN_GRANULE_SIZE
  * @value - value that's written to metadata for the range
@@ -434,7 +434,7 @@ bool kasan_byte_accessible(const void *addr);
 
 /**
  * kasan_poison_last_granule - mark the last granule of the memory range as
- * unaccessible
+ * inaccessible
  * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
  * @size - range size
  *
index 728fb24..d8ccff4 100644 (file)
@@ -27,7 +27,7 @@
 /* Data structure and operations for quarantine queues. */
 
 /*
- * Each queue is a signle-linked list, which also stores the total size of
+ * Each queue is a single-linked list, which also stores the total size of
  * objects inside of it.
  */
 struct qlist_head {
@@ -138,7 +138,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
                local_irq_save(flags);
 
        /*
-        * As the object now gets freed from the quaratine, assume that its
+        * As the object now gets freed from the quarantine, assume that its
         * free track is no longer valid.
         */
        *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE;
index 727ad46..082ee5b 100644 (file)
@@ -316,7 +316,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
         * // rest of vmalloc process           <data dependency>
         * STORE p, a                           LOAD shadow(x+99)
         *
-        * If there is no barrier between the end of unpoisioning the shadow
+        * If there is no barrier between the end of unpoisoning the shadow
         * and the store of the result to p, the stores could be committed
         * in a different order by CPU#0, and CPU#1 could erroneously observe
         * poison in the shadow.
@@ -384,7 +384,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
  * How does this work?
  * -------------------
  *
- * We have a region that is page aligned, labelled as A.
+ * We have a region that is page aligned, labeled as A.
  * That might not map onto the shadow in a way that is page-aligned:
  *
  *                    start                     end
index d53c91f..e18fbbd 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/atomic.h>
 #include <linux/bug.h>
 #include <linux/debugfs.h>
+#include <linux/irq_work.h>
 #include <linux/kcsan-checks.h>
 #include <linux/kfence.h>
 #include <linux/kmemleak.h>
@@ -19,6 +20,7 @@
 #include <linux/moduleparam.h>
 #include <linux/random.h>
 #include <linux/rcupdate.h>
+#include <linux/sched/sysctl.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
@@ -372,6 +374,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
 
        /* Restore page protection if there was an OOB access. */
        if (meta->unprotected_page) {
+               memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE);
                kfence_protect(meta->unprotected_page);
                meta->unprotected_page = 0;
        }
@@ -586,6 +589,17 @@ late_initcall(kfence_debugfs_init);
 
 /* === Allocation Gate Timer ================================================ */
 
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+/* Wait queue to wake up allocation-gate timer task. */
+static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
+
+static void wake_up_kfence_timer(struct irq_work *work)
+{
+       wake_up(&allocation_wait);
+}
+static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer);
+#endif
+
 /*
  * Set up delayed work, which will enable and disable the static key. We need to
  * use a work queue (rather than a simple timer), since enabling and disabling a
@@ -603,29 +617,27 @@ static void toggle_allocation_gate(struct work_struct *work)
        if (!READ_ONCE(kfence_enabled))
                return;
 
-       /* Enable static key, and await allocation to happen. */
        atomic_set(&kfence_allocation_gate, 0);
 #ifdef CONFIG_KFENCE_STATIC_KEYS
+       /* Enable static key, and await allocation to happen. */
        static_branch_enable(&kfence_allocation_key);
-       /*
-        * Await an allocation. Timeout after 1 second, in case the kernel stops
-        * doing allocations, to avoid stalling this worker task for too long.
-        */
-       {
-               unsigned long end_wait = jiffies + HZ;
-
-               do {
-                       set_current_state(TASK_UNINTERRUPTIBLE);
-                       if (atomic_read(&kfence_allocation_gate) != 0)
-                               break;
-                       schedule_timeout(1);
-               } while (time_before(jiffies, end_wait));
-               __set_current_state(TASK_RUNNING);
+
+       if (sysctl_hung_task_timeout_secs) {
+               /*
+                * During low activity with no allocations we might wait a
+                * while; let's avoid the hung task warning.
+                */
+               wait_event_timeout(allocation_wait, atomic_read(&kfence_allocation_gate),
+                                  sysctl_hung_task_timeout_secs * HZ / 2);
+       } else {
+               wait_event(allocation_wait, atomic_read(&kfence_allocation_gate));
        }
+
        /* Disable static key and reset timer. */
        static_branch_disable(&kfence_allocation_key);
 #endif
-       schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval));
+       queue_delayed_work(system_power_efficient_wq, &kfence_timer,
+                          msecs_to_jiffies(kfence_sample_interval));
 }
 static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
 
@@ -654,7 +666,7 @@ void __init kfence_init(void)
        }
 
        WRITE_ONCE(kfence_enabled, true);
-       schedule_delayed_work(&kfence_timer, 0);
+       queue_delayed_work(system_power_efficient_wq, &kfence_timer, 0);
        pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
                CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
                (void *)(__kfence_pool + KFENCE_POOL_SIZE));
@@ -728,6 +740,19 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
         */
        if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1)
                return NULL;
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+       /*
+        * waitqueue_active() is fully ordered after the update of
+        * kfence_allocation_gate per atomic_inc_return().
+        */
+       if (waitqueue_active(&allocation_wait)) {
+               /*
+                * Calling wake_up() here may deadlock when allocations happen
+                * from within timer code. Use an irq_work to defer it.
+                */
+               irq_work_queue(&wake_up_kfence_timer_work);
+       }
+#endif
 
        if (!READ_ONCE(kfence_enabled))
                return NULL;
index e3f7145..2a319c2 100644 (file)
@@ -263,6 +263,6 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
        if (panic_on_warn)
                panic("panic_on_warn set ...\n");
 
-       /* We encountered a memory unsafety error, taint the kernel! */
+       /* We encountered a memory safety error, taint the kernel! */
        add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK);
 }
index a7d6cb9..6c0185f 100644 (file)
@@ -481,7 +481,7 @@ int __khugepaged_enter(struct mm_struct *mm)
                return -ENOMEM;
 
        /* __khugepaged_exit() must not run from under us */
-       VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
+       VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
        if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
                free_mm_slot(mm_slot);
                return 0;
@@ -667,7 +667,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                 *
                 * The page table that maps the page has been already unlinked
                 * from the page table tree and this process cannot get
-                * an additinal pin on the page.
+                * an additional pin on the page.
                 *
                 * New pins can come later if the page is shared across fork,
                 * but not from this process. The other process cannot write to
@@ -716,17 +716,17 @@ next:
                if (pte_write(pteval))
                        writable = true;
        }
-       if (likely(writable)) {
-               if (likely(referenced)) {
-                       result = SCAN_SUCCEED;
-                       trace_mm_collapse_huge_page_isolate(page, none_or_zero,
-                                                           referenced, writable, result);
-                       return 1;
-               }
-       } else {
+
+       if (unlikely(!writable)) {
                result = SCAN_PAGE_RO;
+       } else if (unlikely(!referenced)) {
+               result = SCAN_LACK_REFERENCED_PAGE;
+       } else {
+               result = SCAN_SUCCEED;
+               trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+                                                   referenced, writable, result);
+               return 1;
        }
-
 out:
        release_pte_pages(pte, _pte, compound_pagelist);
        trace_mm_collapse_huge_page_isolate(page, none_or_zero,
@@ -809,7 +809,7 @@ static bool khugepaged_scan_abort(int nid)
         * If node_reclaim_mode is disabled, then no extra effort is made to
         * allocate memory locally.
         */
-       if (!node_reclaim_mode)
+       if (!node_reclaim_enabled())
                return false;
 
        /* If there is a count for this node already, it must be acceptable */
@@ -1128,10 +1128,10 @@ static void collapse_huge_page(struct mm_struct *mm,
        mmap_write_lock(mm);
        result = hugepage_vma_revalidate(mm, address, &vma);
        if (result)
-               goto out;
+               goto out_up_write;
        /* check if the pmd is still valid */
        if (mm_find_pmd(mm, address) != pmd)
-               goto out;
+               goto out_up_write;
 
        anon_vma_lock_write(vma->anon_vma);
 
@@ -1171,7 +1171,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                spin_unlock(pmd_ptl);
                anon_vma_unlock_write(vma->anon_vma);
                result = SCAN_FAIL;
-               goto out;
+               goto out_up_write;
        }
 
        /*
@@ -1183,19 +1183,18 @@ static void collapse_huge_page(struct mm_struct *mm,
        __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
                        &compound_pagelist);
        pte_unmap(pte);
+       /*
+        * spin_lock() below is not the equivalent of smp_wmb(), but
+        * the smp_wmb() inside __SetPageUptodate() can be reused to
+        * avoid the copy_huge_page writes to become visible after
+        * the set_pmd_at() write.
+        */
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
 
        _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
 
-       /*
-        * spin_lock() below is not the equivalent of smp_wmb(), so
-        * this is needed to avoid the copy_huge_page writes to become
-        * visible after the set_pmd_at() write.
-        */
-       smp_wmb();
-
        spin_lock(pmd_ptl);
        BUG_ON(!pmd_none(*pmd));
        page_add_new_anon_rmap(new_page, vma, address, true);
@@ -1216,8 +1215,6 @@ out_nolock:
                mem_cgroup_uncharge(*hpage);
        trace_mm_collapse_huge_page(mm, isolated, result);
        return;
-out:
-       goto out_up_write;
 }
 
 static int khugepaged_scan_pmd(struct mm_struct *mm,
@@ -1274,10 +1271,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                                goto out_unmap;
                        }
                }
-               if (!pte_present(pteval)) {
-                       result = SCAN_PTE_NON_PRESENT;
-                       goto out_unmap;
-               }
                if (pte_uffd_wp(pteval)) {
                        /*
                         * Don't collapse the page if any of the small
@@ -1447,7 +1440,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
        int i;
 
        if (!vma || !vma->vm_file ||
-           vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
+           !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
                return;
 
        /*
@@ -1533,16 +1526,16 @@ abort:
        goto drop_hpage;
 }
 
-static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 {
        struct mm_struct *mm = mm_slot->mm;
        int i;
 
        if (likely(mm_slot->nr_pte_mapped_thp == 0))
-               return 0;
+               return;
 
        if (!mmap_write_trylock(mm))
-               return -EBUSY;
+               return;
 
        if (unlikely(khugepaged_test_exit(mm)))
                goto out;
@@ -1553,7 +1546,6 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 out:
        mm_slot->nr_pte_mapped_thp = 0;
        mmap_write_unlock(mm);
-       return 0;
 }
 
 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
@@ -2057,9 +2049,8 @@ static void khugepaged_scan_file(struct mm_struct *mm,
        BUILD_BUG();
 }
 
-static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 {
-       return 0;
 }
 #endif
 
@@ -2205,11 +2196,9 @@ static void khugepaged_do_scan(void)
 {
        struct page *hpage = NULL;
        unsigned int progress = 0, pass_through_head = 0;
-       unsigned int pages = khugepaged_pages_to_scan;
+       unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
        bool wait = true;
 
-       barrier(); /* write khugepaged_pages_to_scan to local stack */
-
        lru_add_drain_all();
 
        while (progress < pages) {
index 9694ee2..6bbe314 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -215,8 +215,6 @@ struct rmap_item {
 #define SEQNR_MASK     0x0ff   /* low bits of unstable tree seqnr */
 #define UNSTABLE_FLAG  0x100   /* is a node of the unstable tree */
 #define STABLE_FLAG    0x200   /* is listed from the stable tree */
-#define KSM_FLAG_MASK  (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
-                               /* to mask all the flags */
 
 /* The stable and unstable tree heads */
 static struct rb_root one_stable_tree[1] = { RB_ROOT };
@@ -461,7 +459,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
  * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
  * in case the application has unmapped and remapped mm,addr meanwhile.
  * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
- * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ * mmap of /dev/mem, where we would not want to touch it.
  *
  * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
  * of the process that owns 'vma'.  We also do not want to enforce
@@ -778,12 +776,11 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
                struct page *page;
 
                stable_node = rmap_item->head;
-               page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
+               page = get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
                if (!page)
                        goto out;
 
                hlist_del(&rmap_item->hlist);
-               unlock_page(page);
                put_page(page);
 
                if (!hlist_empty(&stable_node->hlist))
@@ -794,6 +791,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
                stable_node->rmap_hlist_len--;
 
                put_anon_vma(rmap_item->anon_vma);
+               rmap_item->head = NULL;
                rmap_item->address &= PAGE_MASK;
 
        } else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -817,8 +815,7 @@ out:
        cond_resched();         /* we're called from many long loops */
 }
 
-static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
-                                      struct rmap_item **rmap_list)
+static void remove_trailing_rmap_items(struct rmap_item **rmap_list)
 {
        while (*rmap_list) {
                struct rmap_item *rmap_item = *rmap_list;
@@ -989,7 +986,7 @@ static int unmerge_and_remove_all_rmap_items(void)
                                goto error;
                }
 
-               remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+               remove_trailing_rmap_items(&mm_slot->rmap_list);
                mmap_read_unlock(mm);
 
                spin_lock(&ksm_mmlist_lock);
@@ -1068,7 +1065,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
                /*
                 * Ok this is tricky, when get_user_pages_fast() run it doesn't
                 * take any lock, therefore the check that we are going to make
-                * with the pagecount against the mapcount is racey and
+                * with the pagecount against the mapcount is racy and
                 * O_DIRECT can happen right after the check.
                 * So we clear the pte and flush the tlb before the check
                 * this assure us that no O_DIRECT can happen after the check
@@ -1438,7 +1435,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
                         */
                        *_stable_node = found;
                        /*
-                        * Just for robustneess as stable_node is
+                        * Just for robustness, as stable_node is
                         * otherwise left as a stable pointer, the
                         * compiler shall optimize it away at build
                         * time.
@@ -1771,7 +1768,6 @@ chain_append:
         * stable_node_dup is the dup to replace.
         */
        if (stable_node_dup == stable_node) {
-               VM_BUG_ON(is_stable_node_chain(stable_node_dup));
                VM_BUG_ON(is_stable_node_dup(stable_node_dup));
                /* chain is missing so create it */
                stable_node = alloc_stable_node_chain(stable_node_dup,
@@ -1785,7 +1781,6 @@ chain_append:
         * of the current nid for this page
         * content.
         */
-       VM_BUG_ON(!is_stable_node_chain(stable_node));
        VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
        VM_BUG_ON(page_node->head != &migrate_nodes);
        list_del(&page_node->list);
@@ -2337,7 +2332,7 @@ next_mm:
         * Nuke all the rmap_items that are above this current rmap:
         * because there were no VM_MERGEABLE vmas with such addresses.
         */
-       remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
+       remove_trailing_rmap_items(ksm_scan.rmap_list);
 
        spin_lock(&ksm_mmlist_lock);
        ksm_scan.mm_slot = list_entry(slot->mm_list.next,
@@ -2634,7 +2629,7 @@ again:
                        vma = vmac->vma;
 
                        /* Ignore the stable/unstable/sqnr flags */
-                       addr = rmap_item->address & ~KSM_FLAG_MASK;
+                       addr = rmap_item->address & PAGE_MASK;
 
                        if (addr < vma->vm_start || addr >= vma->vm_end)
                                continue;
index 6f067b6..cd58790 100644 (file)
@@ -125,8 +125,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
                list_add_tail(item, &l->list);
                /* Set shrinker bit if the first element was added */
                if (!l->nr_items++)
-                       memcg_set_shrinker_bit(memcg, nid,
-                                              lru_shrinker_id(lru));
+                       set_shrinker_bit(memcg, nid,
+                                        lru_shrinker_id(lru));
                nlru->nr_items++;
                spin_unlock(&nlru->lock);
                return true;
@@ -540,7 +540,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
 
        if (src->nr_items) {
                dst->nr_items += src->nr_items;
-               memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
+               set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
                src->nr_items = 0;
        }
 
index 01fef79..63e489e 100644 (file)
@@ -799,7 +799,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
                if (end > vma->vm_end) {
                        /*
                         * Don't fail if end > vma->vm_end. If the old
-                        * vma was splitted while the mmap_lock was
+                        * vma was split while the mmap_lock was
                         * released the effect of the concurrent
                         * operation may not cause madvise() to
                         * have an undefined result. There may be an
@@ -1039,7 +1039,7 @@ process_madvise_behavior_valid(int behavior)
  *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
  *  MADV_COLD - the application is not expected to use this memory soon,
  *             deactivate pages in this range so that they can be reclaimed
- *             easily if memory pressure hanppens.
+ *             easily if memory pressure happens.
  *  MADV_PAGEOUT - the application is not expected to use this memory soon,
  *             page out the pages in this range immediately.
  *
index c100265..64ada9e 100644 (file)
@@ -215,7 +215,7 @@ enum res_type {
 #define MEMFILE_PRIVATE(x, val)        ((x) << 16 | (val))
 #define MEMFILE_TYPE(val)      ((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)      ((val) & 0xffff)
-/* Used for OOM nofiier */
+/* Used for OOM notifier */
 #define OOM_CONTROL            (0)
 
 /*
@@ -400,130 +400,6 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 #endif
 
-static int memcg_shrinker_map_size;
-static DEFINE_MUTEX(memcg_shrinker_map_mutex);
-
-static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
-{
-       kvfree(container_of(head, struct memcg_shrinker_map, rcu));
-}
-
-static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
-                                        int size, int old_size)
-{
-       struct memcg_shrinker_map *new, *old;
-       struct mem_cgroup_per_node *pn;
-       int nid;
-
-       lockdep_assert_held(&memcg_shrinker_map_mutex);
-
-       for_each_node(nid) {
-               pn = memcg->nodeinfo[nid];
-               old = rcu_dereference_protected(pn->shrinker_map, true);
-               /* Not yet online memcg */
-               if (!old)
-                       return 0;
-
-               new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
-               if (!new)
-                       return -ENOMEM;
-
-               /* Set all old bits, clear all new bits */
-               memset(new->map, (int)0xff, old_size);
-               memset((void *)new->map + old_size, 0, size - old_size);
-
-               rcu_assign_pointer(pn->shrinker_map, new);
-               call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
-       }
-
-       return 0;
-}
-
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
-{
-       struct mem_cgroup_per_node *pn;
-       struct memcg_shrinker_map *map;
-       int nid;
-
-       if (mem_cgroup_is_root(memcg))
-               return;
-
-       for_each_node(nid) {
-               pn = memcg->nodeinfo[nid];
-               map = rcu_dereference_protected(pn->shrinker_map, true);
-               kvfree(map);
-               rcu_assign_pointer(pn->shrinker_map, NULL);
-       }
-}
-
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
-       struct memcg_shrinker_map *map;
-       int nid, size, ret = 0;
-
-       if (mem_cgroup_is_root(memcg))
-               return 0;
-
-       mutex_lock(&memcg_shrinker_map_mutex);
-       size = memcg_shrinker_map_size;
-       for_each_node(nid) {
-               map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
-               if (!map) {
-                       memcg_free_shrinker_maps(memcg);
-                       ret = -ENOMEM;
-                       break;
-               }
-               rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
-       }
-       mutex_unlock(&memcg_shrinker_map_mutex);
-
-       return ret;
-}
-
-int memcg_expand_shrinker_maps(int new_id)
-{
-       int size, old_size, ret = 0;
-       struct mem_cgroup *memcg;
-
-       size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
-       old_size = memcg_shrinker_map_size;
-       if (size <= old_size)
-               return 0;
-
-       mutex_lock(&memcg_shrinker_map_mutex);
-       if (!root_mem_cgroup)
-               goto unlock;
-
-       for_each_mem_cgroup(memcg) {
-               if (mem_cgroup_is_root(memcg))
-                       continue;
-               ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
-               if (ret) {
-                       mem_cgroup_iter_break(NULL, memcg);
-                       goto unlock;
-               }
-       }
-unlock:
-       if (!ret)
-               memcg_shrinker_map_size = size;
-       mutex_unlock(&memcg_shrinker_map_mutex);
-       return ret;
-}
-
-void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
-{
-       if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
-               struct memcg_shrinker_map *map;
-
-               rcu_read_lock();
-               map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
-               /* Pairs with smp mb in shrink_slab() */
-               smp_mb__before_atomic();
-               set_bit(shrinker_id, map->map);
-               rcu_read_unlock();
-       }
-}
-
 /**
  * mem_cgroup_css_from_page - css of the memcg associated with a page
  * @page: page of interest
@@ -910,7 +786,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
  * __count_memcg_events - account VM events in a cgroup
  * @memcg: the memory cgroup
  * @idx: the event item
- * @count: the number of events that occured
+ * @count: the number of events that occurred
  */
 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
                          unsigned long count)
@@ -1028,7 +904,7 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
        rcu_read_lock();
        do {
                /*
-                * Page cache insertions can happen withou an
+                * Page cache insertions can happen without an
                 * actual mm context, e.g. during disk probing
                 * on boot, loopback IO, acct() writes etc.
                 */
@@ -1836,7 +1712,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
        struct mem_cgroup *iter;
 
        /*
-        * Be careful about under_oom underflows becase a child memcg
+        * Be careful about under_oom underflows because a child memcg
         * could have been added after mem_cgroup_mark_under_oom.
         */
        spin_lock(&memcg_oom_lock);
@@ -2008,7 +1884,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
                /*
                 * There is no guarantee that an OOM-lock contender
                 * sees the wakeups triggered by the OOM kill
-                * uncharges.  Wake any sleepers explicitely.
+                * uncharges.  Wake any sleepers explicitly.
                 */
                memcg_oom_recover(memcg);
        }
@@ -4488,7 +4364,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
  * Foreign dirty flushing
  *
  * There's an inherent mismatch between memcg and writeback.  The former
- * trackes ownership per-page while the latter per-inode.  This was a
+ * tracks ownership per-page while the latter per-inode.  This was a
  * deliberate design decision because honoring per-page ownership in the
  * writeback path is complicated, may lead to higher CPU and IO overheads
  * and deemed unnecessary given that write-sharing an inode across
@@ -4503,9 +4379,9 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
  * triggering background writeback.  A will be slowed down without a way to
  * make writeback of the dirty pages happen.
  *
- * Conditions like the above can lead to a cgroup getting repatedly and
+ * Conditions like the above can lead to a cgroup getting repeatedly and
  * severely throttled after making some progress after each
- * dirty_expire_interval while the underyling IO device is almost
+ * dirty_expire_interval while the underlying IO device is almost
  * completely idle.
  *
  * Solving this problem completely requires matching the ownership tracking
@@ -5242,11 +5118,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
        /*
-        * A memcg must be visible for memcg_expand_shrinker_maps()
+        * A memcg must be visible for expand_shrinker_info()
         * by the time the maps are allocated. So, we allocate maps
         * here, when for_each_mem_cgroup() can't skip it.
         */
-       if (memcg_alloc_shrinker_maps(memcg)) {
+       if (alloc_shrinker_info(memcg)) {
                mem_cgroup_id_remove(memcg);
                return -ENOMEM;
        }
@@ -5278,6 +5154,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        page_counter_set_low(&memcg->memory, 0);
 
        memcg_offline_kmem(memcg);
+       reparent_shrinker_deferred(memcg);
        wb_memcg_offline(memcg);
 
        drain_all_stock(memcg);
@@ -5310,7 +5187,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
        vmpressure_cleanup(&memcg->vmpressure);
        cancel_work_sync(&memcg->high_work);
        mem_cgroup_remove_from_trees(memcg);
-       memcg_free_shrinker_maps(memcg);
+       free_shrinker_info(memcg);
        memcg_free_kmem(memcg);
        mem_cgroup_free(memcg);
 }
@@ -5897,7 +5774,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
                return 0;
 
        /*
-        * We are now commited to this value whatever it is. Changes in this
+        * We are now committed to this value whatever it is. Changes in this
         * tunable will only affect upcoming migrations, not the current one.
         * So we need to save it, and keep it going.
         */
index bd39454..85ad98c 100644 (file)
@@ -75,7 +75,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
                if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
                        /*
                         * We could fail to take off the target page from buddy
-                        * for example due to racy page allocaiton, but that's
+                        * for example due to racy page allocation, but that's
                         * acceptable because soft-offlined page is not broken
                         * and if someone really want to use it, they should
                         * take it.
index cbdc2cd..730daa0 100644 (file)
@@ -3339,7 +3339,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        }
 
 
-       delayacct_set_flag(DELAYACCT_PF_SWAPIN);
+       delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
        page = lookup_swap_cache(entry, vma, vmf->address);
        swapcache = page;
 
@@ -3388,7 +3388,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                                        vmf->address, &vmf->ptl);
                        if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
-                       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+                       delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
                        goto unlock;
                }
 
@@ -3402,13 +3402,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                 * owner processes (which may be unknown at hwpoison time)
                 */
                ret = VM_FAULT_HWPOISON;
-               delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+               delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
                goto out_release;
        }
 
        locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
 
-       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+       delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
        if (!locked) {
                ret |= VM_FAULT_RETRY;
                goto out_release;
@@ -3727,7 +3727,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
                return ret;
 
        /*
-        * Archs like ppc64 need additonal space to store information
+        * Archs like ppc64 need additional space to store information
         * related to pte entry. Use the preallocated table for that.
         */
        if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
@@ -4503,7 +4503,7 @@ retry_pud:
 }
 
 /**
- * mm_account_fault - Do page fault accountings
+ * mm_account_fault - Do page fault accounting
  *
  * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
  *        of perf event counters, but we'll still do the per-task accounting to
@@ -4512,9 +4512,9 @@ retry_pud:
  * @flags: the fault flags.
  * @ret: the fault retcode.
  *
- * This will take care of most of the page fault accountings.  Meanwhile, it
+ * This will take care of most of the page fault accounting.  Meanwhile, it
  * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
- * updates.  However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
  * still be in per-arch page fault handlers at the entry of page fault.
  */
 static inline void mm_account_fault(struct pt_regs *regs,
@@ -4848,7 +4848,7 @@ out:
 /**
  * generic_access_phys - generic implementation for iomem mmap access
  * @vma: the vma to access
- * @addr: userspace addres, not relative offset within @vma
+ * @addr: userspace address, not relative offset within @vma
  * @buf: buffer to read/write
  * @len: length of transfer
  * @write: set to FOLL_WRITE when writing, otherwise reading
index 0cdbbfb..70620d0 100644 (file)
 #include "internal.h"
 #include "shuffle.h"
 
+
+/*
+ * memory_hotplug.memmap_on_memory parameter
+ */
+static bool memmap_on_memory __ro_after_init;
+#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
+module_param(memmap_on_memory, bool, 0444);
+MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
+#endif
+
 /*
  * online_page_callback contains pointer to current page onlining function.
  * Initially it is generic_online_page(). If it is required it could be
@@ -648,9 +658,16 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
         * decide to not expose all pages to the buddy (e.g., expose them
         * later). We account all pages as being online and belonging to this
         * zone ("present").
+        * When using memmap_on_memory, the range might not be aligned to
+        * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect
+        * this and the first chunk to online will be pageblock_nr_pages.
         */
-       for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
-               (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
+       for (pfn = start_pfn; pfn < end_pfn;) {
+               int order = min(MAX_ORDER - 1UL, __ffs(pfn));
+
+               (*online_page_callback)(pfn_to_page(pfn), order);
+               pfn += (1UL << order);
+       }
 
        /* mark all involved sections as online */
        online_mem_sections(start_pfn, end_pfn);
@@ -817,7 +834,7 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn
        return movable_node_enabled ? movable_zone : kernel_zone;
 }
 
-struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
+struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
                unsigned long nr_pages)
 {
        if (online_type == MMOP_ONLINE_KERNEL)
@@ -829,24 +846,86 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
        return default_zone_for_pfn(nid, start_pfn, nr_pages);
 }
 
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
-                      int online_type, int nid)
+/*
+ * This function should only be called by memory_block_{online,offline},
+ * and {online,offline}_pages.
+ */
+void adjust_present_page_count(struct zone *zone, long nr_pages)
+{
+       unsigned long flags;
+
+       zone->present_pages += nr_pages;
+       pgdat_resize_lock(zone->zone_pgdat, &flags);
+       zone->zone_pgdat->node_present_pages += nr_pages;
+       pgdat_resize_unlock(zone->zone_pgdat, &flags);
+}
+
+int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+                             struct zone *zone)
+{
+       unsigned long end_pfn = pfn + nr_pages;
+       int ret;
+
+       ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+       if (ret)
+               return ret;
+
+       move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+
+       /*
+        * It might be that the vmemmap_pages fully span sections. If that is
+        * the case, mark those sections online here as otherwise they will be
+        * left offline.
+        */
+       if (nr_pages >= PAGES_PER_SECTION)
+               online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+       return ret;
+}
+
+void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
+{
+       unsigned long end_pfn = pfn + nr_pages;
+
+       /*
+        * It might be that the vmemmap_pages fully span sections. If that is
+        * the case, mark those sections offline here as otherwise they will be
+        * left online.
+        */
+       if (nr_pages >= PAGES_PER_SECTION)
+               offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+        /*
+        * The pages associated with this vmemmap have been offlined, so
+        * we can reset its state here.
+        */
+       remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
+       kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+}
+
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone)
 {
        unsigned long flags;
-       struct zone *zone;
        int need_zonelists_rebuild = 0;
+       const int nid = zone_to_nid(zone);
        int ret;
        struct memory_notify arg;
 
-       /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
+       /*
+        * {on,off}lining is constrained to full memory sections (or more
+        * precisly to memory blocks from the user space POV).
+        * memmap_on_memory is an exception because it reserves initial part
+        * of the physical memory space for vmemmaps. That space is pageblock
+        * aligned.
+        */
        if (WARN_ON_ONCE(!nr_pages ||
-                        !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
+                        !IS_ALIGNED(pfn, pageblock_nr_pages) ||
+                        !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
                return -EINVAL;
 
        mem_hotplug_begin();
 
        /* associate pfn range with the zone */
-       zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
        move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
 
        arg.start_pfn = pfn;
@@ -877,11 +956,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
        }
 
        online_pages_range(pfn, nr_pages);
-       zone->present_pages += nr_pages;
-
-       pgdat_resize_lock(zone->zone_pgdat, &flags);
-       zone->zone_pgdat->node_present_pages += nr_pages;
-       pgdat_resize_unlock(zone->zone_pgdat, &flags);
+       adjust_present_page_count(zone, nr_pages);
 
        node_states_set_node(nid, &arg);
        if (need_zonelists_rebuild)
@@ -1064,6 +1139,45 @@ static int online_memory_block(struct memory_block *mem, void *arg)
        return device_online(&mem->dev);
 }
 
+bool mhp_supports_memmap_on_memory(unsigned long size)
+{
+       unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
+       unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+       unsigned long remaining_size = size - vmemmap_size;
+
+       /*
+        * Besides having arch support and the feature enabled at runtime, we
+        * need a few more assumptions to hold true:
+        *
+        * a) We span a single memory block: memory onlining/offlinin;g happens
+        *    in memory block granularity. We don't want the vmemmap of online
+        *    memory blocks to reside on offline memory blocks. In the future,
+        *    we might want to support variable-sized memory blocks to make the
+        *    feature more versatile.
+        *
+        * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+        *    to populate memory from the altmap for unrelated parts (i.e.,
+        *    other memory blocks)
+        *
+        * c) The vmemmap pages (and thereby the pages that will be exposed to
+        *    the buddy) have to cover full pageblocks: memory onlining/offlining
+        *    code requires applicable ranges to be page-aligned, for example, to
+        *    set the migratetypes properly.
+        *
+        * TODO: Although we have a check here to make sure that vmemmap pages
+        *       fully populate a PMD, it is not the right place to check for
+        *       this. A much better solution involves improving vmemmap code
+        *       to fallback to base pages when trying to populate vmemmap using
+        *       altmap as an alternative source of memory, and we do not exactly
+        *       populate a single PMD.
+        */
+       return memmap_on_memory &&
+              IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
+              size == memory_block_size_bytes() &&
+              IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
+              IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+}
+
 /*
  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
  * and online/offline operations (triggered e.g. by sysfs).
@@ -1073,6 +1187,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
 int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 {
        struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
+       struct vmem_altmap mhp_altmap = {};
        u64 start, size;
        bool new_node = false;
        int ret;
@@ -1099,13 +1214,26 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
                goto error;
        new_node = ret;
 
+       /*
+        * Self hosted memmap array
+        */
+       if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
+               if (!mhp_supports_memmap_on_memory(size)) {
+                       ret = -EINVAL;
+                       goto error;
+               }
+               mhp_altmap.free = PHYS_PFN(size);
+               mhp_altmap.base_pfn = PHYS_PFN(start);
+               params.altmap = &mhp_altmap;
+       }
+
        /* call arch's memory hotadd */
        ret = arch_add_memory(nid, start, size, &params);
        if (ret < 0)
                goto error;
 
        /* create memory block devices after memory was added */
-       ret = create_memory_block_devices(start, size);
+       ret = create_memory_block_devices(start, size, mhp_altmap.alloc);
        if (ret) {
                arch_remove_memory(nid, start, size, NULL);
                goto error;
@@ -1573,9 +1701,16 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
        int ret, node;
        char *reason;
 
-       /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
+       /*
+        * {on,off}lining is constrained to full memory sections (or more
+        * precisly to memory blocks from the user space POV).
+        * memmap_on_memory is an exception because it reserves initial part
+        * of the physical memory space for vmemmaps. That space is pageblock
+        * aligned.
+        */
        if (WARN_ON_ONCE(!nr_pages ||
-                        !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
+                        !IS_ALIGNED(start_pfn, pageblock_nr_pages) ||
+                        !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
                return -EINVAL;
 
        mem_hotplug_begin();
@@ -1611,6 +1746,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
         * in a way that pages from isolated pageblock are left on pcplists.
         */
        zone_pcp_disable(zone);
+       lru_cache_disable();
 
        /* set above range as isolated */
        ret = start_isolate_page_range(start_pfn, end_pfn,
@@ -1642,7 +1778,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
                        }
 
                        cond_resched();
-                       lru_add_drain_all();
 
                        ret = scan_movable_pages(pfn, end_pfn, &pfn);
                        if (!ret) {
@@ -1687,15 +1822,12 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
        zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
        spin_unlock_irqrestore(&zone->lock, flags);
 
+       lru_cache_enable();
        zone_pcp_enable(zone);
 
        /* removal success */
        adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
-       zone->present_pages -= nr_pages;
-
-       pgdat_resize_lock(zone->zone_pgdat, &flags);
-       zone->zone_pgdat->node_present_pages -= nr_pages;
-       pgdat_resize_unlock(zone->zone_pgdat, &flags);
+       adjust_present_page_count(zone, -nr_pages);
 
        init_per_zone_wmark_min();
 
@@ -1750,6 +1882,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
        return 0;
 }
 
+static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
+{
+       /*
+        * If not set, continue with the next block.
+        */
+       return mem->nr_vmemmap_pages;
+}
+
 static int check_cpu_on_node(pg_data_t *pgdat)
 {
        int cpu;
@@ -1824,6 +1964,9 @@ EXPORT_SYMBOL(try_offline_node);
 static int __ref try_remove_memory(int nid, u64 start, u64 size)
 {
        int rc = 0;
+       struct vmem_altmap mhp_altmap = {};
+       struct vmem_altmap *altmap = NULL;
+       unsigned long nr_vmemmap_pages;
 
        BUG_ON(check_hotplug_memory_range(start, size));
 
@@ -1836,6 +1979,31 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
        if (rc)
                return rc;
 
+       /*
+        * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
+        * the same granularity it was added - a single memory block.
+        */
+       if (memmap_on_memory) {
+               nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
+                                                     get_nr_vmemmap_pages_cb);
+               if (nr_vmemmap_pages) {
+                       if (size != memory_block_size_bytes()) {
+                               pr_warn("Refuse to remove %#llx - %#llx,"
+                                       "wrong granularity\n",
+                                       start, start + size);
+                               return -EINVAL;
+                       }
+
+                       /*
+                        * Let remove_pmd_table->free_hugepage_table do the
+                        * right thing if we used vmem_altmap when hot-adding
+                        * the range.
+                        */
+                       mhp_altmap.alloc = nr_vmemmap_pages;
+                       altmap = &mhp_altmap;
+               }
+       }
+
        /* remove memmap entry */
        firmware_map_remove(start, start + size, "System RAM");
 
@@ -1847,7 +2015,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
 
        mem_hotplug_begin();
 
-       arch_remove_memory(nid, start, size, NULL);
+       arch_remove_memory(nid, start, size, altmap);
 
        if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
                memblock_free(start, size);
index cd02955..d79fa29 100644 (file)
@@ -330,7 +330,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
        else if (pol->flags & MPOL_F_RELATIVE_NODES)
                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
        else {
-               nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
+               nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
                                                                *nodes);
                pol->w.cpuset_mems_allowed = *nodes;
        }
@@ -994,7 +994,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
                if (flags & MPOL_F_ADDR) {
                        /*
                         * Take a refcount on the mpol, lookup_node()
-                        * wil drop the mmap_lock, so after calling
+                        * will drop the mmap_lock, so after calling
                         * lookup_node() only "pol" remains valid, "vma"
                         * is stale.
                         */
@@ -1124,7 +1124,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
        int err = 0;
        nodemask_t tmp;
 
-       migrate_prep();
+       lru_cache_disable();
 
        mmap_read_lock(mm);
 
@@ -1161,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 
        tmp = *from;
        while (!nodes_empty(tmp)) {
-               int s,d;
+               int s, d;
                int source = NUMA_NO_NODE;
                int dest = 0;
 
@@ -1208,6 +1208,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                        break;
        }
        mmap_read_unlock(mm);
+
+       lru_cache_enable();
        if (err < 0)
                return err;
        return busy;
@@ -1323,7 +1325,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 
        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 
-               migrate_prep();
+               lru_cache_disable();
        }
        {
                NODEMASK_SCRATCH(scratch);
@@ -1371,6 +1373,8 @@ up_out:
        mmap_write_unlock(mm);
 mpol_out:
        mpol_put(new);
+       if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+               lru_cache_enable();
        return err;
 }
 
@@ -1863,7 +1867,7 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
         *
         * policy->v.nodes is intersect with node_states[N_MEMORY].
-        * so if the following test faile, it implies
+        * so if the following test fails, it implies
         * policy->v.nodes has movable memory only.
         */
        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
@@ -2094,7 +2098,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
  *
  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
  * policy.  Otherwise, check for intersection between mask and the policy
- * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
+ * nodemask for 'bind' or 'interleave' policy.  For 'preferred' or 'local'
  * policy, always return true since it may allocate elsewhere on fallback.
  *
  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
index fe19d29..a258cf4 100644 (file)
@@ -251,7 +251,7 @@ EXPORT_SYMBOL(mempool_init);
 mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
                                mempool_free_t *free_fn, void *pool_data)
 {
-       return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
+       return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data,
                                   GFP_KERNEL, NUMA_NO_NODE);
 }
 EXPORT_SYMBOL(mempool_create);
index 47df0df..b234c3f 100644 (file)
 
 #include "internal.h"
 
-/*
- * migrate_prep() needs to be called before we start compiling a list of pages
- * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
- * undesirable, use migrate_prep_local()
- */
-void migrate_prep(void)
-{
-       /*
-        * Clear the LRU lists so pages can be isolated.
-        * Note that pages may be moved off the LRU after we have
-        * drained them. Those pages will fail to migrate like other
-        * pages that may be busy.
-        */
-       lru_add_drain_all();
-}
-
-/* Do the necessary work of migrate_prep but not if it involves other CPUs */
-void migrate_prep_local(void)
-{
-       lru_add_drain();
-}
-
 int isolate_movable_page(struct page *page, isolate_mode_t mode)
 {
        struct address_space *mapping;
@@ -140,15 +118,10 @@ out:
        return -EBUSY;
 }
 
-/* It should be called on page which is PG_movable */
-void putback_movable_page(struct page *page)
+static void putback_movable_page(struct page *page)
 {
        struct address_space *mapping;
 
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
-       VM_BUG_ON_PAGE(!PageMovable(page), page);
-       VM_BUG_ON_PAGE(!PageIsolated(page), page);
-
        mapping = page_mapping(page);
        mapping->a_ops->putback_page(page);
        __ClearPageIsolated(page);
@@ -1375,7 +1348,7 @@ out_unlock:
 out:
        if (rc == MIGRATEPAGE_SUCCESS)
                putback_active_hugepage(hpage);
-       else if (rc != -EAGAIN && rc != MIGRATEPAGE_SUCCESS)
+       else if (rc != -EAGAIN)
                list_move_tail(&hpage->lru, ret);
 
        /*
@@ -1445,6 +1418,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
        int rc, nr_subpages;
        LIST_HEAD(ret_pages);
 
+       trace_mm_migrate_pages_start(mode, reason);
+
        if (!swapwrite)
                current->flags |= PF_SWAPWRITE;
 
@@ -1769,7 +1744,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
        int start, i;
        int err = 0, err1;
 
-       migrate_prep();
+       lru_cache_disable();
 
        for (i = start = 0; i < nr_pages; i++) {
                const void __user *p;
@@ -1838,6 +1813,7 @@ out_flush:
        if (err >= 0)
                err = err1;
 out:
+       lru_cache_enable();
        return err;
 }
 
@@ -2110,17 +2086,6 @@ bool pmd_trans_migrating(pmd_t pmd)
        return PageLocked(page);
 }
 
-static inline bool is_shared_exec_page(struct vm_area_struct *vma,
-                                      struct page *page)
-{
-       if (page_mapcount(page) != 1 &&
-           (page_is_file_lru(page) || vma_is_shmem(vma)) &&
-           (vma->vm_flags & VM_EXEC))
-               return true;
-
-       return false;
-}
-
 /*
  * Attempt to migrate a misplaced page to the specified destination
  * node. Caller is expected to have an elevated reference count on
@@ -2138,7 +2103,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
         * Don't migrate file pages that are mapped in multiple processes
         * with execute permissions as they are probably shared libraries.
         */
-       if (is_shared_exec_page(vma, page))
+       if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
+           (vma->vm_flags & VM_EXEC))
                goto out;
 
        /*
@@ -2193,9 +2159,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
        int page_lru = page_is_file_lru(page);
        unsigned long start = address & HPAGE_PMD_MASK;
 
-       if (is_shared_exec_page(vma, page))
-               goto out;
-
        new_page = alloc_pages_node(node,
                (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
                HPAGE_PMD_ORDER);
@@ -2307,7 +2270,6 @@ out_fail:
 
 out_unlock:
        unlock_page(page);
-out:
        put_page(page);
        return 0;
 }
@@ -2316,44 +2278,38 @@ out:
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_DEVICE_PRIVATE
-static int migrate_vma_collect_hole(unsigned long start,
+static int migrate_vma_collect_skip(unsigned long start,
                                    unsigned long end,
-                                   __always_unused int depth,
                                    struct mm_walk *walk)
 {
        struct migrate_vma *migrate = walk->private;
        unsigned long addr;
 
-       /* Only allow populating anonymous memory. */
-       if (!vma_is_anonymous(walk->vma)) {
-               for (addr = start; addr < end; addr += PAGE_SIZE) {
-                       migrate->src[migrate->npages] = 0;
-                       migrate->dst[migrate->npages] = 0;
-                       migrate->npages++;
-               }
-               return 0;
-       }
-
        for (addr = start; addr < end; addr += PAGE_SIZE) {
-               migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
                migrate->dst[migrate->npages] = 0;
-               migrate->npages++;
-               migrate->cpages++;
+               migrate->src[migrate->npages++] = 0;
        }
 
        return 0;
 }
 
-static int migrate_vma_collect_skip(unsigned long start,
+static int migrate_vma_collect_hole(unsigned long start,
                                    unsigned long end,
+                                   __always_unused int depth,
                                    struct mm_walk *walk)
 {
        struct migrate_vma *migrate = walk->private;
        unsigned long addr;
 
+       /* Only allow populating anonymous memory. */
+       if (!vma_is_anonymous(walk->vma))
+               return migrate_vma_collect_skip(start, end, walk);
+
        for (addr = start; addr < end; addr += PAGE_SIZE) {
+               migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
                migrate->dst[migrate->npages] = 0;
-               migrate->src[migrate->npages++] = 0;
+               migrate->npages++;
+               migrate->cpages++;
        }
 
        return 0;
@@ -2823,11 +2779,11 @@ restore:
  *
  * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
  * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
- * allowing the caller to allocate device memory for those unback virtual
- * address.  For this the caller simply has to allocate device memory and
+ * allowing the caller to allocate device memory for those unbacked virtual
+ * addresses.  For this the caller simply has to allocate device memory and
  * properly set the destination entry like for regular migration.  Note that
- * this can still fails and thus inside the device driver must check if the
- * migration was successful for those entries after calling migrate_vma_pages()
+ * this can still fail, and thus inside the device driver you must check if the
+ * migration was successful for those entries after calling migrate_vma_pages(),
  * just like for regular migration.
  *
  * After that, the callers must call migrate_vma_pages() to go over each entry
@@ -2973,6 +2929,13 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 
                        swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
                        entry = swp_entry_to_pte(swp_entry);
+               } else {
+                       /*
+                        * For now we only support migrating to un-addressable
+                        * device memory.
+                        */
+                       pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
+                       goto abort;
                }
        } else {
                entry = mk_pte(page, vma->vm_page_prot);
index f8f8cc3..df590fd 100644 (file)
@@ -559,7 +559,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
                                vm_flags_t flags)
 {
        unsigned long nstart, end, tmp;
-       struct vm_area_struct * vma, * prev;
+       struct vm_area_struct *vma, *prev;
        int error;
 
        VM_BUG_ON(offset_in_page(start));
@@ -737,7 +737,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
  */
 static int apply_mlockall_flags(int flags)
 {
-       struct vm_area_struct * vma, * prev = NULL;
+       struct vm_area_struct *vma, *prev = NULL;
        vm_flags_t to_add = 0;
 
        current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
index 347ef9b..0584e54 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,7 +612,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
        unsigned long nr_pages = 0;
        struct vm_area_struct *vma;
 
-       /* Find first overlaping mapping */
+       /* Find first overlapping mapping */
        vma = find_vma_intersection(mm, addr, end);
        if (!vma)
                return 0;
@@ -2875,7 +2875,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
        if (unlikely(uf)) {
                /*
                 * If userfaultfd_unmap_prep returns an error the vmas
-                * will remain splitted, but userland will get a
+                * will remain split, but userland will get a
                 * highly unexpected error anyway. This is no
                 * different than the case where the first of the two
                 * __split_vma fails, but we don't undo the first
@@ -3029,25 +3029,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 
        flags &= MAP_NONBLOCK;
        flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
-       if (vma->vm_flags & VM_LOCKED) {
-               struct vm_area_struct *tmp;
+       if (vma->vm_flags & VM_LOCKED)
                flags |= MAP_LOCKED;
 
-               /* drop PG_Mlocked flag for over-mapped range */
-               for (tmp = vma; tmp->vm_start >= start + size;
-                               tmp = tmp->vm_next) {
-                       /*
-                        * Split pmd and munlock page on the border
-                        * of the range.
-                        */
-                       vma_adjust_trans_huge(tmp, start, start + size, 0);
-
-                       munlock_vma_pages_range(tmp,
-                                       max(tmp->vm_start, start),
-                                       min(tmp->vm_end, start + size));
-               }
-       }
-
        file = get_file(vma->vm_file);
        ret = do_mmap(vma->vm_file, start, size,
                        prot, flags, pgoff, &populate, NULL);
index 94188df..e7a4431 100644 (file)
@@ -699,7 +699,7 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
        mmap_write_unlock(current->mm);
 
        /*
-        * We could provie warnings or errors if any VMA still
+        * We could provide warnings or errors if any VMA still
         * has the pkey set here.
         */
        return ret;
index d22629f..47c255b 100644 (file)
@@ -730,7 +730,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
         * So, to avoid such scenario we can pre-compute if the whole
         * operation has high chances to success map-wise.
         * Worst-scenario case is when both vma's (new_addr and old_addr) get
-        * split in 3 before unmaping it.
+        * split in 3 before unmapping it.
         * That means 2 more maps (1 for each) to the ones we already hold.
         * Check whether current map count plus 2 still leads us to 4 maps below
         * the threshold, otherwise return -ENOMEM here to be more safe.
index 5c9ab79..85a3a68 100644 (file)
@@ -210,16 +210,6 @@ long vread(char *buf, char *addr, unsigned long count)
        return count;
 }
 
-long vwrite(char *buf, char *addr, unsigned long count)
-{
-       /* Don't allow overflow */
-       if ((unsigned long) addr + count < count)
-               count = -(unsigned long) addr;
-
-       memcpy(addr, buf, count);
-       return count;
-}
-
 /*
  *     vmalloc  -  allocate virtually contiguous memory
  *
index fa1cf18..eefd3f5 100644 (file)
@@ -74,7 +74,7 @@ static inline bool is_memcg_oom(struct oom_control *oc)
 
 #ifdef CONFIG_NUMA
 /**
- * oom_cpuset_eligible() - check task eligiblity for kill
+ * oom_cpuset_eligible() - check task eligibility for kill
  * @start: task struct of which task to consider
  * @oc: pointer to struct oom_control
  *
@@ -993,7 +993,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
        if (oom_group) {
                mem_cgroup_print_oom_group(oom_group);
                mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
-                                     (void*)message);
+                                     (void *)message);
                mem_cgroup_put(oom_group);
        }
 }
index 5e761fb..0062d5c 100644 (file)
@@ -1806,7 +1806,7 @@ pause:
                        break;
 
                /*
-                * In the case of an unresponding NFS server and the NFS dirty
+                * In the case of an unresponsive NFS server and the NFS dirty
                 * pages exceeds dirty_thresh, give the other good wb's a pipe
                 * to go through, so that tasks on them still remain responsive.
                 *
@@ -2216,7 +2216,7 @@ int write_cache_pages(struct address_space *mapping,
                         * Page truncated or invalidated. We can freely skip it
                         * then, even for data integrity operations: the page
                         * has disappeared concurrently, so there could be no
-                        * real expectation of this data interity operation
+                        * real expectation of this data integrity operation
                         * even if there is now a new, dirty page at the same
                         * pagecache address.
                         */
index 6b208b1..aaa1655 100644 (file)
@@ -893,7 +893,7 @@ compaction_capture(struct capture_control *capc, struct page *page,
                return false;
 
        /*
-        * Do not let lower order allocations polluate a movable pageblock.
+        * Do not let lower order allocations pollute a movable pageblock.
         * This might let an unmovable request use a reclaimable pageblock
         * and vice-versa but no more than normal fallback logic which can
         * have trouble finding a high-order free page.
@@ -2776,7 +2776,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
                        /*
                         * In page freeing path, migratetype change is racy so
                         * we can counter several free pages in a pageblock
-                        * in this loop althoug we changed the pageblock type
+                        * in this loop although we changed the pageblock type
                         * from highatomic to ac->migratetype. So we should
                         * adjust the count once.
                         */
@@ -3080,7 +3080,7 @@ static void drain_local_pages_wq(struct work_struct *work)
         * drain_all_pages doesn't use proper cpu hotplug protection so
         * we can race with cpu offline when the WQ can move this from
         * a cpu pinned worker to an unbound one. We can operate on a different
-        * cpu which is allright but we also have to make sure to not move to
+        * cpu which is alright but we also have to make sure to not move to
         * a different one.
         */
        preempt_disable();
@@ -3859,16 +3859,13 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
        return alloc_flags;
 }
 
-static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
-                                       unsigned int alloc_flags)
+/* Must be called after current_gfp_context() which can change gfp_mask */
+static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
+                                                 unsigned int alloc_flags)
 {
 #ifdef CONFIG_CMA
-       unsigned int pflags = current->flags;
-
-       if (!(pflags & PF_MEMALLOC_NOCMA) &&
-                       gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+       if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
-
 #endif
        return alloc_flags;
 }
@@ -3968,7 +3965,7 @@ retry:
                        if (alloc_flags & ALLOC_NO_WATERMARKS)
                                goto try_this_zone;
 
-                       if (node_reclaim_mode == 0 ||
+                       if (!node_reclaim_enabled() ||
                            !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
                                continue;
 
@@ -4176,7 +4173,7 @@ out:
 }
 
 /*
- * Maximum number of compaction retries wit a progress before OOM
+ * Maximum number of compaction retries with a progress before OOM
  * killer is consider as the only way to move forward.
  */
 #define MAX_COMPACT_RETRIES 16
@@ -4204,6 +4201,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        memalloc_noreclaim_restore(noreclaim_flag);
        psi_memstall_leave(&pflags);
 
+       if (*compact_result == COMPACT_SKIPPED)
+               return NULL;
        /*
         * At least in one zone compaction wasn't deferred or skipped, so let's
         * count a compaction stall
@@ -4524,7 +4523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
        } else if (unlikely(rt_task(current)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
 
-       alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
+       alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
 
        return alloc_flags;
 }
@@ -4826,7 +4825,7 @@ retry:
 
        reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
        if (reserve_flags)
-               alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
+               alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags);
 
        /*
         * Reset the nodemask and zonelist iterators if memory policies can be
@@ -4995,7 +4994,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
        if (should_fail_alloc_page(gfp_mask, order))
                return false;
 
-       *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
+       *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
 
        /* Dirty zone balancing only done in the fast path */
        ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
@@ -5178,6 +5177,14 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
        }
 
        gfp &= gfp_allowed_mask;
+       /*
+        * Apply scoped allocation constraints. This is mainly about GFP_NOFS
+        * resp. GFP_NOIO which has to be inherited for all allocation requests
+        * from a particular context which has been marked by
+        * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
+        * movable zones are not used during allocation.
+        */
+       gfp = current_gfp_context(gfp);
        alloc_gfp = gfp;
        if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
                        &alloc_gfp, &alloc_flags))
@@ -5194,13 +5201,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
        if (likely(page))
                goto out;
 
-       /*
-        * Apply scoped allocation constraints. This is mainly about GFP_NOFS
-        * resp. GFP_NOIO which has to be inherited for all allocation requests
-        * from a particular context which has been marked by
-        * memalloc_no{fs,io}_{save,restore}.
-        */
-       alloc_gfp = current_gfp_context(gfp);
+       alloc_gfp = gfp;
        ac.spread_dirty_pages = false;
 
        /*
@@ -5928,7 +5929,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
 static int __parse_numa_zonelist_order(char *s)
 {
        /*
-        * We used to support different zonlists modes but they turned
+        * We used to support different zonelists modes but they turned
         * out to be just not useful. Let's keep the warning in place
         * if somebody still use the cmd line parameter so that we do
         * not fail it silently
@@ -7669,7 +7670,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
 }
 
 /*
- * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
+ * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
  * such cases we allow max_zone_pfn sorted in the descending order
  */
 bool __weak arch_has_descending_max_zone_pfns(void)
@@ -8679,7 +8680,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
        };
 
-       migrate_prep();
+       lru_cache_disable();
 
        while (pfn < end || !list_empty(&cc->migratepages)) {
                if (fatal_signal_pending(current)) {
@@ -8689,14 +8690,13 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 
                if (list_empty(&cc->migratepages)) {
                        cc->nr_migratepages = 0;
-                       pfn = isolate_migratepages_range(cc, pfn, end);
-                       if (!pfn) {
-                               ret = -EINTR;
+                       ret = isolate_migratepages_range(cc, pfn, end);
+                       if (ret && ret != -EAGAIN)
                                break;
-                       }
+                       pfn = cc->migrate_pfn;
                        tries = 0;
                } else if (++tries == 5) {
-                       ret = ret < 0 ? ret : -EBUSY;
+                       ret = -EBUSY;
                        break;
                }
 
@@ -8706,7 +8706,16 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 
                ret = migrate_pages(&cc->migratepages, alloc_migration_target,
                                NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
+
+               /*
+                * On -ENOMEM, migrate_pages() bails out right away. It is pointless
+                * to retry again over this error, so do the same here.
+                */
+               if (ret == -ENOMEM)
+                       break;
        }
+
+       lru_cache_enable();
        if (ret < 0) {
                alloc_contig_dump_pages(&cc->migratepages);
                putback_movable_pages(&cc->migratepages);
@@ -8719,7 +8728,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:     start PFN to allocate
  * @end:       one-past-the-last PFN to allocate
- * @migratetype:       migratetype of the underlaying pageblocks (either
+ * @migratetype:       migratetype of the underlying pageblocks (either
  *                     #MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
  *                     in range must have the same migratetype and it must
  *                     be either of the two.
@@ -8799,7 +8808,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        ret = __alloc_contig_migrate_range(&cc, start, end);
        if (ret && ret != -EBUSY)
                goto done;
-       ret =0;
+       ret = 0;
 
        /*
         * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
@@ -8892,12 +8901,6 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
 
                if (PageReserved(page))
                        return false;
-
-               if (page_count(page) > 0)
-                       return false;
-
-               if (PageHuge(page))
-                       return false;
        }
        return true;
 }
@@ -8969,9 +8972,9 @@ struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
 }
 #endif /* CONFIG_CONTIG_ALLOC */
 
-void free_contig_range(unsigned long pfn, unsigned int nr_pages)
+void free_contig_range(unsigned long pfn, unsigned long nr_pages)
 {
-       unsigned int count = 0;
+       unsigned long count = 0;
 
        for (; nr_pages--; pfn++) {
                struct page *page = pfn_to_page(pfn);
@@ -8979,13 +8982,13 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
                count += page_count(page) != 1;
                __free_page(page);
        }
-       WARN(count != 0, "%d pages are still in use!\n", count);
+       WARN(count != 0, "%lu pages are still in use!\n", count);
 }
 EXPORT_SYMBOL(free_contig_range);
 
 /*
  * The zone indicated has a new number of managed_pages; batch sizes and percpu
- * page high values need to be recalulated.
+ * page high values need to be recalculated.
  */
 void __meminit zone_pcp_update(struct zone *zone)
 {
@@ -9017,12 +9020,9 @@ void zone_pcp_enable(struct zone *zone)
 
 void zone_pcp_reset(struct zone *zone)
 {
-       unsigned long flags;
        int cpu;
        struct per_cpu_pageset *pset;
 
-       /* avoid races with drain_pages()  */
-       local_irq_save(flags);
        if (zone->pageset != &boot_pageset) {
                for_each_online_cpu(cpu) {
                        pset = per_cpu_ptr(zone->pageset, cpu);
@@ -9031,7 +9031,6 @@ void zone_pcp_reset(struct zone *zone)
                free_percpu(zone->pageset);
                zone->pageset = &boot_pageset;
        }
-       local_irq_restore(flags);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
index 9661d53..adfabb5 100644 (file)
@@ -233,7 +233,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
        /*
         * We don't clear the bit on the oldpage as it's going to be freed
         * after migration. Until then, the info can be useful in case of
-        * a bug, and the overal stats will be off a bit only temporarily.
+        * a bug, and the overall stats will be off a bit only temporarily.
         * Also, migrate_misplaced_transhuge_page() can still fail the
         * migration and then we want the oldpage to retain the info. But
         * in that case we also don't need to explicitly clear the info from
index 86e3a36..2cf01d9 100644 (file)
@@ -134,7 +134,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
  * regardless of which page table level the page is mapped at. @pvmw->pmd is
  * NULL.
  *
- * Retruns false if there are no more page table entries for the page in
+ * Returns false if there are no more page table entries for the page in
  * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped.
  *
  * If you need to stop the walk before page_vma_mapped_walk() returned false,
index 095d7ea..ae26b11 100644 (file)
@@ -170,7 +170,7 @@ struct percpu_stats {
        u64 nr_max_alloc;       /* max # of live allocations */
        u32 nr_chunks;          /* current # of live chunks */
        u32 nr_max_chunks;      /* max # of live chunks */
-       size_t min_alloc_size;  /* min allocaiton size */
+       size_t min_alloc_size;  /* min allocation size */
        size_t max_alloc_size;  /* max allocation size */
 };
 
index 2330811..f99e930 100644 (file)
@@ -1862,7 +1862,7 @@ fail:
                        pr_info("limit reached, disable warning\n");
        }
        if (is_atomic) {
-               /* see the flag handling in pcpu_blance_workfn() */
+               /* see the flag handling in pcpu_balance_workfn() */
                pcpu_atomic_alloc_failed = true;
                pcpu_schedule_balance_work();
        } else {
index 1dcc865..e9e879d 100644 (file)
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_PGALLLC_TRACK_H
-#define _LINUX_PGALLLC_TRACK_H
+#ifndef _LINUX_PGALLOC_TRACK_H
+#define _LINUX_PGALLOC_TRACK_H
 
 #if defined(CONFIG_MMU)
 static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
@@ -48,4 +48,4 @@ static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
          (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
                NULL: pte_offset_kernel(pmd, address))
 
-#endif /* _LINUX_PGALLLC_TRACK_H */
+#endif /* _LINUX_PGALLOC_TRACK_H */
index f5fee9c..4bcc119 100644 (file)
@@ -9,7 +9,6 @@
 #include <linux/mm.h>
 #include <linux/uio.h>
 #include <linux/sched.h>
-#include <linux/compat.h>
 #include <linux/sched/mm.h>
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
index b0fc27e..693a610 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -257,7 +257,7 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
  * Attach the anon_vmas from src to dst.
  * Returns 0 on success, -ENOMEM on failure.
  *
- * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
+ * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and
  * anon_vma_fork(). The first three want an exact copy of src, while the last
  * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
  * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
index 162d8f8..a08cede 100644 (file)
@@ -3508,7 +3508,7 @@ static int shmem_parse_options(struct fs_context *fc, void *data)
                        }
                }
                if (*this_char) {
-                       char *value = strchr(this_char,'=');
+                       char *value = strchr(this_char, '=');
                        size_t len = 0;
                        int err;
 
index df45c43..d0f7256 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -259,7 +259,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
 
 #define BATCHREFILL_LIMIT      16
 /*
- * Optimization question: fewer reaps means less probability for unnessary
+ * Optimization question: fewer reaps means less probability for unnecessary
  * cpucache drain/refill cycles.
  *
  * OTOH the cpuarrays can contain lots of objects,
@@ -2284,7 +2284,7 @@ void __kmem_cache_release(struct kmem_cache *cachep)
  * Because if it is the case, that means we defer the creation of
  * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
  * And we eventually call down to __kmem_cache_create(), which
- * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
+ * in turn looks up in the kmalloc_{dma,}_caches for the desired-size one.
  * This is a "chicken-and-egg" problem.
  *
  * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
@@ -2381,8 +2381,8 @@ union freelist_init_state {
 };
 
 /*
- * Initialize the state based on the randomization methode available.
- * return true if the pre-computed list is available, false otherwize.
+ * Initialize the state based on the randomization method available.
+ * return true if the pre-computed list is available, false otherwise.
  */
 static bool freelist_state_initialize(union freelist_init_state *state,
                                struct kmem_cache *cachep,
index 68123b2..feda53a 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3391,7 +3391,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk);
  */
 
 /*
- * Mininum / Maximum order of slab pages. This influences locking overhead
+ * Minimum / Maximum order of slab pages. This influences locking overhead
  * and slab fragmentation. A higher order reduces the number of partial slabs
  * and increases the number of allocations possible without having to
  * take the list_lock.
index 33406ea..b2ada9d 100644 (file)
@@ -257,7 +257,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en
        if (unlikely(!mem_section)) {
                unsigned long size, align;
 
-               size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
+               size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
                align = 1 << (INTERNODE_CACHE_SHIFT);
                mem_section = memblock_alloc(size, align);
                if (!mem_section)
@@ -624,7 +624,6 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
        }
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 /* Mark all memory sections within the pfn range as offline */
 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -645,7 +644,6 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
                ms->section_mem_map &= ~SECTION_IS_ONLINE;
        }
 }
-#endif
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 static struct page * __meminit populate_section_memmap(unsigned long pfn,
index 31b844d..dfb48cf 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -36,6 +36,7 @@
 #include <linux/hugetlb.h>
 #include <linux/page_idle.h>
 #include <linux/local_lock.h>
+#include <linux/buffer_head.h>
 
 #include "internal.h"
 
@@ -235,6 +236,18 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
        }
 }
 
+/* return true if pagevec needs to drain */
+static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
+{
+       bool ret = false;
+
+       if (!pagevec_add(pvec, page) || PageCompound(page) ||
+                       lru_cache_disabled())
+               ret = true;
+
+       return ret;
+}
+
 /*
  * Writeback is about to end against a page which has been marked for immediate
  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
@@ -252,7 +265,7 @@ void rotate_reclaimable_page(struct page *page)
                get_page(page);
                local_lock_irqsave(&lru_rotate.lock, flags);
                pvec = this_cpu_ptr(&lru_rotate.pvec);
-               if (!pagevec_add(pvec, page) || PageCompound(page))
+               if (pagevec_add_and_need_flush(pvec, page))
                        pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
                local_unlock_irqrestore(&lru_rotate.lock, flags);
        }
@@ -343,7 +356,7 @@ static void activate_page(struct page *page)
                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.activate_page);
                get_page(page);
-               if (!pagevec_add(pvec, page) || PageCompound(page))
+               if (pagevec_add_and_need_flush(pvec, page))
                        pagevec_lru_move_fn(pvec, __activate_page);
                local_unlock(&lru_pvecs.lock);
        }
@@ -458,7 +471,7 @@ void lru_cache_add(struct page *page)
        get_page(page);
        local_lock(&lru_pvecs.lock);
        pvec = this_cpu_ptr(&lru_pvecs.lru_add);
-       if (!pagevec_add(pvec, page) || PageCompound(page))
+       if (pagevec_add_and_need_flush(pvec, page))
                __pagevec_lru_add(pvec);
        local_unlock(&lru_pvecs.lock);
 }
@@ -483,7 +496,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
        if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
                int nr_pages = thp_nr_pages(page);
                /*
-                * We use the irq-unsafe __mod_zone_page_stat because this
+                * We use the irq-unsafe __mod_zone_page_state because this
                 * counter is not modified from interrupt context, and the pte
                 * lock is held(spinlock), which implies preemption disabled.
                 */
@@ -629,6 +642,7 @@ void lru_add_drain_cpu(int cpu)
                pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
 
        activate_page_drain(cpu);
+       invalidate_bh_lrus_cpu(cpu);
 }
 
 /**
@@ -654,7 +668,7 @@ void deactivate_file_page(struct page *page)
                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
 
-               if (!pagevec_add(pvec, page) || PageCompound(page))
+               if (pagevec_add_and_need_flush(pvec, page))
                        pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
                local_unlock(&lru_pvecs.lock);
        }
@@ -676,7 +690,7 @@ void deactivate_page(struct page *page)
                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
                get_page(page);
-               if (!pagevec_add(pvec, page) || PageCompound(page))
+               if (pagevec_add_and_need_flush(pvec, page))
                        pagevec_lru_move_fn(pvec, lru_deactivate_fn);
                local_unlock(&lru_pvecs.lock);
        }
@@ -698,7 +712,7 @@ void mark_page_lazyfree(struct page *page)
                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
                get_page(page);
-               if (!pagevec_add(pvec, page) || PageCompound(page))
+               if (pagevec_add_and_need_flush(pvec, page))
                        pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
                local_unlock(&lru_pvecs.lock);
        }
@@ -735,7 +749,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  * Calling this function with cpu hotplug locks held can actually lead
  * to obscure indirect dependencies via WQ context.
  */
-void lru_add_drain_all(void)
+inline void __lru_add_drain_all(bool force_all_cpus)
 {
        /*
         * lru_drain_gen - Global pages generation number
@@ -780,7 +794,7 @@ void lru_add_drain_all(void)
         * (C) Exit the draining operation if a newer generation, from another
         * lru_add_drain_all(), was already scheduled for draining. Check (A).
         */
-       if (unlikely(this_gen != lru_drain_gen))
+       if (unlikely(this_gen != lru_drain_gen && !force_all_cpus))
                goto done;
 
        /*
@@ -794,7 +808,7 @@ void lru_add_drain_all(void)
         * below which drains the page vectors.
         *
         * Let x, y, and z represent some system CPU numbers, where x < y < z.
-        * Assume CPU #z is is in the middle of the for_each_online_cpu loop
+        * Assume CPU #z is in the middle of the for_each_online_cpu loop
         * below and has already reached CPU #y's per-cpu data. CPU #x comes
         * along, adds some pages to its per-cpu vectors, then calls
         * lru_add_drain_all().
@@ -810,12 +824,14 @@ void lru_add_drain_all(void)
        for_each_online_cpu(cpu) {
                struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
 
-               if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
+               if (force_all_cpus ||
+                   pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
                    data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
                    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
                    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
                    pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
-                   need_activate_page_drain(cpu)) {
+                   need_activate_page_drain(cpu) ||
+                   has_bh_in_lru(cpu, NULL)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        queue_work_on(cpu, mm_percpu_wq, work);
                        __cpumask_set_cpu(cpu, &has_work);
@@ -828,6 +844,11 @@ void lru_add_drain_all(void)
 done:
        mutex_unlock(&lock);
 }
+
+void lru_add_drain_all(void)
+{
+       __lru_add_drain_all(false);
+}
 #else
 void lru_add_drain_all(void)
 {
@@ -835,6 +856,34 @@ void lru_add_drain_all(void)
 }
 #endif /* CONFIG_SMP */
 
+atomic_t lru_disable_count = ATOMIC_INIT(0);
+
+/*
+ * lru_cache_disable() needs to be called before we start compiling
+ * a list of pages to be migrated using isolate_lru_page().
+ * It drains pages on LRU cache and then disable on all cpus until
+ * lru_cache_enable is called.
+ *
+ * Must be paired with a call to lru_cache_enable().
+ */
+void lru_cache_disable(void)
+{
+       atomic_inc(&lru_disable_count);
+#ifdef CONFIG_SMP
+       /*
+        * lru_add_drain_all in the force mode will schedule draining on
+        * all online CPUs so any calls of lru_cache_disabled wrapped by
+        * local_lock or preemption disabled would be ordered by that.
+        * The atomic operation doesn't need to have stronger ordering
+        * requirements because that is enforeced by the scheduling
+        * guarantees.
+        */
+       __lru_add_drain_all(true);
+#else
+       lru_add_drain();
+#endif
+}
+
 /**
  * release_pages - batched put_page()
  * @pages: array of pages to release
index be9de6d..6248d10 100644 (file)
@@ -16,7 +16,7 @@
  * to local caches without needing to acquire swap_info
  * lock.  We do not reuse the returned slots directly but
  * move them back to the global pool in a batch.  This
- * allows the slots to coaellesce and reduce fragmentation.
+ * allows the slots to coalesce and reduce fragmentation.
  *
  * The swap entry allocated is marked with SWAP_HAS_CACHE
  * flag in map_count that prevents it from being allocated
index fb7efa0..272ea21 100644 (file)
@@ -132,7 +132,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
                        xas_store(&xas, page);
                        xas_next(&xas);
                }
-               address_space->nrexceptional -= nr_shadows;
                address_space->nrpages += nr;
                __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
                __mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
@@ -172,8 +171,6 @@ void __delete_from_swap_cache(struct page *page,
                xas_next(&xas);
        }
        ClearPageSwapCache(page);
-       if (shadow)
-               address_space->nrexceptional += nr;
        address_space->nrpages -= nr;
        __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
        __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr);
@@ -275,7 +272,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
                        xas_store(&xas, NULL);
                        nr_shadows++;
                }
-               address_space->nrexceptional -= nr_shadows;
                xa_unlock_irq(&address_space->i_pages);
 
                /* search the next swapcache until we meet end */
@@ -796,7 +792,7 @@ static void swap_ra_info(struct vm_fault *vmf,
  *
  * Returns the struct page for entry and addr, after queueing swapin.
  *
- * Primitive swap readahead code. We simply read in a few pages whoes
+ * Primitive swap readahead code. We simply read in a few pages whose
  * virtual addresses are around the fault address in the same vma.
  *
  * Caller must hold read mmap_lock if vmf->vma is not NULL.
index 084a5b9..149e774 100644 (file)
@@ -2780,7 +2780,7 @@ static int swap_show(struct seq_file *swap, void *v)
        unsigned int bytes, inuse;
 
        if (si == SEQ_START_TOKEN) {
-               seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
+               seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
                return 0;
        }
 
@@ -3284,7 +3284,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                                         sizeof(long),
                                         GFP_KERNEL);
 
-       if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+       if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
                /*
                 * When discard is enabled for swap with no particular
                 * policy flagged, we set all swap discard flags here in
index 4559442..95af244 100644 (file)
@@ -40,7 +40,6 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
        if (xas_load(&xas) != entry)
                return;
        xas_store(&xas, NULL);
-       mapping->nrexceptional--;
 }
 
 static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
@@ -295,7 +294,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
        pgoff_t         index;
        int             i;
 
-       if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+       if (mapping_empty(mapping))
                goto out;
 
        /* Offsets within partial pages */
@@ -440,9 +439,6 @@ EXPORT_SYMBOL(truncate_inode_pages);
  */
 void truncate_inode_pages_final(struct address_space *mapping)
 {
-       unsigned long nrexceptional;
-       unsigned long nrpages;
-
        /*
         * Page reclaim can not participate in regular inode lifetime
         * management (can't call iput()) and thus can race with the
@@ -452,16 +448,7 @@ void truncate_inode_pages_final(struct address_space *mapping)
         */
        mapping_set_exiting(mapping);
 
-       /*
-        * When reclaim installs eviction entries, it increases
-        * nrexceptional first, then decreases nrpages.  Make sure we see
-        * this in the right order or we might miss an entry.
-        */
-       nrpages = mapping->nrpages;
-       smp_rmb();
-       nrexceptional = mapping->nrexceptional;
-
-       if (nrpages || nrexceptional) {
+       if (!mapping_empty(mapping)) {
                /*
                 * As truncation uses a lockless tree lookup, cycle
                 * the tree lock to make sure any ongoing tree
@@ -633,7 +620,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
        int ret2 = 0;
        int did_range_unmap = 0;
 
-       if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+       if (mapping_empty(mapping))
                goto out;
 
        pagevec_init(&pvec);
index 9a3d451..e14b382 100644 (file)
@@ -207,7 +207,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
                                              unsigned long dst_start,
                                              unsigned long src_start,
                                              unsigned long len,
-                                             bool zeropage)
+                                             enum mcopy_atomic_mode mode)
 {
        int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
        int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -227,7 +227,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
         * by THP.  Since we can not reliably insert a zero page, this
         * feature is not supported.
         */
-       if (zeropage) {
+       if (mode == MCOPY_ATOMIC_ZEROPAGE) {
                mmap_read_unlock(dst_mm);
                return -EINVAL;
        }
@@ -273,8 +273,6 @@ retry:
        }
 
        while (src_addr < src_start + len) {
-               pte_t dst_pteval;
-
                BUG_ON(dst_addr >= dst_start + len);
 
                /*
@@ -290,23 +288,23 @@ retry:
                mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
                err = -ENOMEM;
-               dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize);
+               dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
                if (!dst_pte) {
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        i_mmap_unlock_read(mapping);
                        goto out_unlock;
                }
 
-               err = -EEXIST;
-               dst_pteval = huge_ptep_get(dst_pte);
-               if (!huge_pte_none(dst_pteval)) {
+               if (mode != MCOPY_ATOMIC_CONTINUE &&
+                   !huge_pte_none(huge_ptep_get(dst_pte))) {
+                       err = -EEXIST;
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        i_mmap_unlock_read(mapping);
                        goto out_unlock;
                }
 
                err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
-                                               dst_addr, src_addr, &page);
+                                              dst_addr, src_addr, mode, &page);
 
                mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                i_mmap_unlock_read(mapping);
@@ -408,7 +406,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
                                      unsigned long dst_start,
                                      unsigned long src_start,
                                      unsigned long len,
-                                     bool zeropage);
+                                     enum mcopy_atomic_mode mode);
 #endif /* CONFIG_HUGETLB_PAGE */
 
 static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -458,7 +456,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
                                              unsigned long dst_start,
                                              unsigned long src_start,
                                              unsigned long len,
-                                             bool zeropage,
+                                             enum mcopy_atomic_mode mcopy_mode,
                                              bool *mmap_changing,
                                              __u64 mode)
 {
@@ -469,6 +467,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
        long copied;
        struct page *page;
        bool wp_copy;
+       bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
 
        /*
         * Sanitize the command parameters:
@@ -527,10 +526,12 @@ retry:
         */
        if (is_vm_hugetlb_page(dst_vma))
                return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
-                                               src_start, len, zeropage);
+                                               src_start, len, mcopy_mode);
 
        if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
                goto out_unlock;
+       if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
+               goto out_unlock;
 
        /*
         * Ensure the dst_vma has a anon_vma or this page
@@ -626,14 +627,22 @@ ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
                     unsigned long src_start, unsigned long len,
                     bool *mmap_changing, __u64 mode)
 {
-       return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
-                             mmap_changing, mode);
+       return __mcopy_atomic(dst_mm, dst_start, src_start, len,
+                             MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
 }
 
 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
                       unsigned long len, bool *mmap_changing)
 {
-       return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
+       return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
+                             mmap_changing, 0);
+}
+
+ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
+                      unsigned long len, bool *mmap_changing)
+{
+       return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
+                             mmap_changing, 0);
 }
 
 int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
index 083c5c4..a8bf17f 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -765,7 +765,7 @@ int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
         * The deviation of sync_overcommit_as could be big with loose policy
         * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
         * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
-        * with the strict "NEVER", and to avoid possible race condtion (even
+        * with the strict "NEVER", and to avoid possible race condition (even
         * though user usually won't too frequently do the switching to policy
         * OVERCOMMIT_NEVER), the switch is done in the following order:
         *      1. changing the batch
@@ -987,22 +987,26 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
  */
 void mem_dump_obj(void *object)
 {
+       const char *type;
+
        if (kmem_valid_obj(object)) {
                kmem_dump_obj(object);
                return;
        }
+
        if (vmalloc_dump_obj(object))
                return;
-       if (!virt_addr_valid(object)) {
-               if (object == NULL)
-                       pr_cont(" NULL pointer.\n");
-               else if (object == ZERO_SIZE_PTR)
-                       pr_cont(" zero-size pointer.\n");
-               else
-                       pr_cont(" non-paged memory.\n");
-               return;
-       }
-       pr_cont(" non-slab/vmalloc memory.\n");
+
+       if (virt_addr_valid(object))
+               type = "non-slab/vmalloc memory";
+       else if (object == NULL)
+               type = "NULL pointer";
+       else if (object == ZERO_SIZE_PTR)
+               type = "zero-size pointer";
+       else
+               type = "non-paged memory";
+
+       pr_cont(" %s\n", type);
 }
 EXPORT_SYMBOL_GPL(mem_dump_obj);
 #endif
index d33894d..a13ac52 100644 (file)
@@ -1583,7 +1583,7 @@ static unsigned long lazy_max_pages(void)
 static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
 
 /*
- * Serialize vmap purging.  There is no actual criticial section protected
+ * Serialize vmap purging.  There is no actual critical section protected
  * by this look, but we want to avoid concurrent calls for performance
  * reasons and to make the pcpu_get_vm_areas more deterministic.
  */
@@ -2628,7 +2628,7 @@ static void __vfree(const void *addr)
  * May sleep if called *not* from interrupt context.
  * Must not be called in NMI context (strictly speaking, it could be
  * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea).
+ * conventions for vfree() arch-dependent would be a really bad idea).
  */
 void vfree(const void *addr)
 {
@@ -3083,7 +3083,7 @@ EXPORT_SYMBOL(vzalloc_node);
  * 64b systems should always have either DMA or DMA32 zones. For others
  * GFP_DMA32 should do the right thing and use the normal zone.
  */
-#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
+#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
 #endif
 
 /**
@@ -3141,15 +3141,12 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
                /*
                 * To do safe access to this _mapped_ area, we need
                 * lock. But adding lock here means that we need to add
-                * overhead of vmalloc()/vfree() calles for this _debug_
+                * overhead of vmalloc()/vfree() calls for this _debug_
                 * interface, rarely used. Instead of that, we'll use
                 * kmap() and get small overhead in this access function.
                 */
                if (p) {
-                       /*
-                        * we can expect USER0 is not used (see vread/vwrite's
-                        * function description)
-                        */
+                       /* We can expect USER0 is not used -- see vread() */
                        void *map = kmap_atomic(p);
                        memcpy(buf, map + offset, length);
                        kunmap_atomic(map);
@@ -3164,43 +3161,6 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
        return copied;
 }
 
-static int aligned_vwrite(char *buf, char *addr, unsigned long count)
-{
-       struct page *p;
-       int copied = 0;
-
-       while (count) {
-               unsigned long offset, length;
-
-               offset = offset_in_page(addr);
-               length = PAGE_SIZE - offset;
-               if (length > count)
-                       length = count;
-               p = vmalloc_to_page(addr);
-               /*
-                * To do safe access to this _mapped_ area, we need
-                * lock. But adding lock here means that we need to add
-                * overhead of vmalloc()/vfree() calles for this _debug_
-                * interface, rarely used. Instead of that, we'll use
-                * kmap() and get small overhead in this access function.
-                */
-               if (p) {
-                       /*
-                        * we can expect USER0 is not used (see vread/vwrite's
-                        * function description)
-                        */
-                       void *map = kmap_atomic(p);
-                       memcpy(map + offset, buf, length);
-                       kunmap_atomic(map);
-               }
-               addr += length;
-               buf += length;
-               copied += length;
-               count -= length;
-       }
-       return copied;
-}
-
 /**
  * vread() - read vmalloc area in a safe way.
  * @buf:     buffer for reading data
@@ -3219,7 +3179,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
  * Note: In usual ops, vread() is never necessary because the caller
  * should know vmalloc() area is valid and can use memcpy().
  * This is for routines which have to access vmalloc area without
- * any information, as /dev/kmem.
+ * any information, as /proc/kcore.
  *
  * Return: number of bytes for which addr and buf should be increased
  * (same number as @count) or %0 if [addr...addr+count) doesn't
@@ -3283,80 +3243,6 @@ finished:
        return buflen;
 }
 
-/**
- * vwrite() - write vmalloc area in a safe way.
- * @buf:      buffer for source data
- * @addr:     vm address.
- * @count:    number of bytes to be read.
- *
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from a buffer to the given addr. If specified range of
- * [addr...addr+count) includes some valid address, data is copied from
- * proper area of @buf. If there are memory holes, no copy to hole.
- * IOREMAP area is treated as memory hole and no copy is done.
- *
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
- *
- * Note: In usual ops, vwrite() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any information, as /dev/kmem.
- *
- * Return: number of bytes for which addr and buf should be
- * increased (same number as @count) or %0 if [addr...addr+count)
- * doesn't include any intersection with valid vmalloc area
- */
-long vwrite(char *buf, char *addr, unsigned long count)
-{
-       struct vmap_area *va;
-       struct vm_struct *vm;
-       char *vaddr;
-       unsigned long n, buflen;
-       int copied = 0;
-
-       /* Don't allow overflow */
-       if ((unsigned long) addr + count < count)
-               count = -(unsigned long) addr;
-       buflen = count;
-
-       spin_lock(&vmap_area_lock);
-       list_for_each_entry(va, &vmap_area_list, list) {
-               if (!count)
-                       break;
-
-               if (!va->vm)
-                       continue;
-
-               vm = va->vm;
-               vaddr = (char *) vm->addr;
-               if (addr >= vaddr + get_vm_area_size(vm))
-                       continue;
-               while (addr < vaddr) {
-                       if (count == 0)
-                               goto finished;
-                       buf++;
-                       addr++;
-                       count--;
-               }
-               n = vaddr + get_vm_area_size(vm) - addr;
-               if (n > count)
-                       n = count;
-               if (!(vm->flags & VM_IOREMAP)) {
-                       aligned_vwrite(buf, addr, n);
-                       copied++;
-               }
-               buf += n;
-               addr += n;
-               count -= n;
-       }
-finished:
-       spin_unlock(&vmap_area_lock);
-       if (!copied)
-               return 0;
-       return buflen;
-}
-
 /**
  * remap_vmalloc_range_partial - map vmalloc pages to userspace
  * @vma:               vma to cover
index 562e87c..5199b96 100644 (file)
@@ -185,39 +185,181 @@ static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 
 #ifdef CONFIG_MEMCG
-/*
- * We allow subsystems to populate their shrinker-related
- * LRU lists before register_shrinker_prepared() is called
- * for the shrinker, since we don't want to impose
- * restrictions on their internal registration order.
- * In this case shrink_slab_memcg() may find corresponding
- * bit is set in the shrinkers map.
- *
- * This value is used by the function to detect registering
- * shrinkers and to skip do_shrink_slab() calls for them.
- */
-#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
+static int shrinker_nr_max;
+
+/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
+static inline int shrinker_map_size(int nr_items)
+{
+       return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
+}
+
+static inline int shrinker_defer_size(int nr_items)
+{
+       return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
+}
+
+static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
+                                                    int nid)
+{
+       return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
+                                        lockdep_is_held(&shrinker_rwsem));
+}
+
+static int expand_one_shrinker_info(struct mem_cgroup *memcg,
+                                   int map_size, int defer_size,
+                                   int old_map_size, int old_defer_size)
+{
+       struct shrinker_info *new, *old;
+       struct mem_cgroup_per_node *pn;
+       int nid;
+       int size = map_size + defer_size;
+
+       for_each_node(nid) {
+               pn = memcg->nodeinfo[nid];
+               old = shrinker_info_protected(memcg, nid);
+               /* Not yet online memcg */
+               if (!old)
+                       return 0;
+
+               new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
+               if (!new)
+                       return -ENOMEM;
+
+               new->nr_deferred = (atomic_long_t *)(new + 1);
+               new->map = (void *)new->nr_deferred + defer_size;
+
+               /* map: set all old bits, clear all new bits */
+               memset(new->map, (int)0xff, old_map_size);
+               memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
+               /* nr_deferred: copy old values, clear all new values */
+               memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
+               memset((void *)new->nr_deferred + old_defer_size, 0,
+                      defer_size - old_defer_size);
+
+               rcu_assign_pointer(pn->shrinker_info, new);
+               kvfree_rcu(old, rcu);
+       }
+
+       return 0;
+}
+
+void free_shrinker_info(struct mem_cgroup *memcg)
+{
+       struct mem_cgroup_per_node *pn;
+       struct shrinker_info *info;
+       int nid;
+
+       for_each_node(nid) {
+               pn = memcg->nodeinfo[nid];
+               info = rcu_dereference_protected(pn->shrinker_info, true);
+               kvfree(info);
+               rcu_assign_pointer(pn->shrinker_info, NULL);
+       }
+}
+
+int alloc_shrinker_info(struct mem_cgroup *memcg)
+{
+       struct shrinker_info *info;
+       int nid, size, ret = 0;
+       int map_size, defer_size = 0;
+
+       down_write(&shrinker_rwsem);
+       map_size = shrinker_map_size(shrinker_nr_max);
+       defer_size = shrinker_defer_size(shrinker_nr_max);
+       size = map_size + defer_size;
+       for_each_node(nid) {
+               info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
+               if (!info) {
+                       free_shrinker_info(memcg);
+                       ret = -ENOMEM;
+                       break;
+               }
+               info->nr_deferred = (atomic_long_t *)(info + 1);
+               info->map = (void *)info->nr_deferred + defer_size;
+               rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
+       }
+       up_write(&shrinker_rwsem);
+
+       return ret;
+}
+
+static inline bool need_expand(int nr_max)
+{
+       return round_up(nr_max, BITS_PER_LONG) >
+              round_up(shrinker_nr_max, BITS_PER_LONG);
+}
+
+static int expand_shrinker_info(int new_id)
+{
+       int ret = 0;
+       int new_nr_max = new_id + 1;
+       int map_size, defer_size = 0;
+       int old_map_size, old_defer_size = 0;
+       struct mem_cgroup *memcg;
+
+       if (!need_expand(new_nr_max))
+               goto out;
+
+       if (!root_mem_cgroup)
+               goto out;
+
+       lockdep_assert_held(&shrinker_rwsem);
+
+       map_size = shrinker_map_size(new_nr_max);
+       defer_size = shrinker_defer_size(new_nr_max);
+       old_map_size = shrinker_map_size(shrinker_nr_max);
+       old_defer_size = shrinker_defer_size(shrinker_nr_max);
+
+       memcg = mem_cgroup_iter(NULL, NULL, NULL);
+       do {
+               ret = expand_one_shrinker_info(memcg, map_size, defer_size,
+                                              old_map_size, old_defer_size);
+               if (ret) {
+                       mem_cgroup_iter_break(NULL, memcg);
+                       goto out;
+               }
+       } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+out:
+       if (!ret)
+               shrinker_nr_max = new_nr_max;
+
+       return ret;
+}
+
+void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
+{
+       if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
+               struct shrinker_info *info;
+
+               rcu_read_lock();
+               info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+               /* Pairs with smp mb in shrink_slab() */
+               smp_mb__before_atomic();
+               set_bit(shrinker_id, info->map);
+               rcu_read_unlock();
+       }
+}
 
 static DEFINE_IDR(shrinker_idr);
-static int shrinker_nr_max;
 
 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 {
        int id, ret = -ENOMEM;
 
+       if (mem_cgroup_disabled())
+               return -ENOSYS;
+
        down_write(&shrinker_rwsem);
        /* This may call shrinker, so it must use down_read_trylock() */
-       id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
+       id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
        if (id < 0)
                goto unlock;
 
        if (id >= shrinker_nr_max) {
-               if (memcg_expand_shrinker_maps(id)) {
+               if (expand_shrinker_info(id)) {
                        idr_remove(&shrinker_idr, id);
                        goto unlock;
                }
-
-               shrinker_nr_max = id + 1;
        }
        shrinker->id = id;
        ret = 0;
@@ -232,9 +374,51 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
 
        BUG_ON(id < 0);
 
-       down_write(&shrinker_rwsem);
+       lockdep_assert_held(&shrinker_rwsem);
+
        idr_remove(&shrinker_idr, id);
-       up_write(&shrinker_rwsem);
+}
+
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+                                  struct mem_cgroup *memcg)
+{
+       struct shrinker_info *info;
+
+       info = shrinker_info_protected(memcg, nid);
+       return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+                                 struct mem_cgroup *memcg)
+{
+       struct shrinker_info *info;
+
+       info = shrinker_info_protected(memcg, nid);
+       return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
+}
+
+void reparent_shrinker_deferred(struct mem_cgroup *memcg)
+{
+       int i, nid;
+       long nr;
+       struct mem_cgroup *parent;
+       struct shrinker_info *child_info, *parent_info;
+
+       parent = parent_mem_cgroup(memcg);
+       if (!parent)
+               parent = root_mem_cgroup;
+
+       /* Prevent from concurrent shrinker_info expand */
+       down_read(&shrinker_rwsem);
+       for_each_node(nid) {
+               child_info = shrinker_info_protected(memcg, nid);
+               parent_info = shrinker_info_protected(parent, nid);
+               for (i = 0; i < shrinker_nr_max; i++) {
+                       nr = atomic_long_read(&child_info->nr_deferred[i]);
+                       atomic_long_add(nr, &parent_info->nr_deferred[i]);
+               }
+       }
+       up_read(&shrinker_rwsem);
 }
 
 static bool cgroup_reclaim(struct scan_control *sc)
@@ -268,13 +452,25 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 #else
 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 {
-       return 0;
+       return -ENOSYS;
 }
 
 static void unregister_memcg_shrinker(struct shrinker *shrinker)
 {
 }
 
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+                                  struct mem_cgroup *memcg)
+{
+       return 0;
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+                                 struct mem_cgroup *memcg)
+{
+       return 0;
+}
+
 static bool cgroup_reclaim(struct scan_control *sc)
 {
        return false;
@@ -286,6 +482,39 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 }
 #endif
 
+static long xchg_nr_deferred(struct shrinker *shrinker,
+                            struct shrink_control *sc)
+{
+       int nid = sc->nid;
+
+       if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+               nid = 0;
+
+       if (sc->memcg &&
+           (shrinker->flags & SHRINKER_MEMCG_AWARE))
+               return xchg_nr_deferred_memcg(nid, shrinker,
+                                             sc->memcg);
+
+       return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+}
+
+
+static long add_nr_deferred(long nr, struct shrinker *shrinker,
+                           struct shrink_control *sc)
+{
+       int nid = sc->nid;
+
+       if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+               nid = 0;
+
+       if (sc->memcg &&
+           (shrinker->flags & SHRINKER_MEMCG_AWARE))
+               return add_nr_deferred_memcg(nr, nid, shrinker,
+                                            sc->memcg);
+
+       return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
+}
+
 /*
  * This misses isolated pages which are not accounted for to save counters.
  * As the data only determines if reclaim or compaction continues, it is
@@ -335,8 +564,18 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
  */
 int prealloc_shrinker(struct shrinker *shrinker)
 {
-       unsigned int size = sizeof(*shrinker->nr_deferred);
+       unsigned int size;
+       int err;
+
+       if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+               err = prealloc_memcg_shrinker(shrinker);
+               if (err != -ENOSYS)
+                       return err;
+
+               shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
+       }
 
+       size = sizeof(*shrinker->nr_deferred);
        if (shrinker->flags & SHRINKER_NUMA_AWARE)
                size *= nr_node_ids;
 
@@ -344,26 +583,17 @@ int prealloc_shrinker(struct shrinker *shrinker)
        if (!shrinker->nr_deferred)
                return -ENOMEM;
 
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
-               if (prealloc_memcg_shrinker(shrinker))
-                       goto free_deferred;
-       }
-
        return 0;
-
-free_deferred:
-       kfree(shrinker->nr_deferred);
-       shrinker->nr_deferred = NULL;
-       return -ENOMEM;
 }
 
 void free_prealloced_shrinker(struct shrinker *shrinker)
 {
-       if (!shrinker->nr_deferred)
-               return;
-
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+       if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+               down_write(&shrinker_rwsem);
                unregister_memcg_shrinker(shrinker);
+               up_write(&shrinker_rwsem);
+               return;
+       }
 
        kfree(shrinker->nr_deferred);
        shrinker->nr_deferred = NULL;
@@ -373,10 +603,7 @@ void register_shrinker_prepared(struct shrinker *shrinker)
 {
        down_write(&shrinker_rwsem);
        list_add_tail(&shrinker->list, &shrinker_list);
-#ifdef CONFIG_MEMCG
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
-               idr_replace(&shrinker_idr, shrinker, shrinker->id);
-#endif
+       shrinker->flags |= SHRINKER_REGISTERED;
        up_write(&shrinker_rwsem);
 }
 
@@ -396,13 +623,16 @@ EXPORT_SYMBOL(register_shrinker);
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
-       if (!shrinker->nr_deferred)
+       if (!(shrinker->flags & SHRINKER_REGISTERED))
                return;
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
-               unregister_memcg_shrinker(shrinker);
+
        down_write(&shrinker_rwsem);
        list_del(&shrinker->list);
+       shrinker->flags &= ~SHRINKER_REGISTERED;
+       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+               unregister_memcg_shrinker(shrinker);
        up_write(&shrinker_rwsem);
+
        kfree(shrinker->nr_deferred);
        shrinker->nr_deferred = NULL;
 }
@@ -419,14 +649,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
        long freeable;
        long nr;
        long new_nr;
-       int nid = shrinkctl->nid;
        long batch_size = shrinker->batch ? shrinker->batch
                                          : SHRINK_BATCH;
        long scanned = 0, next_deferred;
 
-       if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
-               nid = 0;
-
        freeable = shrinker->count_objects(shrinker, shrinkctl);
        if (freeable == 0 || freeable == SHRINK_EMPTY)
                return freeable;
@@ -436,9 +662,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
         * and zero it so that other concurrent shrinker invocations
         * don't also do this scanning work.
         */
-       nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+       nr = xchg_nr_deferred(shrinker, shrinkctl);
 
-       total_scan = nr;
        if (shrinker->seeks) {
                delta = freeable >> priority;
                delta *= 4;
@@ -452,37 +677,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                delta = freeable / 2;
        }
 
+       total_scan = nr >> priority;
        total_scan += delta;
-       if (total_scan < 0) {
-               pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
-                      shrinker->scan_objects, total_scan);
-               total_scan = freeable;
-               next_deferred = nr;
-       } else
-               next_deferred = total_scan;
-
-       /*
-        * We need to avoid excessive windup on filesystem shrinkers
-        * due to large numbers of GFP_NOFS allocations causing the
-        * shrinkers to return -1 all the time. This results in a large
-        * nr being built up so when a shrink that can do some work
-        * comes along it empties the entire cache due to nr >>>
-        * freeable. This is bad for sustaining a working set in
-        * memory.
-        *
-        * Hence only allow the shrinker to scan the entire cache when
-        * a large delta change is calculated directly.
-        */
-       if (delta < freeable / 4)
-               total_scan = min(total_scan, freeable / 2);
-
-       /*
-        * Avoid risking looping forever due to too large nr value:
-        * never try to free more than twice the estimate number of
-        * freeable entries.
-        */
-       if (total_scan > freeable * 2)
-               total_scan = freeable * 2;
+       total_scan = min(total_scan, (2 * freeable));
 
        trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
                                   freeable, delta, total_scan, priority);
@@ -521,22 +718,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                cond_resched();
        }
 
-       if (next_deferred >= scanned)
-               next_deferred -= scanned;
-       else
-               next_deferred = 0;
+       /*
+        * The deferred work is increased by any new work (delta) that wasn't
+        * done, decreased by old deferred work that was done now.
+        *
+        * And it is capped to two times of the freeable items.
+        */
+       next_deferred = max_t(long, (nr + delta - scanned), 0);
+       next_deferred = min(next_deferred, (2 * freeable));
+
        /*
         * move the unused scan count back into the shrinker in a
-        * manner that handles concurrent updates. If we exhausted the
-        * scan, there is no need to do an update.
+        * manner that handles concurrent updates.
         */
-       if (next_deferred > 0)
-               new_nr = atomic_long_add_return(next_deferred,
-                                               &shrinker->nr_deferred[nid]);
-       else
-               new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+       new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
 
-       trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
+       trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
        return freed;
 }
 
@@ -544,7 +741,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        struct mem_cgroup *memcg, int priority)
 {
-       struct memcg_shrinker_map *map;
+       struct shrinker_info *info;
        unsigned long ret, freed = 0;
        int i;
 
@@ -554,12 +751,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
        if (!down_read_trylock(&shrinker_rwsem))
                return 0;
 
-       map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
-                                       true);
-       if (unlikely(!map))
+       info = shrinker_info_protected(memcg, nid);
+       if (unlikely(!info))
                goto unlock;
 
-       for_each_set_bit(i, map->map, shrinker_nr_max) {
+       for_each_set_bit(i, info->map, shrinker_nr_max) {
                struct shrink_control sc = {
                        .gfp_mask = gfp_mask,
                        .nid = nid,
@@ -568,9 +764,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                struct shrinker *shrinker;
 
                shrinker = idr_find(&shrinker_idr, i);
-               if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
+               if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
                        if (!shrinker)
-                               clear_bit(i, map->map);
+                               clear_bit(i, info->map);
                        continue;
                }
 
@@ -581,7 +777,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 
                ret = do_shrink_slab(&sc, shrinker, priority);
                if (ret == SHRINK_EMPTY) {
-                       clear_bit(i, map->map);
+                       clear_bit(i, info->map);
                        /*
                         * After the shrinker reported that it had no objects to
                         * free, but before we cleared the corresponding bit in
@@ -590,7 +786,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                         * case, we invoke the shrinker one more time and reset
                         * the bit if it reports that it is not empty anymore.
                         * The memory barrier here pairs with the barrier in
-                        * memcg_set_shrinker_bit():
+                        * set_shrinker_bit():
                         *
                         * list_lru_add()     shrink_slab_memcg()
                         *   list_add_tail()    clear_bit()
@@ -602,7 +798,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        if (ret == SHRINK_EMPTY)
                                ret = 0;
                        else
-                               memcg_set_shrinker_bit(memcg, nid, i);
+                               set_shrinker_bit(memcg, nid, i);
                }
                freed += ret;
 
@@ -1507,8 +1703,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
        LIST_HEAD(clean_pages);
 
        list_for_each_entry_safe(page, next, page_list, lru) {
-               if (page_is_file_lru(page) && !PageDirty(page) &&
-                   !__PageMovable(page) && !PageUnevictable(page)) {
+               if (!PageHuge(page) && page_is_file_lru(page) &&
+                   !PageDirty(page) && !__PageMovable(page) &&
+                   !PageUnevictable(page)) {
                        ClearPageActive(page);
                        list_move(&page->lru, &clean_pages);
                }
@@ -3862,7 +4059,7 @@ static int kswapd(void *p)
 {
        unsigned int alloc_order, reclaim_order;
        unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
-       pg_data_t *pgdat = (pg_data_t*)p;
+       pg_data_t *pgdat = (pg_data_t *)p;
        struct task_struct *tsk = current;
        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
@@ -4085,14 +4282,6 @@ module_init(kswapd_init)
  */
 int node_reclaim_mode __read_mostly;
 
-/*
- * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
- * ABI.  New bits are OK, but existing bits can never change.
- */
-#define RECLAIM_ZONE  (1<<0)   /* Run shrink_inactive_list on the zone */
-#define RECLAIM_WRITE (1<<1)   /* Writeout pages during reclaim */
-#define RECLAIM_UNMAP (1<<2)   /* Unmap pages during reclaim */
-
 /*
  * Priority for NODE_RECLAIM. This determines the fraction of pages
  * of a node considered for each zone_reclaim. 4 scans 1/16th of
index 74b2c37..cccee36 100644 (file)
@@ -934,7 +934,7 @@ void cpu_vm_stats_fold(int cpu)
 
 /*
  * this is only called if !populated_zone(zone), which implies no other users of
- * pset->vm_stat_diff[] exsist.
+ * pset->vm_stat_diff[] exist.
  */
 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
 {
@@ -1312,6 +1312,10 @@ const char * const vmstat_text[] = {
 #ifdef CONFIG_HUGETLB_PAGE
        "htlb_buddy_alloc_success",
        "htlb_buddy_alloc_fail",
+#endif
+#ifdef CONFIG_CMA
+       "cma_alloc_success",
+       "cma_alloc_fail",
 #endif
        "unevictable_pgs_culled",
        "unevictable_pgs_scanned",
@@ -1365,6 +1369,10 @@ const char * const vmstat_text[] = {
        "swap_ra",
        "swap_ra_hit",
 #endif
+#ifdef CONFIG_X86
+       "direct_map_level2_splits",
+       "direct_map_level3_splits",
+#endif
 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
@@ -1854,25 +1862,34 @@ int vmstat_refresh(struct ctl_table *table, int write,
        if (err)
                return err;
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+               /*
+                * Skip checking stats known to go negative occasionally.
+                */
+               switch (i) {
+               case NR_ZONE_WRITE_PENDING:
+               case NR_FREE_CMA_PAGES:
+                       continue;
+               }
                val = atomic_long_read(&vm_zone_stat[i]);
                if (val < 0) {
                        pr_warn("%s: %s %ld\n",
                                __func__, zone_stat_name(i), val);
-                       err = -EINVAL;
                }
        }
-#ifdef CONFIG_NUMA
-       for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
-               val = atomic_long_read(&vm_numa_stat[i]);
+       for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+               /*
+                * Skip checking stats known to go negative occasionally.
+                */
+               switch (i) {
+               case NR_WRITEBACK:
+                       continue;
+               }
+               val = atomic_long_read(&vm_node_stat[i]);
                if (val < 0) {
                        pr_warn("%s: %s %ld\n",
-                               __func__, numa_stat_name(i), val);
-                       err = -EINVAL;
+                               __func__, node_stat_name(i), val);
                }
        }
-#endif
-       if (err)
-               return err;
        if (write)
                *ppos += *lenp;
        else
index cd39902..b7cdeca 100644 (file)
@@ -554,7 +554,6 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
                goto out_invalid;
        if (WARN_ON_ONCE(node->count != node->nr_values))
                goto out_invalid;
-       mapping->nrexceptional -= node->nr_values;
        xa_delete_node(node, workingset_update_node);
        __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
 
index 9d889ad..7fe7ada 100644 (file)
@@ -391,7 +391,7 @@ static void z3fold_unregister_migration(struct z3fold_pool *pool)
 {
        if (pool->inode)
                iput(pool->inode);
- }
+}
 
 /* Initializes the z3fold header of a newly allocated z3fold page */
 static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
index 5ed7120..6d9ed48 100644 (file)
@@ -336,7 +336,7 @@ int zpool_shrink(struct zpool *zpool, unsigned int pages,
  * This may hold locks, disable interrupts, and/or preemption,
  * and the zpool_unmap_handle() must be called to undo those
  * actions.  The code that uses the mapped handle should complete
- * its operatons on the mapped handle memory quickly and unmap
+ * its operations on the mapped handle memory quickly and unmap
  * as soon as possible.  As the implementation may use per-cpu
  * data, multiple handles should not be mapped concurrently on
  * any cpu.
index 30c358b..19b563b 100644 (file)
@@ -61,7 +61,7 @@
 #define ZSPAGE_MAGIC   0x58
 
 /*
- * This must be power of 2 and greater than of equal to sizeof(link_free).
+ * This must be power of 2 and greater than or equal to sizeof(link_free).
  * These two conditions ensure that any 'struct link_free' itself doesn't
  * span more than 1 page which avoids complex case of mapping 2 pages simply
  * to restore link_free pointer values.
@@ -530,7 +530,7 @@ static void set_zspage_mapping(struct zspage *zspage,
  * class maintains a list of zspages where each zspage is divided
  * into equal sized chunks. Each allocation falls into one of these
  * classes depending on its size. This function returns index of the
- * size class which has chunk size big enough to hold the give size.
+ * size class which has chunk size big enough to hold the given size.
  */
 static int get_size_class_index(int size)
 {
@@ -1227,7 +1227,7 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
  * zs_map_object - get address of allocated object from handle.
  * @pool: pool from which the object was allocated
  * @handle: handle returned from zs_malloc
- * @mm: maping mode to use
+ * @mm: mapping mode to use
  *
  * Before using an object allocated from zs_malloc, it must be mapped using
  * this function. When done with the object, it must be unmapped using
@@ -1987,8 +1987,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
                head = obj_to_head(page, addr);
                if (head & OBJ_ALLOCATED_TAG) {
                        handle = head & ~OBJ_ALLOCATED_TAG;
-                       if (!testpin_tag(handle))
-                               BUG();
+                       BUG_ON(!testpin_tag(handle));
 
                        old_obj = handle_to_obj(handle);
                        obj_to_location(old_obj, &dummy, &obj_idx);
@@ -2035,8 +2034,7 @@ unpin_objects:
                head = obj_to_head(page, addr);
                if (head & OBJ_ALLOCATED_TAG) {
                        handle = head & ~OBJ_ALLOCATED_TAG;
-                       if (!testpin_tag(handle))
-                               BUG();
+                       BUG_ON(!testpin_tag(handle));
                        unpin_tag(handle);
                }
        }
index 578d9f2..2076326 100644 (file)
@@ -614,7 +614,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
        }
        pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
 
-       strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
+       strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
 
        pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
        if (!pool->acomp_ctx) {
index 0456593..e4e6e99 100644 (file)
@@ -103,8 +103,9 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
 
        rcu_read_lock();
        if (netif_is_bridge_port(dev)) {
-               p = br_port_get_rcu(dev);
-               vg = nbp_vlan_group_rcu(p);
+               p = br_port_get_check_rcu(dev);
+               if (p)
+                       vg = nbp_vlan_group_rcu(p);
        } else if (dev->priv_flags & IFF_EBRIDGE) {
                br = netdev_priv(dev);
                vg = br_vlan_group_rcu(br);
index eb261aa..de407e8 100644 (file)
@@ -36,6 +36,20 @@ static int init_protocol(struct ceph_auth_client *ac, int proto)
        }
 }
 
+static void set_global_id(struct ceph_auth_client *ac, u64 global_id)
+{
+       dout("%s global_id %llu\n", __func__, global_id);
+
+       if (!global_id)
+               pr_err("got zero global_id\n");
+
+       if (ac->global_id && global_id != ac->global_id)
+               pr_err("global_id changed from %llu to %llu\n", ac->global_id,
+                      global_id);
+
+       ac->global_id = global_id;
+}
+
 /*
  * setup, teardown.
  */
@@ -222,11 +236,6 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 
        payload_end = payload + payload_len;
 
-       if (global_id && ac->global_id != global_id) {
-               dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
-               ac->global_id = global_id;
-       }
-
        if (ac->negotiating) {
                /* server does not support our protocols? */
                if (!protocol && result < 0) {
@@ -253,11 +262,16 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 
        ret = ac->ops->handle_reply(ac, result, payload, payload_end,
                                    NULL, NULL, NULL, NULL);
-       if (ret == -EAGAIN)
+       if (ret == -EAGAIN) {
                ret = build_request(ac, true, reply_buf, reply_len);
-       else if (ret)
+               goto out;
+       } else if (ret) {
                pr_err("auth protocol '%s' mauth authentication failed: %d\n",
                       ceph_auth_proto_name(ac->protocol), result);
+               goto out;
+       }
+
+       set_global_id(ac, global_id);
 
 out:
        mutex_unlock(&ac->mutex);
@@ -484,15 +498,11 @@ int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
        int ret;
 
        mutex_lock(&ac->mutex);
-       if (global_id && ac->global_id != global_id) {
-               dout("%s global_id %llu -> %llu\n", __func__, ac->global_id,
-                    global_id);
-               ac->global_id = global_id;
-       }
-
        ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
                                    session_key, session_key_len,
                                    con_secret, con_secret_len);
+       if (!ret)
+               set_global_id(ac, global_id);
        mutex_unlock(&ac->mutex);
        return ret;
 }
index ca44c32..79641c4 100644 (file)
@@ -526,7 +526,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
                if (ret < 0)
                        return ret;
 
-               auth->struct_v = 2;  /* nautilus+ */
+               auth->struct_v = 3;  /* nautilus+ */
                auth->key = 0;
                for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++)
                        auth->key ^= *(__le64 *)u;
index b44f765..bc109a1 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/inet.h>
 
 #include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h>  /* for ceph_pr_addr() */
 
 static int
 ceph_decode_entity_addr_versioned(void **p, void *end,
@@ -110,6 +111,7 @@ int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
        }
 
        ceph_decode_32_safe(p, end, addr_cnt, e_inval);
+       dout("%s addr_cnt %d\n", __func__, addr_cnt);
 
        found = false;
        for (i = 0; i < addr_cnt; i++) {
@@ -117,6 +119,7 @@ int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
                if (ret)
                        return ret;
 
+               dout("%s i %d addr %s\n", __func__, i, ceph_pr_addr(&tmp_addr));
                if (tmp_addr.type == my_type) {
                        if (found) {
                                pr_err("another match of type %d in addrvec\n",
@@ -128,13 +131,18 @@ int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
                        found = true;
                }
        }
-       if (!found && addr_cnt != 0) {
-               pr_err("no match of type %d in addrvec\n",
-                      le32_to_cpu(my_type));
-               return -ENOENT;
-       }
 
-       return 0;
+       if (found)
+               return 0;
+
+       if (!addr_cnt)
+               return 0;  /* normal -- e.g. unused OSD id/slot */
+
+       if (addr_cnt == 1 && !memchr_inv(&tmp_addr, 0, sizeof(tmp_addr)))
+               return 0;  /* weird but effectively the same as !addr_cnt */
+
+       pr_err("no match of type %d in addrvec\n", le32_to_cpu(my_type));
+       return -ENOENT;
 
 e_inval:
        return -EINVAL;
index 290012d..88d8a02 100644 (file)
@@ -387,7 +387,8 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev,
        int ret;
 
        ehdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
-                          &ethtool_genl_family, 0, ctx->ops->reply_cmd);
+                          &ethtool_genl_family, NLM_F_MULTI,
+                          ctx->ops->reply_cmd);
        if (!ehdr)
                return -EMSGSIZE;
 
index b218e45..6852e9b 100644 (file)
@@ -520,6 +520,10 @@ static int fill_frame_info(struct hsr_frame_info *frame,
        struct ethhdr *ethhdr;
        __be16 proto;
 
+       /* Check if skb contains hsr_ethhdr */
+       if (skb->mac_len < sizeof(struct hsr_ethhdr))
+               return -EINVAL;
+
        memset(frame, 0, sizeof(*frame));
        frame->is_supervision = is_supervision_frame(port->hsr, skb);
        frame->node_src = hsr_get_node(port, &hsr->node_db, skb,
index cf20316..c53f14b 100644 (file)
@@ -1556,13 +1556,12 @@ out_free:
        return ret;
 }
 
-void arpt_unregister_table_pre_exit(struct net *net, const char *name,
-                                   const struct nf_hook_ops *ops)
+void arpt_unregister_table_pre_exit(struct net *net, const char *name)
 {
        struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name);
 
        if (table)
-               nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+               nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
 }
 EXPORT_SYMBOL(arpt_unregister_table_pre_exit);
 
index b8f45e9..6922612 100644 (file)
@@ -54,7 +54,7 @@ static int __net_init arptable_filter_table_init(struct net *net)
 
 static void __net_exit arptable_filter_net_pre_exit(struct net *net)
 {
-       arpt_unregister_table_pre_exit(net, "filter", arpfilter_ops);
+       arpt_unregister_table_pre_exit(net, "filter");
 }
 
 static void __net_exit arptable_filter_net_exit(struct net *net)
index e14fd0c..f1c1f9e 100644 (file)
@@ -2039,6 +2039,7 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
                (__kernel_size_t)zc->msg_controllen;
        cmsg_dummy.msg_flags = in_compat_syscall()
                ? MSG_CMSG_COMPAT : 0;
+       cmsg_dummy.msg_control_is_user = true;
        zc->msg_flags = 0;
        if (zc->msg_control == msg_control_addr &&
            zc->msg_controllen == cmsg_dummy.msg_controllen) {
index 563d016..db5831e 100644 (file)
@@ -230,6 +230,10 @@ int tcp_set_default_congestion_control(struct net *net, const char *name)
                ret = -ENOENT;
        } else if (!bpf_try_module_get(ca, ca->owner)) {
                ret = -EBUSY;
+       } else if (!net_eq(net, &init_net) &&
+                       !(ca->flags & TCP_CONG_NON_RESTRICTED)) {
+               /* Only init netns can set default to a restricted algorithm */
+               ret = -EPERM;
        } else {
                prev = xchg(&net->ipv4.tcp_congestion_control, ca);
                if (prev)
index d2f8138..e412817 100644 (file)
@@ -122,9 +122,6 @@ static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
        hinfo = seg6_hmac_info_lookup(net, hmackeyid);
 
        if (!slen) {
-               if (!hinfo)
-                       err = -ENOENT;
-
                err = seg6_hmac_info_del(net, hmackeyid);
 
                goto out_unlock;
index bd71408..4ff38cb 100644 (file)
@@ -93,6 +93,35 @@ struct seg6_end_dt_info {
        int hdrlen;
 };
 
+struct pcpu_seg6_local_counters {
+       u64_stats_t packets;
+       u64_stats_t bytes;
+       u64_stats_t errors;
+
+       struct u64_stats_sync syncp;
+};
+
+/* This struct groups all the SRv6 Behavior counters supported so far.
+ *
+ * put_nla_counters() makes use of this data structure to collect all counter
+ * values after the per-CPU counter evaluation has been performed.
+ * Finally, each counter value (in seg6_local_counters) is stored in the
+ * corresponding netlink attribute and sent to user space.
+ *
+ * NB: we don't want to expose this structure to user space!
+ */
+struct seg6_local_counters {
+       __u64 packets;
+       __u64 bytes;
+       __u64 errors;
+};
+
+#define seg6_local_alloc_pcpu_counters(__gfp)                          \
+       __netdev_alloc_pcpu_stats(struct pcpu_seg6_local_counters,      \
+                                 ((__gfp) | __GFP_ZERO))
+
+#define SEG6_F_LOCAL_COUNTERS  SEG6_F_ATTR(SEG6_LOCAL_COUNTERS)
+
 struct seg6_local_lwt {
        int action;
        struct ipv6_sr_hdr *srh;
@@ -105,6 +134,7 @@ struct seg6_local_lwt {
 #ifdef CONFIG_NET_L3_MASTER_DEV
        struct seg6_end_dt_info dt_info;
 #endif
+       struct pcpu_seg6_local_counters __percpu *pcpu_counters;
 
        int headroom;
        struct seg6_action_desc *desc;
@@ -878,36 +908,43 @@ static struct seg6_action_desc seg6_action_table[] = {
        {
                .action         = SEG6_LOCAL_ACTION_END,
                .attrs          = 0,
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_X,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_NH6),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_x,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_T,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_TABLE),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_t,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_DX2,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_OIF),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_dx2,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_DX6,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_NH6),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_dx6,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_DX4,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_NH4),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_dx4,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_DT4,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
 #ifdef CONFIG_NET_L3_MASTER_DEV
                .input          = input_action_end_dt4,
                .slwt_ops       = {
@@ -919,30 +956,35 @@ static struct seg6_action_desc seg6_action_table[] = {
                .action         = SEG6_LOCAL_ACTION_END_DT6,
 #ifdef CONFIG_NET_L3_MASTER_DEV
                .attrs          = 0,
-               .optattrs       = SEG6_F_ATTR(SEG6_LOCAL_TABLE) |
+               .optattrs       = SEG6_F_LOCAL_COUNTERS         |
+                                 SEG6_F_ATTR(SEG6_LOCAL_TABLE) |
                                  SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
                .slwt_ops       = {
                                        .build_state = seg6_end_dt6_build,
                                  },
 #else
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_TABLE),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
 #endif
                .input          = input_action_end_dt6,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_B6,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_SRH),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_b6,
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_B6_ENCAP,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_SRH),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_b6_encap,
                .static_headroom        = sizeof(struct ipv6hdr),
        },
        {
                .action         = SEG6_LOCAL_ACTION_END_BPF,
                .attrs          = SEG6_F_ATTR(SEG6_LOCAL_BPF),
+               .optattrs       = SEG6_F_LOCAL_COUNTERS,
                .input          = input_action_end_bpf,
        },
 
@@ -963,11 +1005,36 @@ static struct seg6_action_desc *__get_action_desc(int action)
        return NULL;
 }
 
+static bool seg6_lwtunnel_counters_enabled(struct seg6_local_lwt *slwt)
+{
+       return slwt->parsed_optattrs & SEG6_F_LOCAL_COUNTERS;
+}
+
+static void seg6_local_update_counters(struct seg6_local_lwt *slwt,
+                                      unsigned int len, int err)
+{
+       struct pcpu_seg6_local_counters *pcounters;
+
+       pcounters = this_cpu_ptr(slwt->pcpu_counters);
+       u64_stats_update_begin(&pcounters->syncp);
+
+       if (likely(!err)) {
+               u64_stats_inc(&pcounters->packets);
+               u64_stats_add(&pcounters->bytes, len);
+       } else {
+               u64_stats_inc(&pcounters->errors);
+       }
+
+       u64_stats_update_end(&pcounters->syncp);
+}
+
 static int seg6_local_input(struct sk_buff *skb)
 {
        struct dst_entry *orig_dst = skb_dst(skb);
        struct seg6_action_desc *desc;
        struct seg6_local_lwt *slwt;
+       unsigned int len = skb->len;
+       int rc;
 
        if (skb->protocol != htons(ETH_P_IPV6)) {
                kfree_skb(skb);
@@ -977,7 +1044,14 @@ static int seg6_local_input(struct sk_buff *skb)
        slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
        desc = slwt->desc;
 
-       return desc->input(skb, slwt);
+       rc = desc->input(skb, slwt);
+
+       if (!seg6_lwtunnel_counters_enabled(slwt))
+               return rc;
+
+       seg6_local_update_counters(slwt, len, rc);
+
+       return rc;
 }
 
 static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
@@ -992,6 +1066,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
        [SEG6_LOCAL_IIF]        = { .type = NLA_U32 },
        [SEG6_LOCAL_OIF]        = { .type = NLA_U32 },
        [SEG6_LOCAL_BPF]        = { .type = NLA_NESTED },
+       [SEG6_LOCAL_COUNTERS]   = { .type = NLA_NESTED },
 };
 
 static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -1296,6 +1371,112 @@ static void destroy_attr_bpf(struct seg6_local_lwt *slwt)
                bpf_prog_put(slwt->bpf.prog);
 }
 
+static const struct
+nla_policy seg6_local_counters_policy[SEG6_LOCAL_CNT_MAX + 1] = {
+       [SEG6_LOCAL_CNT_PACKETS]        = { .type = NLA_U64 },
+       [SEG6_LOCAL_CNT_BYTES]          = { .type = NLA_U64 },
+       [SEG6_LOCAL_CNT_ERRORS]         = { .type = NLA_U64 },
+};
+
+static int parse_nla_counters(struct nlattr **attrs,
+                             struct seg6_local_lwt *slwt)
+{
+       struct pcpu_seg6_local_counters __percpu *pcounters;
+       struct nlattr *tb[SEG6_LOCAL_CNT_MAX + 1];
+       int ret;
+
+       ret = nla_parse_nested_deprecated(tb, SEG6_LOCAL_CNT_MAX,
+                                         attrs[SEG6_LOCAL_COUNTERS],
+                                         seg6_local_counters_policy, NULL);
+       if (ret < 0)
+               return ret;
+
+       /* basic support for SRv6 Behavior counters requires at least:
+        * packets, bytes and errors.
+        */
+       if (!tb[SEG6_LOCAL_CNT_PACKETS] || !tb[SEG6_LOCAL_CNT_BYTES] ||
+           !tb[SEG6_LOCAL_CNT_ERRORS])
+               return -EINVAL;
+
+       /* counters are always zero initialized */
+       pcounters = seg6_local_alloc_pcpu_counters(GFP_KERNEL);
+       if (!pcounters)
+               return -ENOMEM;
+
+       slwt->pcpu_counters = pcounters;
+
+       return 0;
+}
+
+static int seg6_local_fill_nla_counters(struct sk_buff *skb,
+                                       struct seg6_local_counters *counters)
+{
+       if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_PACKETS, counters->packets,
+                             SEG6_LOCAL_CNT_PAD))
+               return -EMSGSIZE;
+
+       if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_BYTES, counters->bytes,
+                             SEG6_LOCAL_CNT_PAD))
+               return -EMSGSIZE;
+
+       if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_ERRORS, counters->errors,
+                             SEG6_LOCAL_CNT_PAD))
+               return -EMSGSIZE;
+
+       return 0;
+}
+
+static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+       struct seg6_local_counters counters = { 0, 0, 0 };
+       struct nlattr *nest;
+       int rc, i;
+
+       nest = nla_nest_start(skb, SEG6_LOCAL_COUNTERS);
+       if (!nest)
+               return -EMSGSIZE;
+
+       for_each_possible_cpu(i) {
+               struct pcpu_seg6_local_counters *pcounters;
+               u64 packets, bytes, errors;
+               unsigned int start;
+
+               pcounters = per_cpu_ptr(slwt->pcpu_counters, i);
+               do {
+                       start = u64_stats_fetch_begin_irq(&pcounters->syncp);
+
+                       packets = u64_stats_read(&pcounters->packets);
+                       bytes = u64_stats_read(&pcounters->bytes);
+                       errors = u64_stats_read(&pcounters->errors);
+
+               } while (u64_stats_fetch_retry_irq(&pcounters->syncp, start));
+
+               counters.packets += packets;
+               counters.bytes += bytes;
+               counters.errors += errors;
+       }
+
+       rc = seg6_local_fill_nla_counters(skb, &counters);
+       if (rc < 0) {
+               nla_nest_cancel(skb, nest);
+               return rc;
+       }
+
+       return nla_nest_end(skb, nest);
+}
+
+static int cmp_nla_counters(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+       /* a and b are equal if both have pcpu_counters set or not */
+       return (!!((unsigned long)a->pcpu_counters)) ^
+               (!!((unsigned long)b->pcpu_counters));
+}
+
+static void destroy_attr_counters(struct seg6_local_lwt *slwt)
+{
+       free_percpu(slwt->pcpu_counters);
+}
+
 struct seg6_action_param {
        int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
        int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
@@ -1343,6 +1524,10 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
                                    .put = put_nla_vrftable,
                                    .cmp = cmp_nla_vrftable },
 
+       [SEG6_LOCAL_COUNTERS]   = { .parse = parse_nla_counters,
+                                   .put = put_nla_counters,
+                                   .cmp = cmp_nla_counters,
+                                   .destroy = destroy_attr_counters },
 };
 
 /* call the destroy() callback (if available) for each set attribute in
@@ -1645,6 +1830,15 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
        if (attrs & SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE))
                nlsize += nla_total_size(4);
 
+       if (attrs & SEG6_F_LOCAL_COUNTERS)
+               nlsize += nla_total_size(0) + /* nest SEG6_LOCAL_COUNTERS */
+                         /* SEG6_LOCAL_CNT_PACKETS */
+                         nla_total_size_64bit(sizeof(__u64)) +
+                         /* SEG6_LOCAL_CNT_BYTES */
+                         nla_total_size_64bit(sizeof(__u64)) +
+                         /* SEG6_LOCAL_CNT_ERRORS */
+                         nla_total_size_64bit(sizeof(__u64));
+
        return nlsize;
 }
 
index 82e91b0..a5ede35 100644 (file)
@@ -546,8 +546,7 @@ static void mptcp_sock_destruct(struct sock *sk)
         * ESTABLISHED state and will not have the SOCK_DEAD flag.
         * Both result in warnings from inet_sock_destruct.
         */
-
-       if (sk->sk_state == TCP_ESTABLISHED) {
+       if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
                sk->sk_state = TCP_CLOSE;
                WARN_ON_ONCE(sk->sk_socket);
                sock_orphan(sk);
index b22801f..a414274 100644 (file)
@@ -413,7 +413,10 @@ static int help(struct sk_buff *skb,
 
        spin_lock_bh(&nf_ftp_lock);
        fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer);
-       BUG_ON(fb_ptr == NULL);
+       if (!fb_ptr) {
+               spin_unlock_bh(&nf_ftp_lock);
+               return NF_ACCEPT;
+       }
 
        ends_in_nl = (fb_ptr[datalen - 1] == '\n');
        seq = ntohl(th->seq) + datalen;
index 8ba037b..aafaff0 100644 (file)
@@ -146,7 +146,8 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
                /* Get first TPKT pointer */
                tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen,
                                          h323_buffer);
-               BUG_ON(tpkt == NULL);
+               if (!tpkt)
+                       goto clear_out;
 
                /* Validate TPKT identifier */
                if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) {
index e40988a..08ee4e7 100644 (file)
@@ -143,7 +143,10 @@ static int help(struct sk_buff *skb, unsigned int protoff,
        spin_lock_bh(&irc_buffer_lock);
        ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff,
                                    irc_buffer);
-       BUG_ON(ib_ptr == NULL);
+       if (!ib_ptr) {
+               spin_unlock_bh(&irc_buffer_lock);
+               return NF_ACCEPT;
+       }
 
        data = ib_ptr;
        data_limit = ib_ptr + skb->len - dataoff;
index 5105d42..7d5708b 100644 (file)
@@ -544,7 +544,9 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
 
        nexthdr_off = protoff;
        tcph = skb_header_pointer(skb, nexthdr_off, sizeof(_tcph), &_tcph);
-       BUG_ON(!tcph);
+       if (!tcph)
+               return NF_ACCEPT;
+
        nexthdr_off += tcph->doff * 4;
        datalen = tcplen - tcph->doff * 4;
 
index 318b8f7..34e2241 100644 (file)
@@ -338,7 +338,8 @@ static void tcp_options(const struct sk_buff *skb,
 
        ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
                                 length, buff);
-       BUG_ON(ptr == NULL);
+       if (!ptr)
+               return;
 
        state->td_scale =
        state->flags = 0;
@@ -394,7 +395,8 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
 
        ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
                                 length, buff);
-       BUG_ON(ptr == NULL);
+       if (!ptr)
+               return;
 
        /* Fast path for timestamp-only option */
        if (length == TCPOLEN_TSTAMP_ALIGNED
index 1aebd65..fcb33b1 100644 (file)
@@ -95,7 +95,10 @@ static int help(struct sk_buff *skb,
 
        spin_lock_bh(&nf_sane_lock);
        sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer);
-       BUG_ON(sb_ptr == NULL);
+       if (!sb_ptr) {
+               spin_unlock_bh(&nf_sane_lock);
+               return NF_ACCEPT;
+       }
 
        if (dir == IP_CT_DIR_ORIGINAL) {
                if (datalen != sizeof(struct sane_request))
index 0b7fe0a..d63d2d8 100644 (file)
@@ -4184,6 +4184,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
        unsigned char *udata;
        struct nft_set *set;
        struct nft_ctx ctx;
+       size_t alloc_size;
        u64 timeout;
        char *name;
        int err, i;
@@ -4329,8 +4330,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
        size = 0;
        if (ops->privsize != NULL)
                size = ops->privsize(nla, &desc);
-
-       set = kvzalloc(sizeof(*set) + size + udlen, GFP_KERNEL);
+       alloc_size = sizeof(*set) + size + udlen;
+       if (alloc_size < size)
+               return -ENOMEM;
+       set = kvzalloc(alloc_size, GFP_KERNEL);
        if (!set)
                return -ENOMEM;
 
@@ -6615,9 +6618,9 @@ err_obj_ht:
        INIT_LIST_HEAD(&obj->list);
        return err;
 err_trans:
-       kfree(obj->key.name);
-err_userdata:
        kfree(obj->udata);
+err_userdata:
+       kfree(obj->key.name);
 err_strdup:
        if (obj->ops->destroy)
                obj->ops->destroy(&ctx, obj);
index d7a9628..e8dbd83 100644 (file)
@@ -295,6 +295,7 @@ replay:
                        nfnl_unlock(subsys_id);
                        break;
                default:
+                       rcu_read_unlock();
                        err = -EINVAL;
                        break;
                }
index e8f8875..0fa2e20 100644 (file)
@@ -186,6 +186,8 @@ static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx,
 
                ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) +
                                sizeof(struct tcphdr), ctx->optsize, opts);
+               if (!ctx->optp)
+                       return NULL;
        }
 
        return tcp;
index 58f576a..7b3d0a7 100644 (file)
@@ -412,9 +412,17 @@ static void nft_rhash_destroy(const struct nft_set *set)
                                    (void *)set);
 }
 
+/* Number of buckets is stored in u32, so cap our result to 1U<<31 */
+#define NFT_MAX_BUCKETS (1U << 31)
+
 static u32 nft_hash_buckets(u32 size)
 {
-       return roundup_pow_of_two(size * 4 / 3);
+       u64 val = div_u64((u64)size * 4, 3);
+
+       if (val >= NFT_MAX_BUCKETS)
+               return NFT_MAX_BUCKETS;
+
+       return roundup_pow_of_two(val);
 }
 
 static bool nft_rhash_estimate(const struct nft_set_desc *desc, u32 features,
@@ -615,7 +623,7 @@ static u64 nft_hash_privsize(const struct nlattr * const nla[],
                             const struct nft_set_desc *desc)
 {
        return sizeof(struct nft_hash) +
-              nft_hash_buckets(desc->size) * sizeof(struct hlist_head);
+              (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head);
 }
 
 static int nft_hash_init(const struct nft_set *set,
@@ -655,8 +663,8 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
                return false;
 
        est->size   = sizeof(struct nft_hash) +
-                     nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
-                     desc->size * sizeof(struct nft_hash_elem);
+                     (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
+                     (u64)desc->size * sizeof(struct nft_hash_elem);
        est->lookup = NFT_SET_CLASS_O_1;
        est->space  = NFT_SET_CLASS_O_N;
 
@@ -673,8 +681,8 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features
                return false;
 
        est->size   = sizeof(struct nft_hash) +
-                     nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
-                     desc->size * sizeof(struct nft_hash_elem);
+                     (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
+                     (u64)desc->size * sizeof(struct nft_hash_elem);
        est->lookup = NFT_SET_CLASS_O_1;
        est->space  = NFT_SET_CLASS_O_N;
 
index 75625d1..498a0bf 100644 (file)
@@ -24,10 +24,9 @@ MODULE_ALIAS("ip6t_SECMARK");
 static u8 mode;
 
 static unsigned int
-secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+secmark_tg(struct sk_buff *skb, const struct xt_secmark_target_info_v1 *info)
 {
        u32 secmark = 0;
-       const struct xt_secmark_target_info *info = par->targinfo;
 
        switch (mode) {
        case SECMARK_MODE_SEL:
@@ -41,7 +40,7 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
        return XT_CONTINUE;
 }
 
-static int checkentry_lsm(struct xt_secmark_target_info *info)
+static int checkentry_lsm(struct xt_secmark_target_info_v1 *info)
 {
        int err;
 
@@ -73,15 +72,15 @@ static int checkentry_lsm(struct xt_secmark_target_info *info)
        return 0;
 }
 
-static int secmark_tg_check(const struct xt_tgchk_param *par)
+static int
+secmark_tg_check(const char *table, struct xt_secmark_target_info_v1 *info)
 {
-       struct xt_secmark_target_info *info = par->targinfo;
        int err;
 
-       if (strcmp(par->table, "mangle") != 0 &&
-           strcmp(par->table, "security") != 0) {
+       if (strcmp(table, "mangle") != 0 &&
+           strcmp(table, "security") != 0) {
                pr_info_ratelimited("only valid in \'mangle\' or \'security\' table, not \'%s\'\n",
-                                   par->table);
+                                   table);
                return -EINVAL;
        }
 
@@ -116,25 +115,76 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
        }
 }
 
-static struct xt_target secmark_tg_reg __read_mostly = {
-       .name       = "SECMARK",
-       .revision   = 0,
-       .family     = NFPROTO_UNSPEC,
-       .checkentry = secmark_tg_check,
-       .destroy    = secmark_tg_destroy,
-       .target     = secmark_tg,
-       .targetsize = sizeof(struct xt_secmark_target_info),
-       .me         = THIS_MODULE,
+static int secmark_tg_check_v0(const struct xt_tgchk_param *par)
+{
+       struct xt_secmark_target_info *info = par->targinfo;
+       struct xt_secmark_target_info_v1 newinfo = {
+               .mode   = info->mode,
+       };
+       int ret;
+
+       memcpy(newinfo.secctx, info->secctx, SECMARK_SECCTX_MAX);
+
+       ret = secmark_tg_check(par->table, &newinfo);
+       info->secid = newinfo.secid;
+
+       return ret;
+}
+
+static unsigned int
+secmark_tg_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+       const struct xt_secmark_target_info *info = par->targinfo;
+       struct xt_secmark_target_info_v1 newinfo = {
+               .secid  = info->secid,
+       };
+
+       return secmark_tg(skb, &newinfo);
+}
+
+static int secmark_tg_check_v1(const struct xt_tgchk_param *par)
+{
+       return secmark_tg_check(par->table, par->targinfo);
+}
+
+static unsigned int
+secmark_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+       return secmark_tg(skb, par->targinfo);
+}
+
+static struct xt_target secmark_tg_reg[] __read_mostly = {
+       {
+               .name           = "SECMARK",
+               .revision       = 0,
+               .family         = NFPROTO_UNSPEC,
+               .checkentry     = secmark_tg_check_v0,
+               .destroy        = secmark_tg_destroy,
+               .target         = secmark_tg_v0,
+               .targetsize     = sizeof(struct xt_secmark_target_info),
+               .me             = THIS_MODULE,
+       },
+       {
+               .name           = "SECMARK",
+               .revision       = 1,
+               .family         = NFPROTO_UNSPEC,
+               .checkentry     = secmark_tg_check_v1,
+               .destroy        = secmark_tg_destroy,
+               .target         = secmark_tg_v1,
+               .targetsize     = sizeof(struct xt_secmark_target_info_v1),
+               .usersize       = offsetof(struct xt_secmark_target_info_v1, secid),
+               .me             = THIS_MODULE,
+       },
 };
 
 static int __init secmark_tg_init(void)
 {
-       return xt_register_target(&secmark_tg_reg);
+       return xt_register_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg));
 }
 
 static void __exit secmark_tg_exit(void)
 {
-       xt_unregister_target(&secmark_tg_reg);
+       xt_unregister_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg));
 }
 
 module_init(secmark_tg_init);
index a3b46f8..53dbe73 100644 (file)
@@ -109,12 +109,14 @@ static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
                                          GFP_KERNEL);
        if (!llcp_sock->service_name) {
                nfc_llcp_local_put(llcp_sock->local);
+               llcp_sock->local = NULL;
                ret = -ENOMEM;
                goto put_dev;
        }
        llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock);
        if (llcp_sock->ssap == LLCP_SAP_MAX) {
                nfc_llcp_local_put(llcp_sock->local);
+               llcp_sock->local = NULL;
                kfree(llcp_sock->service_name);
                llcp_sock->service_name = NULL;
                ret = -EADDRINUSE;
@@ -709,6 +711,7 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr,
        llcp_sock->ssap = nfc_llcp_get_local_ssap(local);
        if (llcp_sock->ssap == LLCP_SAP_MAX) {
                nfc_llcp_local_put(llcp_sock->local);
+               llcp_sock->local = NULL;
                ret = -ENOMEM;
                goto put_dev;
        }
@@ -756,6 +759,7 @@ sock_unlink:
 sock_llcp_release:
        nfc_llcp_put_ssap(local, llcp_sock->ssap);
        nfc_llcp_local_put(llcp_sock->local);
+       llcp_sock->local = NULL;
 
 put_dev:
        nfc_put_device(dev);
index 92a0b67..77d924a 100644 (file)
@@ -827,17 +827,17 @@ static void ovs_fragment(struct net *net, struct vport *vport,
        }
 
        if (key->eth.type == htons(ETH_P_IP)) {
-               struct dst_entry ovs_dst;
+               struct rtable ovs_rt = { 0 };
                unsigned long orig_dst;
 
                prepare_frag(vport, skb, orig_network_offset,
                             ovs_key_mac_proto(key));
-               dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1,
+               dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
                         DST_OBSOLETE_NONE, DST_NOCOUNT);
-               ovs_dst.dev = vport->dev;
+               ovs_rt.dst.dev = vport->dev;
 
                orig_dst = skb->_skb_refdst;
-               skb_dst_set_noref(skb, &ovs_dst);
+               skb_dst_set_noref(skb, &ovs_rt.dst);
                IPCB(skb)->frag_max_size = mru;
 
                ip_do_fragment(net, skb->sk, skb, ovs_vport_output);
index f5cbe96..26b069e 100644 (file)
@@ -67,31 +67,6 @@ static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
        }
 }
 
-/*
- * Tune RNR behavior. Without flow control, we use a rather
- * low timeout, but not the absolute minimum - this should
- * be tunable.
- *
- * We already set the RNR retry count to 7 (which is the
- * smallest infinite number :-) above.
- * If flow control is off, we want to change this back to 0
- * so that we learn quickly when our credit accounting is
- * buggy.
- *
- * Caller passes in a qp_attr pointer - don't waste stack spacv
- * by allocation this twice.
- */
-static void
-rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
-{
-       int ret;
-
-       attr->min_rnr_timer = IB_RNR_TIMER_000_32;
-       ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
-       if (ret)
-               printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
-}
-
 /*
  * Connection established.
  * We get here for both outgoing and incoming connection.
@@ -100,7 +75,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 {
        struct rds_ib_connection *ic = conn->c_transport_data;
        const union rds_ib_conn_priv *dp = NULL;
-       struct ib_qp_attr qp_attr;
        __be64 ack_seq = 0;
        __be32 credit = 0;
        u8 major = 0;
@@ -168,14 +142,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
         * the posted credit count. */
        rds_ib_recv_refill(conn, 1, GFP_KERNEL);
 
-       /* Tune RNR behavior */
-       rds_ib_tune_rnr(ic, &qp_attr);
-
-       qp_attr.qp_state = IB_QPS_RTS;
-       err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
-       if (err)
-               printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
-
        /* update ib_device with this local ipaddr */
        err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr);
        if (err)
@@ -947,6 +913,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                                  event->param.conn.responder_resources,
                                  event->param.conn.initiator_depth, isv6);
 
+       rdma_set_min_rnr_timer(cm_id, IB_RNR_TIMER_000_32);
        /* rdma_accept() calls rdma_reject() internally if it fails */
        if (rdma_accept(cm_id, &conn_param))
                rds_ib_conn_error(conn, "rdma_accept failed\n");
index 5f741e5..a9e4ff9 100644 (file)
@@ -87,6 +87,7 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
 
        case RDMA_CM_EVENT_ADDR_RESOLVED:
                rdma_set_service_type(cm_id, conn->c_tos);
+               rdma_set_min_rnr_timer(cm_id, IB_RNR_TIMER_000_32);
                /* XXX do we need to clean up if this fails? */
                ret = rdma_resolve_route(cm_id,
                                         RDS_RDMA_RESOLVE_TIMEOUT_MS);
index e1e77d3..8c06381 100644 (file)
@@ -90,16 +90,16 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
        }
 
        if (skb_protocol(skb, true) == htons(ETH_P_IP)) {
-               struct dst_entry sch_frag_dst;
+               struct rtable sch_frag_rt = { 0 };
                unsigned long orig_dst;
 
                sch_frag_prepare_frag(skb, xmit);
-               dst_init(&sch_frag_dst, &sch_frag_dst_ops, NULL, 1,
+               dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, 1,
                         DST_OBSOLETE_NONE, DST_NOCOUNT);
-               sch_frag_dst.dev = skb->dev;
+               sch_frag_rt.dst.dev = skb->dev;
 
                orig_dst = skb->_skb_refdst;
-               skb_dst_set_noref(skb, &sch_frag_dst);
+               skb_dst_set_noref(skb, &sch_frag_rt.dst);
                IPCB(skb)->frag_max_size = mru;
 
                ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit);
index 5f9a7c0..5b44d22 100644 (file)
@@ -858,11 +858,7 @@ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,
        struct sctp_chunk *retval;
        __u32 ctsn;
 
-       if (chunk && chunk->asoc)
-               ctsn = sctp_tsnmap_get_ctsn(&chunk->asoc->peer.tsn_map);
-       else
-               ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
-
+       ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
        shut.cum_tsn_ack = htonl(ctsn);
 
        retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0,
index 0948f14..ce15d59 100644 (file)
@@ -826,28 +826,6 @@ static void sctp_cmd_setup_t2(struct sctp_cmd_seq *cmds,
        asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = t->rto;
 }
 
-static void sctp_cmd_assoc_update(struct sctp_cmd_seq *cmds,
-                                 struct sctp_association *asoc,
-                                 struct sctp_association *new)
-{
-       struct net *net = asoc->base.net;
-       struct sctp_chunk *abort;
-
-       if (!sctp_assoc_update(asoc, new))
-               return;
-
-       abort = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr));
-       if (abort) {
-               sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0);
-               sctp_add_cmd_sf(cmds, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
-       }
-       sctp_add_cmd_sf(cmds, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNABORTED));
-       sctp_add_cmd_sf(cmds, SCTP_CMD_ASSOC_FAILED,
-                       SCTP_PERR(SCTP_ERROR_RSRC_LOW));
-       SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
-       SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
-}
-
 /* Helper function to change the state of an association. */
 static void sctp_cmd_new_state(struct sctp_cmd_seq *cmds,
                               struct sctp_association *asoc,
@@ -1301,10 +1279,6 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type,
                        sctp_endpoint_add_asoc(ep, asoc);
                        break;
 
-               case SCTP_CMD_UPDATE_ASSOC:
-                      sctp_cmd_assoc_update(commands, asoc, cmd->obj.asoc);
-                      break;
-
                case SCTP_CMD_PURGE_OUTQUEUE:
                       sctp_outq_teardown(&asoc->outqueue);
                       break;
index 7632714..fd1e319 100644 (file)
@@ -1773,6 +1773,30 @@ enum sctp_disposition sctp_sf_do_5_2_3_initack(
                return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
 }
 
+static int sctp_sf_do_assoc_update(struct sctp_association *asoc,
+                                  struct sctp_association *new,
+                                  struct sctp_cmd_seq *cmds)
+{
+       struct net *net = asoc->base.net;
+       struct sctp_chunk *abort;
+
+       if (!sctp_assoc_update(asoc, new))
+               return 0;
+
+       abort = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr));
+       if (abort) {
+               sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0);
+               sctp_add_cmd_sf(cmds, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+       }
+       sctp_add_cmd_sf(cmds, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNABORTED));
+       sctp_add_cmd_sf(cmds, SCTP_CMD_ASSOC_FAILED,
+                       SCTP_PERR(SCTP_ERROR_RSRC_LOW));
+       SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+       SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+
+       return -ENOMEM;
+}
+
 /* Unexpected COOKIE-ECHO handler for peer restart (Table 2, action 'A')
  *
  * Section 5.2.4
@@ -1852,20 +1876,22 @@ static enum sctp_disposition sctp_sf_do_dupcook_a(
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
        sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_ASCONF_QUEUE, SCTP_NULL());
 
-       repl = sctp_make_cookie_ack(new_asoc, chunk);
+       /* Update the content of current association. */
+       if (sctp_sf_do_assoc_update((struct sctp_association *)asoc, new_asoc, commands))
+               goto nomem;
+
+       repl = sctp_make_cookie_ack(asoc, chunk);
        if (!repl)
                goto nomem;
 
        /* Report association restart to upper layer. */
        ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_RESTART, 0,
-                                            new_asoc->c.sinit_num_ostreams,
-                                            new_asoc->c.sinit_max_instreams,
+                                            asoc->c.sinit_num_ostreams,
+                                            asoc->c.sinit_max_instreams,
                                             NULL, GFP_ATOMIC);
        if (!ev)
                goto nomem_ev;
 
-       /* Update the content of current association. */
-       sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc));
        sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
        if ((sctp_state(asoc, SHUTDOWN_PENDING) ||
             sctp_state(asoc, SHUTDOWN_SENT)) &&
@@ -1925,14 +1951,17 @@ static enum sctp_disposition sctp_sf_do_dupcook_b(
        if (!sctp_auth_chunk_verify(net, chunk, new_asoc))
                return SCTP_DISPOSITION_DISCARD;
 
-       /* Update the content of current association.  */
-       sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc));
        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_ESTABLISHED));
-       SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB);
+       if (asoc->state < SCTP_STATE_ESTABLISHED)
+               SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB);
        sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL());
 
-       repl = sctp_make_cookie_ack(new_asoc, chunk);
+       /* Update the content of current association.  */
+       if (sctp_sf_do_assoc_update((struct sctp_association *)asoc, new_asoc, commands))
+               goto nomem;
+
+       repl = sctp_make_cookie_ack(asoc, chunk);
        if (!repl)
                goto nomem;
 
index b7b9013..40f9f6c 100644 (file)
@@ -357,6 +357,18 @@ static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt,
        return af;
 }
 
+static void sctp_auto_asconf_init(struct sctp_sock *sp)
+{
+       struct net *net = sock_net(&sp->inet.sk);
+
+       if (net->sctp.default_auto_asconf) {
+               spin_lock(&net->sctp.addr_wq_lock);
+               list_add_tail(&sp->auto_asconf_list, &net->sctp.auto_asconf_splist);
+               spin_unlock(&net->sctp.addr_wq_lock);
+               sp->do_auto_asconf = 1;
+       }
+}
+
 /* Bind a local address either to an endpoint or to an association.  */
 static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
 {
@@ -418,8 +430,10 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
                return -EADDRINUSE;
 
        /* Refresh ephemeral port.  */
-       if (!bp->port)
+       if (!bp->port) {
                bp->port = inet_sk(sk)->inet_num;
+               sctp_auto_asconf_init(sp);
+       }
 
        /* Add the address to the bind address list.
         * Use GFP_ATOMIC since BHs will be disabled.
@@ -1520,9 +1534,11 @@ static void sctp_close(struct sock *sk, long timeout)
 
        /* Supposedly, no process has access to the socket, but
         * the net layers still may.
+        * Also, sctp_destroy_sock() needs to be called with addr_wq_lock
+        * held and that should be grabbed before socket lock.
         */
-       local_bh_disable();
-       bh_lock_sock(sk);
+       spin_lock_bh(&net->sctp.addr_wq_lock);
+       bh_lock_sock_nested(sk);
 
        /* Hold the sock, since sk_common_release() will put sock_put()
         * and we have just a little more cleanup.
@@ -1531,7 +1547,7 @@ static void sctp_close(struct sock *sk, long timeout)
        sk_common_release(sk);
 
        bh_unlock_sock(sk);
-       local_bh_enable();
+       spin_unlock_bh(&net->sctp.addr_wq_lock);
 
        sock_put(sk);
 
@@ -4991,16 +5007,6 @@ static int sctp_init_sock(struct sock *sk)
        sk_sockets_allocated_inc(sk);
        sock_prot_inuse_add(net, sk->sk_prot, 1);
 
-       if (net->sctp.default_auto_asconf) {
-               spin_lock(&sock_net(sk)->sctp.addr_wq_lock);
-               list_add_tail(&sp->auto_asconf_list,
-                   &net->sctp.auto_asconf_splist);
-               sp->do_auto_asconf = 1;
-               spin_unlock(&sock_net(sk)->sctp.addr_wq_lock);
-       } else {
-               sp->do_auto_asconf = 0;
-       }
-
        local_bh_enable();
 
        return 0;
@@ -5025,9 +5031,7 @@ static void sctp_destroy_sock(struct sock *sk)
 
        if (sp->do_auto_asconf) {
                sp->do_auto_asconf = 0;
-               spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
                list_del(&sp->auto_asconf_list);
-               spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
        }
        sctp_endpoint_free(sp->ep);
        local_bh_disable();
@@ -9398,6 +9402,8 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
                        return err;
        }
 
+       sctp_auto_asconf_init(newsp);
+
        /* Move any messages in the old socket's receive queue that are for the
         * peeled off association to the new socket's receive queue.
         */
index be3e80b..5eff7cc 100644 (file)
@@ -2161,6 +2161,9 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
        struct smc_sock *smc;
        int val, rc;
 
+       if (level == SOL_TCP && optname == TCP_ULP)
+               return -EOPNOTSUPP;
+
        smc = smc_sk(sk);
 
        /* generic setsockopts reaching us here always apply to the
@@ -2185,7 +2188,6 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
        if (rc || smc->use_fallback)
                goto out;
        switch (optname) {
-       case TCP_ULP:
        case TCP_FASTOPEN:
        case TCP_FASTOPEN_CONNECT:
        case TCP_FASTOPEN_KEY:
index 612f0a6..f555d33 100644 (file)
@@ -1799,7 +1799,6 @@ call_allocate(struct rpc_task *task)
 
        status = xprt->ops->buf_alloc(task);
        trace_rpc_buf_alloc(task, status);
-       xprt_inject_disconnect(xprt);
        if (status == 0)
                return;
        if (status != -ENOMEM) {
@@ -2457,12 +2456,6 @@ call_decode(struct rpc_task *task)
                task->tk_flags &= ~RPC_CALL_MAJORSEEN;
        }
 
-       /*
-        * Ensure that we see all writes made by xprt_complete_rqst()
-        * before it changed req->rq_reply_bytes_recvd.
-        */
-       smp_rmb();
-
        /*
         * Did we ever call xprt_complete_rqst()? If not, we should assume
         * the message is incomplete.
@@ -2471,6 +2464,11 @@ call_decode(struct rpc_task *task)
        if (!req->rq_reply_bytes_recvd)
                goto out;
 
+       /* Ensure that we see all writes made by xprt_complete_rqst()
+        * before it changed req->rq_reply_bytes_recvd.
+        */
+       smp_rmb();
+
        req->rq_rcv_buf.len = req->rq_private_buf.len;
        trace_rpc_xdr_recvfrom(task, &req->rq_rcv_buf);
 
index 38fe2ce..647b323 100644 (file)
@@ -344,13 +344,15 @@ static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename,
                                    const char *hostname,
                                    struct sockaddr *srvaddr, size_t salen,
                                    int proto, u32 version,
-                                   const struct cred *cred)
+                                   const struct cred *cred,
+                                   const struct rpc_timeout *timeo)
 {
        struct rpc_create_args args = {
                .net            = net,
                .protocol       = proto,
                .address        = srvaddr,
                .addrsize       = salen,
+               .timeout        = timeo,
                .servername     = hostname,
                .nodename       = nodename,
                .program        = &rpcb_program,
@@ -705,7 +707,8 @@ void rpcb_getport_async(struct rpc_task *task)
                                clnt->cl_nodename,
                                xprt->servername, sap, salen,
                                xprt->prot, bind_version,
-                               clnt->cl_cred);
+                               clnt->cl_cred,
+                               task->tk_client->cl_timeout);
        if (IS_ERR(rpcb_clnt)) {
                status = PTR_ERR(rpcb_clnt);
                goto bailout_nofree;
index d76dc9d..0de918c 100644 (file)
@@ -846,7 +846,8 @@ void
 svc_rqst_free(struct svc_rqst *rqstp)
 {
        svc_release_buffer(rqstp);
-       put_page(rqstp->rq_scratch_page);
+       if (rqstp->rq_scratch_page)
+               put_page(rqstp->rq_scratch_page);
        kfree(rqstp->rq_resp);
        kfree(rqstp->rq_argp);
        kfree(rqstp->rq_auth_data);
index 9eb5b6b..478f857 100644 (file)
@@ -1174,7 +1174,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
        tcp_sock_set_cork(svsk->sk_sk, true);
        err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent);
        xdr_free_bvec(xdr);
-       trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
+       trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent);
        if (err < 0 || sent != (xdr->len + sizeof(marker)))
                goto out_close;
        if (atomic_dec_and_test(&svsk->sk_sendqlen))
index 691ccf8..e5b5a96 100644 (file)
@@ -698,9 +698,9 @@ int xprt_adjust_timeout(struct rpc_rqst *req)
        const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout;
        int status = 0;
 
-       if (time_before(jiffies, req->rq_minortimeo))
-               return status;
        if (time_before(jiffies, req->rq_majortimeo)) {
+               if (time_before(jiffies, req->rq_minortimeo))
+                       return status;
                if (to->to_exponential)
                        req->rq_timeout <<= 1;
                else
@@ -1352,6 +1352,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
                list_add_tail(&req->rq_xmit, &xprt->xmit_queue);
                INIT_LIST_HEAD(&req->rq_xmit2);
 out:
+               atomic_long_inc(&xprt->xmit_queuelen);
                set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate);
                spin_unlock(&xprt->queue_lock);
        }
@@ -1381,6 +1382,7 @@ xprt_request_dequeue_transmit_locked(struct rpc_task *task)
                }
        } else
                list_del(&req->rq_xmit2);
+       atomic_long_dec(&req->rq_xprt->xmit_queuelen);
 }
 
 /**
@@ -1469,8 +1471,6 @@ bool xprt_prepare_transmit(struct rpc_task *task)
        struct rpc_xprt *xprt = req->rq_xprt;
 
        if (!xprt_lock_write(xprt, task)) {
-               trace_xprt_transmit_queued(xprt, task);
-
                /* Race breaker: someone may have transmitted us */
                if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate))
                        rpc_wake_up_queued_task_set_status(&xprt->sending,
@@ -1483,7 +1483,10 @@ bool xprt_prepare_transmit(struct rpc_task *task)
 
 void xprt_end_transmit(struct rpc_task *task)
 {
-       xprt_release_write(task->tk_rqstp->rq_xprt, task);
+       struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
+
+       xprt_inject_disconnect(xprt);
+       xprt_release_write(xprt, task);
 }
 
 /**
@@ -1537,8 +1540,10 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
                return status;
        }
 
-       if (is_retrans)
+       if (is_retrans) {
                task->tk_client->cl_stats->rpcretrans++;
+               trace_xprt_retransmit(req);
+       }
 
        xprt_inject_disconnect(xprt);
 
@@ -1885,7 +1890,6 @@ void xprt_release(struct rpc_task *task)
        spin_unlock(&xprt->transport_lock);
        if (req->rq_buffer)
                xprt->ops->buf_free(task);
-       xprt_inject_disconnect(xprt);
        xdr_free_bvec(&req->rq_rcv_buf);
        xdr_free_bvec(&req->rq_snd_buf);
        if (req->rq_cred != NULL)
index a249837..1151efd 100644 (file)
@@ -155,9 +155,11 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs)
 void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
 {
        struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+       struct rpcrdma_rep *rep = req->rl_reply;
        struct rpc_xprt *xprt = rqst->rq_xprt;
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 
-       rpcrdma_recv_buffer_put(req->rl_reply);
+       rpcrdma_rep_put(&r_xprt->rx_buf, rep);
        req->rl_reply = NULL;
 
        spin_lock(&xprt->bc_pa_lock);
index 766a104..229fcc9 100644 (file)
 # define RPCDBG_FACILITY       RPCDBG_TRANS
 #endif
 
-/**
- * frwr_release_mr - Destroy one MR
- * @mr: MR allocated by frwr_mr_init
- *
- */
-void frwr_release_mr(struct rpcrdma_mr *mr)
+static void frwr_cid_init(struct rpcrdma_ep *ep,
+                         struct rpcrdma_mr *mr)
 {
-       int rc;
+       struct rpc_rdma_cid *cid = &mr->mr_cid;
 
-       rc = ib_dereg_mr(mr->frwr.fr_mr);
-       if (rc)
-               trace_xprtrdma_frwr_dereg(mr, rc);
-       kfree(mr->mr_sg);
-       kfree(mr);
+       cid->ci_queue_id = ep->re_attr.send_cq->res.id;
+       cid->ci_completion_id = mr->mr_ibmr->res.id;
 }
 
 static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
@@ -75,20 +68,22 @@ static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
        }
 }
 
-static void frwr_mr_recycle(struct rpcrdma_mr *mr)
+/**
+ * frwr_mr_release - Destroy one MR
+ * @mr: MR allocated by frwr_mr_init
+ *
+ */
+void frwr_mr_release(struct rpcrdma_mr *mr)
 {
-       struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
-
-       trace_xprtrdma_mr_recycle(mr);
-
-       frwr_mr_unmap(r_xprt, mr);
+       int rc;
 
-       spin_lock(&r_xprt->rx_buf.rb_lock);
-       list_del(&mr->mr_all);
-       r_xprt->rx_stats.mrs_recycled++;
-       spin_unlock(&r_xprt->rx_buf.rb_lock);
+       frwr_mr_unmap(mr->mr_xprt, mr);
 
-       frwr_release_mr(mr);
+       rc = ib_dereg_mr(mr->mr_ibmr);
+       if (rc)
+               trace_xprtrdma_frwr_dereg(mr, rc);
+       kfree(mr->mr_sg);
+       kfree(mr);
 }
 
 static void frwr_mr_put(struct rpcrdma_mr *mr)
@@ -144,10 +139,11 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
                goto out_list_err;
 
        mr->mr_xprt = r_xprt;
-       mr->frwr.fr_mr = frmr;
+       mr->mr_ibmr = frmr;
        mr->mr_device = NULL;
        INIT_LIST_HEAD(&mr->mr_list);
-       init_completion(&mr->frwr.fr_linv_done);
+       init_completion(&mr->mr_linv_done);
+       frwr_cid_init(ep, mr);
 
        sg_init_table(sg, depth);
        mr->mr_sg = sg;
@@ -257,6 +253,7 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
        ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
        ep->re_attr.cap.max_recv_wr = ep->re_max_requests;
        ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+       ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH;
        ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
 
        ep->re_max_rdma_segs =
@@ -326,7 +323,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
                goto out_dmamap_err;
        mr->mr_device = ep->re_id->device;
 
-       ibmr = mr->frwr.fr_mr;
+       ibmr = mr->mr_ibmr;
        n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE);
        if (n != dma_nents)
                goto out_mapmr_err;
@@ -336,7 +333,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
        key = (u8)(ibmr->rkey & 0x000000FF);
        ib_update_fast_reg_key(ibmr, ++key);
 
-       reg_wr = &mr->frwr.fr_regwr;
+       reg_wr = &mr->mr_regwr;
        reg_wr->mr = ibmr;
        reg_wr->key = ibmr->rkey;
        reg_wr->access = writing ?
@@ -364,29 +361,19 @@ out_mapmr_err:
  * @cq: completion queue
  * @wc: WCE for a completed FastReg WR
  *
+ * Each flushed MR gets destroyed after the QP has drained.
  */
 static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
 {
        struct ib_cqe *cqe = wc->wr_cqe;
-       struct rpcrdma_frwr *frwr =
-               container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+       struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
 
        /* WARNING: Only wr_cqe and status are reliable at this point */
-       trace_xprtrdma_wc_fastreg(wc, &frwr->fr_cid);
-       /* The MR will get recycled when the associated req is retransmitted */
+       trace_xprtrdma_wc_fastreg(wc, &mr->mr_cid);
 
        rpcrdma_flush_disconnect(cq->cq_context, wc);
 }
 
-static void frwr_cid_init(struct rpcrdma_ep *ep,
-                         struct rpcrdma_frwr *frwr)
-{
-       struct rpc_rdma_cid *cid = &frwr->fr_cid;
-
-       cid->ci_queue_id = ep->re_attr.send_cq->res.id;
-       cid->ci_completion_id = frwr->fr_mr->res.id;
-}
-
 /**
  * frwr_send - post Send WRs containing the RPC Call message
  * @r_xprt: controlling transport instance
@@ -403,27 +390,36 @@ static void frwr_cid_init(struct rpcrdma_ep *ep,
  */
 int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
+       struct ib_send_wr *post_wr, *send_wr = &req->rl_wr;
        struct rpcrdma_ep *ep = r_xprt->rx_ep;
-       struct ib_send_wr *post_wr;
        struct rpcrdma_mr *mr;
+       unsigned int num_wrs;
 
-       post_wr = &req->rl_wr;
+       num_wrs = 1;
+       post_wr = send_wr;
        list_for_each_entry(mr, &req->rl_registered, mr_list) {
-               struct rpcrdma_frwr *frwr;
-
-               frwr = &mr->frwr;
-
-               frwr->fr_cqe.done = frwr_wc_fastreg;
-               frwr_cid_init(ep, frwr);
-               frwr->fr_regwr.wr.next = post_wr;
-               frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe;
-               frwr->fr_regwr.wr.num_sge = 0;
-               frwr->fr_regwr.wr.opcode = IB_WR_REG_MR;
-               frwr->fr_regwr.wr.send_flags = 0;
+               trace_xprtrdma_mr_fastreg(mr);
+
+               mr->mr_cqe.done = frwr_wc_fastreg;
+               mr->mr_regwr.wr.next = post_wr;
+               mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe;
+               mr->mr_regwr.wr.num_sge = 0;
+               mr->mr_regwr.wr.opcode = IB_WR_REG_MR;
+               mr->mr_regwr.wr.send_flags = 0;
+               post_wr = &mr->mr_regwr.wr;
+               ++num_wrs;
+       }
 
-               post_wr = &frwr->fr_regwr.wr;
+       if ((kref_read(&req->rl_kref) > 1) || num_wrs > ep->re_send_count) {
+               send_wr->send_flags |= IB_SEND_SIGNALED;
+               ep->re_send_count = min_t(unsigned int, ep->re_send_batch,
+                                         num_wrs - ep->re_send_count);
+       } else {
+               send_wr->send_flags &= ~IB_SEND_SIGNALED;
+               ep->re_send_count -= num_wrs;
        }
 
+       trace_xprtrdma_post_send(req);
        return ib_post_send(ep->re_id->qp, post_wr, NULL);
 }
 
@@ -440,6 +436,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
        list_for_each_entry(mr, mrs, mr_list)
                if (mr->mr_handle == rep->rr_inv_rkey) {
                        list_del_init(&mr->mr_list);
+                       trace_xprtrdma_mr_reminv(mr);
                        frwr_mr_put(mr);
                        break;  /* only one invalidated MR per RPC */
                }
@@ -447,9 +444,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
 
 static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr)
 {
-       if (wc->status != IB_WC_SUCCESS)
-               frwr_mr_recycle(mr);
-       else
+       if (likely(wc->status == IB_WC_SUCCESS))
                frwr_mr_put(mr);
 }
 
@@ -462,12 +457,10 @@ static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr)
 static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
 {
        struct ib_cqe *cqe = wc->wr_cqe;
-       struct rpcrdma_frwr *frwr =
-               container_of(cqe, struct rpcrdma_frwr, fr_cqe);
-       struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+       struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
 
        /* WARNING: Only wr_cqe and status are reliable at this point */
-       trace_xprtrdma_wc_li(wc, &frwr->fr_cid);
+       trace_xprtrdma_wc_li(wc, &mr->mr_cid);
        frwr_mr_done(wc, mr);
 
        rpcrdma_flush_disconnect(cq->cq_context, wc);
@@ -483,14 +476,12 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
 static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
 {
        struct ib_cqe *cqe = wc->wr_cqe;
-       struct rpcrdma_frwr *frwr =
-               container_of(cqe, struct rpcrdma_frwr, fr_cqe);
-       struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+       struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
 
        /* WARNING: Only wr_cqe and status are reliable at this point */
-       trace_xprtrdma_wc_li_wake(wc, &frwr->fr_cid);
+       trace_xprtrdma_wc_li_wake(wc, &mr->mr_cid);
        frwr_mr_done(wc, mr);
-       complete(&frwr->fr_linv_done);
+       complete(&mr->mr_linv_done);
 
        rpcrdma_flush_disconnect(cq->cq_context, wc);
 }
@@ -511,7 +502,6 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
        struct ib_send_wr *first, **prev, *last;
        struct rpcrdma_ep *ep = r_xprt->rx_ep;
        const struct ib_send_wr *bad_wr;
-       struct rpcrdma_frwr *frwr;
        struct rpcrdma_mr *mr;
        int rc;
 
@@ -520,35 +510,34 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
         * Chain the LOCAL_INV Work Requests and post them with
         * a single ib_post_send() call.
         */
-       frwr = NULL;
        prev = &first;
        while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
 
                trace_xprtrdma_mr_localinv(mr);
                r_xprt->rx_stats.local_inv_needed++;
 
-               frwr = &mr->frwr;
-               frwr->fr_cqe.done = frwr_wc_localinv;
-               frwr_cid_init(ep, frwr);
-               last = &frwr->fr_invwr;
+               last = &mr->mr_invwr;
                last->next = NULL;
-               last->wr_cqe = &frwr->fr_cqe;
+               last->wr_cqe = &mr->mr_cqe;
                last->sg_list = NULL;
                last->num_sge = 0;
                last->opcode = IB_WR_LOCAL_INV;
                last->send_flags = IB_SEND_SIGNALED;
                last->ex.invalidate_rkey = mr->mr_handle;
 
+               last->wr_cqe->done = frwr_wc_localinv;
+
                *prev = last;
                prev = &last->next;
        }
+       mr = container_of(last, struct rpcrdma_mr, mr_invwr);
 
        /* Strong send queue ordering guarantees that when the
         * last WR in the chain completes, all WRs in the chain
         * are complete.
         */
-       frwr->fr_cqe.done = frwr_wc_localinv_wake;
-       reinit_completion(&frwr->fr_linv_done);
+       last->wr_cqe->done = frwr_wc_localinv_wake;
+       reinit_completion(&mr->mr_linv_done);
 
        /* Transport disconnect drains the receive CQ before it
         * replaces the QP. The RPC reply handler won't call us
@@ -562,22 +551,12 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
         * not happen, so don't wait in that case.
         */
        if (bad_wr != first)
-               wait_for_completion(&frwr->fr_linv_done);
+               wait_for_completion(&mr->mr_linv_done);
        if (!rc)
                return;
 
-       /* Recycle MRs in the LOCAL_INV chain that did not get posted.
-        */
+       /* On error, the MRs get destroyed once the QP has drained. */
        trace_xprtrdma_post_linv_err(req, rc);
-       while (bad_wr) {
-               frwr = container_of(bad_wr, struct rpcrdma_frwr,
-                                   fr_invwr);
-               mr = container_of(frwr, struct rpcrdma_mr, frwr);
-               bad_wr = bad_wr->next;
-
-               list_del_init(&mr->mr_list);
-               frwr_mr_recycle(mr);
-       }
 }
 
 /**
@@ -589,20 +568,24 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
        struct ib_cqe *cqe = wc->wr_cqe;
-       struct rpcrdma_frwr *frwr =
-               container_of(cqe, struct rpcrdma_frwr, fr_cqe);
-       struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
-       struct rpcrdma_rep *rep = mr->mr_req->rl_reply;
+       struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe);
+       struct rpcrdma_rep *rep;
 
        /* WARNING: Only wr_cqe and status are reliable at this point */
-       trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid);
-       frwr_mr_done(wc, mr);
+       trace_xprtrdma_wc_li_done(wc, &mr->mr_cid);
 
-       /* Ensure @rep is generated before frwr_mr_done */
+       /* Ensure that @rep is generated before the MR is released */
+       rep = mr->mr_req->rl_reply;
        smp_rmb();
-       rpcrdma_complete_rqst(rep);
 
-       rpcrdma_flush_disconnect(cq->cq_context, wc);
+       if (wc->status != IB_WC_SUCCESS) {
+               if (rep)
+                       rpcrdma_unpin_rqst(rep);
+               rpcrdma_flush_disconnect(cq->cq_context, wc);
+               return;
+       }
+       frwr_mr_put(mr);
+       rpcrdma_complete_rqst(rep);
 }
 
 /**
@@ -619,33 +602,29 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
        struct ib_send_wr *first, *last, **prev;
        struct rpcrdma_ep *ep = r_xprt->rx_ep;
-       const struct ib_send_wr *bad_wr;
-       struct rpcrdma_frwr *frwr;
        struct rpcrdma_mr *mr;
        int rc;
 
        /* Chain the LOCAL_INV Work Requests and post them with
         * a single ib_post_send() call.
         */
-       frwr = NULL;
        prev = &first;
        while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
 
                trace_xprtrdma_mr_localinv(mr);
                r_xprt->rx_stats.local_inv_needed++;
 
-               frwr = &mr->frwr;
-               frwr->fr_cqe.done = frwr_wc_localinv;
-               frwr_cid_init(ep, frwr);
-               last = &frwr->fr_invwr;
+               last = &mr->mr_invwr;
                last->next = NULL;
-               last->wr_cqe = &frwr->fr_cqe;
+               last->wr_cqe = &mr->mr_cqe;
                last->sg_list = NULL;
                last->num_sge = 0;
                last->opcode = IB_WR_LOCAL_INV;
                last->send_flags = IB_SEND_SIGNALED;
                last->ex.invalidate_rkey = mr->mr_handle;
 
+               last->wr_cqe->done = frwr_wc_localinv;
+
                *prev = last;
                prev = &last->next;
        }
@@ -655,31 +634,23 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
         * are complete. The last completion will wake up the
         * RPC waiter.
         */
-       frwr->fr_cqe.done = frwr_wc_localinv_done;
+       last->wr_cqe->done = frwr_wc_localinv_done;
 
        /* Transport disconnect drains the receive CQ before it
         * replaces the QP. The RPC reply handler won't call us
         * unless re_id->qp is a valid pointer.
         */
-       bad_wr = NULL;
-       rc = ib_post_send(ep->re_id->qp, first, &bad_wr);
+       rc = ib_post_send(ep->re_id->qp, first, NULL);
        if (!rc)
                return;
 
-       /* Recycle MRs in the LOCAL_INV chain that did not get posted.
-        */
+       /* On error, the MRs get destroyed once the QP has drained. */
        trace_xprtrdma_post_linv_err(req, rc);
-       while (bad_wr) {
-               frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
-               mr = container_of(frwr, struct rpcrdma_mr, frwr);
-               bad_wr = bad_wr->next;
-
-               frwr_mr_recycle(mr);
-       }
 
        /* The final LOCAL_INV WR in the chain is supposed to
-        * do the wake. If it was never posted, the wake will
-        * not happen, so wake here in that case.
+        * do the wake. If it was never posted, the wake does
+        * not happen. Unpin the rqst in preparation for its
+        * retransmission.
         */
-       rpcrdma_complete_rqst(req->rl_reply);
+       rpcrdma_unpin_rqst(req->rl_reply);
 }
index 292f066..649f7d8 100644 (file)
@@ -1326,9 +1326,35 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
        return -EIO;
 }
 
-/* Perform XID lookup, reconstruction of the RPC reply, and
- * RPC completion while holding the transport lock to ensure
- * the rep, rqst, and rq_task pointers remain stable.
+/**
+ * rpcrdma_unpin_rqst - Release rqst without completing it
+ * @rep: RPC/RDMA Receive context
+ *
+ * This is done when a connection is lost so that a Reply
+ * can be dropped and its matching Call can be subsequently
+ * retransmitted on a new connection.
+ */
+void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep)
+{
+       struct rpc_xprt *xprt = &rep->rr_rxprt->rx_xprt;
+       struct rpc_rqst *rqst = rep->rr_rqst;
+       struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+
+       req->rl_reply = NULL;
+       rep->rr_rqst = NULL;
+
+       spin_lock(&xprt->queue_lock);
+       xprt_unpin_rqst(rqst);
+       spin_unlock(&xprt->queue_lock);
+}
+
+/**
+ * rpcrdma_complete_rqst - Pass completed rqst back to RPC
+ * @rep: RPC/RDMA Receive context
+ *
+ * Reconstruct the RPC reply and complete the transaction
+ * while @rqst is still pinned to ensure the rep, rqst, and
+ * rq_task pointers remain stable.
  */
 void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
 {
@@ -1430,13 +1456,14 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
                credits = 1;    /* don't deadlock */
        else if (credits > r_xprt->rx_ep->re_max_requests)
                credits = r_xprt->rx_ep->re_max_requests;
+       rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1),
+                          false);
        if (buf->rb_credits != credits)
                rpcrdma_update_cwnd(r_xprt, credits);
-       rpcrdma_post_recvs(r_xprt, false);
 
        req = rpcr_to_rdmar(rqst);
        if (unlikely(req->rl_reply))
-               rpcrdma_recv_buffer_put(req->rl_reply);
+               rpcrdma_rep_put(buf, req->rl_reply);
        req->rl_reply = rep;
        rep->rr_rqst = rqst;
 
@@ -1464,5 +1491,5 @@ out_shortreply:
        trace_xprtrdma_reply_short_err(rep);
 
 out:
-       rpcrdma_recv_buffer_put(rep);
+       rpcrdma_rep_put(buf, rep);
 }
index 056452c..d6bbafb 100644 (file)
@@ -921,42 +921,48 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
        __be32 *rdma_argp = rctxt->rc_recv_buf;
        struct svc_rdma_send_ctxt *sctxt;
+       unsigned int rc_size;
        __be32 *p;
        int ret;
 
        ret = -ENOTCONN;
        if (svc_xprt_is_dead(xprt))
-               goto err0;
+               goto drop_connection;
 
        ret = -ENOMEM;
        sctxt = svc_rdma_send_ctxt_get(rdma);
        if (!sctxt)
-               goto err0;
+               goto drop_connection;
 
+       ret = -EMSGSIZE;
        p = xdr_reserve_space(&sctxt->sc_stream,
                              rpcrdma_fixed_maxsz * sizeof(*p));
        if (!p)
-               goto err0;
+               goto put_ctxt;
 
        ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res);
        if (ret < 0)
-               goto err2;
+               goto reply_chunk;
+       rc_size = ret;
 
        *p++ = *rdma_argp;
        *p++ = *(rdma_argp + 1);
        *p++ = rdma->sc_fc_credits;
        *p = pcl_is_empty(&rctxt->rc_reply_pcl) ? rdma_msg : rdma_nomsg;
 
-       if (svc_rdma_encode_read_list(sctxt) < 0)
-               goto err0;
-       if (svc_rdma_encode_write_list(rctxt, sctxt) < 0)
-               goto err0;
-       if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0)
-               goto err0;
+       ret = svc_rdma_encode_read_list(sctxt);
+       if (ret < 0)
+               goto put_ctxt;
+       ret = svc_rdma_encode_write_list(rctxt, sctxt);
+       if (ret < 0)
+               goto put_ctxt;
+       ret = svc_rdma_encode_reply_chunk(rctxt, sctxt, rc_size);
+       if (ret < 0)
+               goto put_ctxt;
 
        ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp);
        if (ret < 0)
-               goto err1;
+               goto put_ctxt;
 
        /* Prevent svc_xprt_release() from releasing the page backing
         * rq_res.head[0].iov_base. It's no longer being accessed by
@@ -964,16 +970,16 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        rqstp->rq_respages++;
        return 0;
 
- err2:
+reply_chunk:
        if (ret != -E2BIG && ret != -EINVAL)
-               goto err1;
+               goto put_ctxt;
 
        svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret);
        return 0;
 
- err1:
+put_ctxt:
        svc_rdma_send_ctxt_put(rdma, sctxt);
- err0:
+drop_connection:
        trace_svcrdma_send_err(rqstp, ret);
        svc_xprt_deferred_close(&rdma->sc_xprt);
        return -ENOTCONN;
index 78d29d1..0995359 100644 (file)
@@ -262,8 +262,10 @@ xprt_rdma_connect_worker(struct work_struct *work)
  * xprt_rdma_inject_disconnect - inject a connection fault
  * @xprt: transport context
  *
- * If @xprt is connected, disconnect it to simulate spurious connection
- * loss.
+ * If @xprt is connected, disconnect it to simulate spurious
+ * connection loss. Caller must hold @xprt's send lock to
+ * ensure that data structures and hardware resources are
+ * stable during the rdma_disconnect() call.
  */
 static void
 xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
index ec912cf..1e965a3 100644 (file)
@@ -101,6 +101,12 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
        struct rpcrdma_ep *ep = r_xprt->rx_ep;
        struct rdma_cm_id *id = ep->re_id;
 
+       /* Wait for rpcrdma_post_recvs() to leave its critical
+        * section.
+        */
+       if (atomic_inc_return(&ep->re_receiving) > 1)
+               wait_for_completion(&ep->re_done);
+
        /* Flush Receives, then wait for deferred Reply work
         * to complete.
         */
@@ -114,22 +120,6 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
        rpcrdma_ep_put(ep);
 }
 
-/**
- * rpcrdma_qp_event_handler - Handle one QP event (error notification)
- * @event: details of the event
- * @context: ep that owns QP where event occurred
- *
- * Called from the RDMA provider (device driver) possibly in an interrupt
- * context. The QP is always destroyed before the ID, so the ID will be
- * reliably available when this handler is invoked.
- */
-static void rpcrdma_qp_event_handler(struct ib_event *event, void *context)
-{
-       struct rpcrdma_ep *ep = context;
-
-       trace_xprtrdma_qp_event(ep, event);
-}
-
 /* Ensure xprt_force_disconnect() is invoked exactly once when a
  * connection is closed or lost. (The important thing is it needs
  * to be invoked "at least" once).
@@ -205,7 +195,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 
 out_flushed:
        rpcrdma_flush_disconnect(r_xprt, wc);
-       rpcrdma_rep_destroy(rep);
+       rpcrdma_rep_put(&r_xprt->rx_buf, rep);
 }
 
 static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep,
@@ -414,6 +404,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
        __module_get(THIS_MODULE);
        device = id->device;
        ep->re_id = id;
+       reinit_completion(&ep->re_done);
 
        ep->re_max_requests = r_xprt->rx_xprt.max_reqs;
        ep->re_inline_send = xprt_rdma_max_inline_write;
@@ -424,8 +415,6 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
 
        r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests);
 
-       ep->re_attr.event_handler = rpcrdma_qp_event_handler;
-       ep->re_attr.qp_context = ep;
        ep->re_attr.srq = NULL;
        ep->re_attr.cap.max_inline_data = 0;
        ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -535,7 +524,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
         * outstanding Receives.
         */
        rpcrdma_ep_get(ep);
-       rpcrdma_post_recvs(r_xprt, true);
+       rpcrdma_post_recvs(r_xprt, 1, true);
 
        rc = rdma_connect(ep->re_id, &ep->re_remote_cma);
        if (rc)
@@ -954,13 +943,11 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt)
                rpcrdma_req_reset(req);
 }
 
-/* No locking needed here. This function is called only by the
- * Receive completion handler.
- */
 static noinline
 struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
                                       bool temp)
 {
+       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
        struct rpcrdma_rep *rep;
 
        rep = kzalloc(sizeof(*rep), GFP_KERNEL);
@@ -987,7 +974,10 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
        rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
        rep->rr_recv_wr.num_sge = 1;
        rep->rr_temp = temp;
-       list_add(&rep->rr_all, &r_xprt->rx_buf.rb_all_reps);
+
+       spin_lock(&buf->rb_lock);
+       list_add(&rep->rr_all, &buf->rb_all_reps);
+       spin_unlock(&buf->rb_lock);
        return rep;
 
 out_free_regbuf:
@@ -998,16 +988,23 @@ out:
        return NULL;
 }
 
-/* No locking needed here. This function is invoked only by the
- * Receive completion handler, or during transport shutdown.
- */
-static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
+static void rpcrdma_rep_free(struct rpcrdma_rep *rep)
 {
-       list_del(&rep->rr_all);
        rpcrdma_regbuf_free(rep->rr_rdmabuf);
        kfree(rep);
 }
 
+static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
+{
+       struct rpcrdma_buffer *buf = &rep->rr_rxprt->rx_buf;
+
+       spin_lock(&buf->rb_lock);
+       list_del(&rep->rr_all);
+       spin_unlock(&buf->rb_lock);
+
+       rpcrdma_rep_free(rep);
+}
+
 static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf)
 {
        struct llist_node *node;
@@ -1019,12 +1016,21 @@ static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf)
        return llist_entry(node, struct rpcrdma_rep, rr_node);
 }
 
-static void rpcrdma_rep_put(struct rpcrdma_buffer *buf,
-                           struct rpcrdma_rep *rep)
+/**
+ * rpcrdma_rep_put - Release rpcrdma_rep back to free list
+ * @buf: buffer pool
+ * @rep: rep to release
+ *
+ */
+void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep)
 {
        llist_add(&rep->rr_node, &buf->rb_free_reps);
 }
 
+/* Caller must ensure the QP is quiescent (RQ is drained) before
+ * invoking this function, to guarantee rb_all_reps is not
+ * changing.
+ */
 static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt)
 {
        struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
@@ -1032,7 +1038,7 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt)
 
        list_for_each_entry(rep, &buf->rb_all_reps, rr_all) {
                rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf);
-               rep->rr_temp = true;
+               rep->rr_temp = true;    /* Mark this rep for destruction */
        }
 }
 
@@ -1040,8 +1046,18 @@ static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf)
 {
        struct rpcrdma_rep *rep;
 
-       while ((rep = rpcrdma_rep_get_locked(buf)) != NULL)
-               rpcrdma_rep_destroy(rep);
+       spin_lock(&buf->rb_lock);
+       while ((rep = list_first_entry_or_null(&buf->rb_all_reps,
+                                              struct rpcrdma_rep,
+                                              rr_all)) != NULL) {
+               list_del(&rep->rr_all);
+               spin_unlock(&buf->rb_lock);
+
+               rpcrdma_rep_free(rep);
+
+               spin_lock(&buf->rb_lock);
+       }
+       spin_unlock(&buf->rb_lock);
 }
 
 /**
@@ -1104,7 +1120,7 @@ void rpcrdma_req_destroy(struct rpcrdma_req *req)
                list_del(&mr->mr_all);
                spin_unlock(&buf->rb_lock);
 
-               frwr_release_mr(mr);
+               frwr_mr_release(mr);
        }
 
        rpcrdma_regbuf_free(req->rl_recvbuf);
@@ -1135,7 +1151,7 @@ static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt)
                list_del(&mr->mr_all);
                spin_unlock(&buf->rb_lock);
 
-               frwr_release_mr(mr);
+               frwr_mr_release(mr);
 
                spin_lock(&buf->rb_lock);
        }
@@ -1221,17 +1237,6 @@ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
        spin_unlock(&buffers->rb_lock);
 }
 
-/**
- * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list
- * @rep: rep to release
- *
- * Used after error conditions.
- */
-void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
-{
-       rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep);
-}
-
 /* Returns a pointer to a rpcrdma_regbuf object, or NULL.
  *
  * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
@@ -1342,21 +1347,7 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb)
  */
 int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
-       struct ib_send_wr *send_wr = &req->rl_wr;
-       struct rpcrdma_ep *ep = r_xprt->rx_ep;
-       int rc;
-
-       if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) {
-               send_wr->send_flags |= IB_SEND_SIGNALED;
-               ep->re_send_count = ep->re_send_batch;
-       } else {
-               send_wr->send_flags &= ~IB_SEND_SIGNALED;
-               --ep->re_send_count;
-       }
-
-       trace_xprtrdma_post_send(req);
-       rc = frwr_send(r_xprt, req);
-       if (rc)
+       if (frwr_send(r_xprt, req))
                return -ENOTCONN;
        return 0;
 }
@@ -1364,27 +1355,30 @@ int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 /**
  * rpcrdma_post_recvs - Refill the Receive Queue
  * @r_xprt: controlling transport instance
- * @temp: mark Receive buffers to be deleted after use
+ * @needed: current credit grant
+ * @temp: mark Receive buffers to be deleted after one use
  *
  */
-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
 {
        struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
        struct rpcrdma_ep *ep = r_xprt->rx_ep;
        struct ib_recv_wr *wr, *bad_wr;
        struct rpcrdma_rep *rep;
-       int needed, count, rc;
+       int count, rc;
 
        rc = 0;
        count = 0;
 
-       needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
        if (likely(ep->re_receive_count > needed))
                goto out;
        needed -= ep->re_receive_count;
        if (!temp)
                needed += RPCRDMA_MAX_RECV_BATCH;
 
+       if (atomic_inc_return(&ep->re_receiving) > 1)
+               goto out;
+
        /* fast path: all needed reps can be found on the free list */
        wr = NULL;
        while (needed) {
@@ -1410,6 +1404,9 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
 
        rc = ib_post_recv(ep->re_id->qp, wr,
                          (const struct ib_recv_wr **)&bad_wr);
+       if (atomic_dec_return(&ep->re_receiving) > 0)
+               complete(&ep->re_done);
+
 out:
        trace_xprtrdma_post_recvs(r_xprt, count, rc);
        if (rc) {
@@ -1418,7 +1415,7 @@ out:
 
                        rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr);
                        wr = wr->next;
-                       rpcrdma_recv_buffer_put(rep);
+                       rpcrdma_rep_put(buf, rep);
                        --count;
                }
        }
index fe3be98..436ad73 100644 (file)
@@ -83,6 +83,7 @@ struct rpcrdma_ep {
        unsigned int            re_max_inline_recv;
        int                     re_async_rc;
        int                     re_connect_status;
+       atomic_t                re_receiving;
        atomic_t                re_force_disconnect;
        struct ib_qp_init_attr  re_attr;
        wait_queue_head_t       re_connect_wait;
@@ -228,31 +229,28 @@ struct rpcrdma_sendctx {
  * An external memory region is any buffer or page that is registered
  * on the fly (ie, not pre-registered).
  */
-struct rpcrdma_frwr {
-       struct ib_mr                    *fr_mr;
-       struct ib_cqe                   fr_cqe;
-       struct rpc_rdma_cid             fr_cid;
-       struct completion               fr_linv_done;
-       union {
-               struct ib_reg_wr        fr_regwr;
-               struct ib_send_wr       fr_invwr;
-       };
-};
-
 struct rpcrdma_req;
 struct rpcrdma_mr {
        struct list_head        mr_list;
        struct rpcrdma_req      *mr_req;
+
+       struct ib_mr            *mr_ibmr;
        struct ib_device        *mr_device;
        struct scatterlist      *mr_sg;
        int                     mr_nents;
        enum dma_data_direction mr_dir;
-       struct rpcrdma_frwr     frwr;
+       struct ib_cqe           mr_cqe;
+       struct completion       mr_linv_done;
+       union {
+               struct ib_reg_wr        mr_regwr;
+               struct ib_send_wr       mr_invwr;
+       };
        struct rpcrdma_xprt     *mr_xprt;
        u32                     mr_handle;
        u32                     mr_length;
        u64                     mr_offset;
        struct list_head        mr_all;
+       struct rpc_rdma_cid     mr_cid;
 };
 
 /*
@@ -461,7 +459,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt);
 void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt);
 
 int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
-void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp);
 
 /*
  * Buffer calls - xprtrdma/verbs.c
@@ -480,7 +478,7 @@ void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt);
 struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
 void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers,
                        struct rpcrdma_req *req);
-void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
+void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep);
 
 bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size,
                            gfp_t flags);
@@ -527,7 +525,7 @@ rpcrdma_data_dir(bool writing)
 void frwr_reset(struct rpcrdma_req *req);
 int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device);
 int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr);
-void frwr_release_mr(struct rpcrdma_mr *mr);
+void frwr_mr_release(struct rpcrdma_mr *mr);
 struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
                                struct rpcrdma_mr_seg *seg,
                                int nsegs, bool writing, __be32 xid,
@@ -560,6 +558,7 @@ int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
 void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep);
 void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt);
 void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
+void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep);
 void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
 
 static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
index e35760f..47aa47a 100644 (file)
@@ -558,6 +558,10 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags)
        struct rpc_rqst *req;
        ssize_t ret;
 
+       /* Is this transport associated with the backchannel? */
+       if (!xprt->bc_serv)
+               return -ESHUTDOWN;
+
        /* Look up and lock the request corresponding to the given XID */
        req = xprt_lookup_bc_request(xprt, transport->recv.xid);
        if (!req) {
@@ -1018,6 +1022,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
         * to cope with writespace callbacks arriving _after_ we have
         * called sendmsg(). */
        req->rq_xtime = ktime_get();
+       tcp_sock_set_cork(transport->inet, true);
        while (1) {
                status = xprt_sock_sendmsg(transport->sock, &msg, xdr,
                                           transport->xmit.offset, rm, &sent);
@@ -1032,6 +1037,8 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
                if (likely(req->rq_bytes_sent >= msglen)) {
                        req->rq_xmit_bytes_sent += transport->xmit.offset;
                        transport->xmit.offset = 0;
+                       if (atomic_long_read(&xprt->xmit_queuelen) == 1)
+                               tcp_sock_set_cork(transport->inet, false);
                        return 0;
                }
 
@@ -2163,6 +2170,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
                }
 
                xs_tcp_set_socket_timeouts(xprt, sock);
+               tcp_sock_set_nodelay(sk);
 
                write_lock_bh(&sk->sk_callback_lock);
 
@@ -2177,7 +2185,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 
                /* socket options */
                sock_reset_flag(sk, SOCK_LINGER);
-               tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
 
                xprt_clear_connected(xprt);
 
index 1c9ecb1..c99bc4c 100644 (file)
@@ -944,8 +944,6 @@ static int vmci_transport_recv_listen(struct sock *sk,
        bool old_request = false;
        bool old_pkt_proto = false;
 
-       err = 0;
-
        /* Because we are in the listen state, we could be receiving a packet
         * for ourself or any previous connection requests that we received.
         * If it's the latter, we try to find a socket in our list of pending
index 2ac3802..9d2a89d 100644 (file)
@@ -128,13 +128,12 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
 static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
                                            struct xdp_desc *desc)
 {
-       u64 chunk, chunk_end;
+       u64 chunk;
 
-       chunk = xp_aligned_extract_addr(pool, desc->addr);
-       chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len);
-       if (chunk != chunk_end)
+       if (desc->len > pool->chunk_size)
                return false;
 
+       chunk = xp_aligned_extract_addr(pool, desc->addr);
        if (chunk >= pool->addrs_cnt)
                return false;
 
index e76cdfc..b5a1a7a 100644 (file)
@@ -124,6 +124,13 @@ config SAMPLE_HIDRAW
        bool "hidraw sample"
        depends on CC_CAN_LINK && HEADERS_INSTALL
 
+config SAMPLE_LANDLOCK
+       bool "Landlock example"
+       depends on CC_CAN_LINK && HEADERS_INSTALL
+       help
+         Build a simple Landlock sandbox manager able to start a process
+         restricted by a user-defined filesystem access control policy.
+
 config SAMPLE_PIDFD
        bool "pidfd sample"
        depends on CC_CAN_LINK && HEADERS_INSTALL
index c3392a5..087e098 100644 (file)
@@ -11,6 +11,7 @@ obj-$(CONFIG_SAMPLE_KDB)              += kdb/
 obj-$(CONFIG_SAMPLE_KFIFO)             += kfifo/
 obj-$(CONFIG_SAMPLE_KOBJECT)           += kobject/
 obj-$(CONFIG_SAMPLE_KPROBES)           += kprobes/
+subdir-$(CONFIG_SAMPLE_LANDLOCK)       += landlock
 obj-$(CONFIG_SAMPLE_LIVEPATCH)         += livepatch/
 subdir-$(CONFIG_SAMPLE_PIDFD)          += pidfd
 obj-$(CONFIG_SAMPLE_QMI_CLIENT)                += qmi/
index f9008be..37a657b 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * vim: noexpandtab ts=8 sts=0 sw=8:
- *
  * configfs_example_macros.c - This file is a demonstration module
  *      containing a number of configfs subsystems.  It uses the helper
  *      macros defined by configfs.h
index 331dcf1..c495664 100644 (file)
@@ -47,6 +47,10 @@ static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs)
        pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx, cpsr = 0x%lx\n",
                p->symbol_name, p->addr, (long)regs->ARM_pc, (long)regs->ARM_cpsr);
 #endif
+#ifdef CONFIG_RISCV
+       pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx, status = 0x%lx\n",
+               p->symbol_name, p->addr, regs->epc, regs->status);
+#endif
 #ifdef CONFIG_S390
        pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n",
                p->symbol_name, p->addr, regs->psw.addr, regs->flags);
@@ -80,6 +84,10 @@ static void __kprobes handler_post(struct kprobe *p, struct pt_regs *regs,
        pr_info("<%s> post_handler: p->addr = 0x%p, cpsr = 0x%lx\n",
                p->symbol_name, p->addr, (long)regs->ARM_cpsr);
 #endif
+#ifdef CONFIG_RISCV
+       pr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n",
+               p->symbol_name, p->addr, regs->status);
+#endif
 #ifdef CONFIG_S390
        pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n",
                p->symbol_name, p->addr, regs->flags);
diff --git a/samples/landlock/.gitignore b/samples/landlock/.gitignore
new file mode 100644 (file)
index 0000000..f43668b
--- /dev/null
@@ -0,0 +1 @@
+/sandboxer
diff --git a/samples/landlock/Makefile b/samples/landlock/Makefile
new file mode 100644 (file)
index 0000000..5d601e5
--- /dev/null
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: BSD-3-Clause
+
+userprogs-always-y := sandboxer
+
+userccflags += -I usr/include
+
+.PHONY: all clean
+
+all:
+       $(MAKE) -C ../.. samples/landlock/
+
+clean:
+       $(MAKE) -C ../.. M=samples/landlock/ clean
diff --git a/samples/landlock/sandboxer.c b/samples/landlock/sandboxer.c
new file mode 100644 (file)
index 0000000..7a15910
--- /dev/null
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*
+ * Simple Landlock sandbox manager able to launch a process restricted by a
+ * user-defined filesystem access control policy.
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2020 ANSSI
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/landlock.h>
+#include <linux/prctl.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#ifndef landlock_create_ruleset
+static inline int landlock_create_ruleset(
+               const struct landlock_ruleset_attr *const attr,
+               const size_t size, const __u32 flags)
+{
+       return syscall(__NR_landlock_create_ruleset, attr, size, flags);
+}
+#endif
+
+#ifndef landlock_add_rule
+static inline int landlock_add_rule(const int ruleset_fd,
+               const enum landlock_rule_type rule_type,
+               const void *const rule_attr, const __u32 flags)
+{
+       return syscall(__NR_landlock_add_rule, ruleset_fd, rule_type,
+                       rule_attr, flags);
+}
+#endif
+
+#ifndef landlock_restrict_self
+static inline int landlock_restrict_self(const int ruleset_fd,
+               const __u32 flags)
+{
+       return syscall(__NR_landlock_restrict_self, ruleset_fd, flags);
+}
+#endif
+
+#define ENV_FS_RO_NAME "LL_FS_RO"
+#define ENV_FS_RW_NAME "LL_FS_RW"
+#define ENV_PATH_TOKEN ":"
+
+static int parse_path(char *env_path, const char ***const path_list)
+{
+       int i, num_paths = 0;
+
+       if (env_path) {
+               num_paths++;
+               for (i = 0; env_path[i]; i++) {
+                       if (env_path[i] == ENV_PATH_TOKEN[0])
+                               num_paths++;
+               }
+       }
+       *path_list = malloc(num_paths * sizeof(**path_list));
+       for (i = 0; i < num_paths; i++)
+               (*path_list)[i] = strsep(&env_path, ENV_PATH_TOKEN);
+
+       return num_paths;
+}
+
+#define ACCESS_FILE ( \
+       LANDLOCK_ACCESS_FS_EXECUTE | \
+       LANDLOCK_ACCESS_FS_WRITE_FILE | \
+       LANDLOCK_ACCESS_FS_READ_FILE)
+
+static int populate_ruleset(
+               const char *const env_var, const int ruleset_fd,
+               const __u64 allowed_access)
+{
+       int num_paths, i, ret = 1;
+       char *env_path_name;
+       const char **path_list = NULL;
+       struct landlock_path_beneath_attr path_beneath = {
+               .parent_fd = -1,
+       };
+
+       env_path_name = getenv(env_var);
+       if (!env_path_name) {
+               /* Prevents users to forget a setting. */
+               fprintf(stderr, "Missing environment variable %s\n", env_var);
+               return 1;
+       }
+       env_path_name = strdup(env_path_name);
+       unsetenv(env_var);
+       num_paths = parse_path(env_path_name, &path_list);
+       if (num_paths == 1 && path_list[0][0] == '\0') {
+               /*
+                * Allows to not use all possible restrictions (e.g. use
+                * LL_FS_RO without LL_FS_RW).
+                */
+               ret = 0;
+               goto out_free_name;
+       }
+
+       for (i = 0; i < num_paths; i++) {
+               struct stat statbuf;
+
+               path_beneath.parent_fd = open(path_list[i], O_PATH |
+                               O_CLOEXEC);
+               if (path_beneath.parent_fd < 0) {
+                       fprintf(stderr, "Failed to open \"%s\": %s\n",
+                                       path_list[i],
+                                       strerror(errno));
+                       goto out_free_name;
+               }
+               if (fstat(path_beneath.parent_fd, &statbuf)) {
+                       close(path_beneath.parent_fd);
+                       goto out_free_name;
+               }
+               path_beneath.allowed_access = allowed_access;
+               if (!S_ISDIR(statbuf.st_mode))
+                       path_beneath.allowed_access &= ACCESS_FILE;
+               if (landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+                                       &path_beneath, 0)) {
+                       fprintf(stderr, "Failed to update the ruleset with \"%s\": %s\n",
+                                       path_list[i], strerror(errno));
+                       close(path_beneath.parent_fd);
+                       goto out_free_name;
+               }
+               close(path_beneath.parent_fd);
+       }
+       ret = 0;
+
+out_free_name:
+       free(env_path_name);
+       return ret;
+}
+
+#define ACCESS_FS_ROUGHLY_READ ( \
+       LANDLOCK_ACCESS_FS_EXECUTE | \
+       LANDLOCK_ACCESS_FS_READ_FILE | \
+       LANDLOCK_ACCESS_FS_READ_DIR)
+
+#define ACCESS_FS_ROUGHLY_WRITE ( \
+       LANDLOCK_ACCESS_FS_WRITE_FILE | \
+       LANDLOCK_ACCESS_FS_REMOVE_DIR | \
+       LANDLOCK_ACCESS_FS_REMOVE_FILE | \
+       LANDLOCK_ACCESS_FS_MAKE_CHAR | \
+       LANDLOCK_ACCESS_FS_MAKE_DIR | \
+       LANDLOCK_ACCESS_FS_MAKE_REG | \
+       LANDLOCK_ACCESS_FS_MAKE_SOCK | \
+       LANDLOCK_ACCESS_FS_MAKE_FIFO | \
+       LANDLOCK_ACCESS_FS_MAKE_BLOCK | \
+       LANDLOCK_ACCESS_FS_MAKE_SYM)
+
+int main(const int argc, char *const argv[], char *const *const envp)
+{
+       const char *cmd_path;
+       char *const *cmd_argv;
+       int ruleset_fd;
+       struct landlock_ruleset_attr ruleset_attr = {
+               .handled_access_fs = ACCESS_FS_ROUGHLY_READ |
+                       ACCESS_FS_ROUGHLY_WRITE,
+       };
+
+       if (argc < 2) {
+               fprintf(stderr, "usage: %s=\"...\" %s=\"...\" %s <cmd> [args]...\n\n",
+                               ENV_FS_RO_NAME, ENV_FS_RW_NAME, argv[0]);
+               fprintf(stderr, "Launch a command in a restricted environment.\n\n");
+               fprintf(stderr, "Environment variables containing paths, "
+                               "each separated by a colon:\n");
+               fprintf(stderr, "* %s: list of paths allowed to be used in a read-only way.\n",
+                               ENV_FS_RO_NAME);
+               fprintf(stderr, "* %s: list of paths allowed to be used in a read-write way.\n",
+                               ENV_FS_RW_NAME);
+               fprintf(stderr, "\nexample:\n"
+                               "%s=\"/bin:/lib:/usr:/proc:/etc:/dev/urandom\" "
+                               "%s=\"/dev/null:/dev/full:/dev/zero:/dev/pts:/tmp\" "
+                               "%s bash -i\n",
+                               ENV_FS_RO_NAME, ENV_FS_RW_NAME, argv[0]);
+               return 1;
+       }
+
+       ruleset_fd = landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0);
+       if (ruleset_fd < 0) {
+               const int err = errno;
+
+               perror("Failed to create a ruleset");
+               switch (err) {
+               case ENOSYS:
+                       fprintf(stderr, "Hint: Landlock is not supported by the current kernel. "
+                                       "To support it, build the kernel with "
+                                       "CONFIG_SECURITY_LANDLOCK=y and prepend "
+                                       "\"landlock,\" to the content of CONFIG_LSM.\n");
+                       break;
+               case EOPNOTSUPP:
+                       fprintf(stderr, "Hint: Landlock is currently disabled. "
+                                       "It can be enabled in the kernel configuration by "
+                                       "prepending \"landlock,\" to the content of CONFIG_LSM, "
+                                       "or at boot time by setting the same content to the "
+                                       "\"lsm\" kernel parameter.\n");
+                       break;
+               }
+               return 1;
+       }
+       if (populate_ruleset(ENV_FS_RO_NAME, ruleset_fd,
+                               ACCESS_FS_ROUGHLY_READ)) {
+               goto err_close_ruleset;
+       }
+       if (populate_ruleset(ENV_FS_RW_NAME, ruleset_fd,
+                               ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_WRITE)) {
+               goto err_close_ruleset;
+       }
+       if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+               perror("Failed to restrict privileges");
+               goto err_close_ruleset;
+       }
+       if (landlock_restrict_self(ruleset_fd, 0)) {
+               perror("Failed to enforce ruleset");
+               goto err_close_ruleset;
+       }
+       close(ruleset_fd);
+
+       cmd_path = argv[1];
+       cmd_argv = argv + 1;
+       execvpe(cmd_path, cmd_argv, envp);
+       fprintf(stderr, "Failed to execute \"%s\": %s\n", cmd_path,
+                       strerror(errno));
+       fprintf(stderr, "Hint: access to the binary, the interpreter or "
+                       "shared libraries may be denied.\n");
+       return 1;
+
+err_close_ruleset:
+       close(ruleset_fd);
+       return 1;
+}
index 861c769..881ef9a 100644 (file)
@@ -513,8 +513,6 @@ static int mbochs_create(struct mdev_device *mdev)
        struct device *dev = mdev_dev(mdev);
        struct mdev_state *mdev_state;
 
-       if (!type)
-               type = &mbochs_types[0];
        if (type->mbytes + mbochs_used_mbytes > max_mbytes)
                return -ENOMEM;
 
index f0c0e72..e889c1c 100644 (file)
@@ -667,8 +667,7 @@ static ssize_t description_show(struct mdev_type *mtype,
                &mdpy_types[mtype_get_type_group_id(mtype)];
 
        return sprintf(buf, "virtual display, %dx%d framebuffer\n",
-                      type ? type->width  : 0,
-                      type ? type->height : 0);
+                      type->width, type->height);
 }
 static MDEV_TYPE_ATTR_RO(description);
 
index ccb412a..23697a6 100755 (executable)
@@ -5829,7 +5829,7 @@ sub process {
                                next if ($arg =~ /\.\.\./);
                                next if ($arg =~ /^type$/i);
                                my $tmp_stmt = $define_stmt;
-                               $tmp_stmt =~ s/\b(sizeof|typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g;
+                               $tmp_stmt =~ s/\b(__must_be_array|offsetof|sizeof|sizeof_field|__stringify|typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g;
                                $tmp_stmt =~ s/\#+\s*$arg\b//g;
                                $tmp_stmt =~ s/\b$arg\s*\#\#//g;
                                my $use_cnt = () = $tmp_stmt =~ /\b$arg\b/g;
@@ -7006,7 +7006,7 @@ sub process {
                }
 
 # check for alloc argument mismatch
-               if ($line =~ /\b(kcalloc|kmalloc_array)\s*\(\s*sizeof\b/) {
+               if ($line =~ /\b((?:devm_)?(?:kcalloc|kmalloc_array))\s*\(\s*sizeof\b/) {
                        WARN("ALLOC_ARRAY_ARGS",
                             "$1 uses number as first arg, sizeof is generally wrong\n" . $herecurr);
                }
@@ -7198,6 +7198,17 @@ sub process {
                             "Using $1 should generally have parentheses around the comparison\n" . $herecurr);
                }
 
+# return sysfs_emit(foo, fmt, ...) fmt without newline
+               if ($line =~ /\breturn\s+sysfs_emit\s*\(\s*$FuncArg\s*,\s*($String)/ &&
+                   substr($rawline, $-[6], $+[6] - $-[6]) !~ /\\n"$/) {
+                       my $offset = $+[6] - 1;
+                       if (WARN("SYSFS_EMIT",
+                                "return sysfs_emit(...) formats should include a terminating newline\n" . $herecurr) &&
+                           $fix) {
+                               substr($fixed[$fixlinenr], $offset, 0) = '\\n';
+                       }
+               }
+
 # nested likely/unlikely calls
                if ($line =~ /\b(?:(?:un)?likely)\s*\(\s*!?\s*(IS_ERR(?:_OR_NULL|_VALUE)?|WARN)/) {
                        WARN("LIKELY_MISUSE",
index 008e62f..15fc462 100644 (file)
@@ -16,6 +16,9 @@ import gdb
 from linux import tasks, utils
 
 
+task_type = utils.CachedType("struct task_struct")
+
+
 MAX_CPUS = 4096
 
 
@@ -156,6 +159,23 @@ Note that VAR has to be quoted as string."""
 
 PerCpu()
 
+def get_current_task(cpu):
+    task_ptr_type = task_type.get_type().pointer()
+
+    if utils.is_target_arch("x86"):
+         var_ptr = gdb.parse_and_eval("&current_task")
+         return per_cpu(var_ptr, cpu).dereference()
+    elif utils.is_target_arch("aarch64"):
+         current_task_addr = gdb.parse_and_eval("$SP_EL0")
+         if((current_task_addr >> 63) != 0):
+             current_task = current_task_addr.cast(task_ptr_type)
+             return current_task.dereference()
+         else:
+             raise gdb.GdbError("Sorry, obtaining the current task is not allowed "
+                                "while running in userspace(EL0)")
+    else:
+        raise gdb.GdbError("Sorry, obtaining the current task is not yet "
+                           "supported with this arch")
 
 class LxCurrentFunc(gdb.Function):
     """Return current task.
@@ -167,8 +187,7 @@ number. If CPU is omitted, the CPU of the current context is used."""
         super(LxCurrentFunc, self).__init__("lx_current")
 
     def invoke(self, cpu=-1):
-        var_ptr = gdb.parse_and_eval("&current_task")
-        return per_cpu(var_ptr, cpu).dereference()
+        return get_current_task(cpu)
 
 
 LxCurrentFunc()
index 1be9763..08d264a 100644 (file)
@@ -164,7 +164,8 @@ lx-symbols command."""
             saved_state['breakpoint'].enabled = saved_state['enabled']
 
     def invoke(self, arg, from_tty):
-        self.module_paths = [os.path.expanduser(p) for p in arg.split()]
+        self.module_paths = [os.path.abspath(os.path.expanduser(p))
+                             for p in arg.split()]
         self.module_paths.append(os.getcwd())
 
         # enforce update
index 2a85d34..4840e74 100755 (executable)
@@ -1777,6 +1777,7 @@ sub dump_function($$) {
     $prototype =~ s/^noinline +//;
     $prototype =~ s/__init +//;
     $prototype =~ s/__init_or_module +//;
+    $prototype =~ s/__deprecated +//;
     $prototype =~ s/__flatten +//;
     $prototype =~ s/__meminit +//;
     $prototype =~ s/__must_check +//;
index 936198a..221aa7d 100755 (executable)
@@ -125,6 +125,14 @@ case "${ARCH}" in
                        fi
                done
                ;;
+       riscv)
+               for i in Image.bz2 Image.gz Image; do
+                       if [ -f "${objtree}/arch/riscv/boot/${i}" ] ; then
+                               cp -v -- "${objtree}/arch/riscv/boot/${i}" "${tmpdir}/boot/vmlinux-${KERNELRELEASE}"
+                               break
+                       fi
+               done
+               ;;
        *)
                [ -f "${KBUILD_IMAGE}" ] && cp -v -- "${KBUILD_IMAGE}" "${tmpdir}/boot/vmlinux-kbuild-${KERNELRELEASE}"
                echo "" >&2
index 0a7fc95..c17e480 100755 (executable)
@@ -266,9 +266,9 @@ if ($arch eq "x86_64") {
     # force flags for this arch
     $ld .= " -m shlelf_linux";
     if ($endian eq "big") {
-        $objcopy .= " -O elf32-shbig-linux";
+       $objcopy .= " -O elf32-shbig-linux";
     } else {
-        $objcopy .= " -O elf32-sh-linux";
+       $objcopy .= " -O elf32-sh-linux";
     }
 
 } elsif ($arch eq "powerpc") {
@@ -289,12 +289,12 @@ if ($arch eq "x86_64") {
            $ldemulation = "lppc"
     }
     if ($bits == 64) {
-        $type = ".quad";
-        $cc .= " -m64 ";
-        $ld .= " -m elf64".$ldemulation." ";
+       $type = ".quad";
+       $cc .= " -m64 ";
+       $ld .= " -m elf64".$ldemulation." ";
     } else {
-        $cc .= " -m32 ";
-        $ld .= " -m elf32".$ldemulation." ";
+       $cc .= " -m32 ";
+       $ld .= " -m elf32".$ldemulation." ";
     }
 
 } elsif ($arch eq "arm") {
@@ -313,7 +313,7 @@ if ($arch eq "x86_64") {
     $type = "data8";
 
     if ($is_module eq "0") {
-        $cc .= " -mconstant-gp";
+       $cc .= " -mconstant-gp";
     }
 } elsif ($arch eq "sparc64") {
     # In the objdump output there are giblets like:
@@ -392,7 +392,7 @@ if ($arch eq "x86_64") {
     $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$";
 } elsif ($arch eq "riscv") {
     $function_regex = "^([0-9a-fA-F]+)\\s+<([^.0-9][0-9a-zA-Z_\\.]+)>:";
-    $mcount_regex = "^\\s*([0-9a-fA-F]+):\\sR_RISCV_CALL\\s_mcount\$";
+    $mcount_regex = "^\\s*([0-9a-fA-F]+):\\sR_RISCV_CALL(_PLT)?\\s_?mcount\$";
     $type = ".quad";
     $alignment = 2;
 } elsif ($arch eq "nds32") {
@@ -530,10 +530,10 @@ while (<IN>) {
        $read_function = defined($text_sections{$1});
        if (!$read_function) {
            foreach my $prefix (keys %text_section_prefixes) {
-               if (substr($1, 0, length $prefix) eq $prefix) {
-                   $read_function = 1;
-                   last;
-               }
+               if (substr($1, 0, length $prefix) eq $prefix) {
+                   $read_function = 1;
+                   last;
+               }
            }
        }
        # print out any recorded offsets
@@ -642,3 +642,5 @@ if ($#converts >= 0) {
 `$rm $mcount_o $mcount_s`;
 
 exit(0);
+
+# vim: softtabstop=4
index 7beb426..7b6a012 100644 (file)
@@ -480,6 +480,7 @@ devided||divided
 deviece||device
 devision||division
 diable||disable
+diabled||disabled
 dicline||decline
 dictionnary||dictionary
 didnt||didn't
@@ -1027,6 +1028,8 @@ oustanding||outstanding
 overaall||overall
 overhread||overhead
 overlaping||overlapping
+overflw||overflow
+overlfow||overflow
 overide||override
 overrided||overridden
 overriden||overridden
index a92acc7..1a8ee4f 100755 (executable)
@@ -47,7 +47,6 @@ BEGIN {
        printversion("Net-tools", version("ifconfig --version"))
        printversion("Kbd", version("loadkeys -V"))
        printversion("Console-tools", version("loadkeys -V"))
-       printversion("Oprofile", version("oprofiled --version"))
        printversion("Sh-utils", version("expr --v"))
        printversion("Udev", version("udevadm --version"))
        printversion("Wireless-tools", version("iwconfig --version"))
index 7561f6f..0ced7fd 100644 (file)
@@ -238,6 +238,7 @@ source "security/loadpin/Kconfig"
 source "security/yama/Kconfig"
 source "security/safesetid/Kconfig"
 source "security/lockdown/Kconfig"
+source "security/landlock/Kconfig"
 
 source "security/integrity/Kconfig"
 
@@ -277,11 +278,11 @@ endchoice
 
 config LSM
        string "Ordered list of enabled LSMs"
-       default "lockdown,yama,loadpin,safesetid,integrity,smack,selinux,tomoyo,apparmor,bpf" if DEFAULT_SECURITY_SMACK
-       default "lockdown,yama,loadpin,safesetid,integrity,apparmor,selinux,smack,tomoyo,bpf" if DEFAULT_SECURITY_APPARMOR
-       default "lockdown,yama,loadpin,safesetid,integrity,tomoyo,bpf" if DEFAULT_SECURITY_TOMOYO
-       default "lockdown,yama,loadpin,safesetid,integrity,bpf" if DEFAULT_SECURITY_DAC
-       default "lockdown,yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor,bpf"
+       default "landlock,lockdown,yama,loadpin,safesetid,integrity,smack,selinux,tomoyo,apparmor,bpf" if DEFAULT_SECURITY_SMACK
+       default "landlock,lockdown,yama,loadpin,safesetid,integrity,apparmor,selinux,smack,tomoyo,bpf" if DEFAULT_SECURITY_APPARMOR
+       default "landlock,lockdown,yama,loadpin,safesetid,integrity,tomoyo,bpf" if DEFAULT_SECURITY_TOMOYO
+       default "landlock,lockdown,yama,loadpin,safesetid,integrity,bpf" if DEFAULT_SECURITY_DAC
+       default "landlock,lockdown,yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor,bpf"
        help
          A comma-separated list of LSMs, in initialization order.
          Any LSMs left off this list will be ignored. This can be
index 3baf435..47e4329 100644 (file)
@@ -13,6 +13,7 @@ subdir-$(CONFIG_SECURITY_LOADPIN)     += loadpin
 subdir-$(CONFIG_SECURITY_SAFESETID)    += safesetid
 subdir-$(CONFIG_SECURITY_LOCKDOWN_LSM) += lockdown
 subdir-$(CONFIG_BPF_LSM)               += bpf
+subdir-$(CONFIG_SECURITY_LANDLOCK)     += landlock
 
 # always enable default capabilities
 obj-y                                  += commoncap.o
@@ -32,6 +33,7 @@ obj-$(CONFIG_SECURITY_SAFESETID)       += safesetid/
 obj-$(CONFIG_SECURITY_LOCKDOWN_LSM)    += lockdown/
 obj-$(CONFIG_CGROUPS)                  += device_cgroup.o
 obj-$(CONFIG_BPF_LSM)                  += bpf/
+obj-$(CONFIG_SECURITY_LANDLOCK)                += landlock/
 
 # Object integrity file lists
 subdir-$(CONFIG_INTEGRITY)             += integrity
index e0828ee..aa6fcfd 100644 (file)
@@ -370,7 +370,7 @@ audit:
  * Returns: 0 on success else error
  */
 static int match_mnt(struct aa_profile *profile, const struct path *path,
-                    char *buffer, struct path *devpath, char *devbuffer,
+                    char *buffer, const struct path *devpath, char *devbuffer,
                     const char *type, unsigned long flags, void *data,
                     bool binary)
 {
@@ -579,7 +579,7 @@ out:
        return error;
 }
 
-static int profile_umount(struct aa_profile *profile, struct path *path,
+static int profile_umount(struct aa_profile *profile, const struct path *path,
                          char *buffer)
 {
        struct aa_perms perms = { };
index 250fb08..3b06a01 100644 (file)
@@ -111,6 +111,8 @@ static int __init __integrity_init_keyring(const unsigned int id,
        } else {
                if (id == INTEGRITY_KEYRING_PLATFORM)
                        set_platform_trusted_keys(keyring[id]);
+               if (id == INTEGRITY_KEYRING_IMA)
+                       load_module_cert(keyring[id]);
        }
 
        return err;
index 0ba0184..fca8a94 100644 (file)
@@ -160,7 +160,7 @@ void integrity_inode_free(struct inode *inode)
 
 static void init_once(void *foo)
 {
-       struct integrity_iint_cache *iint = foo;
+       struct integrity_iint_cache *iint = (struct integrity_iint_cache *) foo;
 
        memset(iint, 0, sizeof(*iint));
        iint->ima_file_status = INTEGRITY_UNKNOWN;
index b85d9e4..906c1d8 100644 (file)
@@ -482,7 +482,7 @@ int ima_bprm_check(struct linux_binprm *bprm)
 }
 
 /**
- * ima_path_check - based on policy, collect/store measurement.
+ * ima_file_check - based on policy, collect/store measurement.
  * @file: pointer to the file to be measured
  * @mask: contains MAY_READ, MAY_WRITE, MAY_EXEC or MAY_APPEND
  *
@@ -606,6 +606,9 @@ void ima_post_create_tmpfile(struct user_namespace *mnt_userns,
        struct integrity_iint_cache *iint;
        int must_appraise;
 
+       if (!ima_policy_flag || !S_ISREG(inode->i_mode))
+               return;
+
        must_appraise = ima_must_appraise(mnt_userns, inode, MAY_ACCESS,
                                          FILE_CHECK);
        if (!must_appraise)
@@ -636,6 +639,9 @@ void ima_post_path_mknod(struct user_namespace *mnt_userns,
        struct inode *inode = dentry->d_inode;
        int must_appraise;
 
+       if (!ima_policy_flag || !S_ISREG(inode->i_mode))
+               return;
+
        must_appraise = ima_must_appraise(mnt_userns, inode, MAY_ACCESS,
                                          FILE_CHECK);
        if (!must_appraise)
@@ -780,6 +786,7 @@ int ima_load_data(enum kernel_load_data_id id, bool contents)
                        pr_err("impossible to appraise a module without a file descriptor. sig_enforce kernel parameter might help\n");
                        return -EACCES; /* INTEGRITY_UNKNOWN */
                }
+               break;
        default:
                break;
        }
index 4f8cb15..fd5d46e 100644 (file)
@@ -599,6 +599,7 @@ static bool ima_match_rules(struct ima_rule_entry *rule,
                        rc = ima_filter_rule_match(secid, rule->lsm[i].type,
                                                   Audit_equal,
                                                   rule->lsm[i].rule);
+                       break;
                default:
                        break;
                }
@@ -836,6 +837,7 @@ void __init ima_init_policy(void)
                add_rules(default_measurement_rules,
                          ARRAY_SIZE(default_measurement_rules),
                          IMA_DEFAULT_POLICY);
+               break;
        default:
                break;
        }
index e22e510..4e081e6 100644 (file)
@@ -494,8 +494,8 @@ int ima_restore_measurement_list(loff_t size, void *buf)
                        }
                }
 
-               entry->pcr = !ima_canonical_fmt ? *(hdr[HDR_PCR].data) :
-                            le32_to_cpu(*(hdr[HDR_PCR].data));
+               entry->pcr = !ima_canonical_fmt ? *(u32 *)(hdr[HDR_PCR].data) :
+                            le32_to_cpu(*(u32 *)(hdr[HDR_PCR].data));
                ret = ima_restore_measurement_entry(entry);
                if (ret < 0)
                        break;
diff --git a/security/landlock/Kconfig b/security/landlock/Kconfig
new file mode 100644 (file)
index 0000000..8e33c4e
--- /dev/null
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config SECURITY_LANDLOCK
+       bool "Landlock support"
+       depends on SECURITY && !ARCH_EPHEMERAL_INODES
+       select SECURITY_PATH
+       help
+         Landlock is a sandboxing mechanism that enables processes to restrict
+         themselves (and their future children) by gradually enforcing
+         tailored access control policies.  A Landlock security policy is a
+         set of access rights (e.g. open a file in read-only, make a
+         directory, etc.) tied to a file hierarchy.  Such policy can be
+         configured and enforced by any processes for themselves using the
+         dedicated system calls: landlock_create_ruleset(),
+         landlock_add_rule(), and landlock_restrict_self().
+
+         See Documentation/userspace-api/landlock.rst for further information.
+
+         If you are unsure how to answer this question, answer N.  Otherwise,
+         you should also prepend "landlock," to the content of CONFIG_LSM to
+         enable Landlock at boot time.
diff --git a/security/landlock/Makefile b/security/landlock/Makefile
new file mode 100644 (file)
index 0000000..7bbd2f4
--- /dev/null
@@ -0,0 +1,4 @@
+obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
+
+landlock-y := setup.o syscalls.o object.o ruleset.o \
+       cred.o ptrace.o fs.o
diff --git a/security/landlock/common.h b/security/landlock/common.h
new file mode 100644 (file)
index 0000000..5dc0fe1
--- /dev/null
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - Common constants and helpers
+ *
+ * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_COMMON_H
+#define _SECURITY_LANDLOCK_COMMON_H
+
+#define LANDLOCK_NAME "landlock"
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) LANDLOCK_NAME ": " fmt
+
+#endif /* _SECURITY_LANDLOCK_COMMON_H */
diff --git a/security/landlock/cred.c b/security/landlock/cred.c
new file mode 100644 (file)
index 0000000..6725af2
--- /dev/null
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - Credential hooks
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#include <linux/cred.h>
+#include <linux/lsm_hooks.h>
+
+#include "common.h"
+#include "cred.h"
+#include "ruleset.h"
+#include "setup.h"
+
+static int hook_cred_prepare(struct cred *const new,
+               const struct cred *const old, const gfp_t gfp)
+{
+       struct landlock_ruleset *const old_dom = landlock_cred(old)->domain;
+
+       if (old_dom) {
+               landlock_get_ruleset(old_dom);
+               landlock_cred(new)->domain = old_dom;
+       }
+       return 0;
+}
+
+static void hook_cred_free(struct cred *const cred)
+{
+       struct landlock_ruleset *const dom = landlock_cred(cred)->domain;
+
+       if (dom)
+               landlock_put_ruleset_deferred(dom);
+}
+
+static struct security_hook_list landlock_hooks[] __lsm_ro_after_init = {
+       LSM_HOOK_INIT(cred_prepare, hook_cred_prepare),
+       LSM_HOOK_INIT(cred_free, hook_cred_free),
+};
+
+__init void landlock_add_cred_hooks(void)
+{
+       security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
+                       LANDLOCK_NAME);
+}
diff --git a/security/landlock/cred.h b/security/landlock/cred.h
new file mode 100644 (file)
index 0000000..5f99d3d
--- /dev/null
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - Credential hooks
+ *
+ * Copyright © 2019-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019-2020 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_CRED_H
+#define _SECURITY_LANDLOCK_CRED_H
+
+#include <linux/cred.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+
+#include "ruleset.h"
+#include "setup.h"
+
+struct landlock_cred_security {
+       struct landlock_ruleset *domain;
+};
+
+static inline struct landlock_cred_security *landlock_cred(
+               const struct cred *cred)
+{
+       return cred->security + landlock_blob_sizes.lbs_cred;
+}
+
+static inline const struct landlock_ruleset *landlock_get_current_domain(void)
+{
+       return landlock_cred(current_cred())->domain;
+}
+
+/*
+ * The call needs to come from an RCU read-side critical section.
+ */
+static inline const struct landlock_ruleset *landlock_get_task_domain(
+               const struct task_struct *const task)
+{
+       return landlock_cred(__task_cred(task))->domain;
+}
+
+static inline bool landlocked(const struct task_struct *const task)
+{
+       bool has_dom;
+
+       if (task == current)
+               return !!landlock_get_current_domain();
+
+       rcu_read_lock();
+       has_dom = !!landlock_get_task_domain(task);
+       rcu_read_unlock();
+       return has_dom;
+}
+
+__init void landlock_add_cred_hooks(void);
+
+#endif /* _SECURITY_LANDLOCK_CRED_H */
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
new file mode 100644 (file)
index 0000000..97b8e42
--- /dev/null
@@ -0,0 +1,692 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - Filesystem management and hooks
+ *
+ * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/bits.h>
+#include <linux/compiler_types.h>
+#include <linux/dcache.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/lsm_hooks.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/path.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/types.h>
+#include <linux/wait_bit.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/landlock.h>
+
+#include "common.h"
+#include "cred.h"
+#include "fs.h"
+#include "limits.h"
+#include "object.h"
+#include "ruleset.h"
+#include "setup.h"
+
+/* Underlying object management */
+
+static void release_inode(struct landlock_object *const object)
+       __releases(object->lock)
+{
+       struct inode *const inode = object->underobj;
+       struct super_block *sb;
+
+       if (!inode) {
+               spin_unlock(&object->lock);
+               return;
+       }
+
+       /*
+        * Protects against concurrent use by hook_sb_delete() of the reference
+        * to the underlying inode.
+        */
+       object->underobj = NULL;
+       /*
+        * Makes sure that if the filesystem is concurrently unmounted,
+        * hook_sb_delete() will wait for us to finish iput().
+        */
+       sb = inode->i_sb;
+       atomic_long_inc(&landlock_superblock(sb)->inode_refs);
+       spin_unlock(&object->lock);
+       /*
+        * Because object->underobj was not NULL, hook_sb_delete() and
+        * get_inode_object() guarantee that it is safe to reset
+        * landlock_inode(inode)->object while it is not NULL.  It is therefore
+        * not necessary to lock inode->i_lock.
+        */
+       rcu_assign_pointer(landlock_inode(inode)->object, NULL);
+       /*
+        * Now, new rules can safely be tied to @inode with get_inode_object().
+        */
+
+       iput(inode);
+       if (atomic_long_dec_and_test(&landlock_superblock(sb)->inode_refs))
+               wake_up_var(&landlock_superblock(sb)->inode_refs);
+}
+
+static const struct landlock_object_underops landlock_fs_underops = {
+       .release = release_inode
+};
+
+/* Ruleset management */
+
+static struct landlock_object *get_inode_object(struct inode *const inode)
+{
+       struct landlock_object *object, *new_object;
+       struct landlock_inode_security *inode_sec = landlock_inode(inode);
+
+       rcu_read_lock();
+retry:
+       object = rcu_dereference(inode_sec->object);
+       if (object) {
+               if (likely(refcount_inc_not_zero(&object->usage))) {
+                       rcu_read_unlock();
+                       return object;
+               }
+               /*
+                * We are racing with release_inode(), the object is going
+                * away.  Wait for release_inode(), then retry.
+                */
+               spin_lock(&object->lock);
+               spin_unlock(&object->lock);
+               goto retry;
+       }
+       rcu_read_unlock();
+
+       /*
+        * If there is no object tied to @inode, then create a new one (without
+        * holding any locks).
+        */
+       new_object = landlock_create_object(&landlock_fs_underops, inode);
+       if (IS_ERR(new_object))
+               return new_object;
+
+       /*
+        * Protects against concurrent calls to get_inode_object() or
+        * hook_sb_delete().
+        */
+       spin_lock(&inode->i_lock);
+       if (unlikely(rcu_access_pointer(inode_sec->object))) {
+               /* Someone else just created the object, bail out and retry. */
+               spin_unlock(&inode->i_lock);
+               kfree(new_object);
+
+               rcu_read_lock();
+               goto retry;
+       }
+
+       /*
+        * @inode will be released by hook_sb_delete() on its superblock
+        * shutdown, or by release_inode() when no more ruleset references the
+        * related object.
+        */
+       ihold(inode);
+       rcu_assign_pointer(inode_sec->object, new_object);
+       spin_unlock(&inode->i_lock);
+       return new_object;
+}
+
+/* All access rights that can be tied to files. */
+#define ACCESS_FILE ( \
+       LANDLOCK_ACCESS_FS_EXECUTE | \
+       LANDLOCK_ACCESS_FS_WRITE_FILE | \
+       LANDLOCK_ACCESS_FS_READ_FILE)
+
+/*
+ * @path: Should have been checked by get_path_from_fd().
+ */
+int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
+               const struct path *const path, u32 access_rights)
+{
+       int err;
+       struct landlock_object *object;
+
+       /* Files only get access rights that make sense. */
+       if (!d_is_dir(path->dentry) && (access_rights | ACCESS_FILE) !=
+                       ACCESS_FILE)
+               return -EINVAL;
+       if (WARN_ON_ONCE(ruleset->num_layers != 1))
+               return -EINVAL;
+
+       /* Transforms relative access rights to absolute ones. */
+       access_rights |= LANDLOCK_MASK_ACCESS_FS & ~ruleset->fs_access_masks[0];
+       object = get_inode_object(d_backing_inode(path->dentry));
+       if (IS_ERR(object))
+               return PTR_ERR(object);
+       mutex_lock(&ruleset->lock);
+       err = landlock_insert_rule(ruleset, object, access_rights);
+       mutex_unlock(&ruleset->lock);
+       /*
+        * No need to check for an error because landlock_insert_rule()
+        * increments the refcount for the new object if needed.
+        */
+       landlock_put_object(object);
+       return err;
+}
+
+/* Access-control management */
+
+static inline u64 unmask_layers(
+               const struct landlock_ruleset *const domain,
+               const struct path *const path, const u32 access_request,
+               u64 layer_mask)
+{
+       const struct landlock_rule *rule;
+       const struct inode *inode;
+       size_t i;
+
+       if (d_is_negative(path->dentry))
+               /* Ignore nonexistent leafs. */
+               return layer_mask;
+       inode = d_backing_inode(path->dentry);
+       rcu_read_lock();
+       rule = landlock_find_rule(domain,
+                       rcu_dereference(landlock_inode(inode)->object));
+       rcu_read_unlock();
+       if (!rule)
+               return layer_mask;
+
+       /*
+        * An access is granted if, for each policy layer, at least one rule
+        * encountered on the pathwalk grants the requested accesses,
+        * regardless of their position in the layer stack.  We must then check
+        * the remaining layers for each inode, from the first added layer to
+        * the last one.
+        */
+       for (i = 0; i < rule->num_layers; i++) {
+               const struct landlock_layer *const layer = &rule->layers[i];
+               const u64 layer_level = BIT_ULL(layer->level - 1);
+
+               /* Checks that the layer grants access to the full request. */
+               if ((layer->access & access_request) == access_request) {
+                       layer_mask &= ~layer_level;
+
+                       if (layer_mask == 0)
+                               return layer_mask;
+               }
+       }
+       return layer_mask;
+}
+
+static int check_access_path(const struct landlock_ruleset *const domain,
+               const struct path *const path, u32 access_request)
+{
+       bool allowed = false;
+       struct path walker_path;
+       u64 layer_mask;
+       size_t i;
+
+       /* Make sure all layers can be checked. */
+       BUILD_BUG_ON(BITS_PER_TYPE(layer_mask) < LANDLOCK_MAX_NUM_LAYERS);
+
+       if (!access_request)
+               return 0;
+       if (WARN_ON_ONCE(!domain || !path))
+               return 0;
+       /*
+        * Allows access to pseudo filesystems that will never be mountable
+        * (e.g. sockfs, pipefs), but can still be reachable through
+        * /proc/<pid>/fd/<file-descriptor> .
+        */
+       if ((path->dentry->d_sb->s_flags & SB_NOUSER) ||
+                       (d_is_positive(path->dentry) &&
+                        unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))))
+               return 0;
+       if (WARN_ON_ONCE(domain->num_layers < 1))
+               return -EACCES;
+
+       /* Saves all layers handling a subset of requested accesses. */
+       layer_mask = 0;
+       for (i = 0; i < domain->num_layers; i++) {
+               if (domain->fs_access_masks[i] & access_request)
+                       layer_mask |= BIT_ULL(i);
+       }
+       /* An access request not handled by the domain is allowed. */
+       if (layer_mask == 0)
+               return 0;
+
+       walker_path = *path;
+       path_get(&walker_path);
+       /*
+        * We need to walk through all the hierarchy to not miss any relevant
+        * restriction.
+        */
+       while (true) {
+               struct dentry *parent_dentry;
+
+               layer_mask = unmask_layers(domain, &walker_path,
+                               access_request, layer_mask);
+               if (layer_mask == 0) {
+                       /* Stops when a rule from each layer grants access. */
+                       allowed = true;
+                       break;
+               }
+
+jump_up:
+               if (walker_path.dentry == walker_path.mnt->mnt_root) {
+                       if (follow_up(&walker_path)) {
+                               /* Ignores hidden mount points. */
+                               goto jump_up;
+                       } else {
+                               /*
+                                * Stops at the real root.  Denies access
+                                * because not all layers have granted access.
+                                */
+                               allowed = false;
+                               break;
+                       }
+               }
+               if (unlikely(IS_ROOT(walker_path.dentry))) {
+                       /*
+                        * Stops at disconnected root directories.  Only allows
+                        * access to internal filesystems (e.g. nsfs, which is
+                        * reachable through /proc/<pid>/ns/<namespace>).
+                        */
+                       allowed = !!(walker_path.mnt->mnt_flags & MNT_INTERNAL);
+                       break;
+               }
+               parent_dentry = dget_parent(walker_path.dentry);
+               dput(walker_path.dentry);
+               walker_path.dentry = parent_dentry;
+       }
+       path_put(&walker_path);
+       return allowed ? 0 : -EACCES;
+}
+
+static inline int current_check_access_path(const struct path *const path,
+               const u32 access_request)
+{
+       const struct landlock_ruleset *const dom =
+               landlock_get_current_domain();
+
+       if (!dom)
+               return 0;
+       return check_access_path(dom, path, access_request);
+}
+
+/* Inode hooks */
+
+static void hook_inode_free_security(struct inode *const inode)
+{
+       /*
+        * All inodes must already have been untied from their object by
+        * release_inode() or hook_sb_delete().
+        */
+       WARN_ON_ONCE(landlock_inode(inode)->object);
+}
+
+/* Super-block hooks */
+
+/*
+ * Release the inodes used in a security policy.
+ *
+ * Cf. fsnotify_unmount_inodes() and invalidate_inodes()
+ */
+static void hook_sb_delete(struct super_block *const sb)
+{
+       struct inode *inode, *prev_inode = NULL;
+
+       if (!landlock_initialized)
+               return;
+
+       spin_lock(&sb->s_inode_list_lock);
+       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+               struct landlock_object *object;
+
+               /* Only handles referenced inodes. */
+               if (!atomic_read(&inode->i_count))
+                       continue;
+
+               /*
+                * Protects against concurrent modification of inode (e.g.
+                * from get_inode_object()).
+                */
+               spin_lock(&inode->i_lock);
+               /*
+                * Checks I_FREEING and I_WILL_FREE  to protect against a race
+                * condition when release_inode() just called iput(), which
+                * could lead to a NULL dereference of inode->security or a
+                * second call to iput() for the same Landlock object.  Also
+                * checks I_NEW because such inode cannot be tied to an object.
+                */
+               if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) {
+                       spin_unlock(&inode->i_lock);
+                       continue;
+               }
+
+               rcu_read_lock();
+               object = rcu_dereference(landlock_inode(inode)->object);
+               if (!object) {
+                       rcu_read_unlock();
+                       spin_unlock(&inode->i_lock);
+                       continue;
+               }
+               /* Keeps a reference to this inode until the next loop walk. */
+               __iget(inode);
+               spin_unlock(&inode->i_lock);
+
+               /*
+                * If there is no concurrent release_inode() ongoing, then we
+                * are in charge of calling iput() on this inode, otherwise we
+                * will just wait for it to finish.
+                */
+               spin_lock(&object->lock);
+               if (object->underobj == inode) {
+                       object->underobj = NULL;
+                       spin_unlock(&object->lock);
+                       rcu_read_unlock();
+
+                       /*
+                        * Because object->underobj was not NULL,
+                        * release_inode() and get_inode_object() guarantee
+                        * that it is safe to reset
+                        * landlock_inode(inode)->object while it is not NULL.
+                        * It is therefore not necessary to lock inode->i_lock.
+                        */
+                       rcu_assign_pointer(landlock_inode(inode)->object, NULL);
+                       /*
+                        * At this point, we own the ihold() reference that was
+                        * originally set up by get_inode_object() and the
+                        * __iget() reference that we just set in this loop
+                        * walk.  Therefore the following call to iput() will
+                        * not sleep nor drop the inode because there is now at
+                        * least two references to it.
+                        */
+                       iput(inode);
+               } else {
+                       spin_unlock(&object->lock);
+                       rcu_read_unlock();
+               }
+
+               if (prev_inode) {
+                       /*
+                        * At this point, we still own the __iget() reference
+                        * that we just set in this loop walk.  Therefore we
+                        * can drop the list lock and know that the inode won't
+                        * disappear from under us until the next loop walk.
+                        */
+                       spin_unlock(&sb->s_inode_list_lock);
+                       /*
+                        * We can now actually put the inode reference from the
+                        * previous loop walk, which is not needed anymore.
+                        */
+                       iput(prev_inode);
+                       cond_resched();
+                       spin_lock(&sb->s_inode_list_lock);
+               }
+               prev_inode = inode;
+       }
+       spin_unlock(&sb->s_inode_list_lock);
+
+       /* Puts the inode reference from the last loop walk, if any. */
+       if (prev_inode)
+               iput(prev_inode);
+       /* Waits for pending iput() in release_inode(). */
+       wait_var_event(&landlock_superblock(sb)->inode_refs, !atomic_long_read(
+                               &landlock_superblock(sb)->inode_refs));
+}
+
+/*
+ * Because a Landlock security policy is defined according to the filesystem
+ * topology (i.e. the mount namespace), changing it may grant access to files
+ * not previously allowed.
+ *
+ * To make it simple, deny any filesystem topology modification by landlocked
+ * processes.  Non-landlocked processes may still change the namespace of a
+ * landlocked process, but this kind of threat must be handled by a system-wide
+ * access-control security policy.
+ *
+ * This could be lifted in the future if Landlock can safely handle mount
+ * namespace updates requested by a landlocked process.  Indeed, we could
+ * update the current domain (which is currently read-only) by taking into
+ * account the accesses of the source and the destination of a new mount point.
+ * However, it would also require to make all the child domains dynamically
+ * inherit these new constraints.  Anyway, for backward compatibility reasons,
+ * a dedicated user space option would be required (e.g. as a ruleset flag).
+ */
+static int hook_sb_mount(const char *const dev_name,
+               const struct path *const path, const char *const type,
+               const unsigned long flags, void *const data)
+{
+       if (!landlock_get_current_domain())
+               return 0;
+       return -EPERM;
+}
+
+static int hook_move_mount(const struct path *const from_path,
+               const struct path *const to_path)
+{
+       if (!landlock_get_current_domain())
+               return 0;
+       return -EPERM;
+}
+
+/*
+ * Removing a mount point may reveal a previously hidden file hierarchy, which
+ * may then grant access to files, which may have previously been forbidden.
+ */
+static int hook_sb_umount(struct vfsmount *const mnt, const int flags)
+{
+       if (!landlock_get_current_domain())
+               return 0;
+       return -EPERM;
+}
+
+static int hook_sb_remount(struct super_block *const sb, void *const mnt_opts)
+{
+       if (!landlock_get_current_domain())
+               return 0;
+       return -EPERM;
+}
+
+/*
+ * pivot_root(2), like mount(2), changes the current mount namespace.  It must
+ * then be forbidden for a landlocked process.
+ *
+ * However, chroot(2) may be allowed because it only changes the relative root
+ * directory of the current process.  Moreover, it can be used to restrict the
+ * view of the filesystem.
+ */
+static int hook_sb_pivotroot(const struct path *const old_path,
+               const struct path *const new_path)
+{
+       if (!landlock_get_current_domain())
+               return 0;
+       return -EPERM;
+}
+
+/* Path hooks */
+
+static inline u32 get_mode_access(const umode_t mode)
+{
+       switch (mode & S_IFMT) {
+       case S_IFLNK:
+               return LANDLOCK_ACCESS_FS_MAKE_SYM;
+       case 0:
+               /* A zero mode translates to S_IFREG. */
+       case S_IFREG:
+               return LANDLOCK_ACCESS_FS_MAKE_REG;
+       case S_IFDIR:
+               return LANDLOCK_ACCESS_FS_MAKE_DIR;
+       case S_IFCHR:
+               return LANDLOCK_ACCESS_FS_MAKE_CHAR;
+       case S_IFBLK:
+               return LANDLOCK_ACCESS_FS_MAKE_BLOCK;
+       case S_IFIFO:
+               return LANDLOCK_ACCESS_FS_MAKE_FIFO;
+       case S_IFSOCK:
+               return LANDLOCK_ACCESS_FS_MAKE_SOCK;
+       default:
+               WARN_ON_ONCE(1);
+               return 0;
+       }
+}
+
+/*
+ * Creating multiple links or renaming may lead to privilege escalations if not
+ * handled properly.  Indeed, we must be sure that the source doesn't gain more
+ * privileges by being accessible from the destination.  This is getting more
+ * complex when dealing with multiple layers.  The whole picture can be seen as
+ * a multilayer partial ordering problem.  A future version of Landlock will
+ * deal with that.
+ */
+static int hook_path_link(struct dentry *const old_dentry,
+               const struct path *const new_dir,
+               struct dentry *const new_dentry)
+{
+       const struct landlock_ruleset *const dom =
+               landlock_get_current_domain();
+
+       if (!dom)
+               return 0;
+       /* The mount points are the same for old and new paths, cf. EXDEV. */
+       if (old_dentry->d_parent != new_dir->dentry)
+               /* Gracefully forbids reparenting. */
+               return -EXDEV;
+       if (unlikely(d_is_negative(old_dentry)))
+               return -ENOENT;
+       return check_access_path(dom, new_dir,
+                       get_mode_access(d_backing_inode(old_dentry)->i_mode));
+}
+
+static inline u32 maybe_remove(const struct dentry *const dentry)
+{
+       if (d_is_negative(dentry))
+               return 0;
+       return d_is_dir(dentry) ? LANDLOCK_ACCESS_FS_REMOVE_DIR :
+               LANDLOCK_ACCESS_FS_REMOVE_FILE;
+}
+
+static int hook_path_rename(const struct path *const old_dir,
+               struct dentry *const old_dentry,
+               const struct path *const new_dir,
+               struct dentry *const new_dentry)
+{
+       const struct landlock_ruleset *const dom =
+               landlock_get_current_domain();
+
+       if (!dom)
+               return 0;
+       /* The mount points are the same for old and new paths, cf. EXDEV. */
+       if (old_dir->dentry != new_dir->dentry)
+               /* Gracefully forbids reparenting. */
+               return -EXDEV;
+       if (unlikely(d_is_negative(old_dentry)))
+               return -ENOENT;
+       /* RENAME_EXCHANGE is handled because directories are the same. */
+       return check_access_path(dom, old_dir, maybe_remove(old_dentry) |
+                       maybe_remove(new_dentry) |
+                       get_mode_access(d_backing_inode(old_dentry)->i_mode));
+}
+
+static int hook_path_mkdir(const struct path *const dir,
+               struct dentry *const dentry, const umode_t mode)
+{
+       return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_DIR);
+}
+
+static int hook_path_mknod(const struct path *const dir,
+               struct dentry *const dentry, const umode_t mode,
+               const unsigned int dev)
+{
+       const struct landlock_ruleset *const dom =
+               landlock_get_current_domain();
+
+       if (!dom)
+               return 0;
+       return check_access_path(dom, dir, get_mode_access(mode));
+}
+
+static int hook_path_symlink(const struct path *const dir,
+               struct dentry *const dentry, const char *const old_name)
+{
+       return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_SYM);
+}
+
+static int hook_path_unlink(const struct path *const dir,
+               struct dentry *const dentry)
+{
+       return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_FILE);
+}
+
+static int hook_path_rmdir(const struct path *const dir,
+               struct dentry *const dentry)
+{
+       return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_DIR);
+}
+
+/* File hooks */
+
+static inline u32 get_file_access(const struct file *const file)
+{
+       u32 access = 0;
+
+       if (file->f_mode & FMODE_READ) {
+               /* A directory can only be opened in read mode. */
+               if (S_ISDIR(file_inode(file)->i_mode))
+                       return LANDLOCK_ACCESS_FS_READ_DIR;
+               access = LANDLOCK_ACCESS_FS_READ_FILE;
+       }
+       if (file->f_mode & FMODE_WRITE)
+               access |= LANDLOCK_ACCESS_FS_WRITE_FILE;
+       /* __FMODE_EXEC is indeed part of f_flags, not f_mode. */
+       if (file->f_flags & __FMODE_EXEC)
+               access |= LANDLOCK_ACCESS_FS_EXECUTE;
+       return access;
+}
+
+static int hook_file_open(struct file *const file)
+{
+       const struct landlock_ruleset *const dom =
+               landlock_get_current_domain();
+
+       if (!dom)
+               return 0;
+       /*
+        * Because a file may be opened with O_PATH, get_file_access() may
+        * return 0.  This case will be handled with a future Landlock
+        * evolution.
+        */
+       return check_access_path(dom, &file->f_path, get_file_access(file));
+}
+
+static struct security_hook_list landlock_hooks[] __lsm_ro_after_init = {
+       LSM_HOOK_INIT(inode_free_security, hook_inode_free_security),
+
+       LSM_HOOK_INIT(sb_delete, hook_sb_delete),
+       LSM_HOOK_INIT(sb_mount, hook_sb_mount),
+       LSM_HOOK_INIT(move_mount, hook_move_mount),
+       LSM_HOOK_INIT(sb_umount, hook_sb_umount),
+       LSM_HOOK_INIT(sb_remount, hook_sb_remount),
+       LSM_HOOK_INIT(sb_pivotroot, hook_sb_pivotroot),
+
+       LSM_HOOK_INIT(path_link, hook_path_link),
+       LSM_HOOK_INIT(path_rename, hook_path_rename),
+       LSM_HOOK_INIT(path_mkdir, hook_path_mkdir),
+       LSM_HOOK_INIT(path_mknod, hook_path_mknod),
+       LSM_HOOK_INIT(path_symlink, hook_path_symlink),
+       LSM_HOOK_INIT(path_unlink, hook_path_unlink),
+       LSM_HOOK_INIT(path_rmdir, hook_path_rmdir),
+
+       LSM_HOOK_INIT(file_open, hook_file_open),
+};
+
+__init void landlock_add_fs_hooks(void)
+{
+       security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
+                       LANDLOCK_NAME);
+}
diff --git a/security/landlock/fs.h b/security/landlock/fs.h
new file mode 100644 (file)
index 0000000..187284b
--- /dev/null
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - Filesystem management and hooks
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_FS_H
+#define _SECURITY_LANDLOCK_FS_H
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+
+#include "ruleset.h"
+#include "setup.h"
+
+/**
+ * struct landlock_inode_security - Inode security blob
+ *
+ * Enable to reference a &struct landlock_object tied to an inode (i.e.
+ * underlying object).
+ */
+struct landlock_inode_security {
+       /**
+        * @object: Weak pointer to an allocated object.  All assignments of a
+        * new object are protected by the underlying inode->i_lock.  However,
+        * atomically disassociating @object from the inode is only protected
+        * by @object->lock, from the time @object's usage refcount drops to
+        * zero to the time this pointer is nulled out (cf. release_inode() and
+        * hook_sb_delete()).  Indeed, such disassociation doesn't require
+        * inode->i_lock thanks to the careful rcu_access_pointer() check
+        * performed by get_inode_object().
+        */
+       struct landlock_object __rcu *object;
+};
+
+/**
+ * struct landlock_superblock_security - Superblock security blob
+ *
+ * Enable hook_sb_delete() to wait for concurrent calls to release_inode().
+ */
+struct landlock_superblock_security {
+       /**
+        * @inode_refs: Number of pending inodes (from this superblock) that
+        * are being released by release_inode().
+        * Cf. struct super_block->s_fsnotify_inode_refs .
+        */
+       atomic_long_t inode_refs;
+};
+
+static inline struct landlock_inode_security *landlock_inode(
+               const struct inode *const inode)
+{
+       return inode->i_security + landlock_blob_sizes.lbs_inode;
+}
+
+static inline struct landlock_superblock_security *landlock_superblock(
+               const struct super_block *const superblock)
+{
+       return superblock->s_security + landlock_blob_sizes.lbs_superblock;
+}
+
+__init void landlock_add_fs_hooks(void);
+
+int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
+               const struct path *const path, u32 access_hierarchy);
+
+#endif /* _SECURITY_LANDLOCK_FS_H */
diff --git a/security/landlock/limits.h b/security/landlock/limits.h
new file mode 100644 (file)
index 0000000..2a0a109
--- /dev/null
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - Limits for different components
+ *
+ * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_LIMITS_H
+#define _SECURITY_LANDLOCK_LIMITS_H
+
+#include <linux/limits.h>
+#include <uapi/linux/landlock.h>
+
+#define LANDLOCK_MAX_NUM_LAYERS                64
+#define LANDLOCK_MAX_NUM_RULES         U32_MAX
+
+#define LANDLOCK_LAST_ACCESS_FS                LANDLOCK_ACCESS_FS_MAKE_SYM
+#define LANDLOCK_MASK_ACCESS_FS                ((LANDLOCK_LAST_ACCESS_FS << 1) - 1)
+
+#endif /* _SECURITY_LANDLOCK_LIMITS_H */
diff --git a/security/landlock/object.c b/security/landlock/object.c
new file mode 100644 (file)
index 0000000..d674fdf
--- /dev/null
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - Object management
+ *
+ * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#include <linux/bug.h>
+#include <linux/compiler_types.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/refcount.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "object.h"
+
+struct landlock_object *landlock_create_object(
+               const struct landlock_object_underops *const underops,
+               void *const underobj)
+{
+       struct landlock_object *new_object;
+
+       if (WARN_ON_ONCE(!underops || !underobj))
+               return ERR_PTR(-ENOENT);
+       new_object = kzalloc(sizeof(*new_object), GFP_KERNEL_ACCOUNT);
+       if (!new_object)
+               return ERR_PTR(-ENOMEM);
+       refcount_set(&new_object->usage, 1);
+       spin_lock_init(&new_object->lock);
+       new_object->underops = underops;
+       new_object->underobj = underobj;
+       return new_object;
+}
+
+/*
+ * The caller must own the object (i.e. thanks to object->usage) to safely put
+ * it.
+ */
+void landlock_put_object(struct landlock_object *const object)
+{
+       /*
+        * The call to @object->underops->release(object) might sleep, e.g.
+        * because of iput().
+        */
+       might_sleep();
+       if (!object)
+               return;
+
+       /*
+        * If the @object's refcount cannot drop to zero, we can just decrement
+        * the refcount without holding a lock. Otherwise, the decrement must
+        * happen under @object->lock for synchronization with things like
+        * get_inode_object().
+        */
+       if (refcount_dec_and_lock(&object->usage, &object->lock)) {
+               __acquire(&object->lock);
+               /*
+                * With @object->lock initially held, remove the reference from
+                * @object->underobj to @object (if it still exists).
+                */
+               object->underops->release(object);
+               kfree_rcu(object, rcu_free);
+       }
+}
diff --git a/security/landlock/object.h b/security/landlock/object.h
new file mode 100644 (file)
index 0000000..3f80674
--- /dev/null
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - Object management
+ *
+ * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_OBJECT_H
+#define _SECURITY_LANDLOCK_OBJECT_H
+
+#include <linux/compiler_types.h>
+#include <linux/refcount.h>
+#include <linux/spinlock.h>
+
+struct landlock_object;
+
+/**
+ * struct landlock_object_underops - Operations on an underlying object
+ */
+struct landlock_object_underops {
+       /**
+        * @release: Releases the underlying object (e.g. iput() for an inode).
+        */
+       void (*release)(struct landlock_object *const object)
+               __releases(object->lock);
+};
+
+/**
+ * struct landlock_object - Security blob tied to a kernel object
+ *
+ * The goal of this structure is to enable to tie a set of ephemeral access
+ * rights (pertaining to different domains) to a kernel object (e.g an inode)
+ * in a safe way.  This implies to handle concurrent use and modification.
+ *
+ * The lifetime of a &struct landlock_object depends on the rules referring to
+ * it.
+ */
+struct landlock_object {
+       /**
+        * @usage: This counter is used to tie an object to the rules matching
+        * it or to keep it alive while adding a new rule.  If this counter
+        * reaches zero, this struct must not be modified, but this counter can
+        * still be read from within an RCU read-side critical section.  When
+        * adding a new rule to an object with a usage counter of zero, we must
+        * wait until the pointer to this object is set to NULL (or recycled).
+        */
+       refcount_t usage;
+       /**
+        * @lock: Protects against concurrent modifications.  This lock must be
+        * held from the time @usage drops to zero until any weak references
+        * from @underobj to this object have been cleaned up.
+        *
+        * Lock ordering: inode->i_lock nests inside this.
+        */
+       spinlock_t lock;
+       /**
+        * @underobj: Used when cleaning up an object and to mark an object as
+        * tied to its underlying kernel structure.  This pointer is protected
+        * by @lock.  Cf. landlock_release_inodes() and release_inode().
+        */
+       void *underobj;
+       union {
+               /**
+                * @rcu_free: Enables lockless use of @usage, @lock and
+                * @underobj from within an RCU read-side critical section.
+                * @rcu_free and @underops are only used by
+                * landlock_put_object().
+                */
+               struct rcu_head rcu_free;
+               /**
+                * @underops: Enables landlock_put_object() to release the
+                * underlying object (e.g. inode).
+                */
+               const struct landlock_object_underops *underops;
+       };
+};
+
+struct landlock_object *landlock_create_object(
+               const struct landlock_object_underops *const underops,
+               void *const underobj);
+
+void landlock_put_object(struct landlock_object *const object);
+
+static inline void landlock_get_object(struct landlock_object *const object)
+{
+       if (object)
+               refcount_inc(&object->usage);
+}
+
+#endif /* _SECURITY_LANDLOCK_OBJECT_H */
diff --git a/security/landlock/ptrace.c b/security/landlock/ptrace.c
new file mode 100644 (file)
index 0000000..f55b824
--- /dev/null
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - Ptrace hooks
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019-2020 ANSSI
+ */
+
+#include <asm/current.h>
+#include <linux/cred.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/lsm_hooks.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+
+#include "common.h"
+#include "cred.h"
+#include "ptrace.h"
+#include "ruleset.h"
+#include "setup.h"
+
+/**
+ * domain_scope_le - Checks domain ordering for scoped ptrace
+ *
+ * @parent: Parent domain.
+ * @child: Potential child of @parent.
+ *
+ * Checks if the @parent domain is less or equal to (i.e. an ancestor, which
+ * means a subset of) the @child domain.
+ */
+static bool domain_scope_le(const struct landlock_ruleset *const parent,
+               const struct landlock_ruleset *const child)
+{
+       const struct landlock_hierarchy *walker;
+
+       if (!parent)
+               return true;
+       if (!child)
+               return false;
+       for (walker = child->hierarchy; walker; walker = walker->parent) {
+               if (walker == parent->hierarchy)
+                       /* @parent is in the scoped hierarchy of @child. */
+                       return true;
+       }
+       /* There is no relationship between @parent and @child. */
+       return false;
+}
+
+static bool task_is_scoped(const struct task_struct *const parent,
+               const struct task_struct *const child)
+{
+       bool is_scoped;
+       const struct landlock_ruleset *dom_parent, *dom_child;
+
+       rcu_read_lock();
+       dom_parent = landlock_get_task_domain(parent);
+       dom_child = landlock_get_task_domain(child);
+       is_scoped = domain_scope_le(dom_parent, dom_child);
+       rcu_read_unlock();
+       return is_scoped;
+}
+
+static int task_ptrace(const struct task_struct *const parent,
+               const struct task_struct *const child)
+{
+       /* Quick return for non-landlocked tasks. */
+       if (!landlocked(parent))
+               return 0;
+       if (task_is_scoped(parent, child))
+               return 0;
+       return -EPERM;
+}
+
+/**
+ * hook_ptrace_access_check - Determines whether the current process may access
+ *                           another
+ *
+ * @child: Process to be accessed.
+ * @mode: Mode of attachment.
+ *
+ * If the current task has Landlock rules, then the child must have at least
+ * the same rules.  Else denied.
+ *
+ * Determines whether a process may access another, returning 0 if permission
+ * granted, -errno if denied.
+ */
+static int hook_ptrace_access_check(struct task_struct *const child,
+               const unsigned int mode)
+{
+       return task_ptrace(current, child);
+}
+
+/**
+ * hook_ptrace_traceme - Determines whether another process may trace the
+ *                      current one
+ *
+ * @parent: Task proposed to be the tracer.
+ *
+ * If the parent has Landlock rules, then the current task must have the same
+ * or more rules.  Else denied.
+ *
+ * Determines whether the nominated task is permitted to trace the current
+ * process, returning 0 if permission is granted, -errno if denied.
+ */
+static int hook_ptrace_traceme(struct task_struct *const parent)
+{
+       return task_ptrace(parent, current);
+}
+
+static struct security_hook_list landlock_hooks[] __lsm_ro_after_init = {
+       LSM_HOOK_INIT(ptrace_access_check, hook_ptrace_access_check),
+       LSM_HOOK_INIT(ptrace_traceme, hook_ptrace_traceme),
+};
+
+__init void landlock_add_ptrace_hooks(void)
+{
+       security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
+                       LANDLOCK_NAME);
+}
diff --git a/security/landlock/ptrace.h b/security/landlock/ptrace.h
new file mode 100644 (file)
index 0000000..265b220
--- /dev/null
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - Ptrace hooks
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_PTRACE_H
+#define _SECURITY_LANDLOCK_PTRACE_H
+
+__init void landlock_add_ptrace_hooks(void);
+
+#endif /* _SECURITY_LANDLOCK_PTRACE_H */
diff --git a/security/landlock/ruleset.c b/security/landlock/ruleset.c
new file mode 100644 (file)
index 0000000..ec72b92
--- /dev/null
@@ -0,0 +1,473 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - Ruleset management
+ *
+ * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#include <linux/bits.h>
+#include <linux/bug.h>
+#include <linux/compiler_types.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/lockdep.h>
+#include <linux/overflow.h>
+#include <linux/rbtree.h>
+#include <linux/refcount.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+
+#include "limits.h"
+#include "object.h"
+#include "ruleset.h"
+
+static struct landlock_ruleset *create_ruleset(const u32 num_layers)
+{
+       struct landlock_ruleset *new_ruleset;
+
+       new_ruleset = kzalloc(struct_size(new_ruleset, fs_access_masks,
+                               num_layers), GFP_KERNEL_ACCOUNT);
+       if (!new_ruleset)
+               return ERR_PTR(-ENOMEM);
+       refcount_set(&new_ruleset->usage, 1);
+       mutex_init(&new_ruleset->lock);
+       new_ruleset->root = RB_ROOT;
+       new_ruleset->num_layers = num_layers;
+       /*
+        * hierarchy = NULL
+        * num_rules = 0
+        * fs_access_masks[] = 0
+        */
+       return new_ruleset;
+}
+
+struct landlock_ruleset *landlock_create_ruleset(const u32 fs_access_mask)
+{
+       struct landlock_ruleset *new_ruleset;
+
+       /* Informs about useless ruleset. */
+       if (!fs_access_mask)
+               return ERR_PTR(-ENOMSG);
+       new_ruleset = create_ruleset(1);
+       if (!IS_ERR(new_ruleset))
+               new_ruleset->fs_access_masks[0] = fs_access_mask;
+       return new_ruleset;
+}
+
+static void build_check_rule(void)
+{
+       const struct landlock_rule rule = {
+               .num_layers = ~0,
+       };
+
+       BUILD_BUG_ON(rule.num_layers < LANDLOCK_MAX_NUM_LAYERS);
+}
+
+static struct landlock_rule *create_rule(
+               struct landlock_object *const object,
+               const struct landlock_layer (*const layers)[],
+               const u32 num_layers,
+               const struct landlock_layer *const new_layer)
+{
+       struct landlock_rule *new_rule;
+       u32 new_num_layers;
+
+       build_check_rule();
+       if (new_layer) {
+               /* Should already be checked by landlock_merge_ruleset(). */
+               if (WARN_ON_ONCE(num_layers >= LANDLOCK_MAX_NUM_LAYERS))
+                       return ERR_PTR(-E2BIG);
+               new_num_layers = num_layers + 1;
+       } else {
+               new_num_layers = num_layers;
+       }
+       new_rule = kzalloc(struct_size(new_rule, layers, new_num_layers),
+                       GFP_KERNEL_ACCOUNT);
+       if (!new_rule)
+               return ERR_PTR(-ENOMEM);
+       RB_CLEAR_NODE(&new_rule->node);
+       landlock_get_object(object);
+       new_rule->object = object;
+       new_rule->num_layers = new_num_layers;
+       /* Copies the original layer stack. */
+       memcpy(new_rule->layers, layers,
+                       flex_array_size(new_rule, layers, num_layers));
+       if (new_layer)
+               /* Adds a copy of @new_layer on the layer stack. */
+               new_rule->layers[new_rule->num_layers - 1] = *new_layer;
+       return new_rule;
+}
+
+static void free_rule(struct landlock_rule *const rule)
+{
+       might_sleep();
+       if (!rule)
+               return;
+       landlock_put_object(rule->object);
+       kfree(rule);
+}
+
+static void build_check_ruleset(void)
+{
+       const struct landlock_ruleset ruleset = {
+               .num_rules = ~0,
+               .num_layers = ~0,
+       };
+       typeof(ruleset.fs_access_masks[0]) fs_access_mask = ~0;
+
+       BUILD_BUG_ON(ruleset.num_rules < LANDLOCK_MAX_NUM_RULES);
+       BUILD_BUG_ON(ruleset.num_layers < LANDLOCK_MAX_NUM_LAYERS);
+       BUILD_BUG_ON(fs_access_mask < LANDLOCK_MASK_ACCESS_FS);
+}
+
+/**
+ * insert_rule - Create and insert a rule in a ruleset
+ *
+ * @ruleset: The ruleset to be updated.
+ * @object: The object to build the new rule with.  The underlying kernel
+ *          object must be held by the caller.
+ * @layers: One or multiple layers to be copied into the new rule.
+ * @num_layers: The number of @layers entries.
+ *
+ * When user space requests to add a new rule to a ruleset, @layers only
+ * contains one entry and this entry is not assigned to any level.  In this
+ * case, the new rule will extend @ruleset, similarly to a boolean OR between
+ * access rights.
+ *
+ * When merging a ruleset in a domain, or copying a domain, @layers will be
+ * added to @ruleset as new constraints, similarly to a boolean AND between
+ * access rights.
+ */
+static int insert_rule(struct landlock_ruleset *const ruleset,
+               struct landlock_object *const object,
+               const struct landlock_layer (*const layers)[],
+               size_t num_layers)
+{
+       struct rb_node **walker_node;
+       struct rb_node *parent_node = NULL;
+       struct landlock_rule *new_rule;
+
+       might_sleep();
+       lockdep_assert_held(&ruleset->lock);
+       if (WARN_ON_ONCE(!object || !layers))
+               return -ENOENT;
+       walker_node = &(ruleset->root.rb_node);
+       while (*walker_node) {
+               struct landlock_rule *const this = rb_entry(*walker_node,
+                               struct landlock_rule, node);
+
+               if (this->object != object) {
+                       parent_node = *walker_node;
+                       if (this->object < object)
+                               walker_node = &((*walker_node)->rb_right);
+                       else
+                               walker_node = &((*walker_node)->rb_left);
+                       continue;
+               }
+
+               /* Only a single-level layer should match an existing rule. */
+               if (WARN_ON_ONCE(num_layers != 1))
+                       return -EINVAL;
+
+               /* If there is a matching rule, updates it. */
+               if ((*layers)[0].level == 0) {
+                       /*
+                        * Extends access rights when the request comes from
+                        * landlock_add_rule(2), i.e. @ruleset is not a domain.
+                        */
+                       if (WARN_ON_ONCE(this->num_layers != 1))
+                               return -EINVAL;
+                       if (WARN_ON_ONCE(this->layers[0].level != 0))
+                               return -EINVAL;
+                       this->layers[0].access |= (*layers)[0].access;
+                       return 0;
+               }
+
+               if (WARN_ON_ONCE(this->layers[0].level == 0))
+                       return -EINVAL;
+
+               /*
+                * Intersects access rights when it is a merge between a
+                * ruleset and a domain.
+                */
+               new_rule = create_rule(object, &this->layers, this->num_layers,
+                               &(*layers)[0]);
+               if (IS_ERR(new_rule))
+                       return PTR_ERR(new_rule);
+               rb_replace_node(&this->node, &new_rule->node, &ruleset->root);
+               free_rule(this);
+               return 0;
+       }
+
+       /* There is no match for @object. */
+       build_check_ruleset();
+       if (ruleset->num_rules >= LANDLOCK_MAX_NUM_RULES)
+               return -E2BIG;
+       new_rule = create_rule(object, layers, num_layers, NULL);
+       if (IS_ERR(new_rule))
+               return PTR_ERR(new_rule);
+       rb_link_node(&new_rule->node, parent_node, walker_node);
+       rb_insert_color(&new_rule->node, &ruleset->root);
+       ruleset->num_rules++;
+       return 0;
+}
+
+static void build_check_layer(void)
+{
+       const struct landlock_layer layer = {
+               .level = ~0,
+               .access = ~0,
+       };
+
+       BUILD_BUG_ON(layer.level < LANDLOCK_MAX_NUM_LAYERS);
+       BUILD_BUG_ON(layer.access < LANDLOCK_MASK_ACCESS_FS);
+}
+
+/* @ruleset must be locked by the caller. */
+int landlock_insert_rule(struct landlock_ruleset *const ruleset,
+               struct landlock_object *const object, const u32 access)
+{
+       struct landlock_layer layers[] = {{
+               .access = access,
+               /* When @level is zero, insert_rule() extends @ruleset. */
+               .level = 0,
+       }};
+
+       build_check_layer();
+       return insert_rule(ruleset, object, &layers, ARRAY_SIZE(layers));
+}
+
+static inline void get_hierarchy(struct landlock_hierarchy *const hierarchy)
+{
+       if (hierarchy)
+               refcount_inc(&hierarchy->usage);
+}
+
+static void put_hierarchy(struct landlock_hierarchy *hierarchy)
+{
+       while (hierarchy && refcount_dec_and_test(&hierarchy->usage)) {
+               const struct landlock_hierarchy *const freeme = hierarchy;
+
+               hierarchy = hierarchy->parent;
+               kfree(freeme);
+       }
+}
+
+static int merge_ruleset(struct landlock_ruleset *const dst,
+               struct landlock_ruleset *const src)
+{
+       struct landlock_rule *walker_rule, *next_rule;
+       int err = 0;
+
+       might_sleep();
+       /* Should already be checked by landlock_merge_ruleset() */
+       if (WARN_ON_ONCE(!src))
+               return 0;
+       /* Only merge into a domain. */
+       if (WARN_ON_ONCE(!dst || !dst->hierarchy))
+               return -EINVAL;
+
+       /* Locks @dst first because we are its only owner. */
+       mutex_lock(&dst->lock);
+       mutex_lock_nested(&src->lock, SINGLE_DEPTH_NESTING);
+
+       /* Stacks the new layer. */
+       if (WARN_ON_ONCE(src->num_layers != 1 || dst->num_layers < 1)) {
+               err = -EINVAL;
+               goto out_unlock;
+       }
+       dst->fs_access_masks[dst->num_layers - 1] = src->fs_access_masks[0];
+
+       /* Merges the @src tree. */
+       rbtree_postorder_for_each_entry_safe(walker_rule, next_rule,
+                       &src->root, node) {
+               struct landlock_layer layers[] = {{
+                       .level = dst->num_layers,
+               }};
+
+               if (WARN_ON_ONCE(walker_rule->num_layers != 1)) {
+                       err = -EINVAL;
+                       goto out_unlock;
+               }
+               if (WARN_ON_ONCE(walker_rule->layers[0].level != 0)) {
+                       err = -EINVAL;
+                       goto out_unlock;
+               }
+               layers[0].access = walker_rule->layers[0].access;
+               err = insert_rule(dst, walker_rule->object, &layers,
+                               ARRAY_SIZE(layers));
+               if (err)
+                       goto out_unlock;
+       }
+
+out_unlock:
+       mutex_unlock(&src->lock);
+       mutex_unlock(&dst->lock);
+       return err;
+}
+
+static int inherit_ruleset(struct landlock_ruleset *const parent,
+               struct landlock_ruleset *const child)
+{
+       struct landlock_rule *walker_rule, *next_rule;
+       int err = 0;
+
+       might_sleep();
+       if (!parent)
+               return 0;
+
+       /* Locks @child first because we are its only owner. */
+       mutex_lock(&child->lock);
+       mutex_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
+
+       /* Copies the @parent tree. */
+       rbtree_postorder_for_each_entry_safe(walker_rule, next_rule,
+                       &parent->root, node) {
+               err = insert_rule(child, walker_rule->object,
+                               &walker_rule->layers, walker_rule->num_layers);
+               if (err)
+                       goto out_unlock;
+       }
+
+       if (WARN_ON_ONCE(child->num_layers <= parent->num_layers)) {
+               err = -EINVAL;
+               goto out_unlock;
+       }
+       /* Copies the parent layer stack and leaves a space for the new layer. */
+       memcpy(child->fs_access_masks, parent->fs_access_masks,
+                       flex_array_size(parent, fs_access_masks, parent->num_layers));
+
+       if (WARN_ON_ONCE(!parent->hierarchy)) {
+               err = -EINVAL;
+               goto out_unlock;
+       }
+       get_hierarchy(parent->hierarchy);
+       child->hierarchy->parent = parent->hierarchy;
+
+out_unlock:
+       mutex_unlock(&parent->lock);
+       mutex_unlock(&child->lock);
+       return err;
+}
+
+static void free_ruleset(struct landlock_ruleset *const ruleset)
+{
+       struct landlock_rule *freeme, *next;
+
+       might_sleep();
+       rbtree_postorder_for_each_entry_safe(freeme, next, &ruleset->root,
+                       node)
+               free_rule(freeme);
+       put_hierarchy(ruleset->hierarchy);
+       kfree(ruleset);
+}
+
+void landlock_put_ruleset(struct landlock_ruleset *const ruleset)
+{
+       might_sleep();
+       if (ruleset && refcount_dec_and_test(&ruleset->usage))
+               free_ruleset(ruleset);
+}
+
+static void free_ruleset_work(struct work_struct *const work)
+{
+       struct landlock_ruleset *ruleset;
+
+       ruleset = container_of(work, struct landlock_ruleset, work_free);
+       free_ruleset(ruleset);
+}
+
+void landlock_put_ruleset_deferred(struct landlock_ruleset *const ruleset)
+{
+       if (ruleset && refcount_dec_and_test(&ruleset->usage)) {
+               INIT_WORK(&ruleset->work_free, free_ruleset_work);
+               schedule_work(&ruleset->work_free);
+       }
+}
+
+/**
+ * landlock_merge_ruleset - Merge a ruleset with a domain
+ *
+ * @parent: Parent domain.
+ * @ruleset: New ruleset to be merged.
+ *
+ * Returns the intersection of @parent and @ruleset, or returns @parent if
+ * @ruleset is empty, or returns a duplicate of @ruleset if @parent is empty.
+ */
+struct landlock_ruleset *landlock_merge_ruleset(
+               struct landlock_ruleset *const parent,
+               struct landlock_ruleset *const ruleset)
+{
+       struct landlock_ruleset *new_dom;
+       u32 num_layers;
+       int err;
+
+       might_sleep();
+       if (WARN_ON_ONCE(!ruleset || parent == ruleset))
+               return ERR_PTR(-EINVAL);
+
+       if (parent) {
+               if (parent->num_layers >= LANDLOCK_MAX_NUM_LAYERS)
+                       return ERR_PTR(-E2BIG);
+               num_layers = parent->num_layers + 1;
+       } else {
+               num_layers = 1;
+       }
+
+       /* Creates a new domain... */
+       new_dom = create_ruleset(num_layers);
+       if (IS_ERR(new_dom))
+               return new_dom;
+       new_dom->hierarchy = kzalloc(sizeof(*new_dom->hierarchy),
+                       GFP_KERNEL_ACCOUNT);
+       if (!new_dom->hierarchy) {
+               err = -ENOMEM;
+               goto out_put_dom;
+       }
+       refcount_set(&new_dom->hierarchy->usage, 1);
+
+       /* ...as a child of @parent... */
+       err = inherit_ruleset(parent, new_dom);
+       if (err)
+               goto out_put_dom;
+
+       /* ...and including @ruleset. */
+       err = merge_ruleset(new_dom, ruleset);
+       if (err)
+               goto out_put_dom;
+
+       return new_dom;
+
+out_put_dom:
+       landlock_put_ruleset(new_dom);
+       return ERR_PTR(err);
+}
+
+/*
+ * The returned access has the same lifetime as @ruleset.
+ */
+const struct landlock_rule *landlock_find_rule(
+               const struct landlock_ruleset *const ruleset,
+               const struct landlock_object *const object)
+{
+       const struct rb_node *node;
+
+       if (!object)
+               return NULL;
+       node = ruleset->root.rb_node;
+       while (node) {
+               struct landlock_rule *this = rb_entry(node,
+                               struct landlock_rule, node);
+
+               if (this->object == object)
+                       return this;
+               if (this->object < object)
+                       node = node->rb_right;
+               else
+                       node = node->rb_left;
+       }
+       return NULL;
+}
diff --git a/security/landlock/ruleset.h b/security/landlock/ruleset.h
new file mode 100644 (file)
index 0000000..2d3ed7e
--- /dev/null
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - Ruleset management
+ *
+ * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_RULESET_H
+#define _SECURITY_LANDLOCK_RULESET_H
+
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/refcount.h>
+#include <linux/workqueue.h>
+
+#include "object.h"
+
+/**
+ * struct landlock_layer - Access rights for a given layer
+ */
+struct landlock_layer {
+       /**
+        * @level: Position of this layer in the layer stack.
+        */
+       u16 level;
+       /**
+        * @access: Bitfield of allowed actions on the kernel object.  They are
+        * relative to the object type (e.g. %LANDLOCK_ACTION_FS_READ).
+        */
+       u16 access;
+};
+
+/**
+ * struct landlock_rule - Access rights tied to an object
+ */
+struct landlock_rule {
+       /**
+        * @node: Node in the ruleset's red-black tree.
+        */
+       struct rb_node node;
+       /**
+        * @object: Pointer to identify a kernel object (e.g. an inode).  This
+        * is used as a key for this ruleset element.  This pointer is set once
+        * and never modified.  It always points to an allocated object because
+        * each rule increments the refcount of its object.
+        */
+       struct landlock_object *object;
+       /**
+        * @num_layers: Number of entries in @layers.
+        */
+       u32 num_layers;
+       /**
+        * @layers: Stack of layers, from the latest to the newest, implemented
+        * as a flexible array member (FAM).
+        */
+       struct landlock_layer layers[];
+};
+
+/**
+ * struct landlock_hierarchy - Node in a ruleset hierarchy
+ */
+struct landlock_hierarchy {
+       /**
+        * @parent: Pointer to the parent node, or NULL if it is a root
+        * Landlock domain.
+        */
+       struct landlock_hierarchy *parent;
+       /**
+        * @usage: Number of potential children domains plus their parent
+        * domain.
+        */
+       refcount_t usage;
+};
+
+/**
+ * struct landlock_ruleset - Landlock ruleset
+ *
+ * This data structure must contain unique entries, be updatable, and quick to
+ * match an object.
+ */
+struct landlock_ruleset {
+       /**
+        * @root: Root of a red-black tree containing &struct landlock_rule
+        * nodes.  Once a ruleset is tied to a process (i.e. as a domain), this
+        * tree is immutable until @usage reaches zero.
+        */
+       struct rb_root root;
+       /**
+        * @hierarchy: Enables hierarchy identification even when a parent
+        * domain vanishes.  This is needed for the ptrace protection.
+        */
+       struct landlock_hierarchy *hierarchy;
+       union {
+               /**
+                * @work_free: Enables to free a ruleset within a lockless
+                * section.  This is only used by
+                * landlock_put_ruleset_deferred() when @usage reaches zero.
+                * The fields @lock, @usage, @num_rules, @num_layers and
+                * @fs_access_masks are then unused.
+                */
+               struct work_struct work_free;
+               struct {
+                       /**
+                        * @lock: Protects against concurrent modifications of
+                        * @root, if @usage is greater than zero.
+                        */
+                       struct mutex lock;
+                       /**
+                        * @usage: Number of processes (i.e. domains) or file
+                        * descriptors referencing this ruleset.
+                        */
+                       refcount_t usage;
+                       /**
+                        * @num_rules: Number of non-overlapping (i.e. not for
+                        * the same object) rules in this ruleset.
+                        */
+                       u32 num_rules;
+                       /**
+                        * @num_layers: Number of layers that are used in this
+                        * ruleset.  This enables to check that all the layers
+                        * allow an access request.  A value of 0 identifies a
+                        * non-merged ruleset (i.e. not a domain).
+                        */
+                       u32 num_layers;
+                       /**
+                        * @fs_access_masks: Contains the subset of filesystem
+                        * actions that are restricted by a ruleset.  A domain
+                        * saves all layers of merged rulesets in a stack
+                        * (FAM), starting from the first layer to the last
+                        * one.  These layers are used when merging rulesets,
+                        * for user space backward compatibility (i.e.
+                        * future-proof), and to properly handle merged
+                        * rulesets without overlapping access rights.  These
+                        * layers are set once and never changed for the
+                        * lifetime of the ruleset.
+                        */
+                       u16 fs_access_masks[];
+               };
+       };
+};
+
+struct landlock_ruleset *landlock_create_ruleset(const u32 fs_access_mask);
+
+void landlock_put_ruleset(struct landlock_ruleset *const ruleset);
+void landlock_put_ruleset_deferred(struct landlock_ruleset *const ruleset);
+
+int landlock_insert_rule(struct landlock_ruleset *const ruleset,
+               struct landlock_object *const object, const u32 access);
+
+struct landlock_ruleset *landlock_merge_ruleset(
+               struct landlock_ruleset *const parent,
+               struct landlock_ruleset *const ruleset);
+
+const struct landlock_rule *landlock_find_rule(
+               const struct landlock_ruleset *const ruleset,
+               const struct landlock_object *const object);
+
+static inline void landlock_get_ruleset(struct landlock_ruleset *const ruleset)
+{
+       if (ruleset)
+               refcount_inc(&ruleset->usage);
+}
+
+#endif /* _SECURITY_LANDLOCK_RULESET_H */
diff --git a/security/landlock/setup.c b/security/landlock/setup.c
new file mode 100644 (file)
index 0000000..f8e8e98
--- /dev/null
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - Security framework setup
+ *
+ * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#include <linux/init.h>
+#include <linux/lsm_hooks.h>
+
+#include "common.h"
+#include "cred.h"
+#include "fs.h"
+#include "ptrace.h"
+#include "setup.h"
+
+bool landlock_initialized __lsm_ro_after_init = false;
+
+struct lsm_blob_sizes landlock_blob_sizes __lsm_ro_after_init = {
+       .lbs_cred = sizeof(struct landlock_cred_security),
+       .lbs_inode = sizeof(struct landlock_inode_security),
+       .lbs_superblock = sizeof(struct landlock_superblock_security),
+};
+
+static int __init landlock_init(void)
+{
+       landlock_add_cred_hooks();
+       landlock_add_ptrace_hooks();
+       landlock_add_fs_hooks();
+       landlock_initialized = true;
+       pr_info("Up and running.\n");
+       return 0;
+}
+
+DEFINE_LSM(LANDLOCK_NAME) = {
+       .name = LANDLOCK_NAME,
+       .init = landlock_init,
+       .blobs = &landlock_blob_sizes,
+};
diff --git a/security/landlock/setup.h b/security/landlock/setup.h
new file mode 100644 (file)
index 0000000..1daffab
--- /dev/null
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - Security framework setup
+ *
+ * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_SETUP_H
+#define _SECURITY_LANDLOCK_SETUP_H
+
+#include <linux/lsm_hooks.h>
+
+extern bool landlock_initialized;
+
+extern struct lsm_blob_sizes landlock_blob_sizes;
+
+#endif /* _SECURITY_LANDLOCK_SETUP_H */
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
new file mode 100644 (file)
index 0000000..3239696
--- /dev/null
@@ -0,0 +1,451 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - System call implementations and user space interfaces
+ *
+ * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#include <asm/current.h>
+#include <linux/anon_inodes.h>
+#include <linux/build_bug.h>
+#include <linux/capability.h>
+#include <linux/compiler_types.h>
+#include <linux/dcache.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/limits.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/sched.h>
+#include <linux/security.h>
+#include <linux/stddef.h>
+#include <linux/syscalls.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/landlock.h>
+
+#include "cred.h"
+#include "fs.h"
+#include "limits.h"
+#include "ruleset.h"
+#include "setup.h"
+
+/**
+ * copy_min_struct_from_user - Safe future-proof argument copying
+ *
+ * Extend copy_struct_from_user() to check for consistent user buffer.
+ *
+ * @dst: Kernel space pointer or NULL.
+ * @ksize: Actual size of the data pointed to by @dst.
+ * @ksize_min: Minimal required size to be copied.
+ * @src: User space pointer or NULL.
+ * @usize: (Alleged) size of the data pointed to by @src.
+ */
+static __always_inline int copy_min_struct_from_user(void *const dst,
+               const size_t ksize, const size_t ksize_min,
+               const void __user *const src, const size_t usize)
+{
+       /* Checks buffer inconsistencies. */
+       BUILD_BUG_ON(!dst);
+       if (!src)
+               return -EFAULT;
+
+       /* Checks size ranges. */
+       BUILD_BUG_ON(ksize <= 0);
+       BUILD_BUG_ON(ksize < ksize_min);
+       if (usize < ksize_min)
+               return -EINVAL;
+       if (usize > PAGE_SIZE)
+               return -E2BIG;
+
+       /* Copies user buffer and fills with zeros. */
+       return copy_struct_from_user(dst, ksize, src, usize);
+}
+
+/*
+ * This function only contains arithmetic operations with constants, leading to
+ * BUILD_BUG_ON().  The related code is evaluated and checked at build time,
+ * but it is then ignored thanks to compiler optimizations.
+ */
+static void build_check_abi(void)
+{
+       struct landlock_ruleset_attr ruleset_attr;
+       struct landlock_path_beneath_attr path_beneath_attr;
+       size_t ruleset_size, path_beneath_size;
+
+       /*
+        * For each user space ABI structures, first checks that there is no
+        * hole in them, then checks that all architectures have the same
+        * struct size.
+        */
+       ruleset_size = sizeof(ruleset_attr.handled_access_fs);
+       BUILD_BUG_ON(sizeof(ruleset_attr) != ruleset_size);
+       BUILD_BUG_ON(sizeof(ruleset_attr) != 8);
+
+       path_beneath_size = sizeof(path_beneath_attr.allowed_access);
+       path_beneath_size += sizeof(path_beneath_attr.parent_fd);
+       BUILD_BUG_ON(sizeof(path_beneath_attr) != path_beneath_size);
+       BUILD_BUG_ON(sizeof(path_beneath_attr) != 12);
+}
+
+/* Ruleset handling */
+
+static int fop_ruleset_release(struct inode *const inode,
+               struct file *const filp)
+{
+       struct landlock_ruleset *ruleset = filp->private_data;
+
+       landlock_put_ruleset(ruleset);
+       return 0;
+}
+
+static ssize_t fop_dummy_read(struct file *const filp, char __user *const buf,
+               const size_t size, loff_t *const ppos)
+{
+       /* Dummy handler to enable FMODE_CAN_READ. */
+       return -EINVAL;
+}
+
+static ssize_t fop_dummy_write(struct file *const filp,
+               const char __user *const buf, const size_t size,
+               loff_t *const ppos)
+{
+       /* Dummy handler to enable FMODE_CAN_WRITE. */
+       return -EINVAL;
+}
+
+/*
+ * A ruleset file descriptor enables to build a ruleset by adding (i.e.
+ * writing) rule after rule, without relying on the task's context.  This
+ * reentrant design is also used in a read way to enforce the ruleset on the
+ * current task.
+ */
+static const struct file_operations ruleset_fops = {
+       .release = fop_ruleset_release,
+       .read = fop_dummy_read,
+       .write = fop_dummy_write,
+};
+
+#define LANDLOCK_ABI_VERSION   1
+
+/**
+ * sys_landlock_create_ruleset - Create a new ruleset
+ *
+ * @attr: Pointer to a &struct landlock_ruleset_attr identifying the scope of
+ *        the new ruleset.
+ * @size: Size of the pointed &struct landlock_ruleset_attr (needed for
+ *        backward and forward compatibility).
+ * @flags: Supported value: %LANDLOCK_CREATE_RULESET_VERSION.
+ *
+ * This system call enables to create a new Landlock ruleset, and returns the
+ * related file descriptor on success.
+ *
+ * If @flags is %LANDLOCK_CREATE_RULESET_VERSION and @attr is NULL and @size is
+ * 0, then the returned value is the highest supported Landlock ABI version
+ * (starting at 1).
+ *
+ * Possible returned errors are:
+ *
+ * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - EINVAL: unknown @flags, or unknown access, or too small @size;
+ * - E2BIG or EFAULT: @attr or @size inconsistencies;
+ * - ENOMSG: empty &landlock_ruleset_attr.handled_access_fs.
+ */
+SYSCALL_DEFINE3(landlock_create_ruleset,
+               const struct landlock_ruleset_attr __user *const, attr,
+               const size_t, size, const __u32, flags)
+{
+       struct landlock_ruleset_attr ruleset_attr;
+       struct landlock_ruleset *ruleset;
+       int err, ruleset_fd;
+
+       /* Build-time checks. */
+       build_check_abi();
+
+       if (!landlock_initialized)
+               return -EOPNOTSUPP;
+
+       if (flags) {
+               if ((flags == LANDLOCK_CREATE_RULESET_VERSION)
+                               && !attr && !size)
+                       return LANDLOCK_ABI_VERSION;
+               return -EINVAL;
+       }
+
+       /* Copies raw user space buffer. */
+       err = copy_min_struct_from_user(&ruleset_attr, sizeof(ruleset_attr),
+                       offsetofend(typeof(ruleset_attr), handled_access_fs),
+                       attr, size);
+       if (err)
+               return err;
+
+       /* Checks content (and 32-bits cast). */
+       if ((ruleset_attr.handled_access_fs | LANDLOCK_MASK_ACCESS_FS) !=
+                       LANDLOCK_MASK_ACCESS_FS)
+               return -EINVAL;
+
+       /* Checks arguments and transforms to kernel struct. */
+       ruleset = landlock_create_ruleset(ruleset_attr.handled_access_fs);
+       if (IS_ERR(ruleset))
+               return PTR_ERR(ruleset);
+
+       /* Creates anonymous FD referring to the ruleset. */
+       ruleset_fd = anon_inode_getfd("landlock-ruleset", &ruleset_fops,
+                       ruleset, O_RDWR | O_CLOEXEC);
+       if (ruleset_fd < 0)
+               landlock_put_ruleset(ruleset);
+       return ruleset_fd;
+}
+
+/*
+ * Returns an owned ruleset from a FD. It is thus needed to call
+ * landlock_put_ruleset() on the return value.
+ */
+static struct landlock_ruleset *get_ruleset_from_fd(const int fd,
+               const fmode_t mode)
+{
+       struct fd ruleset_f;
+       struct landlock_ruleset *ruleset;
+
+       ruleset_f = fdget(fd);
+       if (!ruleset_f.file)
+               return ERR_PTR(-EBADF);
+
+       /* Checks FD type and access right. */
+       if (ruleset_f.file->f_op != &ruleset_fops) {
+               ruleset = ERR_PTR(-EBADFD);
+               goto out_fdput;
+       }
+       if (!(ruleset_f.file->f_mode & mode)) {
+               ruleset = ERR_PTR(-EPERM);
+               goto out_fdput;
+       }
+       ruleset = ruleset_f.file->private_data;
+       if (WARN_ON_ONCE(ruleset->num_layers != 1)) {
+               ruleset = ERR_PTR(-EINVAL);
+               goto out_fdput;
+       }
+       landlock_get_ruleset(ruleset);
+
+out_fdput:
+       fdput(ruleset_f);
+       return ruleset;
+}
+
+/* Path handling */
+
+/*
+ * @path: Must call put_path(@path) after the call if it succeeded.
+ */
+static int get_path_from_fd(const s32 fd, struct path *const path)
+{
+       struct fd f;
+       int err = 0;
+
+       BUILD_BUG_ON(!__same_type(fd,
+               ((struct landlock_path_beneath_attr *)NULL)->parent_fd));
+
+       /* Handles O_PATH. */
+       f = fdget_raw(fd);
+       if (!f.file)
+               return -EBADF;
+       /*
+        * Forbids ruleset FDs, internal filesystems (e.g. nsfs), including
+        * pseudo filesystems that will never be mountable (e.g. sockfs,
+        * pipefs).
+        */
+       if ((f.file->f_op == &ruleset_fops) ||
+                       (f.file->f_path.mnt->mnt_flags & MNT_INTERNAL) ||
+                       (f.file->f_path.dentry->d_sb->s_flags & SB_NOUSER) ||
+                       d_is_negative(f.file->f_path.dentry) ||
+                       IS_PRIVATE(d_backing_inode(f.file->f_path.dentry))) {
+               err = -EBADFD;
+               goto out_fdput;
+       }
+       *path = f.file->f_path;
+       path_get(path);
+
+out_fdput:
+       fdput(f);
+       return err;
+}
+
+/**
+ * sys_landlock_add_rule - Add a new rule to a ruleset
+ *
+ * @ruleset_fd: File descriptor tied to the ruleset that should be extended
+ *             with the new rule.
+ * @rule_type: Identify the structure type pointed to by @rule_attr (only
+ *             LANDLOCK_RULE_PATH_BENEATH for now).
+ * @rule_attr: Pointer to a rule (only of type &struct
+ *             landlock_path_beneath_attr for now).
+ * @flags: Must be 0.
+ *
+ * This system call enables to define a new rule and add it to an existing
+ * ruleset.
+ *
+ * Possible returned errors are:
+ *
+ * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - EINVAL: @flags is not 0, or inconsistent access in the rule (i.e.
+ *   &landlock_path_beneath_attr.allowed_access is not a subset of the rule's
+ *   accesses);
+ * - ENOMSG: Empty accesses (e.g. &landlock_path_beneath_attr.allowed_access);
+ * - EBADF: @ruleset_fd is not a file descriptor for the current thread, or a
+ *   member of @rule_attr is not a file descriptor as expected;
+ * - EBADFD: @ruleset_fd is not a ruleset file descriptor, or a member of
+ *   @rule_attr is not the expected file descriptor type (e.g. file open
+ *   without O_PATH);
+ * - EPERM: @ruleset_fd has no write access to the underlying ruleset;
+ * - EFAULT: @rule_attr inconsistency.
+ */
+SYSCALL_DEFINE4(landlock_add_rule,
+               const int, ruleset_fd, const enum landlock_rule_type, rule_type,
+               const void __user *const, rule_attr, const __u32, flags)
+{
+       struct landlock_path_beneath_attr path_beneath_attr;
+       struct path path;
+       struct landlock_ruleset *ruleset;
+       int res, err;
+
+       if (!landlock_initialized)
+               return -EOPNOTSUPP;
+
+       /* No flag for now. */
+       if (flags)
+               return -EINVAL;
+
+       if (rule_type != LANDLOCK_RULE_PATH_BENEATH)
+               return -EINVAL;
+
+       /* Copies raw user space buffer, only one type for now. */
+       res = copy_from_user(&path_beneath_attr, rule_attr,
+                       sizeof(path_beneath_attr));
+       if (res)
+               return -EFAULT;
+
+       /* Gets and checks the ruleset. */
+       ruleset = get_ruleset_from_fd(ruleset_fd, FMODE_CAN_WRITE);
+       if (IS_ERR(ruleset))
+               return PTR_ERR(ruleset);
+
+       /*
+        * Informs about useless rule: empty allowed_access (i.e. deny rules)
+        * are ignored in path walks.
+        */
+       if (!path_beneath_attr.allowed_access) {
+               err = -ENOMSG;
+               goto out_put_ruleset;
+       }
+       /*
+        * Checks that allowed_access matches the @ruleset constraints
+        * (ruleset->fs_access_masks[0] is automatically upgraded to 64-bits).
+        */
+       if ((path_beneath_attr.allowed_access | ruleset->fs_access_masks[0]) !=
+                       ruleset->fs_access_masks[0]) {
+               err = -EINVAL;
+               goto out_put_ruleset;
+       }
+
+       /* Gets and checks the new rule. */
+       err = get_path_from_fd(path_beneath_attr.parent_fd, &path);
+       if (err)
+               goto out_put_ruleset;
+
+       /* Imports the new rule. */
+       err = landlock_append_fs_rule(ruleset, &path,
+                       path_beneath_attr.allowed_access);
+       path_put(&path);
+
+out_put_ruleset:
+       landlock_put_ruleset(ruleset);
+       return err;
+}
+
+/* Enforcement */
+
+/**
+ * sys_landlock_restrict_self - Enforce a ruleset on the calling thread
+ *
+ * @ruleset_fd: File descriptor tied to the ruleset to merge with the target.
+ * @flags: Must be 0.
+ *
+ * This system call enables to enforce a Landlock ruleset on the current
+ * thread.  Enforcing a ruleset requires that the task has CAP_SYS_ADMIN in its
+ * namespace or is running with no_new_privs.  This avoids scenarios where
+ * unprivileged tasks can affect the behavior of privileged children.
+ *
+ * Possible returned errors are:
+ *
+ * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - EINVAL: @flags is not 0.
+ * - EBADF: @ruleset_fd is not a file descriptor for the current thread;
+ * - EBADFD: @ruleset_fd is not a ruleset file descriptor;
+ * - EPERM: @ruleset_fd has no read access to the underlying ruleset, or the
+ *   current thread is not running with no_new_privs, or it doesn't have
+ *   CAP_SYS_ADMIN in its namespace.
+ * - E2BIG: The maximum number of stacked rulesets is reached for the current
+ *   thread.
+ */
+SYSCALL_DEFINE2(landlock_restrict_self,
+               const int, ruleset_fd, const __u32, flags)
+{
+       struct landlock_ruleset *new_dom, *ruleset;
+       struct cred *new_cred;
+       struct landlock_cred_security *new_llcred;
+       int err;
+
+       if (!landlock_initialized)
+               return -EOPNOTSUPP;
+
+       /* No flag for now. */
+       if (flags)
+               return -EINVAL;
+
+       /*
+        * Similar checks as for seccomp(2), except that an -EPERM may be
+        * returned.
+        */
+       if (!task_no_new_privs(current) &&
+                       !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
+               return -EPERM;
+
+       /* Gets and checks the ruleset. */
+       ruleset = get_ruleset_from_fd(ruleset_fd, FMODE_CAN_READ);
+       if (IS_ERR(ruleset))
+               return PTR_ERR(ruleset);
+
+       /* Prepares new credentials. */
+       new_cred = prepare_creds();
+       if (!new_cred) {
+               err = -ENOMEM;
+               goto out_put_ruleset;
+       }
+       new_llcred = landlock_cred(new_cred);
+
+       /*
+        * There is no possible race condition while copying and manipulating
+        * the current credentials because they are dedicated per thread.
+        */
+       new_dom = landlock_merge_ruleset(new_llcred->domain, ruleset);
+       if (IS_ERR(new_dom)) {
+               err = PTR_ERR(new_dom);
+               goto out_put_creds;
+       }
+
+       /* Replaces the old (prepared) domain. */
+       landlock_put_ruleset(new_llcred->domain);
+       new_llcred->domain = new_dom;
+
+       landlock_put_ruleset(ruleset);
+       return commit_creds(new_cred);
+
+out_put_creds:
+       abort_creds(new_cred);
+
+out_put_ruleset:
+       landlock_put_ruleset(ruleset);
+       return err;
+}
index 8a176b6..1079c6d 100644 (file)
@@ -125,7 +125,6 @@ static int safesetid_security_capable(const struct cred *cred,
                pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions\n",
                        __kuid_val(cred->uid));
                return -EPERM;
-               break;
        case CAP_SETGID:
                /*
                * If no policy applies to this task, allow the use of CAP_SETGID for
@@ -140,11 +139,9 @@ static int safesetid_security_capable(const struct cred *cred,
                pr_warn("Operation requires CAP_SETGID, which is not available to GID %u for operations besides approved set*gid transitions\n",
                        __kuid_val(cred->uid));
                return -EPERM;
-               break;
        default:
                /* Error, the only capabilities were checking for is CAP_SETUID/GID */
                return 0;
-               break;
        }
        return 0;
 }
index 94383f8..b38155b 100644 (file)
@@ -203,6 +203,7 @@ static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
        lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode);
        lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc);
        lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
+       lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock);
        lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task);
 }
 
@@ -333,12 +334,13 @@ static void __init ordered_lsm_init(void)
        for (lsm = ordered_lsms; *lsm; lsm++)
                prepare_lsm(*lsm);
 
-       init_debug("cred blob size     = %d\n", blob_sizes.lbs_cred);
-       init_debug("file blob size     = %d\n", blob_sizes.lbs_file);
-       init_debug("inode blob size    = %d\n", blob_sizes.lbs_inode);
-       init_debug("ipc blob size      = %d\n", blob_sizes.lbs_ipc);
-       init_debug("msg_msg blob size  = %d\n", blob_sizes.lbs_msg_msg);
-       init_debug("task blob size     = %d\n", blob_sizes.lbs_task);
+       init_debug("cred blob size       = %d\n", blob_sizes.lbs_cred);
+       init_debug("file blob size       = %d\n", blob_sizes.lbs_file);
+       init_debug("inode blob size      = %d\n", blob_sizes.lbs_inode);
+       init_debug("ipc blob size        = %d\n", blob_sizes.lbs_ipc);
+       init_debug("msg_msg blob size    = %d\n", blob_sizes.lbs_msg_msg);
+       init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock);
+       init_debug("task blob size       = %d\n", blob_sizes.lbs_task);
 
        /*
         * Create any kmem_caches needed for blobs
@@ -670,6 +672,27 @@ static void __init lsm_early_task(struct task_struct *task)
                panic("%s: Early task alloc failed.\n", __func__);
 }
 
+/**
+ * lsm_superblock_alloc - allocate a composite superblock blob
+ * @sb: the superblock that needs a blob
+ *
+ * Allocate the superblock blob for all the modules
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+static int lsm_superblock_alloc(struct super_block *sb)
+{
+       if (blob_sizes.lbs_superblock == 0) {
+               sb->s_security = NULL;
+               return 0;
+       }
+
+       sb->s_security = kzalloc(blob_sizes.lbs_superblock, GFP_KERNEL);
+       if (sb->s_security == NULL)
+               return -ENOMEM;
+       return 0;
+}
+
 /*
  * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
  * can be accessed with:
@@ -867,12 +890,26 @@ int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *
 
 int security_sb_alloc(struct super_block *sb)
 {
-       return call_int_hook(sb_alloc_security, 0, sb);
+       int rc = lsm_superblock_alloc(sb);
+
+       if (unlikely(rc))
+               return rc;
+       rc = call_int_hook(sb_alloc_security, 0, sb);
+       if (unlikely(rc))
+               security_sb_free(sb);
+       return rc;
+}
+
+void security_sb_delete(struct super_block *sb)
+{
+       call_void_hook(sb_delete, sb);
 }
 
 void security_sb_free(struct super_block *sb)
 {
        call_void_hook(sb_free_security, sb);
+       kfree(sb->s_security);
+       sb->s_security = NULL;
 }
 
 void security_free_mnt_opts(void **mnt_opts)
index 92f909a..eaea837 100644 (file)
@@ -358,7 +358,7 @@ static void inode_free_security(struct inode *inode)
 
        if (!isec)
                return;
-       sbsec = inode->i_sb->s_security;
+       sbsec = selinux_superblock(inode->i_sb);
        /*
         * As not all inode security structures are in a list, we check for
         * empty list outside of the lock to make sure that we won't waste
@@ -376,13 +376,6 @@ static void inode_free_security(struct inode *inode)
        }
 }
 
-static void superblock_free_security(struct super_block *sb)
-{
-       struct superblock_security_struct *sbsec = sb->s_security;
-       sb->s_security = NULL;
-       kfree(sbsec);
-}
-
 struct selinux_mnt_opts {
        const char *fscontext, *context, *rootcontext, *defcontext;
 };
@@ -494,7 +487,7 @@ static int selinux_is_genfs_special_handling(struct super_block *sb)
 
 static int selinux_is_sblabel_mnt(struct super_block *sb)
 {
-       struct superblock_security_struct *sbsec = sb->s_security;
+       struct superblock_security_struct *sbsec = selinux_superblock(sb);
 
        /*
         * IMPORTANT: Double-check logic in this function when adding a new
@@ -571,7 +564,7 @@ fallback:
 
 static int sb_finish_set_opts(struct super_block *sb)
 {
-       struct superblock_security_struct *sbsec = sb->s_security;
+       struct superblock_security_struct *sbsec = selinux_superblock(sb);
        struct dentry *root = sb->s_root;
        struct inode *root_inode = d_backing_inode(root);
        int rc = 0;
@@ -662,7 +655,7 @@ static int selinux_set_mnt_opts(struct super_block *sb,
                                unsigned long *set_kern_flags)
 {
        const struct cred *cred = current_cred();
-       struct superblock_security_struct *sbsec = sb->s_security;
+       struct superblock_security_struct *sbsec = selinux_superblock(sb);
        struct dentry *root = sb->s_root;
        struct selinux_mnt_opts *opts = mnt_opts;
        struct inode_security_struct *root_isec;
@@ -900,8 +893,8 @@ out_double_mount:
 static int selinux_cmp_sb_context(const struct super_block *oldsb,
                                    const struct super_block *newsb)
 {
-       struct superblock_security_struct *old = oldsb->s_security;
-       struct superblock_security_struct *new = newsb->s_security;
+       struct superblock_security_struct *old = selinux_superblock(oldsb);
+       struct superblock_security_struct *new = selinux_superblock(newsb);
        char oldflags = old->flags & SE_MNTMASK;
        char newflags = new->flags & SE_MNTMASK;
 
@@ -933,8 +926,9 @@ static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb,
                                        unsigned long *set_kern_flags)
 {
        int rc = 0;
-       const struct superblock_security_struct *oldsbsec = oldsb->s_security;
-       struct superblock_security_struct *newsbsec = newsb->s_security;
+       const struct superblock_security_struct *oldsbsec =
+                                               selinux_superblock(oldsb);
+       struct superblock_security_struct *newsbsec = selinux_superblock(newsb);
 
        int set_fscontext =     (oldsbsec->flags & FSCONTEXT_MNT);
        int set_context =       (oldsbsec->flags & CONTEXT_MNT);
@@ -1113,7 +1107,7 @@ static int show_sid(struct seq_file *m, u32 sid)
 
 static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb)
 {
-       struct superblock_security_struct *sbsec = sb->s_security;
+       struct superblock_security_struct *sbsec = selinux_superblock(sb);
        int rc;
 
        if (!(sbsec->flags & SE_SBINITIALIZED))
@@ -1464,7 +1458,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
        if (isec->sclass == SECCLASS_FILE)
                isec->sclass = inode_mode_to_security_class(inode->i_mode);
 
-       sbsec = inode->i_sb->s_security;
+       sbsec = selinux_superblock(inode->i_sb);
        if (!(sbsec->flags & SE_SBINITIALIZED)) {
                /* Defer initialization until selinux_complete_init,
                   after the initial policy is loaded and the security
@@ -1815,7 +1809,8 @@ selinux_determine_inode_label(const struct task_security_struct *tsec,
                                 const struct qstr *name, u16 tclass,
                                 u32 *_new_isid)
 {
-       const struct superblock_security_struct *sbsec = dir->i_sb->s_security;
+       const struct superblock_security_struct *sbsec =
+                                               selinux_superblock(dir->i_sb);
 
        if ((sbsec->flags & SE_SBINITIALIZED) &&
            (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) {
@@ -1846,7 +1841,7 @@ static int may_create(struct inode *dir,
        int rc;
 
        dsec = inode_security(dir);
-       sbsec = dir->i_sb->s_security;
+       sbsec = selinux_superblock(dir->i_sb);
 
        sid = tsec->sid;
 
@@ -1995,7 +1990,7 @@ static int superblock_has_perm(const struct cred *cred,
        struct superblock_security_struct *sbsec;
        u32 sid = cred_sid(cred);
 
-       sbsec = sb->s_security;
+       sbsec = selinux_superblock(sb);
        return avc_has_perm(&selinux_state,
                            sid, sbsec->sid, SECCLASS_FILESYSTEM, perms, ad);
 }
@@ -2617,11 +2612,7 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 
 static int selinux_sb_alloc_security(struct super_block *sb)
 {
-       struct superblock_security_struct *sbsec;
-
-       sbsec = kzalloc(sizeof(struct superblock_security_struct), GFP_KERNEL);
-       if (!sbsec)
-               return -ENOMEM;
+       struct superblock_security_struct *sbsec = selinux_superblock(sb);
 
        mutex_init(&sbsec->lock);
        INIT_LIST_HEAD(&sbsec->isec_head);
@@ -2629,16 +2620,10 @@ static int selinux_sb_alloc_security(struct super_block *sb)
        sbsec->sid = SECINITSID_UNLABELED;
        sbsec->def_sid = SECINITSID_FILE;
        sbsec->mntpoint_sid = SECINITSID_UNLABELED;
-       sb->s_security = sbsec;
 
        return 0;
 }
 
-static void selinux_sb_free_security(struct super_block *sb)
-{
-       superblock_free_security(sb);
-}
-
 static inline int opt_len(const char *s)
 {
        bool open_quote = false;
@@ -2772,7 +2757,7 @@ static int selinux_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts)
 static int selinux_sb_remount(struct super_block *sb, void *mnt_opts)
 {
        struct selinux_mnt_opts *opts = mnt_opts;
-       struct superblock_security_struct *sbsec = sb->s_security;
+       struct superblock_security_struct *sbsec = selinux_superblock(sb);
        u32 sid;
        int rc;
 
@@ -3010,7 +2995,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
        int rc;
        char *context;
 
-       sbsec = dir->i_sb->s_security;
+       sbsec = selinux_superblock(dir->i_sb);
 
        newsid = tsec->create_sid;
 
@@ -3312,7 +3297,7 @@ static int selinux_inode_setxattr(struct user_namespace *mnt_userns,
        if (!selinux_initialized(&selinux_state))
                return (inode_owner_or_capable(mnt_userns, inode) ? 0 : -EPERM);
 
-       sbsec = inode->i_sb->s_security;
+       sbsec = selinux_superblock(inode->i_sb);
        if (!(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;
 
@@ -3557,13 +3542,14 @@ static int selinux_inode_setsecurity(struct inode *inode, const char *name,
                                     const void *value, size_t size, int flags)
 {
        struct inode_security_struct *isec = inode_security_novalidate(inode);
-       struct superblock_security_struct *sbsec = inode->i_sb->s_security;
+       struct superblock_security_struct *sbsec;
        u32 newsid;
        int rc;
 
        if (strcmp(name, XATTR_SELINUX_SUFFIX))
                return -EOPNOTSUPP;
 
+       sbsec = selinux_superblock(inode->i_sb);
        if (!(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;
 
@@ -7065,6 +7051,7 @@ struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = {
        .lbs_inode = sizeof(struct inode_security_struct),
        .lbs_ipc = sizeof(struct ipc_security_struct),
        .lbs_msg_msg = sizeof(struct msg_security_struct),
+       .lbs_superblock = sizeof(struct superblock_security_struct),
 };
 
 #ifdef CONFIG_PERF_EVENTS
@@ -7165,7 +7152,6 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(bprm_committing_creds, selinux_bprm_committing_creds),
        LSM_HOOK_INIT(bprm_committed_creds, selinux_bprm_committed_creds),
 
-       LSM_HOOK_INIT(sb_free_security, selinux_sb_free_security),
        LSM_HOOK_INIT(sb_free_mnt_opts, selinux_free_mnt_opts),
        LSM_HOOK_INIT(sb_mnt_opts_compat, selinux_sb_mnt_opts_compat),
        LSM_HOOK_INIT(sb_remount, selinux_sb_remount),
index ca4d7ab..2953132 100644 (file)
@@ -188,4 +188,10 @@ static inline u32 current_sid(void)
        return tsec->sid;
 }
 
+static inline struct superblock_security_struct *selinux_superblock(
+                                       const struct super_block *superblock)
+{
+       return superblock->s_security + selinux_blob_sizes.lbs_superblock;
+}
+
 #endif /* _SELINUX_OBJSEC_H_ */
index f0ba826..0a5ce00 100644 (file)
@@ -47,6 +47,7 @@
 #include <linux/sched.h>
 #include <linux/audit.h>
 #include <linux/vmalloc.h>
+#include <linux/lsm_hooks.h>
 #include <net/netlabel.h>
 
 #include "flask.h"
@@ -2955,7 +2956,7 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb)
        struct sidtab *sidtab;
        int rc;
        struct ocontext *c;
-       struct superblock_security_struct *sbsec = sb->s_security;
+       struct superblock_security_struct *sbsec = selinux_superblock(sb);
        const char *fstype = sb->s_type->name;
 
        if (!selinux_initialized(state)) {
index 08f9cb8..c3cfbdf 100644 (file)
@@ -357,6 +357,12 @@ static inline struct smack_known **smack_ipc(const struct kern_ipc_perm *ipc)
        return ipc->security + smack_blob_sizes.lbs_ipc;
 }
 
+static inline struct superblock_smack *smack_superblock(
+                                       const struct super_block *superblock)
+{
+       return superblock->s_security + smack_blob_sizes.lbs_superblock;
+}
+
 /*
  * Is the directory transmuting?
  */
index cd14bec..223a6da 100644 (file)
@@ -535,12 +535,7 @@ static int smack_syslog(int typefrom_file)
  */
 static int smack_sb_alloc_security(struct super_block *sb)
 {
-       struct superblock_smack *sbsp;
-
-       sbsp = kzalloc(sizeof(struct superblock_smack), GFP_KERNEL);
-
-       if (sbsp == NULL)
-               return -ENOMEM;
+       struct superblock_smack *sbsp = smack_superblock(sb);
 
        sbsp->smk_root = &smack_known_floor;
        sbsp->smk_default = &smack_known_floor;
@@ -549,22 +544,10 @@ static int smack_sb_alloc_security(struct super_block *sb)
        /*
         * SMK_SB_INITIALIZED will be zero from kzalloc.
         */
-       sb->s_security = sbsp;
 
        return 0;
 }
 
-/**
- * smack_sb_free_security - free a superblock blob
- * @sb: the superblock getting the blob
- *
- */
-static void smack_sb_free_security(struct super_block *sb)
-{
-       kfree(sb->s_security);
-       sb->s_security = NULL;
-}
-
 struct smack_mnt_opts {
        const char *fsdefault, *fsfloor, *fshat, *fsroot, *fstransmute;
 };
@@ -772,7 +755,7 @@ static int smack_set_mnt_opts(struct super_block *sb,
 {
        struct dentry *root = sb->s_root;
        struct inode *inode = d_backing_inode(root);
-       struct superblock_smack *sp = sb->s_security;
+       struct superblock_smack *sp = smack_superblock(sb);
        struct inode_smack *isp;
        struct smack_known *skp;
        struct smack_mnt_opts *opts = mnt_opts;
@@ -871,7 +854,7 @@ static int smack_set_mnt_opts(struct super_block *sb,
  */
 static int smack_sb_statfs(struct dentry *dentry)
 {
-       struct superblock_smack *sbp = dentry->d_sb->s_security;
+       struct superblock_smack *sbp = smack_superblock(dentry->d_sb);
        int rc;
        struct smk_audit_info ad;
 
@@ -905,7 +888,7 @@ static int smack_bprm_creds_for_exec(struct linux_binprm *bprm)
        if (isp->smk_task == NULL || isp->smk_task == bsp->smk_task)
                return 0;
 
-       sbsp = inode->i_sb->s_security;
+       sbsp = smack_superblock(inode->i_sb);
        if ((sbsp->smk_flags & SMK_SB_UNTRUSTED) &&
            isp->smk_task != sbsp->smk_root)
                return 0;
@@ -1157,7 +1140,7 @@ static int smack_inode_rename(struct inode *old_inode,
  */
 static int smack_inode_permission(struct inode *inode, int mask)
 {
-       struct superblock_smack *sbsp = inode->i_sb->s_security;
+       struct superblock_smack *sbsp = smack_superblock(inode->i_sb);
        struct smk_audit_info ad;
        int no_block = mask & MAY_NOT_BLOCK;
        int rc;
@@ -1400,7 +1383,7 @@ static int smack_inode_removexattr(struct user_namespace *mnt_userns,
         */
        if (strcmp(name, XATTR_NAME_SMACK) == 0) {
                struct super_block *sbp = dentry->d_sb;
-               struct superblock_smack *sbsp = sbp->s_security;
+               struct superblock_smack *sbsp = smack_superblock(sbp);
 
                isp->smk_inode = sbsp->smk_default;
        } else if (strcmp(name, XATTR_NAME_SMACKEXEC) == 0)
@@ -1670,7 +1653,7 @@ static int smack_mmap_file(struct file *file,
        isp = smack_inode(file_inode(file));
        if (isp->smk_mmap == NULL)
                return 0;
-       sbsp = file_inode(file)->i_sb->s_security;
+       sbsp = smack_superblock(file_inode(file)->i_sb);
        if (sbsp->smk_flags & SMK_SB_UNTRUSTED &&
            isp->smk_mmap != sbsp->smk_root)
                return -EACCES;
@@ -3299,7 +3282,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
                return;
 
        sbp = inode->i_sb;
-       sbsp = sbp->s_security;
+       sbsp = smack_superblock(sbp);
        /*
         * We're going to use the superblock default label
         * if there's no label on the file.
@@ -4714,6 +4697,7 @@ struct lsm_blob_sizes smack_blob_sizes __lsm_ro_after_init = {
        .lbs_inode = sizeof(struct inode_smack),
        .lbs_ipc = sizeof(struct smack_known *),
        .lbs_msg_msg = sizeof(struct smack_known *),
+       .lbs_superblock = sizeof(struct superblock_smack),
 };
 
 static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
@@ -4725,7 +4709,6 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(fs_context_parse_param, smack_fs_context_parse_param),
 
        LSM_HOOK_INIT(sb_alloc_security, smack_sb_alloc_security),
-       LSM_HOOK_INIT(sb_free_security, smack_sb_free_security),
        LSM_HOOK_INIT(sb_free_mnt_opts, smack_free_mnt_opts),
        LSM_HOOK_INIT(sb_eat_lsm_opts, smack_sb_eat_lsm_opts),
        LSM_HOOK_INIT(sb_statfs, smack_sb_statfs),
index 3998e17..b638fc2 100644 (file)
@@ -1204,11 +1204,17 @@ static const char *get_line_out_pfx(struct hda_codec *codec, int ch,
                *index = ch;
                return "Headphone";
        case AUTO_PIN_LINE_OUT:
-               /* This deals with the case where we have two DACs and
-                * one LO, one HP and one Speaker */
-               if (!ch && cfg->speaker_outs && cfg->hp_outs) {
-                       bool hp_lo_shared = !path_has_mixer(codec, spec->hp_paths[0], ctl_type);
-                       bool spk_lo_shared = !path_has_mixer(codec, spec->speaker_paths[0], ctl_type);
+               /* This deals with the case where one HP or one Speaker or
+                * one HP + one Speaker need to share the DAC with LO
+                */
+               if (!ch) {
+                       bool hp_lo_shared = false, spk_lo_shared = false;
+
+                       if (cfg->speaker_outs)
+                               spk_lo_shared = !path_has_mixer(codec,
+                                                               spec->speaker_paths[0], ctl_type);
+                       if (cfg->hp_outs)
+                               hp_lo_shared = !path_has_mixer(codec, spec->hp_paths[0], ctl_type);
                        if (hp_lo_shared && spk_lo_shared)
                                return spec->vmaster_mute.hook ? "PCM" : "Master";
                        if (hp_lo_shared)
index bd7bfd7..6d58f24 100644 (file)
@@ -4338,6 +4338,35 @@ static void alc245_fixup_hp_x360_amp(struct hda_codec *codec,
        }
 }
 
+/* toggle GPIO2 at each time stream is started; we use PREPARE state instead */
+static void alc274_hp_envy_pcm_hook(struct hda_pcm_stream *hinfo,
+                                   struct hda_codec *codec,
+                                   struct snd_pcm_substream *substream,
+                                   int action)
+{
+       switch (action) {
+       case HDA_GEN_PCM_ACT_PREPARE:
+               alc_update_gpio_data(codec, 0x04, true);
+               break;
+       case HDA_GEN_PCM_ACT_CLEANUP:
+               alc_update_gpio_data(codec, 0x04, false);
+               break;
+       }
+}
+
+static void alc274_fixup_hp_envy_gpio(struct hda_codec *codec,
+                                     const struct hda_fixup *fix,
+                                     int action)
+{
+       struct alc_spec *spec = codec->spec;
+
+       if (action == HDA_FIXUP_ACT_PROBE) {
+               spec->gpio_mask |= 0x04;
+               spec->gpio_dir |= 0x04;
+               spec->gen.pcm_playback_hook = alc274_hp_envy_pcm_hook;
+       }
+}
+
 static void alc_update_coef_led(struct hda_codec *codec,
                                struct alc_coef_led *led,
                                bool polarity, bool on)
@@ -5695,6 +5724,18 @@ static void alc_fixup_tpt470_dacs(struct hda_codec *codec,
                spec->gen.preferred_dacs = preferred_pairs;
 }
 
+static void alc295_fixup_asus_dacs(struct hda_codec *codec,
+                                  const struct hda_fixup *fix, int action)
+{
+       static const hda_nid_t preferred_pairs[] = {
+               0x17, 0x02, 0x21, 0x03, 0
+       };
+       struct alc_spec *spec = codec->spec;
+
+       if (action == HDA_FIXUP_ACT_PRE_PROBE)
+               spec->gen.preferred_dacs = preferred_pairs;
+}
+
 static void alc_shutup_dell_xps13(struct hda_codec *codec)
 {
        struct alc_spec *spec = codec->spec;
@@ -6453,6 +6494,7 @@ enum {
        ALC255_FIXUP_XIAOMI_HEADSET_MIC,
        ALC274_FIXUP_HP_MIC,
        ALC274_FIXUP_HP_HEADSET_MIC,
+       ALC274_FIXUP_HP_ENVY_GPIO,
        ALC256_FIXUP_ASUS_HPE,
        ALC285_FIXUP_THINKPAD_NO_BASS_SPK_HEADSET_JACK,
        ALC287_FIXUP_HP_GPIO_LED,
@@ -6463,6 +6505,8 @@ enum {
        ALC256_FIXUP_ACER_HEADSET_MIC,
        ALC285_FIXUP_IDEAPAD_S740_COEF,
        ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST,
+       ALC295_FIXUP_ASUS_DACS,
+       ALC295_FIXUP_HP_OMEN,
 };
 
 static const struct hda_fixup alc269_fixups[] = {
@@ -7894,6 +7938,10 @@ static const struct hda_fixup alc269_fixups[] = {
                .chained = true,
                .chain_id = ALC274_FIXUP_HP_MIC
        },
+       [ALC274_FIXUP_HP_ENVY_GPIO] = {
+               .type = HDA_FIXUP_FUNC,
+               .v.func = alc274_fixup_hp_envy_gpio,
+       },
        [ALC256_FIXUP_ASUS_HPE] = {
                .type = HDA_FIXUP_VERBS,
                .v.verbs = (const struct hda_verb[]) {
@@ -7963,6 +8011,30 @@ static const struct hda_fixup alc269_fixups[] = {
                .chained = true,
                .chain_id = ALC285_FIXUP_HP_MUTE_LED,
        },
+       [ALC295_FIXUP_ASUS_DACS] = {
+               .type = HDA_FIXUP_FUNC,
+               .v.func = alc295_fixup_asus_dacs,
+       },
+       [ALC295_FIXUP_HP_OMEN] = {
+               .type = HDA_FIXUP_PINS,
+               .v.pins = (const struct hda_pintbl[]) {
+                       { 0x12, 0xb7a60130 },
+                       { 0x13, 0x40000000 },
+                       { 0x14, 0x411111f0 },
+                       { 0x16, 0x411111f0 },
+                       { 0x17, 0x90170110 },
+                       { 0x18, 0x411111f0 },
+                       { 0x19, 0x02a11030 },
+                       { 0x1a, 0x411111f0 },
+                       { 0x1b, 0x04a19030 },
+                       { 0x1d, 0x40600001 },
+                       { 0x1e, 0x411111f0 },
+                       { 0x21, 0x03211020 },
+                       {}
+               },
+               .chained = true,
+               .chain_id = ALC269_FIXUP_HP_LINE1_MIC1_LED,
+       },
 };
 
 static const struct snd_pci_quirk alc269_fixup_tbl[] = {
@@ -8121,8 +8193,10 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x103c, 0x82c0, "HP G3 mini premium", ALC221_FIXUP_HP_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x103c, 0x83b9, "HP Spectre x360", ALC269_FIXUP_HP_MUTE_LED_MIC3),
        SND_PCI_QUIRK(0x103c, 0x8497, "HP Envy x360", ALC269_FIXUP_HP_MUTE_LED_MIC3),
+       SND_PCI_QUIRK(0x103c, 0x84da, "HP OMEN dc0019-ur", ALC295_FIXUP_HP_OMEN),
        SND_PCI_QUIRK(0x103c, 0x84e7, "HP Pavilion 15", ALC269_FIXUP_HP_MUTE_LED_MIC3),
        SND_PCI_QUIRK(0x103c, 0x869d, "HP", ALC236_FIXUP_HP_MUTE_LED),
+       SND_PCI_QUIRK(0x103c, 0x86c7, "HP Envy AiO 32", ALC274_FIXUP_HP_ENVY_GPIO),
        SND_PCI_QUIRK(0x103c, 0x8724, "HP EliteBook 850 G7", ALC285_FIXUP_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x8729, "HP", ALC285_FIXUP_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x8730, "HP ProBook 445 G7", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF),
@@ -8161,6 +8235,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x1043, 0x1427, "Asus Zenbook UX31E", ALC269VB_FIXUP_ASUS_ZENBOOK),
        SND_PCI_QUIRK(0x1043, 0x1517, "Asus Zenbook UX31A", ALC269VB_FIXUP_ASUS_ZENBOOK_UX31A),
        SND_PCI_QUIRK(0x1043, 0x16e3, "ASUS UX50", ALC269_FIXUP_STEREO_DMIC),
+       SND_PCI_QUIRK(0x1043, 0x1740, "ASUS UX430UA", ALC295_FIXUP_ASUS_DACS),
        SND_PCI_QUIRK(0x1043, 0x17d1, "ASUS UX431FL", ALC294_FIXUP_ASUS_DUAL_SPK),
        SND_PCI_QUIRK(0x1043, 0x1881, "ASUS Zephyrus S/M", ALC294_FIXUP_ASUS_GX502_PINS),
        SND_PCI_QUIRK(0x1043, 0x18b1, "Asus MJ401TA", ALC256_FIXUP_ASUS_HEADSET_MIC),
@@ -8524,6 +8599,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = {
        {.id = ALC255_FIXUP_XIAOMI_HEADSET_MIC, .name = "alc255-xiaomi-headset"},
        {.id = ALC274_FIXUP_HP_MIC, .name = "alc274-hp-mic-detect"},
        {.id = ALC245_FIXUP_HP_X360_AMP, .name = "alc245-hp-x360-amp"},
+       {.id = ALC295_FIXUP_HP_OMEN, .name = "alc295-hp-omen"},
        {}
 };
 #define ALC225_STANDARD_PINS \
@@ -8801,6 +8877,16 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
                {0x19, 0x03a11020},
                {0x21, 0x0321101f}),
        SND_HDA_PIN_QUIRK(0x10ec0285, 0x17aa, "Lenovo", ALC285_FIXUP_LENOVO_PC_BEEP_IN_NOISE,
+               {0x12, 0x90a60130},
+               {0x14, 0x90170110},
+               {0x19, 0x04a11040},
+               {0x21, 0x04211020}),
+       SND_HDA_PIN_QUIRK(0x10ec0285, 0x17aa, "Lenovo", ALC285_FIXUP_LENOVO_PC_BEEP_IN_NOISE,
+               {0x14, 0x90170110},
+               {0x19, 0x04a11040},
+               {0x1d, 0x40600001},
+               {0x21, 0x04211020}),
+       SND_HDA_PIN_QUIRK(0x10ec0285, 0x17aa, "Lenovo", ALC285_FIXUP_THINKPAD_NO_BASS_SPK_HEADSET_JACK,
                {0x14, 0x90170110},
                {0x19, 0x04a11040},
                {0x21, 0x04211020}),
@@ -8971,10 +9057,6 @@ static const struct snd_hda_pin_quirk alc269_fallback_pin_fixup_tbl[] = {
        SND_HDA_PIN_QUIRK(0x10ec0274, 0x1028, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB,
                {0x19, 0x40000000},
                {0x1a, 0x40000000}),
-       SND_HDA_PIN_QUIRK(0x10ec0285, 0x17aa, "Lenovo", ALC285_FIXUP_THINKPAD_NO_BASS_SPK_HEADSET_JACK,
-               {0x14, 0x90170110},
-               {0x19, 0x04a11040},
-               {0x21, 0x04211020}),
        {}
 };
 
index 646deb6..c5794e8 100644 (file)
@@ -337,6 +337,13 @@ static const struct usbmix_name_map bose_companion5_map[] = {
        { 0 }   /* terminator */
 };
 
+/* Sennheiser Communications Headset [PC 8], the dB value is reported as -6 negative maximum  */
+static const struct usbmix_dB_map sennheiser_pc8_dB = {-9500, 0};
+static const struct usbmix_name_map sennheiser_pc8_map[] = {
+       { 9, NULL, .dB = &sennheiser_pc8_dB },
+       { 0 }   /* terminator */
+};
+
 /*
  * Dell usb dock with ALC4020 codec had a firmware problem where it got
  * screwed up when zero volume is passed; just skip it as a workaround
@@ -593,6 +600,11 @@ static const struct usbmix_ctl_map usbmix_ctl_maps[] = {
                .id = USB_ID(0x17aa, 0x1046),
                .map = lenovo_p620_rear_map,
        },
+       {
+               /* Sennheiser Communications Headset [PC 8] */
+               .id = USB_ID(0x1395, 0x0025),
+               .map = sennheiser_pc8_map,
+       },
        { 0 } /* terminator */
 };
 
index 74e255d..04a8e3d 100644 (file)
@@ -52,6 +52,7 @@ FEATURE_TESTS_BASIC :=                  \
         libpython-version               \
         libslang                        \
         libslang-include-subdir         \
+        libtraceevent                   \
         libcrypto                       \
         libunwind                       \
         pthread-attr-setaffinity-np     \
@@ -239,17 +240,24 @@ ifeq ($(VF),1)
   feature_verbose := 1
 endif
 
-ifeq ($(feature_display),1)
-  $(info )
-  $(info Auto-detecting system features:)
-  $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),))
-  ifneq ($(feature_verbose),1)
+feature_display_entries = $(eval $(feature_display_entries_code))
+define feature_display_entries_code
+  ifeq ($(feature_display),1)
     $(info )
+    $(info Auto-detecting system features:)
+    $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),))
+    ifneq ($(feature_verbose),1)
+      $(info )
+    endif
   endif
-endif
 
-ifeq ($(feature_verbose),1)
-  TMP := $(filter-out $(FEATURE_DISPLAY),$(FEATURE_TESTS))
-  $(foreach feat,$(TMP),$(call feature_print_status,$(feat),))
-  $(info )
+  ifeq ($(feature_verbose),1)
+    TMP := $(filter-out $(FEATURE_DISPLAY),$(FEATURE_TESTS))
+    $(foreach feat,$(TMP),$(call feature_print_status,$(feat),))
+    $(info )
+  endif
+endef
+
+ifeq ($(FEATURE_DISPLAY_DEFERRED),)
+  $(call feature_display_entries)
 endif
index 3e55edb..ec203e2 100644 (file)
@@ -36,6 +36,7 @@ FILES=                                          \
          test-libpython-version.bin             \
          test-libslang.bin                      \
          test-libslang-include-subdir.bin       \
+         test-libtraceevent.bin                 \
          test-libcrypto.bin                     \
          test-libunwind.bin                     \
          test-libunwind-debug-frame.bin         \
@@ -196,6 +197,9 @@ $(OUTPUT)test-libslang.bin:
 $(OUTPUT)test-libslang-include-subdir.bin:
        $(BUILD) -lslang
 
+$(OUTPUT)test-libtraceevent.bin:
+       $(BUILD) -ltraceevent
+
 $(OUTPUT)test-libcrypto.bin:
        $(BUILD) -lcrypto
 
diff --git a/tools/build/feature/test-libtraceevent.c b/tools/build/feature/test-libtraceevent.c
new file mode 100644 (file)
index 0000000..416b11f
--- /dev/null
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <traceevent/trace-seq.h>
+
+int main(void)
+{
+       int rv = 0;
+       struct trace_seq s;
+       trace_seq_init(&s);
+       rv += !(s.state == TRACE_SEQ__GOOD);
+       trace_seq_destroy(&s);
+       return rv;
+}
index 1639b4d..4096bcd 100644 (file)
@@ -20,7 +20,7 @@
 #define CONSUMER "gpio-utils"
 
 /**
- * doc: Operation of gpio
+ * DOC: Operation of gpio
  *
  * Provide the api of gpiochip for chardev interface. There are two
  * types of api.  The first one provide as same function as each
@@ -100,7 +100,7 @@ exit_free_name:
 }
 
 /**
- * gpiotools_set_values(): Set the value of gpio(s)
+ * gpiotools_set_values() - Set the value of gpio(s)
  * @fd:                        The fd returned by
  *                     gpiotools_request_line().
  * @values:            The array of values want to set.
@@ -124,7 +124,7 @@ int gpiotools_set_values(const int fd, struct gpio_v2_line_values *values)
 }
 
 /**
- * gpiotools_get_values(): Get the value of gpio(s)
+ * gpiotools_get_values() - Get the value of gpio(s)
  * @fd:                        The fd returned by
  *                     gpiotools_request_line().
  * @values:            The array of values get from hardware.
@@ -148,7 +148,7 @@ int gpiotools_get_values(const int fd, struct gpio_v2_line_values *values)
 }
 
 /**
- * gpiotools_release_line(): Release the line(s) of gpiochip
+ * gpiotools_release_line() - Release the line(s) of gpiochip
  * @fd:                        The fd returned by
  *                     gpiotools_request_line().
  *
@@ -169,7 +169,7 @@ int gpiotools_release_line(const int fd)
 }
 
 /**
- * gpiotools_get(): Get value from specific line
+ * gpiotools_get() - Get value from specific line
  * @device_name:       The name of gpiochip without prefix "/dev/",
  *                     such as "gpiochip0"
  * @line:              number of line, such as 2.
@@ -191,7 +191,7 @@ int gpiotools_get(const char *device_name, unsigned int line)
 
 
 /**
- * gpiotools_gets(): Get values from specific lines.
+ * gpiotools_gets() - Get values from specific lines.
  * @device_name:       The name of gpiochip without prefix "/dev/",
  *                     such as "gpiochip0".
  * @lines:             An array desired lines, specified by offset
@@ -230,7 +230,7 @@ int gpiotools_gets(const char *device_name, unsigned int *lines,
 }
 
 /**
- * gpiotools_set(): Set value to specific line
+ * gpiotools_set() - Set value to specific line
  * @device_name:       The name of gpiochip without prefix "/dev/",
  *                     such as "gpiochip0"
  * @line:              number of line, such as 2.
@@ -248,13 +248,13 @@ int gpiotools_set(const char *device_name, unsigned int line,
 }
 
 /**
- * gpiotools_sets(): Set values to specific lines.
+ * gpiotools_sets() - Set values to specific lines.
  * @device_name:       The name of gpiochip without prefix "/dev/",
  *                     such as "gpiochip0".
  * @lines:             An array desired lines, specified by offset
  *                     index for the associated GPIO device.
  * @num_lines:         The number of lines to request.
- * @value            The array of values set to gpiochip, must be
+ * @values:            The array of values set to gpiochip, must be
  *                     0(low) or 1(high).
  *
  * Return:             On success return 0;
index 16ed198..6481fd1 100644 (file)
@@ -2,6 +2,13 @@
 #ifndef _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_
 #define _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_
 
+extern unsigned long _find_next_bit(const unsigned long *addr1,
+               const unsigned long *addr2, unsigned long nbits,
+               unsigned long start, unsigned long invert, unsigned long le);
+extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
+
 #ifndef find_next_bit
 /**
  * find_next_bit - find the next set bit in a memory region
  * Returns the bit number for the next set bit
  * If no bits are set, returns @size.
  */
-extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
-               size, unsigned long offset);
+static inline
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+                           unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = *addr & GENMASK(size - 1, offset);
+               return val ? __ffs(val) : size;
+       }
+
+       return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
+}
 #endif
 
 #ifndef find_next_and_bit
@@ -27,13 +48,26 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long
  * Returns the bit number for the next set bit
  * If no bits are set, returns @size.
  */
-extern unsigned long find_next_and_bit(const unsigned long *addr1,
+static inline
+unsigned long find_next_and_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
-               unsigned long offset);
+               unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = *addr1 & *addr2 & GENMASK(size - 1, offset);
+               return val ? __ffs(val) : size;
+       }
+
+       return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
+}
 #endif
 
 #ifndef find_next_zero_bit
-
 /**
  * find_next_zero_bit - find the next cleared bit in a memory region
  * @addr: The address to base the search on
@@ -43,8 +77,22 @@ extern unsigned long find_next_and_bit(const unsigned long *addr1,
  * Returns the bit number of the next zero bit
  * If no bits are zero, returns @size.
  */
+static inline
 unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
-                                unsigned long offset);
+                                unsigned long offset)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val;
+
+               if (unlikely(offset >= size))
+                       return size;
+
+               val = *addr | ~GENMASK(size - 1, offset);
+               return val == ~0UL ? size : ffz(val);
+       }
+
+       return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
+}
 #endif
 
 #ifndef find_first_bit
@@ -57,8 +105,17 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
  * Returns the bit number of the first set bit.
  * If no bits are set, returns @size.
  */
-extern unsigned long find_first_bit(const unsigned long *addr,
-                                   unsigned long size);
+static inline
+unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val = *addr & GENMASK(size - 1, 0);
+
+               return val ? __ffs(val) : size;
+       }
+
+       return _find_first_bit(addr, size);
+}
 
 #endif /* find_first_bit */
 
@@ -72,7 +129,17 @@ extern unsigned long find_first_bit(const unsigned long *addr,
  * Returns the bit number of the first cleared bit.
  * If no bits are zero, returns @size.
  */
-unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size);
+static inline
+unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+{
+       if (small_const_nbits(size)) {
+               unsigned long val = *addr | ~GENMASK(size - 1, 0);
+
+               return val == ~0UL ? size : ffz(val);
+       }
+
+       return _find_first_zero_bit(addr, size);
+}
 #endif
 
 #endif /*_TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ */
index 8f22830..2093d56 100644 (file)
@@ -18,4 +18,7 @@
 #define BITS_PER_LONG_LONG 64
 #endif
 
+#define small_const_nbits(nbits) \
+       (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0)
+
 #endif /* __ASM_GENERIC_BITS_PER_LONG */
index e4732d3..4f3d5aa 100644 (file)
 #define HUGETLB_FLAG_ENCODE_SHIFT      26
 #define HUGETLB_FLAG_ENCODE_MASK       0x3f
 
+#define HUGETLB_FLAG_ENCODE_16KB       (14 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_64KB       (16 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_512KB      (19 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1MB                (20 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2MB                (21 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_8MB                (23 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16MB       (24 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_32MB       (25 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_256MB      (28 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_512MB      (29 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1GB                (30 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2GB                (31 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16GB       (34 << HUGETLB_FLAG_ENCODE_SHIFT)
index 477a1ca..330dbf7 100644 (file)
@@ -20,17 +20,9 @@ int __bitmap_equal(const unsigned long *bitmap1,
 void bitmap_clear(unsigned long *map, unsigned int start, int len);
 
 #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
+#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
 
-#define BITMAP_LAST_WORD_MASK(nbits)                                   \
-(                                                                      \
-       ((nbits) % BITS_PER_LONG) ?                                     \
-               (1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL               \
-)
-
-#define small_const_nbits(nbits) \
-       (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
-
-static inline void bitmap_zero(unsigned long *dst, int nbits)
+static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 {
        if (small_const_nbits(nbits))
                *dst = 0UL;
@@ -66,7 +58,7 @@ static inline int bitmap_full(const unsigned long *src, unsigned int nbits)
        return find_first_zero_bit(src, nbits) == nbits;
 }
 
-static inline int bitmap_weight(const unsigned long *src, int nbits)
+static inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
 {
        if (small_const_nbits(nbits))
                return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
@@ -74,7 +66,7 @@ static inline int bitmap_weight(const unsigned long *src, int nbits)
 }
 
 static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
-                            const unsigned long *src2, int nbits)
+                            const unsigned long *src2, unsigned int nbits)
 {
        if (small_const_nbits(nbits))
                *dst = *src1 | *src2;
@@ -141,7 +133,7 @@ static inline void bitmap_free(unsigned long *bitmap)
  * @buf: buffer to store output
  * @size: size of @buf
  */
-size_t bitmap_scnprintf(unsigned long *bitmap, int nbits,
+size_t bitmap_scnprintf(unsigned long *bitmap, unsigned int nbits,
                        char *buf, size_t size);
 
 /**
diff --git a/tools/include/linux/math64.h b/tools/include/linux/math64.h
new file mode 100644 (file)
index 0000000..4ad45d5
--- /dev/null
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MATH64_H
+#define _LINUX_MATH64_H
+
+#include <linux/types.h>
+
+#ifdef __x86_64__
+static inline u64 mul_u64_u64_div64(u64 a, u64 b, u64 c)
+{
+       u64 q;
+
+       asm ("mulq %2; divq %3" : "=a" (q)
+                               : "a" (a), "rm" (b), "rm" (c)
+                               : "rdx");
+
+       return q;
+}
+#define mul_u64_u64_div64 mul_u64_u64_div64
+#endif
+
+#ifdef __SIZEOF_INT128__
+static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift)
+{
+       return (u64)(((unsigned __int128)a * b) >> shift);
+}
+
+#else
+
+#ifdef __i386__
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+       u32 high, low;
+
+       asm ("mull %[b]" : "=a" (low), "=d" (high)
+                        : [a] "a" (a), [b] "rm" (b) );
+
+       return low | ((u64)high) << 32;
+}
+#else
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+       return (u64)a * b;
+}
+#endif
+
+static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift)
+{
+       u32 ah, al;
+       u64 ret;
+
+       al = a;
+       ah = a >> 32;
+
+       ret = mul_u32_u32(al, b) >> shift;
+       if (ah)
+               ret += mul_u32_u32(ah, b) << (32 - shift);
+
+       return ret;
+}
+
+#endif /* __SIZEOF_INT128__ */
+
+#ifndef mul_u64_u64_div64
+static inline u64 mul_u64_u64_div64(u64 a, u64 b, u64 c)
+{
+       u64 quot, rem;
+
+       quot = a / c;
+       rem = a % c;
+
+       return quot * b + (rem * b) / c;
+}
+#endif
+
+#endif /* _LINUX_MATH64_H */
index e9c5a21..6e14a53 100644 (file)
@@ -61,6 +61,9 @@ typedef __u32 __bitwise __be32;
 typedef __u64 __bitwise __le64;
 typedef __u64 __bitwise __be64;
 
+typedef __u16 __bitwise __sum16;
+typedef __u32 __bitwise __wsum;
+
 typedef struct {
        int counter;
 } atomic_t;
index ad15e40..14332f4 100644 (file)
@@ -37,6 +37,21 @@ enum perf_type_id {
        PERF_TYPE_MAX,                          /* non-ABI */
 };
 
+/*
+ * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
+ * PERF_TYPE_HARDWARE:                 0xEEEEEEEE000000AA
+ *                                     AA: hardware event ID
+ *                                     EEEEEEEE: PMU type ID
+ * PERF_TYPE_HW_CACHE:                 0xEEEEEEEE00DDCCBB
+ *                                     BB: hardware cache ID
+ *                                     CC: hardware cache op ID
+ *                                     DD: hardware cache op result ID
+ *                                     EEEEEEEE: PMU type ID
+ * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied.
+ */
+#define PERF_PMU_TYPE_SHIFT            32
+#define PERF_HW_EVENT_MASK             0xffffffff
+
 /*
  * Generalized performance event event_id types, used by the
  * attr.event_id parameter of the sys_perf_event_open()
index 5043747..f4e9147 100644 (file)
@@ -28,11 +28,11 @@ void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
                dst[k] = bitmap1[k] | bitmap2[k];
 }
 
-size_t bitmap_scnprintf(unsigned long *bitmap, int nbits,
+size_t bitmap_scnprintf(unsigned long *bitmap, unsigned int nbits,
                        char *buf, size_t size)
 {
        /* current bit is 'cur', most recently seen range is [rbot, rtop] */
-       int cur, rbot, rtop;
+       unsigned int cur, rbot, rtop;
        bool first = true;
        size_t ret = 0;
 
index e7a8d84..1d80ad4 100644 (file)
@@ -202,9 +202,11 @@ static inline int roundup_len(__u32 len)
        return (len + 7) / 8 * 8;
 }
 
-static int ringbuf_process_ring(struct ring* r)
+static int64_t ringbuf_process_ring(struct ring* r)
 {
-       int *len_ptr, len, err, cnt = 0;
+       int *len_ptr, len, err;
+       /* 64-bit to avoid overflow in case of extreme application behavior */
+       int64_t cnt = 0;
        unsigned long cons_pos, prod_pos;
        bool got_new_data;
        void *sample;
@@ -244,12 +246,14 @@ done:
 }
 
 /* Consume available ring buffer(s) data without event polling.
- * Returns number of records consumed across all registered ring buffers, or
- * negative number if any of the callbacks return error.
+ * Returns number of records consumed across all registered ring buffers (or
+ * INT_MAX, whichever is less), or negative number if any of the callbacks
+ * return error.
  */
 int ring_buffer__consume(struct ring_buffer *rb)
 {
-       int i, err, res = 0;
+       int64_t err, res = 0;
+       int i;
 
        for (i = 0; i < rb->ring_cnt; i++) {
                struct ring *ring = &rb->rings[i];
@@ -259,18 +263,24 @@ int ring_buffer__consume(struct ring_buffer *rb)
                        return err;
                res += err;
        }
+       if (res > INT_MAX)
+               return INT_MAX;
        return res;
 }
 
 /* Poll for available data and consume records, if any are available.
- * Returns number of records consumed, or negative number, if any of the
- * registered callbacks returned error.
+ * Returns number of records consumed (or INT_MAX, whichever is less), or
+ * negative number, if any of the registered callbacks returned error.
  */
 int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms)
 {
-       int i, cnt, err, res = 0;
+       int i, cnt;
+       int64_t err, res = 0;
 
        cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms);
+       if (cnt < 0)
+               return -errno;
+
        for (i = 0; i < cnt; i++) {
                __u32 ring_id = rb->events[i].data.fd;
                struct ring *ring = &rb->rings[ring_id];
@@ -280,7 +290,9 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms)
                        return err;
                res += err;
        }
-       return cnt < 0 ? -errno : res;
+       if (res > INT_MAX)
+               return INT_MAX;
+       return res;
 }
 
 /* Get an fd that can be used to sleep until data is available in the ring(s) */
index ac37022..109aa7f 100644 (file)
  *    searching it for one bits.
  *  - The optional "addr2", which is anded with "addr1" if present.
  */
-static inline unsigned long _find_next_bit(const unsigned long *addr1,
+unsigned long _find_next_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long nbits,
-               unsigned long start, unsigned long invert)
+               unsigned long start, unsigned long invert, unsigned long le)
 {
-       unsigned long tmp;
+       unsigned long tmp, mask;
+       (void) le;
 
        if (unlikely(start >= nbits))
                return nbits;
@@ -43,7 +44,19 @@ static inline unsigned long _find_next_bit(const unsigned long *addr1,
        tmp ^= invert;
 
        /* Handle 1st word. */
-       tmp &= BITMAP_FIRST_WORD_MASK(start);
+       mask = BITMAP_FIRST_WORD_MASK(start);
+
+       /*
+        * Due to the lack of swab() in tools, and the fact that it doesn't
+        * need little-endian support, just comment it out
+        */
+#if (0)
+       if (le)
+               mask = swab(mask);
+#endif
+
+       tmp &= mask;
+
        start = round_down(start, BITS_PER_LONG);
 
        while (!tmp) {
@@ -57,18 +70,12 @@ static inline unsigned long _find_next_bit(const unsigned long *addr1,
                tmp ^= invert;
        }
 
-       return min(start + __ffs(tmp), nbits);
-}
+#if (0)
+       if (le)
+               tmp = swab(tmp);
 #endif
 
-#ifndef find_next_bit
-/*
- * Find the next set bit in a memory region.
- */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-                           unsigned long offset)
-{
-       return _find_next_bit(addr, NULL, size, offset, 0UL);
+       return min(start + __ffs(tmp), nbits);
 }
 #endif
 
@@ -76,7 +83,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
 /*
  * Find the first set bit in a memory region.
  */
-unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+unsigned long _find_first_bit(const unsigned long *addr, unsigned long size)
 {
        unsigned long idx;
 
@@ -93,7 +100,7 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
 /*
  * Find the first cleared bit in a memory region.
  */
-unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size)
 {
        unsigned long idx;
 
@@ -105,20 +112,3 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
        return size;
 }
 #endif
-
-#ifndef find_next_zero_bit
-unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
-                                unsigned long offset)
-{
-       return _find_next_bit(addr, NULL, size, offset, ~0UL);
-}
-#endif
-
-#ifndef find_next_and_bit
-unsigned long find_next_and_bit(const unsigned long *addr1,
-               const unsigned long *addr2, unsigned long size,
-               unsigned long offset)
-{
-       return _find_next_bit(addr1, addr2, size, offset, 0UL);
-}
-#endif
index 0c74c30..63ae5e0 100644 (file)
@@ -136,6 +136,9 @@ SYNOPSIS
                        struct perf_thread_map *threads);
   void perf_evsel__close(struct perf_evsel *evsel);
   void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu);
+  int perf_evsel__mmap(struct perf_evsel *evsel, int pages);
+  void perf_evsel__munmap(struct perf_evsel *evsel);
+  void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu, int thread);
   int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
                        struct perf_counts_values *count);
   int perf_evsel__enable(struct perf_evsel *evsel);
index 4dc0628..bd8c2f1 100644 (file)
 #include <stdlib.h>
 #include <internal/xyarray.h>
 #include <internal/cpumap.h>
+#include <internal/mmap.h>
 #include <internal/threadmap.h>
 #include <internal/lib.h>
 #include <linux/string.h>
 #include <sys/ioctl.h>
+#include <sys/mman.h>
 
 void perf_evsel__init(struct perf_evsel *evsel, struct perf_event_attr *attr)
 {
@@ -38,6 +40,7 @@ void perf_evsel__delete(struct perf_evsel *evsel)
 }
 
 #define FD(e, x, y) (*(int *) xyarray__entry(e->fd, x, y))
+#define MMAP(e, x, y) (e->mmap ? ((struct perf_mmap *) xyarray__entry(e->mmap, x, y)) : NULL)
 
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 {
@@ -55,6 +58,13 @@ int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
        return evsel->fd != NULL ? 0 : -ENOMEM;
 }
 
+static int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads)
+{
+       evsel->mmap = xyarray__new(ncpus, nthreads, sizeof(struct perf_mmap));
+
+       return evsel->mmap != NULL ? 0 : -ENOMEM;
+}
+
 static int
 sys_perf_event_open(struct perf_event_attr *attr,
                    pid_t pid, int cpu, int group_fd,
@@ -156,6 +166,72 @@ void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu)
        perf_evsel__close_fd_cpu(evsel, cpu);
 }
 
+void perf_evsel__munmap(struct perf_evsel *evsel)
+{
+       int cpu, thread;
+
+       if (evsel->fd == NULL || evsel->mmap == NULL)
+               return;
+
+       for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++) {
+               for (thread = 0; thread < xyarray__max_y(evsel->fd); thread++) {
+                       int fd = FD(evsel, cpu, thread);
+                       struct perf_mmap *map = MMAP(evsel, cpu, thread);
+
+                       if (fd < 0)
+                               continue;
+
+                       perf_mmap__munmap(map);
+               }
+       }
+
+       xyarray__delete(evsel->mmap);
+       evsel->mmap = NULL;
+}
+
+int perf_evsel__mmap(struct perf_evsel *evsel, int pages)
+{
+       int ret, cpu, thread;
+       struct perf_mmap_param mp = {
+               .prot = PROT_READ | PROT_WRITE,
+               .mask = (pages * page_size) - 1,
+       };
+
+       if (evsel->fd == NULL || evsel->mmap)
+               return -EINVAL;
+
+       if (perf_evsel__alloc_mmap(evsel, xyarray__max_x(evsel->fd), xyarray__max_y(evsel->fd)) < 0)
+               return -ENOMEM;
+
+       for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++) {
+               for (thread = 0; thread < xyarray__max_y(evsel->fd); thread++) {
+                       int fd = FD(evsel, cpu, thread);
+                       struct perf_mmap *map = MMAP(evsel, cpu, thread);
+
+                       if (fd < 0)
+                               continue;
+
+                       perf_mmap__init(map, NULL, false, NULL);
+
+                       ret = perf_mmap__mmap(map, &mp, fd, cpu);
+                       if (ret) {
+                               perf_evsel__munmap(evsel);
+                               return ret;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu, int thread)
+{
+       if (FD(evsel, cpu, thread) < 0 || MMAP(evsel, cpu, thread) == NULL)
+               return NULL;
+
+       return MMAP(evsel, cpu, thread)->base;
+}
+
 int perf_evsel__read_size(struct perf_evsel *evsel)
 {
        u64 read_format = evsel->attr.read_format;
@@ -191,6 +267,10 @@ int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
        if (FD(evsel, cpu, thread) < 0)
                return -EINVAL;
 
+       if (MMAP(evsel, cpu, thread) &&
+           !perf_mmap__read_self(MMAP(evsel, cpu, thread), count))
+               return 0;
+
        if (readn(FD(evsel, cpu, thread), count->values, size) <= 0)
                return -errno;
 
index 1ffd083..1c067d0 100644 (file)
@@ -41,6 +41,7 @@ struct perf_evsel {
        struct perf_cpu_map     *own_cpus;
        struct perf_thread_map  *threads;
        struct xyarray          *fd;
+       struct xyarray          *mmap;
        struct xyarray          *sample_id;
        u64                     *id;
        u32                      ids;
index be7556e..5e3422f 100644 (file)
@@ -11,6 +11,7 @@
 #define PERF_SAMPLE_MAX_SIZE (1 << 16)
 
 struct perf_mmap;
+struct perf_counts_values;
 
 typedef void (*libperf_unmap_cb_t)(struct perf_mmap *map);
 
@@ -52,4 +53,6 @@ void perf_mmap__put(struct perf_mmap *map);
 
 u64 perf_mmap__read_head(struct perf_mmap *map);
 
+int perf_mmap__read_self(struct perf_mmap *map, struct perf_counts_values *count);
+
 #endif /* __LIBPERF_INTERNAL_MMAP_H */
index 2093e88..29425c2 100644 (file)
@@ -3,11 +3,32 @@
 #define __LIBPERF_INTERNAL_TESTS_H
 
 #include <stdio.h>
+#include <unistd.h>
 
 int tests_failed;
+int tests_verbose;
+
+static inline int get_verbose(char **argv, int argc)
+{
+       int c;
+       int verbose = 0;
+
+       while ((c = getopt(argc, argv, "v")) != -1) {
+               switch (c)
+               {
+               case 'v':
+                       verbose = 1;
+                       break;
+               default:
+                       break;
+               }
+       }
+       return verbose;
+}
 
 #define __T_START                                      \
 do {                                                   \
+       tests_verbose = get_verbose(argv, argc);        \
        fprintf(stdout, "- running %s...", __FILE__);   \
        fflush(NULL);                                   \
        tests_failed = 0;                               \
@@ -30,4 +51,15 @@ do {
        }                                                                        \
 } while (0)
 
+#define __T_VERBOSE(...)                                               \
+do {                                                                   \
+       if (tests_verbose) {                                            \
+               if (tests_verbose == 1) {                               \
+                       fputc('\n', stderr);                            \
+                       tests_verbose++;                                \
+               }                                                       \
+               fprintf(stderr, ##__VA_ARGS__);                         \
+       }                                                               \
+} while (0)
+
 #endif /* __LIBPERF_INTERNAL_TESTS_H */
index 51e35d6..f10af3d 100644 (file)
@@ -18,11 +18,18 @@ struct xyarray *xyarray__new(int xlen, int ylen, size_t entry_size);
 void xyarray__delete(struct xyarray *xy);
 void xyarray__reset(struct xyarray *xy);
 
-static inline void *xyarray__entry(struct xyarray *xy, int x, int y)
+static inline void *__xyarray__entry(struct xyarray *xy, int x, int y)
 {
        return &xy->contents[x * xy->row_size + y * xy->entry_size];
 }
 
+static inline void *xyarray__entry(struct xyarray *xy, size_t x, size_t y)
+{
+       if (x >= xy->max_x || y >= xy->max_y)
+               return NULL;
+       return __xyarray__entry(xy, x, y);
+}
+
 static inline int xyarray__max_y(struct xyarray *xy)
 {
        return xy->max_y;
diff --git a/tools/lib/perf/include/perf/bpf_perf.h b/tools/lib/perf/include/perf/bpf_perf.h
new file mode 100644 (file)
index 0000000..e7cf6ba
--- /dev/null
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __LIBPERF_BPF_PERF_H
+#define __LIBPERF_BPF_PERF_H
+
+#include <linux/types.h>  /* for __u32 */
+
+/*
+ * bpf_perf uses a hashmap, the attr_map, to track all the leader programs.
+ * The hashmap is pinned in bpffs. flock() on this file is used to ensure
+ * no concurrent access to the attr_map.  The key of attr_map is struct
+ * perf_event_attr, and the value is struct perf_event_attr_map_entry.
+ *
+ * struct perf_event_attr_map_entry contains two __u32 IDs, bpf_link of the
+ * leader prog, and the diff_map. Each perf-stat session holds a reference
+ * to the bpf_link to make sure the leader prog is attached to sched_switch
+ * tracepoint.
+ *
+ * Since the hashmap only contains IDs of the bpf_link and diff_map, it
+ * does not hold any references to the leader program. Once all perf-stat
+ * sessions of these events exit, the leader prog, its maps, and the
+ * perf_events will be freed.
+ */
+struct perf_event_attr_map_entry {
+       __u32 link_id;
+       __u32 diff_map_id;
+};
+
+/* default attr_map name */
+#define BPF_PERF_DEFAULT_ATTR_MAP_PATH "perf_attr_map"
+
+#endif /* __LIBPERF_BPF_PERF_H */
index d820542..4d0c02b 100644 (file)
@@ -8,6 +8,8 @@
 #include <linux/bpf.h>
 #include <sys/types.h> /* pid_t */
 
+#define event_contains(obj, mem) ((obj).header.size > offsetof(typeof(obj), mem))
+
 struct perf_record_mmap {
        struct perf_event_header header;
        __u32                    pid, tid;
@@ -346,8 +348,9 @@ struct perf_record_time_conv {
        __u64                    time_zero;
        __u64                    time_cycles;
        __u64                    time_mask;
-       bool                     cap_user_time_zero;
-       bool                     cap_user_time_short;
+       __u8                     cap_user_time_zero;
+       __u8                     cap_user_time_short;
+       __u8                     reserved[6];   /* For alignment */
 };
 
 struct perf_record_header_feature {
index c82ec39..60eae25 100644 (file)
@@ -27,6 +27,9 @@ LIBPERF_API int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *
                                 struct perf_thread_map *threads);
 LIBPERF_API void perf_evsel__close(struct perf_evsel *evsel);
 LIBPERF_API void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu);
+LIBPERF_API int perf_evsel__mmap(struct perf_evsel *evsel, int pages);
+LIBPERF_API void perf_evsel__munmap(struct perf_evsel *evsel);
+LIBPERF_API void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu, int thread);
 LIBPERF_API int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
                                 struct perf_counts_values *count);
 LIBPERF_API int perf_evsel__enable(struct perf_evsel *evsel);
index 7be1af8..c0c7ceb 100644 (file)
@@ -23,6 +23,9 @@ LIBPERF_0.0.1 {
                perf_evsel__disable;
                perf_evsel__open;
                perf_evsel__close;
+               perf_evsel__mmap;
+               perf_evsel__munmap;
+               perf_evsel__mmap_base;
                perf_evsel__read;
                perf_evsel__cpus;
                perf_evsel__threads;
index 79d5ed6..c89dfa5 100644 (file)
@@ -8,9 +8,11 @@
 #include <linux/perf_event.h>
 #include <perf/mmap.h>
 #include <perf/event.h>
+#include <perf/evsel.h>
 #include <internal/mmap.h>
 #include <internal/lib.h>
 #include <linux/kernel.h>
+#include <linux/math64.h>
 #include "internal.h"
 
 void perf_mmap__init(struct perf_mmap *map, struct perf_mmap *prev,
@@ -273,3 +275,89 @@ union perf_event *perf_mmap__read_event(struct perf_mmap *map)
 
        return event;
 }
+
+#if defined(__i386__) || defined(__x86_64__)
+static u64 read_perf_counter(unsigned int counter)
+{
+       unsigned int low, high;
+
+       asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));
+
+       return low | ((u64)high) << 32;
+}
+
+static u64 read_timestamp(void)
+{
+       unsigned int low, high;
+
+       asm volatile("rdtsc" : "=a" (low), "=d" (high));
+
+       return low | ((u64)high) << 32;
+}
+#else
+static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
+static u64 read_timestamp(void) { return 0; }
+#endif
+
+int perf_mmap__read_self(struct perf_mmap *map, struct perf_counts_values *count)
+{
+       struct perf_event_mmap_page *pc = map->base;
+       u32 seq, idx, time_mult = 0, time_shift = 0;
+       u64 cnt, cyc = 0, time_offset = 0, time_cycles = 0, time_mask = ~0ULL;
+
+       if (!pc || !pc->cap_user_rdpmc)
+               return -1;
+
+       do {
+               seq = READ_ONCE(pc->lock);
+               barrier();
+
+               count->ena = READ_ONCE(pc->time_enabled);
+               count->run = READ_ONCE(pc->time_running);
+
+               if (pc->cap_user_time && count->ena != count->run) {
+                       cyc = read_timestamp();
+                       time_mult = READ_ONCE(pc->time_mult);
+                       time_shift = READ_ONCE(pc->time_shift);
+                       time_offset = READ_ONCE(pc->time_offset);
+
+                       if (pc->cap_user_time_short) {
+                               time_cycles = READ_ONCE(pc->time_cycles);
+                               time_mask = READ_ONCE(pc->time_mask);
+                       }
+               }
+
+               idx = READ_ONCE(pc->index);
+               cnt = READ_ONCE(pc->offset);
+               if (pc->cap_user_rdpmc && idx) {
+                       s64 evcnt = read_perf_counter(idx - 1);
+                       u16 width = READ_ONCE(pc->pmc_width);
+
+                       evcnt <<= 64 - width;
+                       evcnt >>= 64 - width;
+                       cnt += evcnt;
+               } else
+                       return -1;
+
+               barrier();
+       } while (READ_ONCE(pc->lock) != seq);
+
+       if (count->ena != count->run) {
+               u64 delta;
+
+               /* Adjust for cap_usr_time_short, a nop if not */
+               cyc = time_cycles + ((cyc - time_cycles) & time_mask);
+
+               delta = time_offset + mul_u64_u32_shr(cyc, time_mult, time_shift);
+
+               count->ena += delta;
+               if (idx)
+                       count->run += delta;
+
+               cnt = mul_u64_u64_div64(cnt, count->ena, count->run);
+       }
+
+       count->val = cnt;
+
+       return 0;
+}
index 9684177..b536cc9 100644 (file)
@@ -5,6 +5,8 @@ TESTS = test-cpumap test-threadmap test-evlist test-evsel
 TESTS_SO := $(addsuffix -so,$(TESTS))
 TESTS_A  := $(addsuffix -a,$(TESTS))
 
+TEST_ARGS := $(if $(V),-v)
+
 # Set compile option CFLAGS
 ifdef EXTRA_CFLAGS
   CFLAGS := $(EXTRA_CFLAGS)
@@ -28,9 +30,9 @@ all: $(TESTS_A) $(TESTS_SO)
 
 run:
        @echo "running static:"
-       @for i in $(TESTS_A); do ./$$i; done
+       @for i in $(TESTS_A); do ./$$i $(TEST_ARGS); done
        @echo "running dynamic:"
-       @for i in $(TESTS_SO); do LD_LIBRARY_PATH=../ ./$$i; done
+       @for i in $(TESTS_SO); do LD_LIBRARY_PATH=../ ./$$i $(TEST_ARGS); done
 
 clean:
        $(call QUIET_CLEAN, tests)$(RM) $(TESTS_A) $(TESTS_SO)
index 0ad82d7..288b5fe 100644 (file)
@@ -120,6 +120,70 @@ static int test_stat_thread_enable(void)
        return 0;
 }
 
+static int test_stat_user_read(int event)
+{
+       struct perf_counts_values counts = { .val = 0 };
+       struct perf_thread_map *threads;
+       struct perf_evsel *evsel;
+       struct perf_event_mmap_page *pc;
+       struct perf_event_attr attr = {
+               .type   = PERF_TYPE_HARDWARE,
+               .config = event,
+       };
+       int err, i;
+
+       threads = perf_thread_map__new_dummy();
+       __T("failed to create threads", threads);
+
+       perf_thread_map__set_pid(threads, 0, 0);
+
+       evsel = perf_evsel__new(&attr);
+       __T("failed to create evsel", evsel);
+
+       err = perf_evsel__open(evsel, NULL, threads);
+       __T("failed to open evsel", err == 0);
+
+       err = perf_evsel__mmap(evsel, 0);
+       __T("failed to mmap evsel", err == 0);
+
+       pc = perf_evsel__mmap_base(evsel, 0, 0);
+
+#if defined(__i386__) || defined(__x86_64__)
+       __T("userspace counter access not supported", pc->cap_user_rdpmc);
+       __T("userspace counter access not enabled", pc->index);
+       __T("userspace counter width not set", pc->pmc_width >= 32);
+#endif
+
+       perf_evsel__read(evsel, 0, 0, &counts);
+       __T("failed to read value for evsel", counts.val != 0);
+
+       for (i = 0; i < 5; i++) {
+               volatile int count = 0x10000 << i;
+               __u64 start, end, last = 0;
+
+               __T_VERBOSE("\tloop = %u, ", count);
+
+               perf_evsel__read(evsel, 0, 0, &counts);
+               start = counts.val;
+
+               while (count--) ;
+
+               perf_evsel__read(evsel, 0, 0, &counts);
+               end = counts.val;
+
+               __T("invalid counter data", (end - start) > last);
+               last = end - start;
+               __T_VERBOSE("count = %llu\n", end - start);
+       }
+
+       perf_evsel__munmap(evsel);
+       perf_evsel__close(evsel);
+       perf_evsel__delete(evsel);
+
+       perf_thread_map__put(threads);
+       return 0;
+}
+
 int main(int argc, char **argv)
 {
        __T_START;
@@ -129,6 +193,8 @@ int main(int argc, char **argv)
        test_stat_cpu();
        test_stat_thread();
        test_stat_thread_enable();
+       test_stat_user_read(PERF_COUNT_HW_INSTRUCTIONS);
+       test_stat_user_read(PERF_COUNT_HW_CPU_CYCLES);
 
        __T_END;
        return tests_failed == 0 ? 0 : -1;
index f3f8478..e555e97 100644 (file)
@@ -20,6 +20,7 @@ perf.data.old
 output.svg
 perf-archive
 perf-with-kcore
+perf-iostat
 tags
 TAGS
 cscope*
diff --git a/tools/perf/Documentation/intel-hybrid.txt b/tools/perf/Documentation/intel-hybrid.txt
new file mode 100644 (file)
index 0000000..07f0aa3
--- /dev/null
@@ -0,0 +1,214 @@
+Intel hybrid support
+--------------------
+Support for Intel hybrid events within perf tools.
+
+For some Intel platforms, such as AlderLake, which is hybrid platform and
+it consists of atom cpu and core cpu. Each cpu has dedicated event list.
+Part of events are available on core cpu, part of events are available
+on atom cpu and even part of events are available on both.
+
+Kernel exports two new cpu pmus via sysfs:
+/sys/devices/cpu_core
+/sys/devices/cpu_atom
+
+The 'cpus' files are created under the directories. For example,
+
+cat /sys/devices/cpu_core/cpus
+0-15
+
+cat /sys/devices/cpu_atom/cpus
+16-23
+
+It indicates cpu0-cpu15 are core cpus and cpu16-cpu23 are atom cpus.
+
+Quickstart
+
+List hybrid event
+-----------------
+
+As before, use perf-list to list the symbolic event.
+
+perf list
+
+inst_retired.any
+       [Fixed Counter: Counts the number of instructions retired. Unit: cpu_atom]
+inst_retired.any
+       [Number of instructions retired. Fixed Counter - architectural event. Unit: cpu_core]
+
+The 'Unit: xxx' is added to brief description to indicate which pmu
+the event is belong to. Same event name but with different pmu can
+be supported.
+
+Enable hybrid event with a specific pmu
+---------------------------------------
+
+To enable a core only event or atom only event, following syntax is supported:
+
+       cpu_core/<event name>/
+or
+       cpu_atom/<event name>/
+
+For example, count the 'cycles' event on core cpus.
+
+       perf stat -e cpu_core/cycles/
+
+Create two events for one hardware event automatically
+------------------------------------------------------
+
+When creating one event and the event is available on both atom and core,
+two events are created automatically. One is for atom, the other is for
+core. Most of hardware events and cache events are available on both
+cpu_core and cpu_atom.
+
+For hardware events, they have pre-defined configs (e.g. 0 for cycles).
+But on hybrid platform, kernel needs to know where the event comes from
+(from atom or from core). The original perf event type PERF_TYPE_HARDWARE
+can't carry pmu information. So now this type is extended to be PMU aware
+type. The PMU type ID is stored at attr.config[63:32].
+
+PMU type ID is retrieved from sysfs.
+/sys/devices/cpu_atom/type
+/sys/devices/cpu_core/type
+
+The new attr.config layout for PERF_TYPE_HARDWARE:
+
+PERF_TYPE_HARDWARE:                 0xEEEEEEEE000000AA
+                                    AA: hardware event ID
+                                    EEEEEEEE: PMU type ID
+
+Cache event is similar. The type PERF_TYPE_HW_CACHE is extended to be
+PMU aware type. The PMU type ID is stored at attr.config[63:32].
+
+The new attr.config layout for PERF_TYPE_HW_CACHE:
+
+PERF_TYPE_HW_CACHE:                 0xEEEEEEEE00DDCCBB
+                                    BB: hardware cache ID
+                                    CC: hardware cache op ID
+                                    DD: hardware cache op result ID
+                                    EEEEEEEE: PMU type ID
+
+When enabling a hardware event without specified pmu, such as,
+perf stat -e cycles -a (use system-wide in this example), two events
+are created automatically.
+
+  ------------------------------------------------------------
+  perf_event_attr:
+    size                             120
+    config                           0x400000000
+    sample_type                      IDENTIFIER
+    read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
+    disabled                         1
+    inherit                          1
+    exclude_guest                    1
+  ------------------------------------------------------------
+
+and
+
+  ------------------------------------------------------------
+  perf_event_attr:
+    size                             120
+    config                           0x800000000
+    sample_type                      IDENTIFIER
+    read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
+    disabled                         1
+    inherit                          1
+    exclude_guest                    1
+  ------------------------------------------------------------
+
+type 0 is PERF_TYPE_HARDWARE.
+0x4 in 0x400000000 indicates it's cpu_core pmu.
+0x8 in 0x800000000 indicates it's cpu_atom pmu (atom pmu type id is random).
+
+The kernel creates 'cycles' (0x400000000) on cpu0-cpu15 (core cpus),
+and create 'cycles' (0x800000000) on cpu16-cpu23 (atom cpus).
+
+For perf-stat result, it displays two events:
+
+ Performance counter stats for 'system wide':
+
+           6,744,979      cpu_core/cycles/
+           1,965,552      cpu_atom/cycles/
+
+The first 'cycles' is core event, the second 'cycles' is atom event.
+
+Thread mode example:
+--------------------
+
+perf-stat reports the scaled counts for hybrid event and with a percentage
+displayed. The percentage is the event's running time/enabling time.
+
+One example, 'triad_loop' runs on cpu16 (atom core), while we can see the
+scaled value for core cycles is 160,444,092 and the percentage is 0.47%.
+
+perf stat -e cycles -- taskset -c 16 ./triad_loop
+
+As previous, two events are created.
+
+------------------------------------------------------------
+perf_event_attr:
+  size                             120
+  config                           0x400000000
+  sample_type                      IDENTIFIER
+  read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
+  disabled                         1
+  inherit                          1
+  enable_on_exec                   1
+  exclude_guest                    1
+------------------------------------------------------------
+
+and
+
+------------------------------------------------------------
+perf_event_attr:
+  size                             120
+  config                           0x800000000
+  sample_type                      IDENTIFIER
+  read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
+  disabled                         1
+  inherit                          1
+  enable_on_exec                   1
+  exclude_guest                    1
+------------------------------------------------------------
+
+ Performance counter stats for 'taskset -c 16 ./triad_loop':
+
+       233,066,666      cpu_core/cycles/                                              (0.43%)
+       604,097,080      cpu_atom/cycles/                                              (99.57%)
+
+perf-record:
+------------
+
+If there is no '-e' specified in perf record, on hybrid platform,
+it creates two default 'cycles' and adds them to event list. One
+is for core, the other is for atom.
+
+perf-stat:
+----------
+
+If there is no '-e' specified in perf stat, on hybrid platform,
+besides of software events, following events are created and
+added to event list in order.
+
+cpu_core/cycles/,
+cpu_atom/cycles/,
+cpu_core/instructions/,
+cpu_atom/instructions/,
+cpu_core/branches/,
+cpu_atom/branches/,
+cpu_core/branch-misses/,
+cpu_atom/branch-misses/
+
+Of course, both perf-stat and perf-record support to enable
+hybrid event with a specific pmu.
+
+e.g.
+perf stat -e cpu_core/cycles/
+perf stat -e cpu_atom/cycles/
+perf stat -e cpu_core/r1a/
+perf stat -e cpu_atom/L1-icache-loads/
+perf stat -e cpu_core/cycles/,cpu_atom/instructions/
+perf stat -e '{cpu_core/cycles/,cpu_core/instructions/}'
+
+But '{cpu_core/cycles/,cpu_atom/instructions/}' will return
+warning and disable grouping, because the pmus in group are
+not matched (cpu_core vs. cpu_atom).
index 1b5042f..80c1be5 100644 (file)
@@ -124,6 +124,13 @@ OPTIONS
 --group::
        Show event group information together
 
+--demangle::
+       Demangle symbol names to human readable form. It's enabled by default,
+       disable with --no-demangle.
+
+--demangle-kernel::
+       Demangle kernel symbol names to human readable form (for C++ kernels).
+
 --percent-type::
        Set annotation percent type from following choices:
          global-period, local-period, global-hits, local-hits
index bb167e3..cd8ce6e 100644 (file)
@@ -57,7 +57,7 @@ OPTIONS
 -u::
 --update=::
        Update specified file of the cache. Note that this doesn't remove
-       older entires since those may be still needed for annotating old
+       older entries since those may be still needed for annotating old
        (or remote) perf.data. Only if there is already a cache which has
        exactly same build-id, that is replaced by new one. It can be used
        to update kallsyms and kernel dso to vmlinux in order to support
index 153bde1..b0872c8 100644 (file)
@@ -123,6 +123,7 @@ Given a $HOME/.perfconfig like this:
                queue-size = 0
                children = true
                group = true
+               skip-empty = true
 
        [llvm]
                dump-obj = true
@@ -393,6 +394,12 @@ annotate.*::
 
                This option works with tui, stdio2 browsers.
 
+       annotate.demangle::
+               Demangle symbol names to human readable form. Default is 'true'.
+
+       annotate.demangle_kernel::
+               Demangle kernel symbol names to human readable form. Default is 'true'.
+
 hist.*::
        hist.percentage::
                This option control the way to calculate overhead of filtered entries -
@@ -525,6 +532,10 @@ report.*::
                     0.07%   0.00%  noploop  ld-2.15.so         [.] strcmp
                     0.03%   0.00%  noploop  [kernel.kallsyms]  [k] timerqueue_del
 
+       report.skip-empty::
+               This option can change default stat behavior with empty results.
+               If it's set true, 'perf report --stat' will not show 0 stats.
+
 top.*::
        top.children::
                Same as 'report.children'. So if it is enabled, the output of 'top'
index 726b9bc..417bf17 100644 (file)
@@ -17,7 +17,7 @@ Data file related processing.
 COMMANDS
 --------
 convert::
-       Converts perf data file into another format (only CTF [1] format is support by now).
+       Converts perf data file into another format.
        It's possible to set data-convert debug variable to get debug messages from conversion,
        like:
          perf --debug data-convert data convert ...
@@ -27,6 +27,9 @@ OPTIONS for 'convert'
 --to-ctf::
        Triggers the CTF conversion, specify the path of CTF data directory.
 
+--to-json::
+       Triggers JSON conversion. Specify the JSON filename to output.
+
 --tod::
        Convert time to wall clock time.
 
diff --git a/tools/perf/Documentation/perf-iostat.txt b/tools/perf/Documentation/perf-iostat.txt
new file mode 100644 (file)
index 0000000..1651769
--- /dev/null
@@ -0,0 +1,88 @@
+perf-iostat(1)
+===============
+
+NAME
+----
+perf-iostat - Show I/O performance metrics
+
+SYNOPSIS
+--------
+[verse]
+'perf iostat' list
+'perf iostat' <ports> -- <command> [<options>]
+
+DESCRIPTION
+-----------
+Mode is intended to provide four I/O performance metrics per each PCIe root port:
+
+- Inbound Read   - I/O devices below root port read from the host memory, in MB
+
+- Inbound Write  - I/O devices below root port write to the host memory, in MB
+
+- Outbound Read  - CPU reads from I/O devices below root port, in MB
+
+- Outbound Write - CPU writes to I/O devices below root port, in MB
+
+OPTIONS
+-------
+<command>...::
+       Any command you can specify in a shell.
+
+list::
+       List all PCIe root ports.
+
+<ports>::
+       Select the root ports for monitoring. Comma-separated list is supported.
+
+EXAMPLES
+--------
+
+1. List all PCIe root ports (example for 2-S platform):
+
+   $ perf iostat list
+   S0-uncore_iio_0<0000:00>
+   S1-uncore_iio_0<0000:80>
+   S0-uncore_iio_1<0000:17>
+   S1-uncore_iio_1<0000:85>
+   S0-uncore_iio_2<0000:3a>
+   S1-uncore_iio_2<0000:ae>
+   S0-uncore_iio_3<0000:5d>
+   S1-uncore_iio_3<0000:d7>
+
+2. Collect metrics for all PCIe root ports:
+
+   $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s
+
+    Performance counter stats for 'system wide':
+
+      port             Inbound Read(MB)    Inbound Write(MB)    Outbound Read(MB)   Outbound Write(MB)
+   0000:00                    1                    0                    2                    3
+   0000:80                    0                    0                    0                    0
+   0000:17               352552                   43                    0                   21
+   0000:85                    0                    0                    0                    0
+   0000:3a                    3                    0                    0                    0
+   0000:ae                    0                    0                    0                    0
+   0000:5d                    0                    0                    0                    0
+   0000:d7                    0                    0                    0                    0
+
+3. Collect metrics for comma-separated list of PCIe root ports:
+
+   $ perf iostat 0000:17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s
+
+    Performance counter stats for 'system wide':
+
+      port             Inbound Read(MB)    Inbound Write(MB)    Outbound Read(MB)   Outbound Write(MB)
+   0000:17               358559                   44                    0                   22
+   0000:3a                    3                    2                    0                    0
+
+        197.081983474 seconds time elapsed
+
+SEE ALSO
+--------
+linkperf:perf-stat[1]
\ No newline at end of file
index f3161c9..d71bac8 100644 (file)
@@ -695,6 +695,7 @@ measurements:
  wait -n ${perf_pid}
  exit $?
 
+include::intel-hybrid.txt[]
 
 SEE ALSO
 --------
index f546b5e..24efc05 100644 (file)
@@ -112,6 +112,8 @@ OPTIONS
        - ins_lat: Instruction latency in core cycles. This is the global instruction
          latency
        - local_ins_lat: Local instruction latency version
+       - p_stage_cyc: On powerpc, this presents the number of cycles spent in a
+         pipeline stage. And currently supported only on powerpc.
 
        By default, comm, dso and symbol keys are used.
        (i.e. --sort comm,dso,symbol)
@@ -224,6 +226,9 @@ OPTIONS
 --dump-raw-trace::
         Dump raw trace in ASCII.
 
+--disable-order::
+       Disable raw trace ordering.
+
 -g::
 --call-graph=<print_type,threshold[,print_limit],order,sort_key[,branch],value>::
         Display call chains using type, min percent threshold, print limit,
@@ -472,7 +477,7 @@ OPTIONS
        but probably we'll make the default not to show the switch-on/off events
         on the --group mode and if there is only one event besides the off/on ones,
        go straight to the histogram browser, just like 'perf report' with no events
-       explicitely specified does.
+       explicitly specified does.
 
 --itrace::
        Options for decoding instruction tracing data. The options are:
@@ -566,6 +571,9 @@ include::itrace.txt[]
                            sampled cycles
        'Avg Cycles'      - block average sampled cycles
 
+--skip-empty::
+       Do not print 0 results in the --stat output.
+
 include::callchain-overhead-calculation.txt[]
 
 SEE ALSO
index 08a1714..45c2467 100644 (file)
@@ -93,6 +93,19 @@ report::
 
         1.102235068 seconds time elapsed
 
+--bpf-counters::
+       Use BPF programs to aggregate readings from perf_events.  This
+       allows multiple perf-stat sessions that are counting the same metric (cycles,
+       instructions, etc.) to share hardware counters.
+       To use BPF programs on common events by default, use
+       "perf config stat.bpf-counter-events=<list_of_events>".
+
+--bpf-attr-map::
+       With option "--bpf-counters", different perf-stat sessions share
+       information about shared BPF programs and maps via a pinned hashmap.
+       Use "--bpf-attr-map" to specify the path of this pinned hashmap.
+       The default path is /sys/fs/bpf/perf_attr_map.
+
 ifdef::HAVE_LIBPFM[]
 --pfm-events events::
 Select a PMU event using libpfm4 syntax (see http://perfmon2.sf.net)
@@ -142,7 +155,10 @@ Do not aggregate counts across all monitored CPUs.
 
 -n::
 --null::
-        null run - don't start any counters
+null run - Don't start any counters.
+
+This can be useful to measure just elapsed wall-clock time - or to assess the
+raw overhead of perf stat itself, without running any counters.
 
 -v::
 --verbose::
@@ -468,6 +484,15 @@ convenient for post processing.
 --summary::
 Print summary for interval mode (-I).
 
+--no-csv-summary::
+Don't print 'summary' at the first column for CVS summary output.
+This option must be used with -x and --summary.
+
+This option can be enabled in perf config by setting the variable
+'stat.no-csv-summary'.
+
+$ perf config stat.no-csv-summary=true
+
 EXAMPLES
 --------
 
@@ -527,6 +552,8 @@ The fields are in this order:
 
 Additional metrics may be printed with all earlier fields being empty.
 
+include::intel-hybrid.txt[]
+
 SEE ALSO
 --------
 linkperf:perf-top[1], linkperf:perf-list[1]
index ee20246..bba5ffb 100644 (file)
@@ -317,7 +317,7 @@ Default is to monitor all CPUS.
        but probably we'll make the default not to show the switch-on/off events
         on the --group mode and if there is only one event besides the off/on ones,
        go straight to the histogram browser, just like 'perf top' with no events
-       explicitely specified does.
+       explicitly specified does.
 
 --stitch-lbr::
        Show callgraph with stitched LBRs, which may have more complete
index c130a3c..9c330cd 100644 (file)
@@ -76,3 +76,15 @@ SEE ALSO
 linkperf:perf-stat[1], linkperf:perf-top[1],
 linkperf:perf-record[1], linkperf:perf-report[1],
 linkperf:perf-list[1]
+
+linkperf:perf-annotate[1],linkperf:perf-archive[1],
+linkperf:perf-bench[1], linkperf:perf-buildid-cache[1],
+linkperf:perf-buildid-list[1], linkperf:perf-c2c[1],
+linkperf:perf-config[1], linkperf:perf-data[1], linkperf:perf-diff[1],
+linkperf:perf-evlist[1], linkperf:perf-ftrace[1],
+linkperf:perf-help[1], linkperf:perf-inject[1],
+linkperf:perf-intel-pt[1], linkperf:perf-kallsyms[1],
+linkperf:perf-kmem[1], linkperf:perf-kvm[1], linkperf:perf-lock[1],
+linkperf:perf-mem[1], linkperf:perf-probe[1], linkperf:perf-sched[1],
+linkperf:perf-script[1], linkperf:perf-test[1],
+linkperf:perf-trace[1], linkperf:perf-version[1]
index 10f07f9..c6302df 100644 (file)
@@ -72,6 +72,7 @@ For example, the perf_event_attr structure can be initialized with
 The Fixed counter 3 must be the leader of the group.
 
 #include <linux/perf_event.h>
+#include <sys/mman.h>
 #include <sys/syscall.h>
 #include <unistd.h>
 
@@ -95,6 +96,11 @@ int slots_fd = perf_event_open(&slots, 0, -1, -1, 0);
 if (slots_fd < 0)
        ... error ...
 
+/* Memory mapping the fd permits _rdpmc calls from userspace */
+void *slots_p = mmap(0, getpagesize(), PROT_READ, MAP_SHARED, slots_fd, 0);
+if (!slot_p)
+       .... error ...
+
 /*
  * Open metrics event file descriptor for current task.
  * Set slots event as the leader of the group.
@@ -110,6 +116,14 @@ int metrics_fd = perf_event_open(&metrics, 0, -1, slots_fd, 0);
 if (metrics_fd < 0)
        ... error ...
 
+/* Memory mapping the fd permits _rdpmc calls from userspace */
+void *metrics_p = mmap(0, getpagesize(), PROT_READ, MAP_SHARED, metrics_fd, 0);
+if (!metrics_p)
+       ... error ...
+
+Note: the file descriptors returned by the perf_event_open calls must be memory
+mapped to permit calls to the _rdpmd instruction. Permission may also be granted
+by writing the /sys/devices/cpu/rdpmc sysfs node.
 
 The RDPMC instruction (or _rdpmc compiler intrinsic) can now be used
 to read slots and the topdown metrics at different points of the program:
@@ -141,6 +155,10 @@ as the parallelism and overlap in the CPU program execution will
 cause too much measurement inaccuracy. For example instrumenting
 individual basic blocks is definitely too fine grained.
 
+_rdpmc calls should not be mixed with reading the metrics and slots counters
+through system calls, as the kernel will reset these counters after each system
+call.
+
 Decoding metrics values
 =======================
 
index b8fc7d9..f3fe360 100644 (file)
@@ -100,7 +100,10 @@ clean:
 # make -C tools/perf -f tests/make
 #
 build-test:
-       @$(MAKE) SHUF=1 -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile SET_PARALLEL=1 --no-print-directory tarpkg out
+       @$(MAKE) SHUF=1 -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile SET_PARALLEL=1 --no-print-directory tarpkg make_static make_with_gtk2 out
+
+build-test-tarball:
+       @$(MAKE) -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile SET_PARALLEL=1 --no-print-directory out
 
 #
 # All other targets get passed through:
index d8e59d3..0d66190 100644 (file)
@@ -32,7 +32,7 @@ ifneq ($(NO_SYSCALL_TABLE),1)
       NO_SYSCALL_TABLE := 0
     endif
   else
-    ifeq ($(SRCARCH),$(filter $(SRCARCH),powerpc arm64 s390))
+    ifeq ($(SRCARCH),$(filter $(SRCARCH),powerpc arm64 s390 mips))
       NO_SYSCALL_TABLE := 0
     endif
   endif
@@ -87,6 +87,13 @@ ifeq ($(ARCH),s390)
   CFLAGS += -fPIC -I$(OUTPUT)arch/s390/include/generated
 endif
 
+ifeq ($(ARCH),mips)
+  NO_PERF_REGS := 0
+  CFLAGS += -I$(OUTPUT)arch/mips/include/generated
+  CFLAGS += -I../../arch/mips/include/uapi -I../../arch/mips/include/generated/uapi
+  LIBUNWIND_LIBS = -lunwind -lunwind-mips
+endif
+
 ifeq ($(NO_PERF_REGS),0)
   $(call detected,CONFIG_PERF_REGS)
 endif
@@ -292,6 +299,9 @@ ifneq ($(TCMALLOC),)
 endif
 
 ifeq ($(FEATURES_DUMP),)
+# We will display at the end of this Makefile.config, using $(call feature_display_entries)
+# As we may retry some feature detection here, see the disassembler-four-args case, for instance
+  FEATURE_DISPLAY_DEFERRED := 1
 include $(srctree)/tools/build/Makefile.feature
 else
 include $(FEATURES_DUMP)
@@ -1072,6 +1082,15 @@ ifdef LIBPFM4
   endif
 endif
 
+ifdef LIBTRACEEVENT_DYNAMIC
+  $(call feature_check,libtraceevent)
+  ifeq ($(feature-libtraceevent), 1)
+    EXTLIBS += -ltraceevent
+  else
+    dummy := $(error Error: No libtraceevent devel library found, please install libtraceevent-devel);
+  endif
+endif
+
 # Among the variables below, these:
 #   perfexecdir
 #   perf_include_dir
@@ -1208,3 +1227,13 @@ $(call detected_var,LIBDIR)
 $(call detected_var,GTK_CFLAGS)
 $(call detected_var,PERL_EMBED_CCOPTS)
 $(call detected_var,PYTHON_EMBED_CCOPTS)
+
+# re-generate FEATURE-DUMP as we may have called feature_check, found out
+# extra libraries to add to LDFLAGS of some other test and then redo those
+# tests, see the block about libbfd, disassembler-four-args, for instance.
+$(shell rm -f $(FEATURE_DUMP_FILENAME))
+$(foreach feat,$(FEATURE_TESTS),$(shell echo "$(call feature_assign,$(feat))" >> $(FEATURE_DUMP_FILENAME)))
+
+ifeq ($(feature_display),1)
+  $(call feature_display_entries)
+endif
index f6e6096..e47f04e 100644 (file)
@@ -128,6 +128,8 @@ include ../scripts/utilities.mak
 #
 # Define BUILD_BPF_SKEL to enable BPF skeletons
 #
+# Define LIBTRACEEVENT_DYNAMIC to enable libtraceevent dynamic linking
+#
 
 # As per kernel Makefile, avoid funny character set dependencies
 unexport LC_ALL
@@ -283,6 +285,7 @@ SCRIPT_SH =
 
 SCRIPT_SH += perf-archive.sh
 SCRIPT_SH += perf-with-kcore.sh
+SCRIPT_SH += perf-iostat.sh
 
 grep-libs = $(filter -l%,$(1))
 strip-libs = $(filter-out -l%,$(1))
@@ -309,7 +312,6 @@ endif
 
 LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
 export LIBTRACEEVENT
-
 LIBTRACEEVENT_DYNAMIC_LIST = $(PLUGINS_PATH)libtraceevent-dynamic-list
 
 #
@@ -374,12 +376,15 @@ endif
 
 export PERL_PATH
 
-PERFLIBS = $(LIBAPI) $(LIBTRACEEVENT) $(LIBSUBCMD) $(LIBPERF)
+PERFLIBS = $(LIBAPI) $(LIBSUBCMD) $(LIBPERF)
 ifndef NO_LIBBPF
   ifndef LIBBPF_DYNAMIC
     PERFLIBS += $(LIBBPF)
   endif
 endif
+ifndef LIBTRACEEVENT_DYNAMIC
+  PERFLIBS += $(LIBTRACEEVENT)
+endif
 
 # We choose to avoid "if .. else if .. else .. endif endif"
 # because maintaining the nesting to match is a pain.  If
@@ -948,6 +953,8 @@ endif
                $(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
        $(call QUIET_INSTALL, perf-with-kcore) \
                $(INSTALL) $(OUTPUT)perf-with-kcore -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+       $(call QUIET_INSTALL, perf-iostat) \
+               $(INSTALL) $(OUTPUT)perf-iostat -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 ifndef NO_LIBAUDIT
        $(call QUIET_INSTALL, strace/groups) \
                $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(STRACE_GROUPS_INSTDIR_SQ)'; \
@@ -1007,6 +1014,7 @@ python-clean:
 SKEL_OUT := $(abspath $(OUTPUT)util/bpf_skel)
 SKEL_TMP_OUT := $(abspath $(SKEL_OUT)/.tmp)
 SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h
+SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
 
 ifdef BUILD_BPF_SKEL
 BPFTOOL := $(SKEL_TMP_OUT)/bootstrap/bpftool
@@ -1021,7 +1029,7 @@ $(BPFTOOL): | $(SKEL_TMP_OUT)
                OUTPUT=$(SKEL_TMP_OUT)/ bootstrap
 
 $(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) | $(SKEL_TMP_OUT)
-       $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf $(BPF_INCLUDE) \
+       $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) \
          -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ && $(LLVM_STRIP) -g $@
 
 $(SKEL_OUT)/%.skel.h: $(SKEL_TMP_OUT)/%.bpf.o | $(BPFTOOL)
@@ -1041,7 +1049,7 @@ bpf-skel-clean:
        $(call QUIET_CLEAN, bpf-skel) $(RM) -r $(SKEL_TMP_OUT) $(SKELETONS)
 
 clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBPERF)-clean fixdep-clean python-clean bpf-skel-clean
-       $(call QUIET_CLEAN, core-objs)  $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS)
+       $(call QUIET_CLEAN, core-objs)  $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(OUTPUT)perf-iostat $(LANG_BINDINGS)
        $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
        $(Q)$(RM) $(OUTPUT).config-detected
        $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32 $(OUTPUT)pmu-events/jevents $(OUTPUT)$(LIBJVMTI).so
index c25c878..d942f11 100644 (file)
@@ -67,6 +67,7 @@ static int cs_etm_set_context_id(struct auxtrace_record *itr,
        char path[PATH_MAX];
        int err = -EINVAL;
        u32 val;
+       u64 contextid;
 
        ptr = container_of(itr, struct cs_etm_recording, itr);
        cs_etm_pmu = ptr->cs_etm_pmu;
@@ -86,25 +87,59 @@ static int cs_etm_set_context_id(struct auxtrace_record *itr,
                goto out;
        }
 
+       /* User has configured for PID tracing, respects it. */
+       contextid = evsel->core.attr.config &
+                       (BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_CTXTID2));
+
        /*
-        * TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID tracing
-        * is supported:
-        *  0b00000 Context ID tracing is not supported.
-        *  0b00100 Maximum of 32-bit Context ID size.
-        *  All other values are reserved.
+        * If user doesn't configure the contextid format, parse PMU format and
+        * enable PID tracing according to the "contextid" format bits:
+        *
+        *   If bit ETM_OPT_CTXTID is set, trace CONTEXTIDR_EL1;
+        *   If bit ETM_OPT_CTXTID2 is set, trace CONTEXTIDR_EL2.
         */
-       val = BMVAL(val, 5, 9);
-       if (!val || val != 0x4) {
-               err = -EINVAL;
-               goto out;
+       if (!contextid)
+               contextid = perf_pmu__format_bits(&cs_etm_pmu->format,
+                                                 "contextid");
+
+       if (contextid & BIT(ETM_OPT_CTXTID)) {
+               /*
+                * TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID
+                * tracing is supported:
+                *  0b00000 Context ID tracing is not supported.
+                *  0b00100 Maximum of 32-bit Context ID size.
+                *  All other values are reserved.
+                */
+               val = BMVAL(val, 5, 9);
+               if (!val || val != 0x4) {
+                       pr_err("%s: CONTEXTIDR_EL1 isn't supported\n",
+                              CORESIGHT_ETM_PMU_NAME);
+                       err = -EINVAL;
+                       goto out;
+               }
+       }
+
+       if (contextid & BIT(ETM_OPT_CTXTID2)) {
+               /*
+                * TRCIDR2.VMIDOPT[30:29] != 0 and
+                * TRCIDR2.VMIDSIZE[14:10] == 0b00100 (32bit virtual contextid)
+                * We can't support CONTEXTIDR in VMID if the size of the
+                * virtual context id is < 32bit.
+                * Any value of VMIDSIZE >= 4 (i.e, > 32bit) is fine for us.
+                */
+               if (!BMVAL(val, 29, 30) || BMVAL(val, 10, 14) < 4) {
+                       pr_err("%s: CONTEXTIDR_EL2 isn't supported\n",
+                              CORESIGHT_ETM_PMU_NAME);
+                       err = -EINVAL;
+                       goto out;
+               }
        }
 
        /* All good, let the kernel know */
-       evsel->core.attr.config |= (1 << ETM_OPT_CTXTID);
+       evsel->core.attr.config |= contextid;
        err = 0;
 
 out:
-
        return err;
 }
 
@@ -173,17 +208,17 @@ static int cs_etm_set_option(struct auxtrace_record *itr,
                    !cpu_map__has(online_cpus, i))
                        continue;
 
-               if (option & ETM_SET_OPT_CTXTID) {
+               if (option & BIT(ETM_OPT_CTXTID)) {
                        err = cs_etm_set_context_id(itr, evsel, i);
                        if (err)
                                goto out;
                }
-               if (option & ETM_SET_OPT_TS) {
+               if (option & BIT(ETM_OPT_TS)) {
                        err = cs_etm_set_timestamp(itr, evsel, i);
                        if (err)
                                goto out;
                }
-               if (option & ~(ETM_SET_OPT_MASK))
+               if (option & ~(BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_TS)))
                        /* Nothing else is currently supported */
                        goto out;
        }
@@ -343,7 +378,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr,
                        opts->auxtrace_mmap_pages = roundup_pow_of_two(sz);
                }
 
-               /* Snapshost size can't be bigger than the auxtrace area */
+               /* Snapshot size can't be bigger than the auxtrace area */
                if (opts->auxtrace_snapshot_size >
                                opts->auxtrace_mmap_pages * (size_t)page_size) {
                        pr_err("Snapshot size %zu must not be greater than AUX area tracing mmap size %zu\n",
@@ -410,7 +445,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr,
                evsel__set_sample_bit(cs_etm_evsel, CPU);
 
                err = cs_etm_set_option(itr, cs_etm_evsel,
-                                       ETM_SET_OPT_CTXTID | ETM_SET_OPT_TS);
+                                       BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_TS));
                if (err)
                        goto out;
        }
@@ -489,7 +524,9 @@ static u64 cs_etmv4_get_config(struct auxtrace_record *itr)
                config |= BIT(ETM4_CFG_BIT_TS);
        if (config_opts & BIT(ETM_OPT_RETSTK))
                config |= BIT(ETM4_CFG_BIT_RETSTK);
-
+       if (config_opts & BIT(ETM_OPT_CTXTID2))
+               config |= BIT(ETM4_CFG_BIT_VMID) |
+                         BIT(ETM4_CFG_BIT_VMID_OPT);
        return config;
 }
 
@@ -576,7 +613,7 @@ static void cs_etm_get_metadata(int cpu, u32 *offset,
                                struct auxtrace_record *itr,
                                struct perf_record_auxtrace_info *info)
 {
-       u32 increment;
+       u32 increment, nr_trc_params;
        u64 magic;
        struct cs_etm_recording *ptr =
                        container_of(itr, struct cs_etm_recording, itr);
@@ -611,6 +648,7 @@ static void cs_etm_get_metadata(int cpu, u32 *offset,
 
                /* How much space was used */
                increment = CS_ETMV4_PRIV_MAX;
+               nr_trc_params = CS_ETMV4_PRIV_MAX - CS_ETMV4_TRCCONFIGR;
        } else {
                magic = __perf_cs_etmv3_magic;
                /* Get configuration register */
@@ -628,11 +666,13 @@ static void cs_etm_get_metadata(int cpu, u32 *offset,
 
                /* How much space was used */
                increment = CS_ETM_PRIV_MAX;
+               nr_trc_params = CS_ETM_PRIV_MAX - CS_ETM_ETMCR;
        }
 
        /* Build generic header portion */
        info->priv[*offset + CS_ETM_MAGIC] = magic;
        info->priv[*offset + CS_ETM_CPU] = cpu;
+       info->priv[*offset + CS_ETM_NR_TRC_PARAMS] = nr_trc_params;
        /* Where the next CPU entry should start from */
        *offset += increment;
 }
@@ -678,7 +718,7 @@ static int cs_etm_info_fill(struct auxtrace_record *itr,
 
        /* First fill out the session header */
        info->type = PERF_AUXTRACE_CS_ETM;
-       info->priv[CS_HEADER_VERSION_0] = 0;
+       info->priv[CS_HEADER_VERSION] = CS_HEADER_CURRENT_VERSION;
        info->priv[CS_PMU_TYPE_CPUS] = type << 32;
        info->priv[CS_PMU_TYPE_CPUS] |= nr_cpu;
        info->priv[CS_ETM_SNAPSHOT] = ptr->snapshot_mode;
index ead2f22..9fcb4e6 100644 (file)
@@ -2,6 +2,7 @@ perf-y += header.o
 perf-y += machine.o
 perf-y += perf_regs.o
 perf-y += tsc.o
+perf-y += pmu.o
 perf-y += kvm-stat.o
 perf-$(CONFIG_DWARF)     += dwarf-regs.o
 perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
index 50376b9..2303256 100644 (file)
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <errno.h>
 #include <memory.h>
-#include "../../util/evsel.h"
-#include "../../util/kvm-stat.h"
+#include "../../../util/evsel.h"
+#include "../../../util/kvm-stat.h"
 #include "arm64_exception_types.h"
 #include "debug.h"
 
index 40c5e0b..7e77142 100644 (file)
@@ -6,11 +6,11 @@
 #include "debug.h"
 #include "symbol.h"
 
-/* On arm64, kernel text segment start at high memory address,
+/* On arm64, kernel text segment starts at high memory address,
  * for example 0xffff 0000 8xxx xxxx. Modules start at a low memory
- * address, like 0xffff 0000 00ax xxxx. When only samll amount of
+ * address, like 0xffff 0000 00ax xxxx. When only small amount of
  * memory is used by modules, gap between end of module's text segment
- * and start of kernel text segment may be reach 2G.
+ * and start of kernel text segment may reach 2G.
  * Therefore do not fill this gap and do not assign it to the kernel dso map.
  */
 
index 2518cde..476b037 100644 (file)
@@ -108,7 +108,7 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
                /* [sp], [sp, NUM] or [sp,NUM] */
                new_len = 7;    /* + ( % s p ) NULL */
 
-               /* If the arugment is [sp], need to fill offset '0' */
+               /* If the argument is [sp], need to fill offset '0' */
                if (rm[2].rm_so == -1)
                        new_len += 1;
                else
diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c
new file mode 100644 (file)
index 0000000..2234fbd
--- /dev/null
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "../../../util/cpumap.h"
+#include "../../../util/pmu.h"
+
+struct pmu_events_map *pmu_events_map__find(void)
+{
+       struct perf_pmu *pmu = NULL;
+
+       while ((pmu = perf_pmu__scan(pmu))) {
+               if (!is_pmu_core(pmu->name))
+                       continue;
+
+               /*
+                * The cpumap should cover all CPUs. Otherwise, some CPUs may
+                * not support some events or have different event IDs.
+                */
+               if (pmu->cpus->nr != cpu__max_cpu())
+                       return NULL;
+
+               return perf_pmu__find_map(pmu);
+       }
+
+       return NULL;
+}
index 1495a95..5aecf88 100644 (file)
@@ -4,9 +4,9 @@
 #ifndef REMOTE_UNWIND_LIBUNWIND
 #include <libunwind.h>
 #include "perf_regs.h"
-#include "../../util/unwind.h"
+#include "../../../util/unwind.h"
 #endif
-#include "../../util/debug.h"
+#include "../../../util/debug.h"
 
 int LIBUNWIND__ARCH_REG_ID(int regnum)
 {
diff --git a/tools/perf/arch/mips/Makefile b/tools/perf/arch/mips/Makefile
new file mode 100644 (file)
index 0000000..8bc0907
--- /dev/null
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0
+ifndef NO_DWARF
+PERF_HAVE_DWARF_REGS := 1
+endif
+
+# Syscall table generation for perf
+out    := $(OUTPUT)arch/mips/include/generated/asm
+header := $(out)/syscalls_n64.c
+sysprf := $(srctree)/tools/perf/arch/mips/entry/syscalls
+sysdef := $(sysprf)/syscall_n64.tbl
+systbl := $(sysprf)/mksyscalltbl
+
+# Create output directory if not already present
+_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+
+$(header): $(sysdef) $(systbl)
+       $(Q)$(SHELL) '$(systbl)' $(sysdef) > $@
+
+clean::
+       $(call QUIET_CLEAN, mips) $(RM) $(header)
+
+archheaders: $(header)
diff --git a/tools/perf/arch/mips/entry/syscalls/mksyscalltbl b/tools/perf/arch/mips/entry/syscalls/mksyscalltbl
new file mode 100644 (file)
index 0000000..fb1f494
--- /dev/null
@@ -0,0 +1,32 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Generate system call table for perf. Derived from
+# s390 script.
+#
+# Author(s):  Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+# Changed by: Tiezhu Yang <yangtiezhu@loongson.cn>
+
+SYSCALL_TBL=$1
+
+if ! test -r $SYSCALL_TBL; then
+       echo "Could not read input file" >&2
+       exit 1
+fi
+
+create_table()
+{
+       local max_nr nr abi sc discard
+
+       echo 'static const char *syscalltbl_mips_n64[] = {'
+       while read nr abi sc discard; do
+               printf '\t[%d] = "%s",\n' $nr $sc
+               max_nr=$nr
+       done
+       echo '};'
+       echo "#define SYSCALLTBL_MIPS_N64_MAX_ID $max_nr"
+}
+
+grep -E "^[[:digit:]]+[[:space:]]+(n64)" $SYSCALL_TBL  \
+       |sort -k1 -n                                    \
+       |create_table
diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
new file mode 100644 (file)
index 0000000..9164969
--- /dev/null
@@ -0,0 +1,358 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+#
+# system call numbers and entry vectors for mips
+#
+# The format is:
+# <number> <abi> <name> <entry point>
+#
+# The <abi> is always "n64" for this file.
+#
+0      n64     read                            sys_read
+1      n64     write                           sys_write
+2      n64     open                            sys_open
+3      n64     close                           sys_close
+4      n64     stat                            sys_newstat
+5      n64     fstat                           sys_newfstat
+6      n64     lstat                           sys_newlstat
+7      n64     poll                            sys_poll
+8      n64     lseek                           sys_lseek
+9      n64     mmap                            sys_mips_mmap
+10     n64     mprotect                        sys_mprotect
+11     n64     munmap                          sys_munmap
+12     n64     brk                             sys_brk
+13     n64     rt_sigaction                    sys_rt_sigaction
+14     n64     rt_sigprocmask                  sys_rt_sigprocmask
+15     n64     ioctl                           sys_ioctl
+16     n64     pread64                         sys_pread64
+17     n64     pwrite64                        sys_pwrite64
+18     n64     readv                           sys_readv
+19     n64     writev                          sys_writev
+20     n64     access                          sys_access
+21     n64     pipe                            sysm_pipe
+22     n64     _newselect                      sys_select
+23     n64     sched_yield                     sys_sched_yield
+24     n64     mremap                          sys_mremap
+25     n64     msync                           sys_msync
+26     n64     mincore                         sys_mincore
+27     n64     madvise                         sys_madvise
+28     n64     shmget                          sys_shmget
+29     n64     shmat                           sys_shmat
+30     n64     shmctl                          sys_old_shmctl
+31     n64     dup                             sys_dup
+32     n64     dup2                            sys_dup2
+33     n64     pause                           sys_pause
+34     n64     nanosleep                       sys_nanosleep
+35     n64     getitimer                       sys_getitimer
+36     n64     setitimer                       sys_setitimer
+37     n64     alarm                           sys_alarm
+38     n64     getpid                          sys_getpid
+39     n64     sendfile                        sys_sendfile64
+40     n64     socket                          sys_socket
+41     n64     connect                         sys_connect
+42     n64     accept                          sys_accept
+43     n64     sendto                          sys_sendto
+44     n64     recvfrom                        sys_recvfrom
+45     n64     sendmsg                         sys_sendmsg
+46     n64     recvmsg                         sys_recvmsg
+47     n64     shutdown                        sys_shutdown
+48     n64     bind                            sys_bind
+49     n64     listen                          sys_listen
+50     n64     getsockname                     sys_getsockname
+51     n64     getpeername                     sys_getpeername
+52     n64     socketpair                      sys_socketpair
+53     n64     setsockopt                      sys_setsockopt
+54     n64     getsockopt                      sys_getsockopt
+55     n64     clone                           __sys_clone
+56     n64     fork                            __sys_fork
+57     n64     execve                          sys_execve
+58     n64     exit                            sys_exit
+59     n64     wait4                           sys_wait4
+60     n64     kill                            sys_kill
+61     n64     uname                           sys_newuname
+62     n64     semget                          sys_semget
+63     n64     semop                           sys_semop
+64     n64     semctl                          sys_old_semctl
+65     n64     shmdt                           sys_shmdt
+66     n64     msgget                          sys_msgget
+67     n64     msgsnd                          sys_msgsnd
+68     n64     msgrcv                          sys_msgrcv
+69     n64     msgctl                          sys_old_msgctl
+70     n64     fcntl                           sys_fcntl
+71     n64     flock                           sys_flock
+72     n64     fsync                           sys_fsync
+73     n64     fdatasync                       sys_fdatasync
+74     n64     truncate                        sys_truncate
+75     n64     ftruncate                       sys_ftruncate
+76     n64     getdents                        sys_getdents
+77     n64     getcwd                          sys_getcwd
+78     n64     chdir                           sys_chdir
+79     n64     fchdir                          sys_fchdir
+80     n64     rename                          sys_rename
+81     n64     mkdir                           sys_mkdir
+82     n64     rmdir                           sys_rmdir
+83     n64     creat                           sys_creat
+84     n64     link                            sys_link
+85     n64     unlink                          sys_unlink
+86     n64     symlink                         sys_symlink
+87     n64     readlink                        sys_readlink
+88     n64     chmod                           sys_chmod
+89     n64     fchmod                          sys_fchmod
+90     n64     chown                           sys_chown
+91     n64     fchown                          sys_fchown
+92     n64     lchown                          sys_lchown
+93     n64     umask                           sys_umask
+94     n64     gettimeofday                    sys_gettimeofday
+95     n64     getrlimit                       sys_getrlimit
+96     n64     getrusage                       sys_getrusage
+97     n64     sysinfo                         sys_sysinfo
+98     n64     times                           sys_times
+99     n64     ptrace                          sys_ptrace
+100    n64     getuid                          sys_getuid
+101    n64     syslog                          sys_syslog
+102    n64     getgid                          sys_getgid
+103    n64     setuid                          sys_setuid
+104    n64     setgid                          sys_setgid
+105    n64     geteuid                         sys_geteuid
+106    n64     getegid                         sys_getegid
+107    n64     setpgid                         sys_setpgid
+108    n64     getppid                         sys_getppid
+109    n64     getpgrp                         sys_getpgrp
+110    n64     setsid                          sys_setsid
+111    n64     setreuid                        sys_setreuid
+112    n64     setregid                        sys_setregid
+113    n64     getgroups                       sys_getgroups
+114    n64     setgroups                       sys_setgroups
+115    n64     setresuid                       sys_setresuid
+116    n64     getresuid                       sys_getresuid
+117    n64     setresgid                       sys_setresgid
+118    n64     getresgid                       sys_getresgid
+119    n64     getpgid                         sys_getpgid
+120    n64     setfsuid                        sys_setfsuid
+121    n64     setfsgid                        sys_setfsgid
+122    n64     getsid                          sys_getsid
+123    n64     capget                          sys_capget
+124    n64     capset                          sys_capset
+125    n64     rt_sigpending                   sys_rt_sigpending
+126    n64     rt_sigtimedwait                 sys_rt_sigtimedwait
+127    n64     rt_sigqueueinfo                 sys_rt_sigqueueinfo
+128    n64     rt_sigsuspend                   sys_rt_sigsuspend
+129    n64     sigaltstack                     sys_sigaltstack
+130    n64     utime                           sys_utime
+131    n64     mknod                           sys_mknod
+132    n64     personality                     sys_personality
+133    n64     ustat                           sys_ustat
+134    n64     statfs                          sys_statfs
+135    n64     fstatfs                         sys_fstatfs
+136    n64     sysfs                           sys_sysfs
+137    n64     getpriority                     sys_getpriority
+138    n64     setpriority                     sys_setpriority
+139    n64     sched_setparam                  sys_sched_setparam
+140    n64     sched_getparam                  sys_sched_getparam
+141    n64     sched_setscheduler              sys_sched_setscheduler
+142    n64     sched_getscheduler              sys_sched_getscheduler
+143    n64     sched_get_priority_max          sys_sched_get_priority_max
+144    n64     sched_get_priority_min          sys_sched_get_priority_min
+145    n64     sched_rr_get_interval           sys_sched_rr_get_interval
+146    n64     mlock                           sys_mlock
+147    n64     munlock                         sys_munlock
+148    n64     mlockall                        sys_mlockall
+149    n64     munlockall                      sys_munlockall
+150    n64     vhangup                         sys_vhangup
+151    n64     pivot_root                      sys_pivot_root
+152    n64     _sysctl                         sys_ni_syscall
+153    n64     prctl                           sys_prctl
+154    n64     adjtimex                        sys_adjtimex
+155    n64     setrlimit                       sys_setrlimit
+156    n64     chroot                          sys_chroot
+157    n64     sync                            sys_sync
+158    n64     acct                            sys_acct
+159    n64     settimeofday                    sys_settimeofday
+160    n64     mount                           sys_mount
+161    n64     umount2                         sys_umount
+162    n64     swapon                          sys_swapon
+163    n64     swapoff                         sys_swapoff
+164    n64     reboot                          sys_reboot
+165    n64     sethostname                     sys_sethostname
+166    n64     setdomainname                   sys_setdomainname
+167    n64     create_module                   sys_ni_syscall
+168    n64     init_module                     sys_init_module
+169    n64     delete_module                   sys_delete_module
+170    n64     get_kernel_syms                 sys_ni_syscall
+171    n64     query_module                    sys_ni_syscall
+172    n64     quotactl                        sys_quotactl
+173    n64     nfsservctl                      sys_ni_syscall
+174    n64     getpmsg                         sys_ni_syscall
+175    n64     putpmsg                         sys_ni_syscall
+176    n64     afs_syscall                     sys_ni_syscall
+# 177 reserved for security
+177    n64     reserved177                     sys_ni_syscall
+178    n64     gettid                          sys_gettid
+179    n64     readahead                       sys_readahead
+180    n64     setxattr                        sys_setxattr
+181    n64     lsetxattr                       sys_lsetxattr
+182    n64     fsetxattr                       sys_fsetxattr
+183    n64     getxattr                        sys_getxattr
+184    n64     lgetxattr                       sys_lgetxattr
+185    n64     fgetxattr                       sys_fgetxattr
+186    n64     listxattr                       sys_listxattr
+187    n64     llistxattr                      sys_llistxattr
+188    n64     flistxattr                      sys_flistxattr
+189    n64     removexattr                     sys_removexattr
+190    n64     lremovexattr                    sys_lremovexattr
+191    n64     fremovexattr                    sys_fremovexattr
+192    n64     tkill                           sys_tkill
+193    n64     reserved193                     sys_ni_syscall
+194    n64     futex                           sys_futex
+195    n64     sched_setaffinity               sys_sched_setaffinity
+196    n64     sched_getaffinity               sys_sched_getaffinity
+197    n64     cacheflush                      sys_cacheflush
+198    n64     cachectl                        sys_cachectl
+199    n64     sysmips                         __sys_sysmips
+200    n64     io_setup                        sys_io_setup
+201    n64     io_destroy                      sys_io_destroy
+202    n64     io_getevents                    sys_io_getevents
+203    n64     io_submit                       sys_io_submit
+204    n64     io_cancel                       sys_io_cancel
+205    n64     exit_group                      sys_exit_group
+206    n64     lookup_dcookie                  sys_lookup_dcookie
+207    n64     epoll_create                    sys_epoll_create
+208    n64     epoll_ctl                       sys_epoll_ctl
+209    n64     epoll_wait                      sys_epoll_wait
+210    n64     remap_file_pages                sys_remap_file_pages
+211    n64     rt_sigreturn                    sys_rt_sigreturn
+212    n64     set_tid_address                 sys_set_tid_address
+213    n64     restart_syscall                 sys_restart_syscall
+214    n64     semtimedop                      sys_semtimedop
+215    n64     fadvise64                       sys_fadvise64_64
+216    n64     timer_create                    sys_timer_create
+217    n64     timer_settime                   sys_timer_settime
+218    n64     timer_gettime                   sys_timer_gettime
+219    n64     timer_getoverrun                sys_timer_getoverrun
+220    n64     timer_delete                    sys_timer_delete
+221    n64     clock_settime                   sys_clock_settime
+222    n64     clock_gettime                   sys_clock_gettime
+223    n64     clock_getres                    sys_clock_getres
+224    n64     clock_nanosleep                 sys_clock_nanosleep
+225    n64     tgkill                          sys_tgkill
+226    n64     utimes                          sys_utimes
+227    n64     mbind                           sys_mbind
+228    n64     get_mempolicy                   sys_get_mempolicy
+229    n64     set_mempolicy                   sys_set_mempolicy
+230    n64     mq_open                         sys_mq_open
+231    n64     mq_unlink                       sys_mq_unlink
+232    n64     mq_timedsend                    sys_mq_timedsend
+233    n64     mq_timedreceive                 sys_mq_timedreceive
+234    n64     mq_notify                       sys_mq_notify
+235    n64     mq_getsetattr                   sys_mq_getsetattr
+236    n64     vserver                         sys_ni_syscall
+237    n64     waitid                          sys_waitid
+# 238 was sys_setaltroot
+239    n64     add_key                         sys_add_key
+240    n64     request_key                     sys_request_key
+241    n64     keyctl                          sys_keyctl
+242    n64     set_thread_area                 sys_set_thread_area
+243    n64     inotify_init                    sys_inotify_init
+244    n64     inotify_add_watch               sys_inotify_add_watch
+245    n64     inotify_rm_watch                sys_inotify_rm_watch
+246    n64     migrate_pages                   sys_migrate_pages
+247    n64     openat                          sys_openat
+248    n64     mkdirat                         sys_mkdirat
+249    n64     mknodat                         sys_mknodat
+250    n64     fchownat                        sys_fchownat
+251    n64     futimesat                       sys_futimesat
+252    n64     newfstatat                      sys_newfstatat
+253    n64     unlinkat                        sys_unlinkat
+254    n64     renameat                        sys_renameat
+255    n64     linkat                          sys_linkat
+256    n64     symlinkat                       sys_symlinkat
+257    n64     readlinkat                      sys_readlinkat
+258    n64     fchmodat                        sys_fchmodat
+259    n64     faccessat                       sys_faccessat
+260    n64     pselect6                        sys_pselect6
+261    n64     ppoll                           sys_ppoll
+262    n64     unshare                         sys_unshare
+263    n64     splice                          sys_splice
+264    n64     sync_file_range                 sys_sync_file_range
+265    n64     tee                             sys_tee
+266    n64     vmsplice                        sys_vmsplice
+267    n64     move_pages                      sys_move_pages
+268    n64     set_robust_list                 sys_set_robust_list
+269    n64     get_robust_list                 sys_get_robust_list
+270    n64     kexec_load                      sys_kexec_load
+271    n64     getcpu                          sys_getcpu
+272    n64     epoll_pwait                     sys_epoll_pwait
+273    n64     ioprio_set                      sys_ioprio_set
+274    n64     ioprio_get                      sys_ioprio_get
+275    n64     utimensat                       sys_utimensat
+276    n64     signalfd                        sys_signalfd
+277    n64     timerfd                         sys_ni_syscall
+278    n64     eventfd                         sys_eventfd
+279    n64     fallocate                       sys_fallocate
+280    n64     timerfd_create                  sys_timerfd_create
+281    n64     timerfd_gettime                 sys_timerfd_gettime
+282    n64     timerfd_settime                 sys_timerfd_settime
+283    n64     signalfd4                       sys_signalfd4
+284    n64     eventfd2                        sys_eventfd2
+285    n64     epoll_create1                   sys_epoll_create1
+286    n64     dup3                            sys_dup3
+287    n64     pipe2                           sys_pipe2
+288    n64     inotify_init1                   sys_inotify_init1
+289    n64     preadv                          sys_preadv
+290    n64     pwritev                         sys_pwritev
+291    n64     rt_tgsigqueueinfo               sys_rt_tgsigqueueinfo
+292    n64     perf_event_open                 sys_perf_event_open
+293    n64     accept4                         sys_accept4
+294    n64     recvmmsg                        sys_recvmmsg
+295    n64     fanotify_init                   sys_fanotify_init
+296    n64     fanotify_mark                   sys_fanotify_mark
+297    n64     prlimit64                       sys_prlimit64
+298    n64     name_to_handle_at               sys_name_to_handle_at
+299    n64     open_by_handle_at               sys_open_by_handle_at
+300    n64     clock_adjtime                   sys_clock_adjtime
+301    n64     syncfs                          sys_syncfs
+302    n64     sendmmsg                        sys_sendmmsg
+303    n64     setns                           sys_setns
+304    n64     process_vm_readv                sys_process_vm_readv
+305    n64     process_vm_writev               sys_process_vm_writev
+306    n64     kcmp                            sys_kcmp
+307    n64     finit_module                    sys_finit_module
+308    n64     getdents64                      sys_getdents64
+309    n64     sched_setattr                   sys_sched_setattr
+310    n64     sched_getattr                   sys_sched_getattr
+311    n64     renameat2                       sys_renameat2
+312    n64     seccomp                         sys_seccomp
+313    n64     getrandom                       sys_getrandom
+314    n64     memfd_create                    sys_memfd_create
+315    n64     bpf                             sys_bpf
+316    n64     execveat                        sys_execveat
+317    n64     userfaultfd                     sys_userfaultfd
+318    n64     membarrier                      sys_membarrier
+319    n64     mlock2                          sys_mlock2
+320    n64     copy_file_range                 sys_copy_file_range
+321    n64     preadv2                         sys_preadv2
+322    n64     pwritev2                        sys_pwritev2
+323    n64     pkey_mprotect                   sys_pkey_mprotect
+324    n64     pkey_alloc                      sys_pkey_alloc
+325    n64     pkey_free                       sys_pkey_free
+326    n64     statx                           sys_statx
+327    n64     rseq                            sys_rseq
+328    n64     io_pgetevents                   sys_io_pgetevents
+# 329 through 423 are reserved to sync up with other architectures
+424    n64     pidfd_send_signal               sys_pidfd_send_signal
+425    n64     io_uring_setup                  sys_io_uring_setup
+426    n64     io_uring_enter                  sys_io_uring_enter
+427    n64     io_uring_register               sys_io_uring_register
+428    n64     open_tree                       sys_open_tree
+429    n64     move_mount                      sys_move_mount
+430    n64     fsopen                          sys_fsopen
+431    n64     fsconfig                        sys_fsconfig
+432    n64     fsmount                         sys_fsmount
+433    n64     fspick                          sys_fspick
+434    n64     pidfd_open                      sys_pidfd_open
+435    n64     clone3                          __sys_clone3
+436    n64     close_range                     sys_close_range
+437    n64     openat2                         sys_openat2
+438    n64     pidfd_getfd                     sys_pidfd_getfd
+439    n64     faccessat2                      sys_faccessat2
+440    n64     process_madvise                 sys_process_madvise
+441    n64     epoll_pwait2                    sys_epoll_pwait2
diff --git a/tools/perf/arch/mips/include/dwarf-regs-table.h b/tools/perf/arch/mips/include/dwarf-regs-table.h
new file mode 100644 (file)
index 0000000..5badbcd
--- /dev/null
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * dwarf-regs-table.h : Mapping of DWARF debug register numbers into
+ * register names.
+ *
+ * Copyright (C) 2013 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifdef DEFINE_DWARF_REGSTR_TABLE
+#undef REG_DWARFNUM_NAME
+#define REG_DWARFNUM_NAME(reg, idx)    [idx] = "$" #reg
+static const char * const mips_regstr_tbl[] = {
+       "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8", "$9",
+       "$10", "$11", "$12", "$13", "$14", "$15", "$16", "$17", "$18", "$19",
+       "$20", "$21", "$22", "$23", "$24", "$25", "$26", "$27", "$28", "%29",
+       "$30", "$31",
+       REG_DWARFNUM_NAME(hi, 64),
+       REG_DWARFNUM_NAME(lo, 65),
+};
+#endif
diff --git a/tools/perf/arch/mips/include/perf_regs.h b/tools/perf/arch/mips/include/perf_regs.h
new file mode 100644 (file)
index 0000000..ee73b36
--- /dev/null
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_PERF_REGS_H
+#define ARCH_PERF_REGS_H
+
+#include <stdlib.h>
+#include <linux/types.h>
+#include <asm/perf_regs.h>
+
+#define PERF_REGS_MAX PERF_REG_MIPS_MAX
+#define PERF_REG_IP PERF_REG_MIPS_PC
+#define PERF_REG_SP PERF_REG_MIPS_R29
+
+#define PERF_REGS_MASK ((1ULL << PERF_REG_MIPS_MAX) - 1)
+
+static inline const char *__perf_reg_name(int id)
+{
+       switch (id) {
+       case PERF_REG_MIPS_PC:
+               return "PC";
+       case PERF_REG_MIPS_R1:
+               return "$1";
+       case PERF_REG_MIPS_R2:
+               return "$2";
+       case PERF_REG_MIPS_R3:
+               return "$3";
+       case PERF_REG_MIPS_R4:
+               return "$4";
+       case PERF_REG_MIPS_R5:
+               return "$5";
+       case PERF_REG_MIPS_R6:
+               return "$6";
+       case PERF_REG_MIPS_R7:
+               return "$7";
+       case PERF_REG_MIPS_R8:
+               return "$8";
+       case PERF_REG_MIPS_R9:
+               return "$9";
+       case PERF_REG_MIPS_R10:
+               return "$10";
+       case PERF_REG_MIPS_R11:
+               return "$11";
+       case PERF_REG_MIPS_R12:
+               return "$12";
+       case PERF_REG_MIPS_R13:
+               return "$13";
+       case PERF_REG_MIPS_R14:
+               return "$14";
+       case PERF_REG_MIPS_R15:
+               return "$15";
+       case PERF_REG_MIPS_R16:
+               return "$16";
+       case PERF_REG_MIPS_R17:
+               return "$17";
+       case PERF_REG_MIPS_R18:
+               return "$18";
+       case PERF_REG_MIPS_R19:
+               return "$19";
+       case PERF_REG_MIPS_R20:
+               return "$20";
+       case PERF_REG_MIPS_R21:
+               return "$21";
+       case PERF_REG_MIPS_R22:
+               return "$22";
+       case PERF_REG_MIPS_R23:
+               return "$23";
+       case PERF_REG_MIPS_R24:
+               return "$24";
+       case PERF_REG_MIPS_R25:
+               return "$25";
+       case PERF_REG_MIPS_R28:
+               return "$28";
+       case PERF_REG_MIPS_R29:
+               return "$29";
+       case PERF_REG_MIPS_R30:
+               return "$30";
+       case PERF_REG_MIPS_R31:
+               return "$31";
+       default:
+               break;
+       }
+       return NULL;
+}
+
+#endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/mips/util/Build b/tools/perf/arch/mips/util/Build
new file mode 100644 (file)
index 0000000..51c8900
--- /dev/null
@@ -0,0 +1,3 @@
+perf-y += perf_regs.o
+perf-$(CONFIG_DWARF) += dwarf-regs.o
+perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
diff --git a/tools/perf/arch/mips/util/dwarf-regs.c b/tools/perf/arch/mips/util/dwarf-regs.c
new file mode 100644 (file)
index 0000000..25c13a9
--- /dev/null
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * dwarf-regs.c : Mapping of DWARF debug register numbers into register names.
+ *
+ * Copyright (C) 2013 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <stdio.h>
+#include <dwarf-regs.h>
+
+static const char *mips_gpr_names[32] = {
+       "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8", "$9",
+       "$10", "$11", "$12", "$13", "$14", "$15", "$16", "$17", "$18", "$19",
+       "$20", "$21", "$22", "$23", "$24", "$25", "$26", "$27", "$28", "$29",
+       "$30", "$31"
+};
+
+const char *get_arch_regstr(unsigned int n)
+{
+       if (n < 32)
+               return mips_gpr_names[n];
+       if (n == 64)
+               return "hi";
+       if (n == 65)
+               return "lo";
+       return NULL;
+}
diff --git a/tools/perf/arch/mips/util/perf_regs.c b/tools/perf/arch/mips/util/perf_regs.c
new file mode 100644 (file)
index 0000000..2864e2e
--- /dev/null
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../../util/perf_regs.h"
+
+const struct sample_reg sample_reg_masks[] = {
+       SMPL_REG_END
+};
diff --git a/tools/perf/arch/mips/util/unwind-libunwind.c b/tools/perf/arch/mips/util/unwind-libunwind.c
new file mode 100644 (file)
index 0000000..0d8c99c
--- /dev/null
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <errno.h>
+#include <libunwind.h>
+#include "perf_regs.h"
+#include "../../util/unwind.h"
+#include "util/debug.h"
+
+int libunwind__arch_reg_id(int regnum)
+{
+       switch (regnum) {
+       case UNW_MIPS_R1 ... UNW_MIPS_R25:
+               return regnum - UNW_MIPS_R1 + PERF_REG_MIPS_R1;
+       case UNW_MIPS_R28 ... UNW_MIPS_R31:
+               return regnum - UNW_MIPS_R28 + PERF_REG_MIPS_R28;
+       case UNW_MIPS_PC:
+               return PERF_REG_MIPS_PC;
+       default:
+               pr_err("unwind: invalid reg id %d\n", regnum);
+               return -EINVAL;
+       }
+}
index b7945e5..8a79c41 100644 (file)
@@ -4,6 +4,8 @@ perf-y += kvm-stat.o
 perf-y += perf_regs.o
 perf-y += mem-events.o
 perf-y += sym-handling.o
+perf-y += evsel.o
+perf-y += event.o
 
 perf-$(CONFIG_DWARF) += dwarf-regs.o
 perf-$(CONFIG_DWARF) += skip-callchain-idx.o
diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c
new file mode 100644 (file)
index 0000000..3bf4412
--- /dev/null
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/zalloc.h>
+
+#include "../../../util/event.h"
+#include "../../../util/synthetic-events.h"
+#include "../../../util/machine.h"
+#include "../../../util/tool.h"
+#include "../../../util/map.h"
+#include "../../../util/debug.h"
+
+void arch_perf_parse_sample_weight(struct perf_sample *data,
+                                  const __u64 *array, u64 type)
+{
+       union perf_sample_weight weight;
+
+       weight.full = *array;
+       if (type & PERF_SAMPLE_WEIGHT)
+               data->weight = weight.full;
+       else {
+               data->weight = weight.var1_dw;
+               data->ins_lat = weight.var2_w;
+               data->p_stage_cyc = weight.var3_w;
+       }
+}
+
+void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
+                                       __u64 *array, u64 type)
+{
+       *array = data->weight;
+
+       if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
+               *array &= 0xffffffff;
+               *array |= ((u64)data->ins_lat << 32);
+       }
+}
+
+const char *arch_perf_header_entry(const char *se_header)
+{
+       if (!strcmp(se_header, "Local INSTR Latency"))
+               return "Finish Cyc";
+       else if (!strcmp(se_header, "Pipeline Stage Cycle"))
+               return "Dispatch Cyc";
+       return se_header;
+}
+
+int arch_support_sort_key(const char *sort_key)
+{
+       if (!strcmp(sort_key, "p_stage_cyc"))
+               return 1;
+       return 0;
+}
diff --git a/tools/perf/arch/powerpc/util/evsel.c b/tools/perf/arch/powerpc/util/evsel.c
new file mode 100644 (file)
index 0000000..2f733cd
--- /dev/null
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include "util/evsel.h"
+
+void arch_evsel__set_sample_weight(struct evsel *evsel)
+{
+       evsel__set_sample_bit(evsel, WEIGHT_STRUCT);
+}
index eed9e5a..1651068 100644 (file)
@@ -176,7 +176,7 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
 }
 
 /*
- * Incase of powerpc architecture, pmu registers are programmable
+ * In case of powerpc architecture, pmu registers are programmable
  * by guest kernel. So monitoring guest via host may not provide
  * valid samples with default 'cycles' event. It is better to use
  * 'trace_imc/trace_cycles' event for guest profiling, since it
index 5788eb1..2baeb1c 100644 (file)
@@ -10,6 +10,6 @@
 
 #define SPRN_PVR        0x11F   /* Processor Version Register */
 #define PVR_VER(pvr)    (((pvr) >>  16) & 0xFFFF) /* Version field */
-#define PVR_REV(pvr)    (((pvr) >>   0) & 0xFFFF) /* Revison field */
+#define PVR_REV(pvr)    (((pvr) >>   0) & 0xFFFF) /* Revision field */
 
 #endif /* __PERF_UTIL_HEADER_H */
index adcacf1..dffcf9b 100644 (file)
@@ -73,7 +73,7 @@ static int bp_modify1(void)
        /*
         * The parent does following steps:
         *  - creates a new breakpoint (id 0) for bp_2 function
-        *  - changes that breakponit to bp_1 function
+        *  - changes that breakpoint to bp_1 function
         *  - waits for the breakpoint to hit and checks
         *    it has proper rip of bp_1 function
         *  - detaches the child
index 0c72d41..dbeb04c 100644 (file)
@@ -9,6 +9,7 @@ perf-y += event.o
 perf-y += evlist.o
 perf-y += mem-events.o
 perf-y += evsel.o
+perf-y += iostat.o
 
 perf-$(CONFIG_DWARF) += dwarf-regs.o
 perf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o
diff --git a/tools/perf/arch/x86/util/iostat.c b/tools/perf/arch/x86/util/iostat.c
new file mode 100644 (file)
index 0000000..d63acb7
--- /dev/null
@@ -0,0 +1,470 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * perf iostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov <alexander.antonov@linux.intel.com>
+ */
+
+#include <api/fs/fs.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <regex.h>
+#include "util/cpumap.h"
+#include "util/debug.h"
+#include "util/iostat.h"
+#include "util/counts.h"
+#include "path.h"
+
+#ifndef MAX_PATH
+#define MAX_PATH 1024
+#endif
+
+#define UNCORE_IIO_PMU_PATH    "devices/uncore_iio_%d"
+#define SYSFS_UNCORE_PMU_PATH  "%s/"UNCORE_IIO_PMU_PATH
+#define PLATFORM_MAPPING_PATH  UNCORE_IIO_PMU_PATH"/die%d"
+
+/*
+ * Each metric requiries one IIO event which increments at every 4B transfer
+ * in corresponding direction. The formulas to compute metrics are generic:
+ *     #EventCount * 4B / (1024 * 1024)
+ */
+static const char * const iostat_metrics[] = {
+       "Inbound Read(MB)",
+       "Inbound Write(MB)",
+       "Outbound Read(MB)",
+       "Outbound Write(MB)",
+};
+
+static inline int iostat_metrics_count(void)
+{
+       return sizeof(iostat_metrics) / sizeof(char *);
+}
+
+static const char *iostat_metric_by_idx(int idx)
+{
+       return *(iostat_metrics + idx % iostat_metrics_count());
+}
+
+struct iio_root_port {
+       u32 domain;
+       u8 bus;
+       u8 die;
+       u8 pmu_idx;
+       int idx;
+};
+
+struct iio_root_ports_list {
+       struct iio_root_port **rps;
+       int nr_entries;
+};
+
+static struct iio_root_ports_list *root_ports;
+
+static void iio_root_port_show(FILE *output,
+                              const struct iio_root_port * const rp)
+{
+       if (output && rp)
+               fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n",
+                       rp->die, rp->pmu_idx, rp->domain, rp->bus);
+}
+
+static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus,
+                                              u8 die, u8 pmu_idx)
+{
+       struct iio_root_port *p = calloc(1, sizeof(*p));
+
+       if (p) {
+               p->domain = domain;
+               p->bus = bus;
+               p->die = die;
+               p->pmu_idx = pmu_idx;
+       }
+       return p;
+}
+
+static void iio_root_ports_list_free(struct iio_root_ports_list *list)
+{
+       int idx;
+
+       if (list) {
+               for (idx = 0; idx < list->nr_entries; idx++)
+                       free(list->rps[idx]);
+               free(list->rps);
+               free(list);
+       }
+}
+
+static struct iio_root_port *iio_root_port_find_by_notation(
+       const struct iio_root_ports_list * const list, u32 domain, u8 bus)
+{
+       int idx;
+       struct iio_root_port *rp;
+
+       if (list) {
+               for (idx = 0; idx < list->nr_entries; idx++) {
+                       rp = list->rps[idx];
+                       if (rp && rp->domain == domain && rp->bus == bus)
+                               return rp;
+               }
+       }
+       return NULL;
+}
+
+static int iio_root_ports_list_insert(struct iio_root_ports_list *list,
+                                     struct iio_root_port * const rp)
+{
+       struct iio_root_port **tmp_buf;
+
+       if (list && rp) {
+               rp->idx = list->nr_entries++;
+               tmp_buf = realloc(list->rps,
+                                 list->nr_entries * sizeof(*list->rps));
+               if (!tmp_buf) {
+                       pr_err("Failed to realloc memory\n");
+                       return -ENOMEM;
+               }
+               tmp_buf[rp->idx] = rp;
+               list->rps = tmp_buf;
+       }
+       return 0;
+}
+
+static int iio_mapping(u8 pmu_idx, struct iio_root_ports_list * const list)
+{
+       char *buf;
+       char path[MAX_PATH];
+       u32 domain;
+       u8 bus;
+       struct iio_root_port *rp;
+       size_t size;
+       int ret;
+
+       for (int die = 0; die < cpu__max_node(); die++) {
+               scnprintf(path, MAX_PATH, PLATFORM_MAPPING_PATH, pmu_idx, die);
+               if (sysfs__read_str(path, &buf, &size) < 0) {
+                       if (pmu_idx)
+                               goto out;
+                       pr_err("Mode iostat is not supported\n");
+                       return -1;
+               }
+               ret = sscanf(buf, "%04x:%02hhx", &domain, &bus);
+               free(buf);
+               if (ret != 2) {
+                       pr_err("Invalid mapping data: iio_%d; die%d\n",
+                              pmu_idx, die);
+                       return -1;
+               }
+               rp = iio_root_port_new(domain, bus, die, pmu_idx);
+               if (!rp || iio_root_ports_list_insert(list, rp)) {
+                       free(rp);
+                       return -ENOMEM;
+               }
+       }
+out:
+       return 0;
+}
+
+static u8 iio_pmu_count(void)
+{
+       u8 pmu_idx = 0;
+       char path[MAX_PATH];
+       const char *sysfs = sysfs__mountpoint();
+
+       if (sysfs) {
+               for (;; pmu_idx++) {
+                       snprintf(path, sizeof(path), SYSFS_UNCORE_PMU_PATH,
+                                sysfs, pmu_idx);
+                       if (access(path, F_OK) != 0)
+                               break;
+               }
+       }
+       return pmu_idx;
+}
+
+static int iio_root_ports_scan(struct iio_root_ports_list **list)
+{
+       int ret = -ENOMEM;
+       struct iio_root_ports_list *tmp_list;
+       u8 pmu_count = iio_pmu_count();
+
+       if (!pmu_count) {
+               pr_err("Unsupported uncore pmu configuration\n");
+               return -1;
+       }
+
+       tmp_list = calloc(1, sizeof(*tmp_list));
+       if (!tmp_list)
+               goto err;
+
+       for (u8 pmu_idx = 0; pmu_idx < pmu_count; pmu_idx++) {
+               ret = iio_mapping(pmu_idx, tmp_list);
+               if (ret)
+                       break;
+       }
+err:
+       if (!ret)
+               *list = tmp_list;
+       else
+               iio_root_ports_list_free(tmp_list);
+
+       return ret;
+}
+
+static int iio_root_port_parse_str(u32 *domain, u8 *bus, char *str)
+{
+       int ret;
+       regex_t regex;
+       /*
+        * Expected format domain:bus:
+        * Valid domain range [0:ffff]
+        * Valid bus range [0:ff]
+        * Example: 0000:af, 0:3d, 01:7
+        */
+       regcomp(&regex, "^([a-f0-9A-F]{1,}):([a-f0-9A-F]{1,2})", REG_EXTENDED);
+       ret = regexec(&regex, str, 0, NULL, 0);
+       if (ret || sscanf(str, "%08x:%02hhx", domain, bus) != 2)
+               pr_warning("Unrecognized root port format: %s\n"
+                          "Please use the following format:\n"
+                          "\t [domain]:[bus]\n"
+                          "\t for example: 0000:3d\n", str);
+
+       regfree(&regex);
+       return ret;
+}
+
+static int iio_root_ports_list_filter(struct iio_root_ports_list **list,
+                                     const char *filter)
+{
+       char *tok, *tmp, *filter_copy = NULL;
+       struct iio_root_port *rp;
+       u32 domain;
+       u8 bus;
+       int ret = -ENOMEM;
+       struct iio_root_ports_list *tmp_list = calloc(1, sizeof(*tmp_list));
+
+       if (!tmp_list)
+               goto err;
+
+       filter_copy = strdup(filter);
+       if (!filter_copy)
+               goto err;
+
+       for (tok = strtok_r(filter_copy, ",", &tmp); tok;
+            tok = strtok_r(NULL, ",", &tmp)) {
+               if (!iio_root_port_parse_str(&domain, &bus, tok)) {
+                       rp = iio_root_port_find_by_notation(*list, domain, bus);
+                       if (rp) {
+                               (*list)->rps[rp->idx] = NULL;
+                               ret = iio_root_ports_list_insert(tmp_list, rp);
+                               if (ret) {
+                                       free(rp);
+                                       goto err;
+                               }
+                       } else if (!iio_root_port_find_by_notation(tmp_list,
+                                                                  domain, bus))
+                               pr_warning("Root port %04x:%02x were not found\n",
+                                          domain, bus);
+               }
+       }
+
+       if (tmp_list->nr_entries == 0) {
+               pr_err("Requested root ports were not found\n");
+               ret = -EINVAL;
+       }
+err:
+       iio_root_ports_list_free(*list);
+       if (ret)
+               iio_root_ports_list_free(tmp_list);
+       else
+               *list = tmp_list;
+
+       free(filter_copy);
+       return ret;
+}
+
+static int iostat_event_group(struct evlist *evl,
+                             struct iio_root_ports_list *list)
+{
+       int ret;
+       int idx;
+       const char *iostat_cmd_template =
+       "{uncore_iio_%x/event=0x83,umask=0x04,ch_mask=0xF,fc_mask=0x07/,\
+         uncore_iio_%x/event=0x83,umask=0x01,ch_mask=0xF,fc_mask=0x07/,\
+         uncore_iio_%x/event=0xc0,umask=0x04,ch_mask=0xF,fc_mask=0x07/,\
+         uncore_iio_%x/event=0xc0,umask=0x01,ch_mask=0xF,fc_mask=0x07/}";
+       const int len_template = strlen(iostat_cmd_template) + 1;
+       struct evsel *evsel = NULL;
+       int metrics_count = iostat_metrics_count();
+       char *iostat_cmd = calloc(len_template, 1);
+
+       if (!iostat_cmd)
+               return -ENOMEM;
+
+       for (idx = 0; idx < list->nr_entries; idx++) {
+               sprintf(iostat_cmd, iostat_cmd_template,
+                       list->rps[idx]->pmu_idx, list->rps[idx]->pmu_idx,
+                       list->rps[idx]->pmu_idx, list->rps[idx]->pmu_idx);
+               ret = parse_events(evl, iostat_cmd, NULL);
+               if (ret)
+                       goto err;
+       }
+
+       evlist__for_each_entry(evl, evsel) {
+               evsel->priv = list->rps[evsel->idx / metrics_count];
+       }
+       list->nr_entries = 0;
+err:
+       iio_root_ports_list_free(list);
+       free(iostat_cmd);
+       return ret;
+}
+
+int iostat_prepare(struct evlist *evlist, struct perf_stat_config *config)
+{
+       if (evlist->core.nr_entries > 0) {
+               pr_warning("The -e and -M options are not supported."
+                          "All chosen events/metrics will be dropped\n");
+               evlist__delete(evlist);
+               evlist = evlist__new();
+               if (!evlist)
+                       return -ENOMEM;
+       }
+
+       config->metric_only = true;
+       config->aggr_mode = AGGR_GLOBAL;
+
+       return iostat_event_group(evlist, root_ports);
+}
+
+int iostat_parse(const struct option *opt, const char *str,
+                int unset __maybe_unused)
+{
+       int ret;
+       struct perf_stat_config *config = (struct perf_stat_config *)opt->data;
+
+       ret = iio_root_ports_scan(&root_ports);
+       if (!ret) {
+               config->iostat_run = true;
+               if (!str)
+                       iostat_mode = IOSTAT_RUN;
+               else if (!strcmp(str, "list"))
+                       iostat_mode = IOSTAT_LIST;
+               else {
+                       iostat_mode = IOSTAT_RUN;
+                       ret = iio_root_ports_list_filter(&root_ports, str);
+               }
+       }
+       return ret;
+}
+
+void iostat_list(struct evlist *evlist, struct perf_stat_config *config)
+{
+       struct evsel *evsel;
+       struct iio_root_port *rp = NULL;
+
+       evlist__for_each_entry(evlist, evsel) {
+               if (rp != evsel->priv) {
+                       rp = evsel->priv;
+                       iio_root_port_show(config->output, rp);
+               }
+       }
+}
+
+void iostat_release(struct evlist *evlist)
+{
+       struct evsel *evsel;
+       struct iio_root_port *rp = NULL;
+
+       evlist__for_each_entry(evlist, evsel) {
+               if (rp != evsel->priv) {
+                       rp = evsel->priv;
+                       free(evsel->priv);
+               }
+       }
+}
+
+void iostat_prefix(struct evlist *evlist,
+                  struct perf_stat_config *config,
+                  char *prefix, struct timespec *ts)
+{
+       struct iio_root_port *rp = evlist->selected->priv;
+
+       if (rp) {
+               if (ts)
+                       sprintf(prefix, "%6lu.%09lu%s%04x:%02x%s",
+                               ts->tv_sec, ts->tv_nsec,
+                               config->csv_sep, rp->domain, rp->bus,
+                               config->csv_sep);
+               else
+                       sprintf(prefix, "%04x:%02x%s", rp->domain, rp->bus,
+                               config->csv_sep);
+       }
+}
+
+void iostat_print_header_prefix(struct perf_stat_config *config)
+{
+       if (config->csv_output)
+               fputs("port,", config->output);
+       else if (config->interval)
+               fprintf(config->output, "#          time    port         ");
+       else
+               fprintf(config->output, "   port         ");
+}
+
+void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel,
+                        struct perf_stat_output_ctx *out)
+{
+       double iostat_value = 0;
+       u64 prev_count_val = 0;
+       const char *iostat_metric = iostat_metric_by_idx(evsel->idx);
+       u8 die = ((struct iio_root_port *)evsel->priv)->die;
+       struct perf_counts_values *count = perf_counts(evsel->counts, die, 0);
+
+       if (count->run && count->ena) {
+               if (evsel->prev_raw_counts && !out->force_header) {
+                       struct perf_counts_values *prev_count =
+                               perf_counts(evsel->prev_raw_counts, die, 0);
+
+                       prev_count_val = prev_count->val;
+                       prev_count->val = count->val;
+               }
+               iostat_value = (count->val - prev_count_val) /
+                              ((double) count->run / count->ena);
+       }
+       out->print_metric(config, out->ctx, NULL, "%8.0f", iostat_metric,
+                         iostat_value / (256 * 1024));
+}
+
+void iostat_print_counters(struct evlist *evlist,
+                          struct perf_stat_config *config, struct timespec *ts,
+                          char *prefix, iostat_print_counter_t print_cnt_cb)
+{
+       void *perf_device = NULL;
+       struct evsel *counter = evlist__first(evlist);
+
+       evlist__set_selected(evlist, counter);
+       iostat_prefix(evlist, config, prefix, ts);
+       fprintf(config->output, "%s", prefix);
+       evlist__for_each_entry(evlist, counter) {
+               perf_device = evlist->selected->priv;
+               if (perf_device && perf_device != counter->priv) {
+                       evlist__set_selected(evlist, counter);
+                       iostat_prefix(evlist, config, prefix, ts);
+                       fprintf(config->output, "\n%s", prefix);
+               }
+               print_cnt_cb(config, counter, prefix);
+       }
+       fputc('\n', config->output);
+}
index fca81b3..207c568 100644 (file)
@@ -165,7 +165,7 @@ static int sdt_init_op_regex(void)
 /*
  * Max x86 register name length is 5(ex: %r15d). So, 6th char
  * should always contain NULL. This helps to find register name
- * length using strlen, insted of maintaing one more variable.
+ * length using strlen, instead of maintaining one more variable.
  */
 #define SDT_REG_NAME_SIZE  6
 
@@ -207,7 +207,7 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
         * and displacement 0 (Both sign and displacement 0 are
         * optional so it may be empty). Use one more character
         * to hold last NULL so that strlen can be used to find
-        * prefix length, instead of maintaing one more variable.
+        * prefix length, instead of maintaining one more variable.
         */
        char prefix[3] = {0};
 
index 0a0ff12..79d13db 100644 (file)
@@ -17,7 +17,7 @@
  * While the second model, enabled via --multiq option, uses multiple
  * queueing (which refers to one epoll instance per worker). For example,
  * short lived tcp connections in a high throughput httpd server will
- * ditribute the accept()'ing  connections across CPUs. In this case each
+ * distribute the accept()'ing  connections across CPUs. In this case each
  * worker does a limited  amount of processing.
  *
  *             [queue A]  ---> [worker]
@@ -198,7 +198,7 @@ static void *workerfn(void *arg)
 
        do {
                /*
-                * Block undefinitely waiting for the IN event.
+                * Block indefinitely waiting for the IN event.
                 * In order to stress the epoll_wait(2) syscall,
                 * call it event per event, instead of a larger
                 * batch (max)limit.
index 280227e..55d373b 100644 (file)
@@ -372,7 +372,7 @@ static int inject_build_id(struct bench_data *data, u64 *max_rss)
                        len += synthesize_flush(data);
        }
 
-       /* tihs makes the child to finish */
+       /* this makes the child to finish */
        close(data->input_pipe[1]);
 
        wait4(data->pid, &status, 0, &rusage);
index 20b87e2..f264017 100644 (file)
@@ -42,7 +42,7 @@
 #endif
 
 /*
- * Regular printout to the terminal, supressed if -q is specified:
+ * Regular printout to the terminal, suppressed if -q is specified:
  */
 #define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0)
 
index a23ba6b..49627a7 100644 (file)
@@ -239,7 +239,7 @@ static int evsel__add_sample(struct evsel *evsel, struct perf_sample *sample,
        }
 
        /*
-        * XXX filtered samples can still have branch entires pointing into our
+        * XXX filtered samples can still have branch entries pointing into our
         * symbol and are missed.
         */
        process_branch_stack(sample->branch_stack, al, sample);
@@ -374,13 +374,6 @@ find_next:
                } else {
                        hist_entry__tty_annotate(he, evsel, ann);
                        nd = rb_next(nd);
-                       /*
-                        * Since we have a hist_entry per IP for the same
-                        * symbol, free he->ms.sym->src to signal we already
-                        * processed this symbol.
-                        */
-                       zfree(&notes->src->cycles_hist);
-                       zfree(&notes->src);
                }
        }
 }
@@ -411,8 +404,8 @@ static int __cmd_annotate(struct perf_annotate *ann)
                goto out;
 
        if (dump_trace) {
-               perf_session__fprintf_nr_events(session, stdout);
-               evlist__fprintf_nr_events(session->evlist, stdout);
+               perf_session__fprintf_nr_events(session, stdout, false);
+               evlist__fprintf_nr_events(session->evlist, stdout, false);
                goto out;
        }
 
@@ -425,7 +418,7 @@ static int __cmd_annotate(struct perf_annotate *ann)
        total_nr_samples = 0;
        evlist__for_each_entry(session->evlist, pos) {
                struct hists *hists = evsel__hists(pos);
-               u32 nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE];
+               u32 nr_samples = hists->stats.nr_samples;
 
                if (nr_samples > 0) {
                        total_nr_samples += nr_samples;
@@ -538,6 +531,10 @@ int cmd_annotate(int argc, const char **argv)
                    "Strip first N entries of source file path name in programs (with --prefix)"),
        OPT_STRING(0, "objdump", &annotate.opts.objdump_path, "path",
                   "objdump binary to use for disassembly and annotations"),
+       OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle,
+                   "Enable symbol demangling"),
+       OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel,
+                   "Enable kernel symbol demangling"),
        OPT_BOOLEAN(0, "group", &symbol_conf.event_group,
                    "Show event group information together"),
        OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
@@ -619,14 +616,22 @@ int cmd_annotate(int argc, const char **argv)
 
        setup_browser(true);
 
-       if ((use_browser == 1 || annotate.use_stdio2) && annotate.has_br_stack) {
+       /*
+        * Events of different processes may correspond to the same
+        * symbol, we do not care about the processes in annotate,
+        * set sort order to avoid repeated output.
+        */
+       sort_order = "dso,symbol";
+
+       /*
+        * Set SORT_MODE__BRANCH so that annotate display IPC/Cycle
+        * if branch info is in perf data in TUI mode.
+        */
+       if ((use_browser == 1 || annotate.use_stdio2) && annotate.has_br_stack)
                sort__mode = SORT_MODE__BRANCH;
-               if (setup_sorting(annotate.session->evlist) < 0)
-                       usage_with_options(annotate_usage, options);
-       } else {
-               if (setup_sorting(NULL) < 0)
-                       usage_with_options(annotate_usage, options);
-       }
+
+       if (setup_sorting(NULL) < 0)
+               usage_with_options(annotate_usage, options);
 
        ret = __cmd_annotate(&annotate);
 
index 7c4a9d4..61929f6 100644 (file)
@@ -6,7 +6,6 @@
 #include <linux/zalloc.h>
 #include <linux/string.h>
 #include <linux/limits.h>
-#include <linux/string.h>
 #include <string.h>
 #include <sys/file.h>
 #include <signal.h>
@@ -24,8 +23,6 @@
 #include <sys/signalfd.h>
 #include <sys/wait.h>
 #include <poll.h>
-#include <sys/stat.h>
-#include <time.h>
 #include "builtin.h"
 #include "perf.h"
 #include "debug.h"
index 8d23b8d..15ca236 100644 (file)
@@ -7,7 +7,6 @@
 #include "debug.h"
 #include <subcmd/parse-options.h>
 #include "data-convert.h"
-#include "data-convert-bt.h"
 
 typedef int (*data_cmd_fn_t)(int argc, const char **argv);
 
@@ -55,7 +54,8 @@ static const char * const data_convert_usage[] = {
 
 static int cmd_data_convert(int argc, const char **argv)
 {
-       const char *to_ctf     = NULL;
+       const char *to_json = NULL;
+       const char *to_ctf = NULL;
        struct perf_data_convert_opts opts = {
                .force = false,
                .all = false,
@@ -63,6 +63,7 @@ static int cmd_data_convert(int argc, const char **argv)
        const struct option options[] = {
                OPT_INCR('v', "verbose", &verbose, "be more verbose"),
                OPT_STRING('i', "input", &input_name, "file", "input file name"),
+               OPT_STRING(0, "to-json", &to_json, NULL, "Convert to JSON format"),
 #ifdef HAVE_LIBBABELTRACE_SUPPORT
                OPT_STRING(0, "to-ctf", &to_ctf, NULL, "Convert to CTF format"),
                OPT_BOOLEAN(0, "tod", &opts.tod, "Convert time to wall clock time"),
@@ -72,11 +73,6 @@ static int cmd_data_convert(int argc, const char **argv)
                OPT_END()
        };
 
-#ifndef HAVE_LIBBABELTRACE_SUPPORT
-       pr_err("No conversion support compiled in. perf should be compiled with environment variables LIBBABELTRACE=1 and LIBBABELTRACE_DIR=/path/to/libbabeltrace/\n");
-       return -1;
-#endif
-
        argc = parse_options(argc, argv, options,
                             data_convert_usage, 0);
        if (argc) {
@@ -84,11 +80,25 @@ static int cmd_data_convert(int argc, const char **argv)
                return -1;
        }
 
+       if (to_json && to_ctf) {
+               pr_err("You cannot specify both --to-ctf and --to-json.\n");
+               return -1;
+       }
+       if (!to_json && !to_ctf) {
+               pr_err("You must specify one of --to-ctf or --to-json.\n");
+               return -1;
+       }
+
+       if (to_json)
+               return bt_convert__perf2json(input_name, to_json, &opts);
+
        if (to_ctf) {
 #ifdef HAVE_LIBBABELTRACE_SUPPORT
                return bt_convert__perf2ctf(input_name, to_ctf, &opts);
 #else
-               pr_err("The libbabeltrace support is not compiled in.\n");
+               pr_err("The libbabeltrace support is not compiled in. perf should be "
+                      "compiled with environment variables LIBBABELTRACE=1 and "
+                      "LIBBABELTRACE_DIR=/path/to/libbabeltrace/\n");
                return -1;
 #endif
        }
index 878e04b..f52b3a7 100644 (file)
@@ -1796,7 +1796,7 @@ static int ui_init(void)
        data__for_each_file(i, d) {
 
                /*
-                * Baseline or compute realted columns:
+                * Baseline or compute related columns:
                 *
                 *   PERF_HPP_DIFF__BASELINE
                 *   PERF_HPP_DIFF__DELTA
index a2f1e53..01326e3 100644 (file)
@@ -49,7 +49,7 @@ struct lock_stat {
 
        /*
         * FIXME: evsel__intval() returns u64,
-        * so address of lockdep_map should be dealed as 64bit.
+        * so address of lockdep_map should be treated as 64bit.
         * Is there more better solution?
         */
        void                    *addr;          /* address of lockdep_map, used as ID */
index 35465d1..3337b5f 100644 (file)
@@ -47,6 +47,8 @@
 #include "util/util.h"
 #include "util/pfm.h"
 #include "util/clockid.h"
+#include "util/pmu-hybrid.h"
+#include "util/evlist-hybrid.h"
 #include "asm/bug.h"
 #include "perf.h"
 
@@ -1603,6 +1605,32 @@ static void hit_auxtrace_snapshot_trigger(struct record *rec)
        }
 }
 
+static void record__uniquify_name(struct record *rec)
+{
+       struct evsel *pos;
+       struct evlist *evlist = rec->evlist;
+       char *new_name;
+       int ret;
+
+       if (!perf_pmu__has_hybrid())
+               return;
+
+       evlist__for_each_entry(evlist, pos) {
+               if (!evsel__is_hybrid(pos))
+                       continue;
+
+               if (strchr(pos->name, '/'))
+                       continue;
+
+               ret = asprintf(&new_name, "%s/%s/",
+                              pos->pmu_name, pos->name);
+               if (ret) {
+                       free(pos->name);
+                       pos->name = new_name;
+               }
+       }
+}
+
 static int __cmd_record(struct record *rec, int argc, const char **argv)
 {
        int err;
@@ -1707,6 +1735,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
        if (data->is_pipe && rec->evlist->core.nr_entries == 1)
                rec->opts.sample_id = true;
 
+       record__uniquify_name(rec);
+
        if (record__open(rec) != 0) {
                err = -1;
                goto out_child;
@@ -1977,9 +2007,13 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
                record__auxtrace_snapshot_exit(rec);
 
        if (forks && workload_exec_errno) {
-               char msg[STRERR_BUFSIZE];
+               char msg[STRERR_BUFSIZE], strevsels[2048];
                const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
-               pr_err("Workload failed: %s\n", emsg);
+
+               evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
+
+               pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
+                       strevsels, argv[0], emsg);
                err = -1;
                goto out_child;
        }
@@ -2786,10 +2820,19 @@ int cmd_record(int argc, const char **argv)
        if (record.opts.overwrite)
                record.opts.tail_synthesize = true;
 
-       if (rec->evlist->core.nr_entries == 0 &&
-           __evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
-               pr_err("Not enough memory for event selector list\n");
-               goto out;
+       if (rec->evlist->core.nr_entries == 0) {
+               if (perf_pmu__has_hybrid()) {
+                       err = evlist__add_default_hybrid(rec->evlist,
+                                                        !record.opts.no_samples);
+               } else {
+                       err = __evlist__add_default(rec->evlist,
+                                                   !record.opts.no_samples);
+               }
+
+               if (err < 0) {
+                       pr_err("Not enough memory for event selector list\n");
+                       goto out;
+               }
        }
 
        if (rec->opts.target.tid && !rec->opts.no_inherit_set)
index 2a845d6..36f9ccf 100644 (file)
@@ -84,6 +84,8 @@ struct report {
        bool                    nonany_branch_mode;
        bool                    group_set;
        bool                    stitch_lbr;
+       bool                    disable_order;
+       bool                    skip_empty;
        int                     max_stack;
        struct perf_read_values show_threads_values;
        struct annotation_options annotation_opts;
@@ -134,6 +136,11 @@ static int report__config(const char *var, const char *value, void *cb)
                return 0;
        }
 
+       if (!strcmp(var, "report.skip-empty")) {
+               rep->skip_empty = perf_config_bool(var, value);
+               return 0;
+       }
+
        return 0;
 }
 
@@ -435,7 +442,7 @@ static size_t hists__fprintf_nr_sample_events(struct hists *hists, struct report
 {
        size_t ret;
        char unit;
-       unsigned long nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE];
+       unsigned long nr_samples = hists->stats.nr_samples;
        u64 nr_events = hists->stats.total_period;
        struct evsel *evsel = hists_to_evsel(hists);
        char buf[512];
@@ -463,7 +470,7 @@ static size_t hists__fprintf_nr_sample_events(struct hists *hists, struct report
                                nr_samples += pos_hists->stats.nr_non_filtered_samples;
                                nr_events += pos_hists->stats.total_non_filtered_period;
                        } else {
-                               nr_samples += pos_hists->stats.nr_events[PERF_RECORD_SAMPLE];
+                               nr_samples += pos_hists->stats.nr_samples;
                                nr_events += pos_hists->stats.total_period;
                        }
                }
@@ -529,6 +536,9 @@ static int evlist__tty_browse_hists(struct evlist *evlist, struct report *rep, c
                if (symbol_conf.event_group && !evsel__is_group_leader(pos))
                        continue;
 
+               if (rep->skip_empty && !hists->stats.nr_samples)
+                       continue;
+
                hists__fprintf_nr_sample_events(hists, rep, evname, stdout);
 
                if (rep->total_cycles_mode) {
@@ -707,9 +717,22 @@ static void report__output_resort(struct report *rep)
        ui_progress__finish();
 }
 
+static int count_sample_event(struct perf_tool *tool __maybe_unused,
+                             union perf_event *event __maybe_unused,
+                             struct perf_sample *sample __maybe_unused,
+                             struct evsel *evsel,
+                             struct machine *machine __maybe_unused)
+{
+       struct hists *hists = evsel__hists(evsel);
+
+       hists__inc_nr_events(hists);
+       return 0;
+}
+
 static void stats_setup(struct report *rep)
 {
        memset(&rep->tool, 0, sizeof(rep->tool));
+       rep->tool.sample = count_sample_event;
        rep->tool.no_warn = true;
 }
 
@@ -717,7 +740,8 @@ static int stats_print(struct report *rep)
 {
        struct perf_session *session = rep->session;
 
-       perf_session__fprintf_nr_events(session, stdout);
+       perf_session__fprintf_nr_events(session, stdout, rep->skip_empty);
+       evlist__fprintf_nr_events(session->evlist, stdout, rep->skip_empty);
        return 0;
 }
 
@@ -929,8 +953,10 @@ static int __cmd_report(struct report *rep)
                        perf_session__fprintf_dsos(session, stdout);
 
                if (dump_trace) {
-                       perf_session__fprintf_nr_events(session, stdout);
-                       evlist__fprintf_nr_events(session->evlist, stdout);
+                       perf_session__fprintf_nr_events(session, stdout,
+                                                       rep->skip_empty);
+                       evlist__fprintf_nr_events(session->evlist, stdout,
+                                                 rep->skip_empty);
                        return 0;
                }
        }
@@ -1139,6 +1165,7 @@ int cmd_report(int argc, const char **argv)
                .pretty_printing_style   = "normal",
                .socket_filter           = -1,
                .annotation_opts         = annotation__default_options,
+               .skip_empty              = true,
        };
        const struct option options[] = {
        OPT_STRING('i', "input", &input_name, "file",
@@ -1296,6 +1323,10 @@ int cmd_report(int argc, const char **argv)
        OPTS_EVSWITCH(&report.evswitch),
        OPT_BOOLEAN(0, "total-cycles", &report.total_cycles_mode,
                    "Sort all blocks by 'Sampled Cycles%'"),
+       OPT_BOOLEAN(0, "disable-order", &report.disable_order,
+                   "Disable raw trace ordering"),
+       OPT_BOOLEAN(0, "skip-empty", &report.skip_empty,
+                   "Do not display empty (or dummy) events in the output"),
        OPT_END()
        };
        struct perf_data data = {
@@ -1329,7 +1360,7 @@ int cmd_report(int argc, const char **argv)
        if (report.mmaps_mode)
                report.tasks_mode = true;
 
-       if (dump_trace)
+       if (dump_trace && report.disable_order)
                report.tool.ordered_events = false;
 
        if (quiet)
index 69c769b..954ce2f 100644 (file)
@@ -1712,7 +1712,7 @@ static int perf_sched__process_fork_event(struct perf_tool *tool,
 {
        struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
 
-       /* run the fork event through the perf machineruy */
+       /* run the fork event through the perf machinery */
        perf_event__process_fork(tool, event, sample, machine);
 
        /* and then run additional processing needed for this command */
index 5915f19..1280cbf 100644 (file)
@@ -314,8 +314,7 @@ static inline struct evsel_script *evsel_script(struct evsel *evsel)
        return (struct evsel_script *)evsel->priv;
 }
 
-static struct evsel_script *perf_evsel_script__new(struct evsel *evsel,
-                                                       struct perf_data *data)
+static struct evsel_script *evsel_script__new(struct evsel *evsel, struct perf_data *data)
 {
        struct evsel_script *es = zalloc(sizeof(*es));
 
@@ -335,7 +334,7 @@ out_free:
        return NULL;
 }
 
-static void perf_evsel_script__delete(struct evsel_script *es)
+static void evsel_script__delete(struct evsel_script *es)
 {
        zfree(&es->filename);
        fclose(es->fp);
@@ -343,7 +342,7 @@ static void perf_evsel_script__delete(struct evsel_script *es)
        free(es);
 }
 
-static int perf_evsel_script__fprintf(struct evsel_script *es, FILE *fp)
+static int evsel_script__fprintf(struct evsel_script *es, FILE *fp)
 {
        struct stat st;
 
@@ -2219,8 +2218,7 @@ static int process_attr(struct perf_tool *tool, union perf_event *event,
 
        if (!evsel->priv) {
                if (scr->per_event_dump) {
-                       evsel->priv = perf_evsel_script__new(evsel,
-                                               scr->session->data);
+                       evsel->priv = evsel_script__new(evsel, scr->session->data);
                } else {
                        es = zalloc(sizeof(*es));
                        if (!es)
@@ -2475,7 +2473,7 @@ static void perf_script__fclose_per_event_dump(struct perf_script *script)
        evlist__for_each_entry(evlist, evsel) {
                if (!evsel->priv)
                        break;
-               perf_evsel_script__delete(evsel->priv);
+               evsel_script__delete(evsel->priv);
                evsel->priv = NULL;
        }
 }
@@ -2488,14 +2486,14 @@ static int perf_script__fopen_per_event_dump(struct perf_script *script)
                /*
                 * Already setup? I.e. we may be called twice in cases like
                 * Intel PT, one for the intel_pt// and dummy events, then
-                * for the evsels syntheized from the auxtrace info.
+                * for the evsels synthesized from the auxtrace info.
                 *
                 * Ses perf_script__process_auxtrace_info.
                 */
                if (evsel->priv != NULL)
                        continue;
 
-               evsel->priv = perf_evsel_script__new(evsel, script->session->data);
+               evsel->priv = evsel_script__new(evsel, script->session->data);
                if (evsel->priv == NULL)
                        goto out_err_fclose;
        }
@@ -2530,8 +2528,8 @@ static void perf_script__exit_per_event_dump_stats(struct perf_script *script)
        evlist__for_each_entry(script->session->evlist, evsel) {
                struct evsel_script *es = evsel->priv;
 
-               perf_evsel_script__fprintf(es, stdout);
-               perf_evsel_script__delete(es);
+               evsel_script__fprintf(es, stdout);
+               evsel_script__delete(es);
                evsel->priv = NULL;
        }
 }
@@ -3085,7 +3083,7 @@ static int list_available_scripts(const struct option *opt __maybe_unused,
  *
  * Fixme: All existing "xxx-record" are all in good formats "-e event ",
  * which is covered well now. And new parsing code should be added to
- * cover the future complexing formats like event groups etc.
+ * cover the future complex formats like event groups etc.
  */
 static int check_ev_match(char *dir_name, char *scriptname,
                        struct perf_session *session)
index 2e2e4a8..5a830ae 100644 (file)
@@ -48,6 +48,7 @@
 #include "util/pmu.h"
 #include "util/event.h"
 #include "util/evlist.h"
+#include "util/evlist-hybrid.h"
 #include "util/evsel.h"
 #include "util/debug.h"
 #include "util/color.h"
@@ -68,6 +69,8 @@
 #include "util/affinity.h"
 #include "util/pfm.h"
 #include "util/bpf_counter.h"
+#include "util/iostat.h"
+#include "util/pmu-hybrid.h"
 #include "asm/bug.h"
 
 #include <linux/time64.h>
@@ -160,6 +163,7 @@ static const char *smi_cost_attrs = {
 };
 
 static struct evlist   *evsel_list;
+static bool all_counters_use_bpf = true;
 
 static struct target target = {
        .uid    = UINT_MAX,
@@ -212,7 +216,8 @@ static struct perf_stat_config stat_config = {
        .walltime_nsecs_stats   = &walltime_nsecs_stats,
        .big_num                = true,
        .ctl_fd                 = -1,
-       .ctl_fd_ack             = -1
+       .ctl_fd_ack             = -1,
+       .iostat_run             = false,
 };
 
 static bool cpus_map_matched(struct evsel *a, struct evsel *b)
@@ -239,6 +244,9 @@ static void evlist__check_cpu_maps(struct evlist *evlist)
        struct evsel *evsel, *pos, *leader;
        char buf[1024];
 
+       if (evlist__has_hybrid(evlist))
+               evlist__warn_hybrid_group(evlist);
+
        evlist__for_each_entry(evlist, evsel) {
                leader = evsel->leader;
 
@@ -399,6 +407,9 @@ static int read_affinity_counters(struct timespec *rs)
        struct affinity affinity;
        int i, ncpus, cpu;
 
+       if (all_counters_use_bpf)
+               return 0;
+
        if (affinity__setup(&affinity) < 0)
                return -1;
 
@@ -413,6 +424,8 @@ static int read_affinity_counters(struct timespec *rs)
                evlist__for_each_entry(evsel_list, counter) {
                        if (evsel__cpu_iter_skip(counter, cpu))
                                continue;
+                       if (evsel__is_bpf(counter))
+                               continue;
                        if (!counter->err) {
                                counter->err = read_counter_cpu(counter, rs,
                                                                counter->cpu_iter - 1);
@@ -429,6 +442,9 @@ static int read_bpf_map_counters(void)
        int err;
 
        evlist__for_each_entry(evsel_list, counter) {
+               if (!evsel__is_bpf(counter))
+                       continue;
+
                err = bpf_counter__read(counter);
                if (err)
                        return err;
@@ -439,14 +455,10 @@ static int read_bpf_map_counters(void)
 static void read_counters(struct timespec *rs)
 {
        struct evsel *counter;
-       int err;
 
        if (!stat_config.stop_read_counter) {
-               if (target__has_bpf(&target))
-                       err = read_bpf_map_counters();
-               else
-                       err = read_affinity_counters(rs);
-               if (err < 0)
+               if (read_bpf_map_counters() ||
+                   read_affinity_counters(rs))
                        return;
        }
 
@@ -535,12 +547,13 @@ static int enable_counters(void)
        struct evsel *evsel;
        int err;
 
-       if (target__has_bpf(&target)) {
-               evlist__for_each_entry(evsel_list, evsel) {
-                       err = bpf_counter__enable(evsel);
-                       if (err)
-                               return err;
-               }
+       evlist__for_each_entry(evsel_list, evsel) {
+               if (!evsel__is_bpf(evsel))
+                       continue;
+
+               err = bpf_counter__enable(evsel);
+               if (err)
+                       return err;
        }
 
        if (stat_config.initial_delay < 0) {
@@ -784,14 +797,20 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
        if (affinity__setup(&affinity) < 0)
                return -1;
 
-       if (target__has_bpf(&target)) {
-               evlist__for_each_entry(evsel_list, counter) {
-                       if (bpf_counter__load(counter, &target))
-                               return -1;
-               }
+       evlist__for_each_entry(evsel_list, counter) {
+               if (bpf_counter__load(counter, &target))
+                       return -1;
+               if (!evsel__is_bpf(counter))
+                       all_counters_use_bpf = false;
        }
 
        evlist__for_each_cpu (evsel_list, i, cpu) {
+               /*
+                * bperf calls evsel__open_per_cpu() in bperf__load(), so
+                * no need to call it again here.
+                */
+               if (target.use_bpf)
+                       break;
                affinity__set(&affinity, cpu);
 
                evlist__for_each_entry(evsel_list, counter) {
@@ -799,6 +818,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
                                continue;
                        if (counter->reset_group || counter->errored)
                                continue;
+                       if (evsel__is_bpf(counter))
+                               continue;
 try_again:
                        if (create_perf_stat_counter(counter, &stat_config, &target,
                                                     counter->cpu_iter - 1) < 0) {
@@ -925,15 +946,15 @@ try_again_reset:
        /*
         * Enable counters and exec the command:
         */
-       t0 = rdclock();
-       clock_gettime(CLOCK_MONOTONIC, &ref_time);
-
        if (forks) {
                evlist__start_workload(evsel_list);
                err = enable_counters();
                if (err)
                        return -1;
 
+               t0 = rdclock();
+               clock_gettime(CLOCK_MONOTONIC, &ref_time);
+
                if (interval || timeout || evlist__ctlfd_initialized(evsel_list))
                        status = dispatch_events(forks, timeout, interval, &times);
                if (child_pid != -1) {
@@ -954,6 +975,10 @@ try_again_reset:
                err = enable_counters();
                if (err)
                        return -1;
+
+               t0 = rdclock();
+               clock_gettime(CLOCK_MONOTONIC, &ref_time);
+
                status = dispatch_events(forks, timeout, interval, &times);
        }
 
@@ -1083,6 +1108,11 @@ void perf_stat__set_big_num(int set)
        stat_config.big_num = (set != 0);
 }
 
+void perf_stat__set_no_csv_summary(int set)
+{
+       stat_config.no_csv_summary = (set != 0);
+}
+
 static int stat__set_big_num(const struct option *opt __maybe_unused,
                             const char *s __maybe_unused, int unset)
 {
@@ -1146,6 +1176,10 @@ static struct option stat_options[] = {
 #ifdef HAVE_BPF_SKEL
        OPT_STRING('b', "bpf-prog", &target.bpf_str, "bpf-prog-id",
                   "stat events on existing bpf program id"),
+       OPT_BOOLEAN(0, "bpf-counters", &target.use_bpf,
+                   "use bpf program to count events"),
+       OPT_STRING(0, "bpf-attr-map", &target.attr_map, "attr-map-path",
+                  "path to perf_event_attr map"),
 #endif
        OPT_BOOLEAN('a', "all-cpus", &target.system_wide,
                    "system-wide collection from all CPUs"),
@@ -1235,6 +1269,8 @@ static struct option stat_options[] = {
                    "threads of same physical core"),
        OPT_BOOLEAN(0, "summary", &stat_config.summary,
                       "print summary for interval mode"),
+       OPT_BOOLEAN(0, "no-csv-summary", &stat_config.no_csv_summary,
+                      "don't print 'summary' for CSV summary output"),
        OPT_BOOLEAN(0, "quiet", &stat_config.quiet,
                        "don't print output (useful with record)"),
 #ifdef HAVE_LIBPFM
@@ -1247,6 +1283,9 @@ static struct option stat_options[] = {
                     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
                     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
                      parse_control_option),
+       OPT_CALLBACK_OPTARG(0, "iostat", &evsel_list, &stat_config, "default",
+                           "measure I/O performance metrics provided by arch/platform",
+                           iostat_parse),
        OPT_END()
 };
 
@@ -1604,6 +1643,12 @@ static int add_default_attributes(void)
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS    },
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES          },
 
+};
+       struct perf_event_attr default_sw_attrs[] = {
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK             },
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES       },
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS         },
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS            },
 };
 
 /*
@@ -1705,7 +1750,7 @@ static int add_default_attributes(void)
        bzero(&errinfo, sizeof(errinfo));
        if (transaction_run) {
                /* Handle -T as -M transaction. Once platform specific metrics
-                * support has been added to the json files, all archictures
+                * support has been added to the json files, all architectures
                 * will use this approach. To determine transaction support
                 * on an architecture test for such a metric name.
                 */
@@ -1841,6 +1886,28 @@ setup_metrics:
        }
 
        if (!evsel_list->core.nr_entries) {
+               if (perf_pmu__has_hybrid()) {
+                       const char *hybrid_str = "cycles,instructions,branches,branch-misses";
+
+                       if (target__has_cpu(&target))
+                               default_sw_attrs[0].config = PERF_COUNT_SW_CPU_CLOCK;
+
+                       if (evlist__add_default_attrs(evsel_list,
+                                                     default_sw_attrs) < 0) {
+                               return -1;
+                       }
+
+                       err = parse_events(evsel_list, hybrid_str, &errinfo);
+                       if (err) {
+                               fprintf(stderr,
+                                       "Cannot set up hybrid events %s: %d\n",
+                                       hybrid_str, err);
+                               parse_events_print_error(&errinfo, hybrid_str);
+                               return -1;
+                       }
+                       return err;
+               }
+
                if (target__has_cpu(&target))
                        default_attrs0[0].config = PERF_COUNT_SW_CPU_CLOCK;
 
@@ -2320,6 +2387,17 @@ int cmd_stat(int argc, const char **argv)
                goto out;
        }
 
+       if (stat_config.iostat_run) {
+               status = iostat_prepare(evsel_list, &stat_config);
+               if (status)
+                       goto out;
+               if (iostat_mode == IOSTAT_LIST) {
+                       iostat_list(evsel_list, &stat_config);
+                       goto out;
+               } else if (verbose)
+                       iostat_list(evsel_list, &stat_config);
+       }
+
        if (add_default_attributes())
                goto out;
 
@@ -2357,6 +2435,9 @@ int cmd_stat(int argc, const char **argv)
 
        evlist__check_cpu_maps(evsel_list);
 
+       if (perf_pmu__has_hybrid())
+               stat_config.no_merge = true;
+
        /*
         * Initialize thread_map with comm names,
         * so we could print it out on output.
@@ -2459,7 +2540,7 @@ int cmd_stat(int argc, const char **argv)
                /*
                 * We synthesize the kernel mmap record just so that older tools
                 * don't emit warnings about not being able to resolve symbols
-                * due to /proc/sys/kernel/kptr_restrict settings and instear provide
+                * due to /proc/sys/kernel/kptr_restrict settings and instead provide
                 * a saner message about no samples being in the perf.data file.
                 *
                 * This also serves to suppress a warning about f_header.data.size == 0
@@ -2495,6 +2576,9 @@ int cmd_stat(int argc, const char **argv)
        perf_stat__exit_aggr_mode();
        evlist__free_stats(evsel_list);
 out:
+       if (stat_config.iostat_run)
+               iostat_release(evsel_list);
+
        zfree(&stat_config.walltime_run);
 
        if (smi_cost && smi_reset)
index 3673c04..69cb363 100644 (file)
@@ -328,13 +328,13 @@ static void perf_top__print_sym_table(struct perf_top *top)
        printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
 
        if (!top->record_opts.overwrite &&
-           (hists->stats.nr_lost_warned !=
-           hists->stats.nr_events[PERF_RECORD_LOST])) {
-               hists->stats.nr_lost_warned =
-                             hists->stats.nr_events[PERF_RECORD_LOST];
+           (top->evlist->stats.nr_lost_warned !=
+            top->evlist->stats.nr_events[PERF_RECORD_LOST])) {
+               top->evlist->stats.nr_lost_warned =
+                             top->evlist->stats.nr_events[PERF_RECORD_LOST];
                color_fprintf(stdout, PERF_COLOR_RED,
                              "WARNING: LOST %d chunks, Check IO/CPU overload",
-                             hists->stats.nr_lost_warned);
+                             top->evlist->stats.nr_lost_warned);
                ++printed;
        }
 
@@ -852,11 +852,9 @@ static void
 perf_top__process_lost(struct perf_top *top, union perf_event *event,
                       struct evsel *evsel)
 {
-       struct hists *hists = evsel__hists(evsel);
-
        top->lost += event->lost.lost;
        top->lost_total += event->lost.lost;
-       hists->stats.total_lost += event->lost.lost;
+       evsel->evlist->stats.total_lost += event->lost.lost;
 }
 
 static void
@@ -864,11 +862,9 @@ perf_top__process_lost_samples(struct perf_top *top,
                               union perf_event *event,
                               struct evsel *evsel)
 {
-       struct hists *hists = evsel__hists(evsel);
-
        top->lost += event->lost_samples.lost;
        top->lost_total += event->lost_samples.lost;
-       hists->stats.total_lost_samples += event->lost_samples.lost;
+       evsel->evlist->stats.total_lost_samples += event->lost_samples.lost;
 }
 
 static u64 last_timestamp;
@@ -1205,7 +1201,7 @@ static int deliver_event(struct ordered_events *qe,
        } else if (event->header.type == PERF_RECORD_LOST_SAMPLES) {
                perf_top__process_lost_samples(top, event, evsel);
        } else if (event->header.type < PERF_RECORD_MAX) {
-               hists__inc_nr_events(evsel__hists(evsel), event->header.type);
+               events_stats__inc(&session->evlist->stats, event->header.type);
                machine__process_event(machine, event, &sample);
        } else
                ++session->evlist->stats.nr_unknown_events;
@@ -1607,7 +1603,7 @@ int cmd_top(int argc, const char **argv)
        if (status) {
                /*
                 * Some arches do not provide a get_cpuid(), so just use pr_debug, otherwise
-                * warn the user explicitely.
+                * warn the user explicitly.
                 */
                eprintf(status == ENOSYS ? 1 : 0, verbose,
                        "Couldn't read the cpuid for this machine: %s\n",
index 07857df..dd8ff28 100755 (executable)
@@ -153,6 +153,7 @@ check lib/ctype.c                 '-I "^EXPORT_SYMBOL" -I "^#include <linux/export.h>" -B
 check_2 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl
 check_2 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl
 check_2 tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl
+check_2 tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl
 
 for i in $BEAUTY_FILES; do
   beauty_check $i -B
index 825a12e..4aa034a 100644 (file)
@@ -14,6 +14,7 @@ perf-config                   mainporcelain common
 perf-evlist                    mainporcelain common
 perf-ftrace                    mainporcelain common
 perf-inject                    mainporcelain common
+perf-iostat                    mainporcelain common
 perf-kallsyms                  mainporcelain common
 perf-kmem                      mainporcelain common
 perf-kvm                       mainporcelain common
index b804379..a262dcd 100644 (file)
@@ -262,7 +262,7 @@ int sys_enter(struct syscall_enter_args *args)
        /*
         * Jump to syscall specific augmenter, even if the default one,
         * "!raw_syscalls:unaugmented" that will just return 1 to return the
-        * unagmented tracepoint payload.
+        * unaugmented tracepoint payload.
         */
        bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
 
@@ -282,7 +282,7 @@ int sys_exit(struct syscall_exit_args *args)
        /*
         * Jump to syscall specific return augmenter, even if the default one,
         * "!raw_syscalls:unaugmented" that will just return 1 to return the
-        * unagmented tracepoint payload.
+        * unaugmented tracepoint payload.
         */
        bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr);
        /*
index 8810859..526dcaf 100644 (file)
@@ -390,7 +390,7 @@ jvmti_write_code(void *agent, char const *sym,
                rec.p.total_size += size;
 
        /*
-        * If JVM is multi-threaded, nultiple concurrent calls to agent
+        * If JVM is multi-threaded, multiple concurrent calls to agent
         * may be possible, so protect file writes
         */
        flockfile(fp);
@@ -457,7 +457,7 @@ jvmti_write_debug_info(void *agent, uint64_t code,
        rec.p.total_size = size;
 
        /*
-        * If JVM is multi-threaded, nultiple concurrent calls to agent
+        * If JVM is multi-threaded, multiple concurrent calls to agent
         * may be possible, so protect file writes
         */
        flockfile(fp);
diff --git a/tools/perf/perf-iostat.sh b/tools/perf/perf-iostat.sh
new file mode 100644 (file)
index 0000000..e562f25
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# perf iostat
+# Alexander Antonov <alexander.antonov@linux.intel.com>
+
+if [[ "$1" == "list" ]] || [[ "$1" =~ ([a-f0-9A-F]{1,}):([a-f0-9A-F]{1,2})(,)? ]]; then
+        DELIMITER="="
+else
+        DELIMITER=" "
+fi
+
+perf stat --iostat$DELIMITER$*
index 75376c7..913fb20 100644 (file)
         "EventName": "L2D_TLB_REFILL",
         "BriefDescription": "Attributable Level 2 data TLB refill"
     },
+    {
+        "PublicDescription": "Attributable Level 2 instruction TLB refill.",
+        "EventCode": "0x2E",
+        "EventName": "L2I_TLB_REFILL",
+        "BriefDescription": "Attributable Level 2 instruction TLB refill."
+    },
     {
         "PublicDescription": "Attributable Level 2 data or unified TLB access",
         "EventCode": "0x2F",
         "EventName": "L2D_TLB",
         "BriefDescription": "Attributable Level 2 data or unified TLB access"
     },
+    {
+        "PublicDescription": "Attributable Level 2 instruction TLB access.",
+        "EventCode": "0x30",
+        "EventName": "L2I_TLB",
+        "BriefDescription": "Attributable Level 2 instruction TLB access."
+    },
     {
         "PublicDescription": "Access to another socket in a multi-socket system",
         "EventCode": "0x31",
         "EventCode": "0x37",
         "EventName": "LL_CACHE_MISS_RD",
         "BriefDescription": "Last level cache miss, read"
+    },
+    {
+        "PublicDescription": "SIMD Instruction architecturally executed.",
+        "EventCode": "0x8000",
+        "EventName": "SIMD_INST_RETIRED",
+        "BriefDescription": "SIMD Instruction architecturally executed."
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed, SVE.",
+        "EventCode": "0x8002",
+        "EventName": "SVE_INST_RETIRED",
+        "BriefDescription": "Instruction architecturally executed, SVE."
+    },
+    {
+        "PublicDescription": "Microarchitectural operation, Operations speculatively executed.",
+        "EventCode": "0x8008",
+        "EventName": "UOP_SPEC",
+        "BriefDescription": "Microarchitectural operation, Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE Math accelerator Operations speculatively executed.",
+        "EventCode": "0x800E",
+        "EventName": "SVE_MATH_SPEC",
+        "BriefDescription": "SVE Math accelerator Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Floating-point Operations speculatively executed.",
+        "EventCode": "0x8010",
+        "EventName": "FP_SPEC",
+        "BriefDescription": "Floating-point Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Floating-point FMA Operations speculatively executed.",
+        "EventCode": "0x8028",
+        "EventName": "FP_FMA_SPEC",
+        "BriefDescription": "Floating-point FMA Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Floating-point reciprocal estimate Operations speculatively executed.",
+        "EventCode": "0x8034",
+        "EventName": "FP_RECPE_SPEC",
+        "BriefDescription": "Floating-point reciprocal estimate Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "floating-point convert Operations speculatively executed.",
+        "EventCode": "0x8038",
+        "EventName": "FP_CVT_SPEC",
+        "BriefDescription": "floating-point convert Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE integer Operations speculatively executed.",
+        "EventCode": "0x8043",
+        "EventName": "ASE_SVE_INT_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE integer Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE predicated Operations speculatively executed.",
+        "EventCode": "0x8074",
+        "EventName": "SVE_PRED_SPEC",
+        "BriefDescription": "SVE predicated Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE MOVPRFX Operations speculatively executed.",
+        "EventCode": "0x807C",
+        "EventName": "SVE_MOVPRFX_SPEC",
+        "BriefDescription": "SVE MOVPRFX Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE MOVPRFX unfused Operations speculatively executed.",
+        "EventCode": "0x807F",
+        "EventName": "SVE_MOVPRFX_U_SPEC",
+        "BriefDescription": "SVE MOVPRFX unfused Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE load Operations speculatively executed.",
+        "EventCode": "0x8085",
+        "EventName": "ASE_SVE_LD_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE store Operations speculatively executed.",
+        "EventCode": "0x8086",
+        "EventName": "ASE_SVE_ST_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE store Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Prefetch Operations speculatively executed.",
+        "EventCode": "0x8087",
+        "EventName": "PRF_SPEC",
+        "BriefDescription": "Prefetch Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "General-purpose register load Operations speculatively executed.",
+        "EventCode": "0x8089",
+        "EventName": "BASE_LD_REG_SPEC",
+        "BriefDescription": "General-purpose register load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "General-purpose register store Operations speculatively executed.",
+        "EventCode": "0x808A",
+        "EventName": "BASE_ST_REG_SPEC",
+        "BriefDescription": "General-purpose register store Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE unpredicated load register Operations speculatively executed.",
+        "EventCode": "0x8091",
+        "EventName": "SVE_LDR_REG_SPEC",
+        "BriefDescription": "SVE unpredicated load register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE unpredicated store register Operations speculatively executed.",
+        "EventCode": "0x8092",
+        "EventName": "SVE_STR_REG_SPEC",
+        "BriefDescription": "SVE unpredicated store register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE load predicate register Operations speculatively executed.",
+        "EventCode": "0x8095",
+        "EventName": "SVE_LDR_PREG_SPEC",
+        "BriefDescription": "SVE load predicate register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE store predicate register Operations speculatively executed.",
+        "EventCode": "0x8096",
+        "EventName": "SVE_STR_PREG_SPEC",
+        "BriefDescription": "SVE store predicate register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE contiguous prefetch element Operations speculatively executed.",
+        "EventCode": "0x809F",
+        "EventName": "SVE_PRF_CONTIG_SPEC",
+        "BriefDescription": "SVE contiguous prefetch element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE contiguous load multiple vector Operations speculatively executed.",
+        "EventCode": "0x80A5",
+        "EventName": "ASE_SVE_LD_MULTI_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE contiguous load multiple vector Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE contiguous store multiple vector Operations speculatively executed.",
+        "EventCode": "0x80A6",
+        "EventName": "ASE_SVE_ST_MULTI_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE contiguous store multiple vector Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE gather-load Operations speculatively executed.",
+        "EventCode": "0x80AD",
+        "EventName": "SVE_LD_GATHER_SPEC",
+        "BriefDescription": "SVE gather-load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE scatter-store Operations speculatively executed.",
+        "EventCode": "0x80AE",
+        "EventName": "SVE_ST_SCATTER_SPEC",
+        "BriefDescription": "SVE scatter-store Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE gather-prefetch Operations speculatively executed.",
+        "EventCode": "0x80AF",
+        "EventName": "SVE_PRF_GATHER_SPEC",
+        "BriefDescription": "SVE gather-prefetch Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE First-fault load Operations speculatively executed.",
+        "EventCode": "0x80BC",
+        "EventName": "SVE_LDFF_SPEC",
+        "BriefDescription": "SVE First-fault load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C0",
+        "EventName": "FP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C1",
+        "EventName": "FP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable half-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C2",
+        "EventName": "FP_HP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable half-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable half-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C3",
+        "EventName": "FP_HP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable half-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable single-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C4",
+        "EventName": "FP_SP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable single-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable single-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C5",
+        "EventName": "FP_SP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable single-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable double-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C6",
+        "EventName": "FP_DP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable double-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable double-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C7",
+        "EventName": "FP_DP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable double-precision floating-point element Operations speculatively executed."
     }
 ]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json
new file mode 100644 (file)
index 0000000..b011af1
--- /dev/null
@@ -0,0 +1,8 @@
+[
+  {
+    "ArchStdEvent": "BR_MIS_PRED"
+  },
+  {
+    "ArchStdEvent": "BR_PRED"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json
new file mode 100644 (file)
index 0000000..084e88d
--- /dev/null
@@ -0,0 +1,62 @@
+[
+  {
+    "PublicDescription": "This event counts read transactions from tofu controller to measured CMG.",
+    "EventCode": "0x314",
+    "EventName": "BUS_READ_TOTAL_TOFU",
+    "BriefDescription": "This event counts read transactions from tofu controller to measured CMG."
+  },
+  {
+    "PublicDescription": "This event counts read transactions from PCI controller to measured CMG.",
+    "EventCode": "0x315",
+    "EventName": "BUS_READ_TOTAL_PCI",
+    "BriefDescription": "This event counts read transactions from PCI controller to measured CMG."
+  },
+  {
+    "PublicDescription": "This event counts read transactions from measured CMG local memory to measured CMG.",
+    "EventCode": "0x316",
+    "EventName": "BUS_READ_TOTAL_MEM",
+    "BriefDescription": "This event counts read transactions from measured CMG local memory to measured CMG."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to CMG0, if measured CMG is not CMG0.",
+    "EventCode": "0x318",
+    "EventName": "BUS_WRITE_TOTAL_CMG0",
+    "BriefDescription": "This event counts write transactions from measured CMG to CMG0, if measured CMG is not CMG0."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to CMG1, if measured CMG is not CMG1.",
+    "EventCode": "0x319",
+    "EventName": "BUS_WRITE_TOTAL_CMG1",
+    "BriefDescription": "This event counts write transactions from measured CMG to CMG1, if measured CMG is not CMG1."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to CMG2, if measured CMG is not CMG2.",
+    "EventCode": "0x31A",
+    "EventName": "BUS_WRITE_TOTAL_CMG2",
+    "BriefDescription": "This event counts write transactions from measured CMG to CMG2, if measured CMG is not CMG2."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to CMG3, if measured CMG is not CMG3.",
+    "EventCode": "0x31B",
+    "EventName": "BUS_WRITE_TOTAL_CMG3",
+    "BriefDescription": "This event counts write transactions from measured CMG to CMG3, if measured CMG is not CMG3."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to tofu controller.",
+    "EventCode": "0x31C",
+    "EventName": "BUS_WRITE_TOTAL_TOFU",
+    "BriefDescription": "This event counts write transactions from measured CMG to tofu controller."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to PCI controller.",
+    "EventCode": "0x31D",
+    "EventName": "BUS_WRITE_TOTAL_PCI",
+    "BriefDescription": "This event counts write transactions from measured CMG to PCI controller."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to measured CMG local memory.",
+    "EventCode": "0x31E",
+    "EventName": "BUS_WRITE_TOTAL_MEM",
+    "BriefDescription": "This event counts write transactions from measured CMG to measured CMG local memory."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cache.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cache.json
new file mode 100644 (file)
index 0000000..2e341a9
--- /dev/null
@@ -0,0 +1,128 @@
+[
+  {
+    "ArchStdEvent": "L1I_CACHE_REFILL"
+  },
+  {
+    "ArchStdEvent": "L1I_TLB_REFILL"
+  },
+  {
+    "ArchStdEvent": "L1D_CACHE_REFILL"
+  },
+  {
+    "ArchStdEvent": "L1D_CACHE"
+  },
+  {
+    "ArchStdEvent": "L1D_TLB_REFILL"
+  },
+  {
+    "ArchStdEvent": "L1I_CACHE"
+  },
+  {
+    "ArchStdEvent": "L1D_CACHE_WB"
+  },
+  {
+    "ArchStdEvent": "L2D_CACHE"
+  },
+  {
+    "ArchStdEvent": "L2D_CACHE_REFILL"
+  },
+  {
+    "ArchStdEvent": "L2D_CACHE_WB"
+  },
+  {
+    "ArchStdEvent": "L2D_TLB_REFILL"
+  },
+  {
+    "ArchStdEvent": "L2I_TLB_REFILL"
+  },
+  {
+    "ArchStdEvent": "L2D_TLB"
+  },
+  {
+    "ArchStdEvent": "L2I_TLB"
+  },
+  {
+    "PublicDescription": "This event counts L1D_CACHE_REFILL caused by software or hardware prefetch.",
+    "EventCode": "0x49",
+    "EventName": "L1D_CACHE_REFILL_PRF",
+    "BriefDescription": "This event counts L1D_CACHE_REFILL caused by software or hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts L2D_CACHE_REFILL caused by software or hardware prefetch.",
+    "EventCode": "0x59",
+    "EventName": "L2D_CACHE_REFILL_PRF",
+    "BriefDescription": "This event counts L2D_CACHE_REFILL caused by software or hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts L1D_CACHE_REFILL caused by demand access.",
+    "EventCode": "0x200",
+    "EventName": "L1D_CACHE_REFILL_DM",
+    "BriefDescription": "This event counts L1D_CACHE_REFILL caused by demand access."
+  },
+  {
+    "PublicDescription": "This event counts L1D_CACHE_REFILL caused by hardware prefetch.",
+    "EventCode": "0x202",
+    "EventName": "L1D_CACHE_REFILL_HWPRF",
+    "BriefDescription": "This event counts L1D_CACHE_REFILL caused by hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts outstanding L1D cache miss requests per cycle.",
+    "EventCode": "0x208",
+    "EventName": "L1_MISS_WAIT",
+    "BriefDescription": "This event counts outstanding L1D cache miss requests per cycle."
+  },
+  {
+    "PublicDescription": "This event counts outstanding L1I cache miss requests per cycle.",
+    "EventCode": "0x209",
+    "EventName": "L1I_MISS_WAIT",
+    "BriefDescription": "This event counts outstanding L1I cache miss requests per cycle."
+  },
+  {
+    "PublicDescription": "This event counts L2D_CACHE_REFILL caused by demand access.",
+    "EventCode": "0x300",
+    "EventName": "L2D_CACHE_REFILL_DM",
+    "BriefDescription": "This event counts L2D_CACHE_REFILL caused by demand access."
+  },
+  {
+    "PublicDescription": "This event counts L2D_CACHE_REFILL caused by hardware prefetch.",
+    "EventCode": "0x302",
+    "EventName": "L2D_CACHE_REFILL_HWPRF",
+    "BriefDescription": "This event counts L2D_CACHE_REFILL caused by hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts outstanding L2 cache miss requests per cycle.",
+    "EventCode": "0x308",
+    "EventName": "L2_MISS_WAIT",
+    "BriefDescription": "This event counts outstanding L2 cache miss requests per cycle."
+  },
+  {
+    "PublicDescription": "This event counts the number of times of L2 cache miss.",
+    "EventCode": "0x309",
+    "EventName": "L2_MISS_COUNT",
+    "BriefDescription": "This event counts the number of times of L2 cache miss."
+  },
+  {
+    "PublicDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch.",
+    "EventCode": "0x325",
+    "EventName": "L2D_SWAP_DM",
+    "BriefDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access.",
+    "EventCode": "0x326",
+    "EventName": "L2D_CACHE_MIBMCH_PRF",
+    "BriefDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access."
+  },
+  {
+    "PublicDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch.",
+    "EventCode": "0x396",
+    "EventName": "L2D_CACHE_SWAP_LOCAL",
+    "BriefDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts energy consumption per cycle of L2 cache.",
+    "EventCode": "0x3E0",
+    "EventName": "EA_L2",
+    "BriefDescription": "This event counts energy consumption per cycle of L2 cache."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cycle.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cycle.json
new file mode 100644 (file)
index 0000000..b164846
--- /dev/null
@@ -0,0 +1,5 @@
+[
+  {
+    "ArchStdEvent": "CPU_CYCLES"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/exception.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/exception.json
new file mode 100644 (file)
index 0000000..348749c
--- /dev/null
@@ -0,0 +1,29 @@
+[
+  {
+    "ArchStdEvent": "EXC_TAKEN"
+  },
+  {
+    "ArchStdEvent": "EXC_UNDEF"
+  },
+  {
+    "ArchStdEvent": "EXC_SVC"
+  },
+  {
+    "ArchStdEvent": "EXC_PABORT"
+  },
+  {
+    "ArchStdEvent": "EXC_DABORT"
+  },
+  {
+    "ArchStdEvent": "EXC_IRQ"
+  },
+  {
+    "ArchStdEvent": "EXC_FIQ"
+  },
+  {
+    "ArchStdEvent": "EXC_SMC"
+  },
+  {
+    "ArchStdEvent": "EXC_HVC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/instruction.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/instruction.json
new file mode 100644 (file)
index 0000000..6d258b1
--- /dev/null
@@ -0,0 +1,131 @@
+[
+  {
+    "ArchStdEvent": "SW_INCR"
+  },
+  {
+    "ArchStdEvent": "INST_RETIRED"
+  },
+  {
+    "ArchStdEvent": "EXC_RETURN"
+  },
+  {
+    "ArchStdEvent": "CID_WRITE_RETIRED"
+  },
+  {
+    "ArchStdEvent": "INST_SPEC"
+  },
+  {
+    "ArchStdEvent": "LDREX_SPEC"
+  },
+  {
+    "ArchStdEvent": "STREX_SPEC"
+  },
+  {
+    "ArchStdEvent": "LD_SPEC"
+  },
+  {
+    "ArchStdEvent": "ST_SPEC"
+  },
+  {
+    "ArchStdEvent": "LDST_SPEC"
+  },
+  {
+    "ArchStdEvent": "DP_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SPEC"
+  },
+  {
+    "ArchStdEvent": "VFP_SPEC"
+  },
+  {
+    "ArchStdEvent": "PC_WRITE_SPEC"
+  },
+  {
+    "ArchStdEvent": "CRYPTO_SPEC"
+  },
+  {
+    "ArchStdEvent": "BR_IMMED_SPEC"
+  },
+  {
+    "ArchStdEvent": "BR_RETURN_SPEC"
+  },
+  {
+    "ArchStdEvent": "BR_INDIRECT_SPEC"
+  },
+  {
+    "ArchStdEvent": "ISB_SPEC"
+  },
+  {
+    "ArchStdEvent": "DSB_SPEC"
+  },
+  {
+    "ArchStdEvent": "DMB_SPEC"
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed zero blocking operations due to the 'DC ZVA' instruction.",
+    "EventCode": "0x9F",
+    "EventName": "DCZVA_SPEC",
+    "BriefDescription": "This event counts architecturally executed zero blocking operations due to the 'DC ZVA' instruction."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed floating-point move operations.",
+    "EventCode": "0x105",
+    "EventName": "FP_MV_SPEC",
+    "BriefDescription": "This event counts architecturally executed floating-point move operations."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed operations that using predicate register.",
+    "EventCode": "0x108",
+    "EventName": "PRD_SPEC",
+    "BriefDescription": "This event counts architecturally executed operations that using predicate register."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed inter-element manipulation operations.",
+    "EventCode": "0x109",
+    "EventName": "IEL_SPEC",
+    "BriefDescription": "This event counts architecturally executed inter-element manipulation operations."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed inter-register manipulation operations.",
+    "EventCode": "0x10A",
+    "EventName": "IREG_SPEC",
+    "BriefDescription": "This event counts architecturally executed inter-register manipulation operations."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed NOSIMD load operations that using SIMD&FP registers.",
+    "EventCode": "0x112",
+    "EventName": "FP_LD_SPEC",
+    "BriefDescription": "This event counts architecturally executed NOSIMD load operations that using SIMD&FP registers."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed NOSIMD store operations that using SIMD&FP registers.",
+    "EventCode": "0x113",
+    "EventName": "FP_ST_SPEC",
+    "BriefDescription": "This event counts architecturally executed NOSIMD store operations that using SIMD&FP registers."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed SIMD broadcast floating-point load operations.",
+    "EventCode": "0x11A",
+    "EventName": "BC_LD_SPEC",
+    "BriefDescription": "This event counts architecturally executed SIMD broadcast floating-point load operations."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed instructions, excluding the MOVPRFX instruction.",
+    "EventCode": "0x121",
+    "EventName": "EFFECTIVE_INST_SPEC",
+    "BriefDescription": "This event counts architecturally executed instructions, excluding the MOVPRFX instruction."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed operations that uses 'pre-index' as its addressing mode.",
+    "EventCode": "0x123",
+    "EventName": "PRE_INDEX_SPEC",
+    "BriefDescription": "This event counts architecturally executed operations that uses 'pre-index' as its addressing mode."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed operations that uses 'post-index' as its addressing mode.",
+    "EventCode": "0x124",
+    "EventName": "POST_INDEX_SPEC",
+    "BriefDescription": "This event counts architecturally executed operations that uses 'post-index' as its addressing mode."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/memory.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/memory.json
new file mode 100644 (file)
index 0000000..c1f6479
--- /dev/null
@@ -0,0 +1,8 @@
+[
+  {
+    "PublicDescription": "This event counts energy consumption per cycle of CMG local memory.",
+    "EventCode": "0x3E8",
+    "EventName": "EA_MEMORY",
+    "BriefDescription": "This event counts energy consumption per cycle of CMG local memory."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/other.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/other.json
new file mode 100644 (file)
index 0000000..10c823a
--- /dev/null
@@ -0,0 +1,188 @@
+[
+  {
+    "PublicDescription": "This event counts the occurrence count of the micro-operation split.",
+    "EventCode": "0x139",
+    "EventName": "UOP_SPLIT",
+    "BriefDescription": "This event counts the occurrence count of the micro-operation split."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no operation was committed because the oldest and uncommitted load/store/prefetch operation waits for memory access.",
+    "EventCode": "0x180",
+    "EventName": "LD_COMP_WAIT_L2_MISS",
+    "BriefDescription": "This event counts every cycle that no operation was committed because the oldest and uncommitted load/store/prefetch operation waits for memory access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for memory access.",
+    "EventCode": "0x181",
+    "EventName": "LD_COMP_WAIT_L2_MISS_EX",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for memory access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L2 cache access.",
+    "EventCode": "0x182",
+    "EventName": "LD_COMP_WAIT_L1_MISS",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L2 cache access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L2 cache access.",
+    "EventCode": "0x183",
+    "EventName": "LD_COMP_WAIT_L1_MISS_EX",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L2 cache access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L1D cache, L2 cache and memory access.",
+    "EventCode": "0x184",
+    "EventName": "LD_COMP_WAIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L1D cache, L2 cache and memory access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L1D cache, L2 cache and memory access.",
+    "EventCode": "0x185",
+    "EventName": "LD_COMP_WAIT_EX",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L1D cache, L2 cache and memory access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed due to the lack of an available prefetch port.",
+    "EventCode": "0x186",
+    "EventName": "LD_COMP_WAIT_PFP_BUSY",
+    "BriefDescription": "This event counts every cycle that no instruction was committed due to the lack of an available prefetch port."
+  },
+  {
+    "PublicDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by an integer load operation.",
+    "EventCode": "0x187",
+    "EventName": "LD_COMP_WAIT_PFP_BUSY_EX",
+    "BriefDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by an integer load operation."
+  },
+  {
+    "PublicDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by a software prefetch instruction.",
+    "EventCode": "0x188",
+    "EventName": "LD_COMP_WAIT_PFP_BUSY_SWPF",
+    "BriefDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by a software prefetch instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is an integer or floating-point/SIMD instruction.",
+    "EventCode": "0x189",
+    "EventName": "EU_COMP_WAIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is an integer or floating-point/SIMD instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a floating-point/SIMD instruction.",
+    "EventCode": "0x18A",
+    "EventName": "FL_COMP_WAIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a floating-point/SIMD instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a branch instruction.",
+    "EventCode": "0x18B",
+    "EventName": "BR_COMP_WAIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a branch instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the CSE is empty.",
+    "EventCode": "0x18C",
+    "EventName": "ROB_EMPTY",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the CSE is empty."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the CSE is empty and the store port (SP) is full.",
+    "EventCode": "0x18D",
+    "EventName": "ROB_EMPTY_STQ_BUSY",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the CSE is empty and the store port (SP) is full."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that the instruction unit is halted by the WFE/WFI instruction.",
+    "EventCode": "0x18E",
+    "EventName": "WFE_WFI_CYCLE",
+    "BriefDescription": "This event counts every cycle that the instruction unit is halted by the WFE/WFI instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed, but counts at the time when commits MOVPRFX only.",
+    "EventCode": "0x190",
+    "EventName": "_0INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed, but counts at the time when commits MOVPRFX only."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that one instruction is committed.",
+    "EventCode": "0x191",
+    "EventName": "_1INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that one instruction is committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that two instructions are committed.",
+    "EventCode": "0x192",
+    "EventName": "_2INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that two instructions are committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that three instructions are committed.",
+    "EventCode": "0x193",
+    "EventName": "_3INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that three instructions are committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that four instructions are committed.",
+    "EventCode": "0x194",
+    "EventName": "_4INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that four instructions are committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that only any micro-operations are committed.",
+    "EventCode": "0x198",
+    "EventName": "UOP_ONLY_COMMIT",
+    "BriefDescription": "This event counts every cycle that only any micro-operations are committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that only the MOVPRFX instruction is committed.",
+    "EventCode": "0x199",
+    "EventName": "SINGLE_MOVPRFX_COMMIT",
+    "BriefDescription": "This event counts every cycle that only the MOVPRFX instruction is committed."
+  },
+  {
+    "PublicDescription": "This event counts energy consumption per cycle of core.",
+    "EventCode": "0x1E0",
+    "EventName": "EA_CORE",
+    "BriefDescription": "This event counts energy consumption per cycle of core."
+  },
+  {
+    "PublicDescription": "This event counts streaming prefetch requests to L1D cache generated by hardware prefetcher.",
+    "EventCode": "0x230",
+    "EventName": "L1HWPF_STREAM_PF",
+    "BriefDescription": "This event counts streaming prefetch requests to L1D cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts allocation type prefetch injection requests to L1D cache generated by hardware prefetcher.",
+    "EventCode": "0x231",
+    "EventName": "L1HWPF_INJ_ALLOC_PF",
+    "BriefDescription": "This event counts allocation type prefetch injection requests to L1D cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts non-allocation type prefetch injection requests to L1D cache generated by hardware prefetcher.",
+    "EventCode": "0x232",
+    "EventName": "L1HWPF_INJ_NOALLOC_PF",
+    "BriefDescription": "This event counts non-allocation type prefetch injection requests to L1D cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts streaming prefetch requests to L2 cache generated by hardware prefecher.",
+    "EventCode": "0x233",
+    "EventName": "L2HWPF_STREAM_PF",
+    "BriefDescription": "This event counts streaming prefetch requests to L2 cache generated by hardware prefecher."
+  },
+  {
+    "PublicDescription": "This event counts allocation type prefetch injection requests to L2 cache generated by hardware prefetcher.",
+    "EventCode": "0x234",
+    "EventName": "L2HWPF_INJ_ALLOC_PF",
+    "BriefDescription": "This event counts allocation type prefetch injection requests to L2 cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts non-allocation type prefetch injection requests to L2 cache generated by hardware prefetcher.",
+    "EventCode": "0x235",
+    "EventName": "L2HWPF_INJ_NOALLOC_PF",
+    "BriefDescription": "This event counts non-allocation type prefetch injection requests to L2 cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts prefetch requests to L2 cache generated by the other causes.",
+    "EventCode": "0x236",
+    "EventName": "L2HWPF_OTHER",
+    "BriefDescription": "This event counts prefetch requests to L2 cache generated by the other causes."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/pipeline.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/pipeline.json
new file mode 100644 (file)
index 0000000..dd7c97a
--- /dev/null
@@ -0,0 +1,194 @@
+[
+  {
+    "ArchStdEvent": "STALL_FRONTEND"
+  },
+  {
+    "ArchStdEvent": "STALL_BACKEND"
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of EAGA pipeline.",
+    "EventCode": "0x1A0",
+    "EventName": "EAGA_VAL",
+    "BriefDescription": "This event counts valid cycles of EAGA pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of EAGB pipeline.",
+    "EventCode": "0x1A1",
+    "EventName": "EAGB_VAL",
+    "BriefDescription": "This event counts valid cycles of EAGB pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of EXA pipeline.",
+    "EventCode": "0x1A2",
+    "EventName": "EXA_VAL",
+    "BriefDescription": "This event counts valid cycles of EXA pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of EXB pipeline.",
+    "EventCode": "0x1A3",
+    "EventName": "EXB_VAL",
+    "BriefDescription": "This event counts valid cycles of EXB pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of FLA pipeline.",
+    "EventCode": "0x1A4",
+    "EventName": "FLA_VAL",
+    "BriefDescription": "This event counts valid cycles of FLA pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of FLB pipeline.",
+    "EventCode": "0x1A5",
+    "EventName": "FLB_VAL",
+    "BriefDescription": "This event counts valid cycles of FLB pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of PRX pipeline.",
+    "EventCode": "0x1A6",
+    "EventName": "PRX_VAL",
+    "BriefDescription": "This event counts valid cycles of PRX pipeline."
+  },
+  {
+    "PublicDescription": "This event counts the number of 1's in the predicate bits of request in FLA pipeline, where it is corrected so that it becomes 16 when all bits are 1.",
+    "EventCode": "0x1B4",
+    "EventName": "FLA_VAL_PRD_CNT",
+    "BriefDescription": "This event counts the number of 1's in the predicate bits of request in FLA pipeline, where it is corrected so that it becomes 16 when all bits are 1."
+  },
+  {
+    "PublicDescription": "This event counts the number of 1's in the predicate bits of request in FLB pipeline, where it is corrected so that it becomes 16 when all bits are 1.",
+    "EventCode": "0x1B5",
+    "EventName": "FLB_VAL_PRD_CNT",
+    "BriefDescription": "This event counts the number of 1's in the predicate bits of request in FLB pipeline, where it is corrected so that it becomes 16 when all bits are 1."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of L1D cache pipeline#0.",
+    "EventCode": "0x240",
+    "EventName": "L1_PIPE0_VAL",
+    "BriefDescription": "This event counts valid cycles of L1D cache pipeline#0."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of L1D cache pipeline#1.",
+    "EventCode": "0x241",
+    "EventName": "L1_PIPE1_VAL",
+    "BriefDescription": "This event counts valid cycles of L1D cache pipeline#1."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#0 that its sce bit of tagged address is 1.",
+    "EventCode": "0x250",
+    "EventName": "L1_PIPE0_VAL_IU_TAG_ADRS_SCE",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#0 that its sce bit of tagged address is 1."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#0 that its pfe bit of tagged address is 1.",
+    "EventCode": "0x251",
+    "EventName": "L1_PIPE0_VAL_IU_TAG_ADRS_PFE",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#0 that its pfe bit of tagged address is 1."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#1 that its sce bit of tagged address is 1.",
+    "EventCode": "0x252",
+    "EventName": "L1_PIPE1_VAL_IU_TAG_ADRS_SCE",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#1 that its sce bit of tagged address is 1."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#1 that its pfe bit of tagged address is 1.",
+    "EventCode": "0x253",
+    "EventName": "L1_PIPE1_VAL_IU_TAG_ADRS_PFE",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#1 that its pfe bit of tagged address is 1."
+  },
+  {
+    "PublicDescription": "This event counts completed requests in L1D cache pipeline#0.",
+    "EventCode": "0x260",
+    "EventName": "L1_PIPE0_COMP",
+    "BriefDescription": "This event counts completed requests in L1D cache pipeline#0."
+  },
+  {
+    "PublicDescription": "This event counts completed requests in L1D cache pipeline#1.",
+    "EventCode": "0x261",
+    "EventName": "L1_PIPE1_COMP",
+    "BriefDescription": "This event counts completed requests in L1D cache pipeline#1."
+  },
+  {
+    "PublicDescription": "This event counts completed requests in L1I cache pipeline.",
+    "EventCode": "0x268",
+    "EventName": "L1I_PIPE_COMP",
+    "BriefDescription": "This event counts completed requests in L1I cache pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of L1I cache pipeline.",
+    "EventCode": "0x269",
+    "EventName": "L1I_PIPE_VAL",
+    "BriefDescription": "This event counts valid cycles of L1I cache pipeline."
+  },
+  {
+    "PublicDescription": "This event counts aborted requests in L1D pipelines that due to store-load interlock.",
+    "EventCode": "0x274",
+    "EventName": "L1_PIPE_ABORT_STLD_INTLK",
+    "BriefDescription": "This event counts aborted requests in L1D pipelines that due to store-load interlock."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#0 that its sector cache ID is not 0.",
+    "EventCode": "0x2A0",
+    "EventName": "L1_PIPE0_VAL_IU_NOT_SEC0",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#0 that its sector cache ID is not 0."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#1 that its sector cache ID is not 0.",
+    "EventCode": "0x2A1",
+    "EventName": "L1_PIPE1_VAL_IU_NOT_SEC0",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#1 that its sector cache ID is not 0."
+  },
+  {
+    "PublicDescription": "This event counts the number of times where 2 elements of the gather instructions became 2 flows because 2 elements could not be combined.",
+    "EventCode": "0x2B0",
+    "EventName": "L1_PIPE_COMP_GATHER_2FLOW",
+    "BriefDescription": "This event counts the number of times where 2 elements of the gather instructions became 2 flows because 2 elements could not be combined."
+  },
+  {
+    "PublicDescription": "This event counts the number of times where 2 elements of the gather instructions became 1 flow because 2 elements could be combined.",
+    "EventCode": "0x2B1",
+    "EventName": "L1_PIPE_COMP_GATHER_1FLOW",
+    "BriefDescription": "This event counts the number of times where 2 elements of the gather instructions became 1 flow because 2 elements could be combined."
+  },
+  {
+    "PublicDescription": "This event counts the number of times where 2 elements of the gather instructions became 0 flow because both predicate values are 0.",
+    "EventCode": "0x2B2",
+    "EventName": "L1_PIPE_COMP_GATHER_0FLOW",
+    "BriefDescription": "This event counts the number of times where 2 elements of the gather instructions became 0 flow because both predicate values are 0."
+  },
+  {
+    "PublicDescription": "This event counts the number of flows of the scatter instructions.",
+    "EventCode": "0x2B3",
+    "EventName": "L1_PIPE_COMP_SCATTER_1FLOW",
+    "BriefDescription": "This event counts the number of flows of the scatter instructions."
+  },
+  {
+    "PublicDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#0, where it is corrected so that it becomes 16 when all bits are 1.",
+    "EventCode": "0x2B8",
+    "EventName": "L1_PIPE0_COMP_PRD_CNT",
+    "BriefDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#0, where it is corrected so that it becomes 16 when all bits are 1."
+  },
+  {
+    "PublicDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#1, where it is corrected so that it becomes 16 when all bits are 1.",
+    "EventCode": "0x2B9",
+    "EventName": "L1_PIPE1_COMP_PRD_CNT",
+    "BriefDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#1, where it is corrected so that it becomes 16 when all bits are 1."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of L2 cache pipeline.",
+    "EventCode": "0x330",
+    "EventName": "L2_PIPE_VAL",
+    "BriefDescription": "This event counts valid cycles of L2 cache pipeline."
+  },
+  {
+    "PublicDescription": "This event counts completed requests in L2 cache pipeline.",
+    "EventCode": "0x350",
+    "EventName": "L2_PIPE_COMP_ALL",
+    "BriefDescription": "This event counts completed requests in L2 cache pipeline."
+  },
+  {
+    "PublicDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access.",
+    "EventCode": "0x370",
+    "EventName": "L2_PIPE_COMP_PF_L2MIB_MCH",
+    "BriefDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/sve.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/sve.json
new file mode 100644 (file)
index 0000000..dc1b95e
--- /dev/null
@@ -0,0 +1,110 @@
+[
+  {
+    "ArchStdEvent": "SIMD_INST_RETIRED"
+  },
+  {
+    "ArchStdEvent": "SVE_INST_RETIRED"
+  },
+  {
+    "ArchStdEvent": "UOP_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_MATH_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_FMA_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_RECPE_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_CVT_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_INT_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_PRED_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_MOVPRFX_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_MOVPRFX_U_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_LD_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_ST_SPEC"
+  },
+  {
+    "ArchStdEvent": "PRF_SPEC"
+  },
+  {
+    "ArchStdEvent": "BASE_LD_REG_SPEC"
+  },
+  {
+    "ArchStdEvent": "BASE_ST_REG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_LDR_REG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_STR_REG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_LDR_PREG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_STR_PREG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_PRF_CONTIG_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_LD_MULTI_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_ST_MULTI_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_LD_GATHER_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_ST_SCATTER_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_PRF_GATHER_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_LDFF_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_SCALE_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_FIXED_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_HP_SCALE_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_HP_FIXED_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_SP_SCALE_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_SP_FIXED_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_DP_SCALE_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_DP_FIXED_OPS_SPEC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json
new file mode 100644 (file)
index 0000000..dda8e59
--- /dev/null
@@ -0,0 +1,233 @@
+[
+    {
+        "MetricExpr": "FETCH_BUBBLE / (4 * CPU_CYCLES)",
+        "PublicDescription": "Frontend bound L1 topdown metric",
+        "BriefDescription": "Frontend bound L1 topdown metric",
+        "MetricGroup": "TopDownL1",
+        "MetricName": "frontend_bound"
+    },
+    {
+        "MetricExpr": "(INST_SPEC - INST_RETIRED) / (4 * CPU_CYCLES)",
+        "PublicDescription": "Bad Speculation L1 topdown metric",
+        "BriefDescription": "Bad Speculation L1 topdown metric",
+        "MetricGroup": "TopDownL1",
+        "MetricName": "bad_speculation"
+    },
+    {
+        "MetricExpr": "INST_RETIRED / (CPU_CYCLES * 4)",
+        "PublicDescription": "Retiring L1 topdown metric",
+        "BriefDescription": "Retiring L1 topdown metric",
+        "MetricGroup": "TopDownL1",
+        "MetricName": "retiring"
+    },
+    {
+        "MetricExpr": "1 - (frontend_bound + bad_speculation + retiring)",
+        "PublicDescription": "Backend Bound L1 topdown metric",
+        "BriefDescription": "Backend Bound L1 topdown metric",
+        "MetricGroup": "TopDownL1",
+        "MetricName": "backend_bound"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x201d@ / CPU_CYCLES",
+        "PublicDescription": "Fetch latency bound L2 topdown metric",
+        "BriefDescription": "Fetch latency bound L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "fetch_latency_bound"
+    },
+    {
+        "MetricExpr": "frontend_bound - fetch_latency_bound",
+        "PublicDescription": "Fetch bandwidth bound L2 topdown metric",
+        "BriefDescription": "Fetch bandwidth bound L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "fetch_bandwidth_bound"
+    },
+    {
+        "MetricExpr": "(bad_speculation * BR_MIS_PRED) / (BR_MIS_PRED + armv8_pmuv3_0@event\\=0x2013@)",
+        "PublicDescription": "Branch mispredicts L2 topdown metric",
+        "BriefDescription": "Branch mispredicts L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "branch_mispredicts"
+    },
+    {
+        "MetricExpr": "bad_speculation - branch_mispredicts",
+        "PublicDescription": "Machine clears L2 topdown metric",
+        "BriefDescription": "Machine clears L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "machine_clears"
+    },
+    {
+        "MetricExpr": "(EXE_STALL_CYCLE - (MEM_STALL_ANYLOAD + armv8_pmuv3_0@event\\=0x7005@)) / CPU_CYCLES",
+        "PublicDescription": "Core bound L2 topdown metric",
+        "BriefDescription": "Core bound L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "core_bound"
+    },
+    {
+        "MetricExpr": "(MEM_STALL_ANYLOAD + armv8_pmuv3_0@event\\=0x7005@) / CPU_CYCLES",
+        "PublicDescription": "Memory bound L2 topdown metric",
+        "BriefDescription": "Memory bound L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "memory_bound"
+    },
+    {
+        "MetricExpr": "(((L2I_TLB - L2I_TLB_REFILL) * 15) + (L2I_TLB_REFILL * 100)) / CPU_CYCLES",
+        "PublicDescription": "Idle by itlb miss L3 topdown metric",
+        "BriefDescription": "Idle by itlb miss L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "idle_by_itlb_miss"
+    },
+    {
+        "MetricExpr": "(((L2I_CACHE - L2I_CACHE_REFILL) * 15) + (L2I_CACHE_REFILL * 100)) / CPU_CYCLES",
+        "PublicDescription": "Idle by icache miss L3 topdown metric",
+        "BriefDescription": "Idle by icache miss L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "idle_by_icache_miss"
+    },
+    {
+        "MetricExpr": "(BR_MIS_PRED * 5) / CPU_CYCLES",
+        "PublicDescription": "BP misp flush L3 topdown metric",
+        "BriefDescription": "BP misp flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "bp_misp_flush"
+    },
+    {
+        "MetricExpr": "(armv8_pmuv3_0@event\\=0x2013@ * 5) / CPU_CYCLES",
+        "PublicDescription": "OOO flush L3 topdown metric",
+        "BriefDescription": "OOO flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "ooo_flush"
+    },
+    {
+        "MetricExpr": "(armv8_pmuv3_0@event\\=0x1001@ * 5) / CPU_CYCLES",
+        "PublicDescription": "Static predictor flush L3 topdown metric",
+        "BriefDescription": "Static predictor flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "sp_flush"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x1010@ / BR_MIS_PRED",
+        "PublicDescription": "Indirect branch L3 topdown metric",
+        "BriefDescription": "Indirect branch L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "indirect_branch"
+    },
+    {
+        "MetricExpr": "(armv8_pmuv3_0@event\\=0x1014@ + armv8_pmuv3_0@event\\=0x1018@) / BR_MIS_PRED",
+        "PublicDescription": "Push branch L3 topdown metric",
+        "BriefDescription": "Push branch L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "push_branch"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x100c@ / BR_MIS_PRED",
+        "PublicDescription": "Pop branch L3 topdown metric",
+        "BriefDescription": "Pop branch L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "pop_branch"
+    },
+    {
+        "MetricExpr": "(BR_MIS_PRED - armv8_pmuv3_0@event\\=0x1010@ - armv8_pmuv3_0@event\\=0x1014@ - armv8_pmuv3_0@event\\=0x1018@ - armv8_pmuv3_0@event\\=0x100c@) / BR_MIS_PRED",
+        "PublicDescription": "Other branch L3 topdown metric",
+        "BriefDescription": "Other branch L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "other_branch"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x2012@ / armv8_pmuv3_0@event\\=0x2013@",
+        "PublicDescription": "Nuke flush L3 topdown metric",
+        "BriefDescription": "Nuke flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "nuke_flush"
+    },
+    {
+        "MetricExpr": "1 - nuke_flush",
+        "PublicDescription": "Other flush L3 topdown metric",
+        "BriefDescription": "Other flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "other_flush"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x2010@ / CPU_CYCLES",
+        "PublicDescription": "Sync stall L3 topdown metric",
+        "BriefDescription": "Sync stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "sync_stall"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x2004@ / CPU_CYCLES",
+        "PublicDescription": "Rob stall L3 topdown metric",
+        "BriefDescription": "Rob stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "rob_stall"
+    },
+    {
+        "MetricExpr": "(armv8_pmuv3_0@event\\=0x2006@ + armv8_pmuv3_0@event\\=0x2007@ + armv8_pmuv3_0@event\\=0x2008@) / CPU_CYCLES",
+        "PublicDescription": "Ptag stall L3 topdown metric",
+        "BriefDescription": "Ptag stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "ptag_stall"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x201e@ / CPU_CYCLES",
+        "PublicDescription": "SaveOpQ stall L3 topdown metric",
+        "BriefDescription": "SaveOpQ stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "saveopq_stall"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x2005@ / CPU_CYCLES",
+        "PublicDescription": "PC buffer stall L3 topdown metric",
+        "BriefDescription": "PC buffer stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "pc_buffer_stall"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x7002@ / CPU_CYCLES",
+        "PublicDescription": "Divider L3 topdown metric",
+        "BriefDescription": "Divider L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "divider"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x7003@ / CPU_CYCLES",
+        "PublicDescription": "FSU stall L3 topdown metric",
+        "BriefDescription": "FSU stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "fsu_stall"
+    },
+    {
+        "MetricExpr": "core_bound - divider - fsu_stall",
+        "PublicDescription": "EXE ports util L3 topdown metric",
+        "BriefDescription": "EXE ports util L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "exe_ports_util"
+    },
+    {
+        "MetricExpr": "(MEM_STALL_ANYLOAD - MEM_STALL_L1MISS) / CPU_CYCLES",
+        "PublicDescription": "L1 bound L3 topdown metric",
+        "BriefDescription": "L1 bound L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "l1_bound"
+    },
+    {
+        "MetricExpr": "(MEM_STALL_L1MISS - MEM_STALL_L2MISS) / CPU_CYCLES",
+        "PublicDescription": "L2 bound L3 topdown metric",
+        "BriefDescription": "L2 bound L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "l2_bound"
+    },
+    {
+        "MetricExpr": "MEM_STALL_L2MISS / CPU_CYCLES",
+        "PublicDescription": "Mem bound L3 topdown metric",
+        "BriefDescription": "Mem bound L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "mem_bound"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x7005@ / CPU_CYCLES",
+        "PublicDescription": "Store bound L3 topdown metric",
+        "BriefDescription": "Store bound L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "store_bound"
+    },
+]
index 0d60914..c43591d 100644 (file)
@@ -20,5 +20,6 @@
 0x00000000410fd0c0,v1,arm/cortex-a76-n1,core
 0x00000000420f5160,v1,cavium/thunderx2,core
 0x00000000430f0af0,v1,cavium/thunderx2,core
+0x00000000460f0010,v1,fujitsu/a64fx,core
 0x00000000480fd010,v1,hisilicon/hip08,core
 0x00000000500f0000,v1,ampere/emag,core
index 229150e..4abdfc3 100644 (file)
@@ -15,3 +15,4 @@
 # Power8 entries
 004[bcd][[:xdigit:]]{4},1,power8,core
 004e[[:xdigit:]]{4},1,power9,core
+0080[[:xdigit:]]{4},1,power10,core
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/cache.json b/tools/perf/pmu-events/arch/powerpc/power10/cache.json
new file mode 100644 (file)
index 0000000..616f290
--- /dev/null
@@ -0,0 +1,47 @@
+[
+  {
+    "EventCode": "1003C",
+    "EventName": "PM_EXEC_STALL_DMISS_L2L3",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from either the local L2 or local L3."
+  },
+  {
+    "EventCode": "34056",
+    "EventName": "PM_EXEC_STALL_LOAD_FINISH",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was finishing a load after its data was reloaded from a data source beyond the local L1; cycles in which the LSU was processing an L1-hit; cycles in which the NTF instruction merged with another load in the LMQ."
+  },
+  {
+    "EventCode": "3006C",
+    "EventName": "PM_RUN_CYC_SMT2_MODE",
+    "BriefDescription": "Cycles when this thread's run latch is set and the core is in SMT2 mode."
+  },
+  {
+    "EventCode": "300F4",
+    "EventName": "PM_RUN_INST_CMPL_CONC",
+    "BriefDescription": "PowerPC instructions completed by this thread when all threads in the core had the run-latch set."
+  },
+  {
+    "EventCode": "4C016",
+    "EventName": "PM_EXEC_STALL_DMISS_L2L3_CONFLICT",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local L2 or local L3, with a dispatch conflict."
+  },
+  {
+    "EventCode": "4D014",
+    "EventName": "PM_EXEC_STALL_LOAD",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a load instruction executing in the Load Store Unit."
+  },
+  {
+    "EventCode": "4D016",
+    "EventName": "PM_EXEC_STALL_PTESYNC",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a PTESYNC instruction executing in the Load Store Unit."
+  },
+  {
+    "EventCode": "401EA",
+    "EventName": "PM_THRESH_EXC_128",
+    "BriefDescription": "Threshold counter exceeded a value of 128."
+  },
+  {
+    "EventCode": "400F6",
+    "EventName": "PM_BR_MPRED_CMPL",
+    "BriefDescription": "A mispredicted branch completed. Includes direction and target."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json b/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json
new file mode 100644 (file)
index 0000000..703cd43
--- /dev/null
@@ -0,0 +1,7 @@
+[
+  {
+    "EventCode": "4016E",
+    "EventName": "PM_THRESH_NOT_MET",
+    "BriefDescription": "Threshold counter did not meet threshold."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
new file mode 100644 (file)
index 0000000..eac8609
--- /dev/null
@@ -0,0 +1,217 @@
+[
+  {
+    "EventCode": "10004",
+    "EventName": "PM_EXEC_STALL_TRANSLATION",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered a TLB miss or ERAT miss and waited for it to resolve."
+  },
+  {
+    "EventCode": "10010",
+    "EventName": "PM_PMC4_OVERFLOW",
+    "BriefDescription": "The event selected for PMC4 caused the event counter to overflow."
+  },
+  {
+    "EventCode": "10020",
+    "EventName": "PM_PMC4_REWIND",
+    "BriefDescription": "The speculative event selected for PMC4 rewinds and the counter for PMC4 is not charged."
+  },
+  {
+    "EventCode": "10038",
+    "EventName": "PM_DISP_STALL_TRANSLATION",
+    "BriefDescription": "Cycles when dispatch was stalled for this thread because the MMU was handling a translation miss."
+  },
+  {
+    "EventCode": "1003A",
+    "EventName": "PM_DISP_STALL_BR_MPRED_IC_L2",
+    "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L2 after suffering a branch mispredict."
+  },
+  {
+    "EventCode": "1E050",
+    "EventName": "PM_DISP_STALL_HELD_STF_MAPPER_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the STF mapper/SRB was full. Includes GPR (count, link, tar), VSR, VMR, FPR."
+  },
+  {
+    "EventCode": "1F054",
+    "EventName": "PM_DTLB_HIT",
+    "BriefDescription": "The PTE required by the instruction was resident in the TLB (data TLB access). When MMCR1[16]=0 this event counts only demand hits. When MMCR1[16]=1 this event includes demand and prefetch. Applies to both HPT and RPT."
+  },
+  {
+    "EventCode": "101E8",
+    "EventName": "PM_THRESH_EXC_256",
+    "BriefDescription": "Threshold counter exceeded a count of 256."
+  },
+  {
+    "EventCode": "101EC",
+    "EventName": "PM_THRESH_MET",
+    "BriefDescription": "Threshold exceeded."
+  },
+  {
+    "EventCode": "100F2",
+    "EventName": "PM_1PLUS_PPC_CMPL",
+    "BriefDescription": "Cycles in which at least one instruction is completed by this thread."
+  },
+  {
+    "EventCode": "100F6",
+    "EventName": "PM_IERAT_MISS",
+    "BriefDescription": "IERAT Reloaded to satisfy an IERAT miss. All page sizes are counted by this event."
+  },
+  {
+    "EventCode": "100F8",
+    "EventName": "PM_DISP_STALL_CYC",
+    "BriefDescription": "Cycles the ICT has no itags assigned to this thread (no instructions were dispatched during these cycles)."
+  },
+  {
+    "EventCode": "20114",
+    "EventName": "PM_MRK_L2_RC_DISP",
+    "BriefDescription": "Marked instruction RC dispatched in L2."
+  },
+  {
+    "EventCode": "2C010",
+    "EventName": "PM_EXEC_STALL_LSU",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the Load Store Unit. This does not include simple fixed point instructions."
+  },
+  {
+    "EventCode": "2C016",
+    "EventName": "PM_DISP_STALL_IERAT_ONLY_MISS",
+    "BriefDescription": "Cycles when dispatch was stalled while waiting to resolve an instruction ERAT miss."
+  },
+  {
+    "EventCode": "2C01E",
+    "EventName": "PM_DISP_STALL_BR_MPRED_IC_L3",
+    "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L3 after suffering a branch mispredict."
+  },
+  {
+    "EventCode": "2D01A",
+    "EventName": "PM_DISP_STALL_IC_MISS",
+    "BriefDescription": "Cycles when dispatch was stalled for this thread due to an Icache Miss."
+  },
+  {
+    "EventCode": "2D01C",
+    "EventName": "PM_CMPL_STALL_STCX",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a stcx waiting for resolution from the nest before completing."
+  },
+  {
+    "EventCode": "2E018",
+    "EventName": "PM_DISP_STALL_FETCH",
+    "BriefDescription": "Cycles when dispatch was stalled for this thread because Fetch was being held."
+  },
+  {
+    "EventCode": "2E01A",
+    "EventName": "PM_DISP_STALL_HELD_XVFC_MAPPER_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the XVFC mapper/SRB was full."
+  },
+  {
+    "EventCode": "2C142",
+    "EventName": "PM_MRK_XFER_FROM_SRC_PMC2",
+    "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[15:27]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "24050",
+    "EventName": "PM_IOPS_DISP",
+    "BriefDescription": "Internal Operations dispatched. PM_IOPS_DISP / PM_INST_DISP will show the average number of internal operations per PowerPC instruction."
+  },
+  {
+    "EventCode": "2405E",
+    "EventName": "PM_ISSUE_CANCEL",
+    "BriefDescription": "An instruction issued and the issue was later cancelled. Only one cancel per PowerPC instruction."
+  },
+  {
+    "EventCode": "200FA",
+    "EventName": "PM_BR_TAKEN_CMPL",
+    "BriefDescription": "Branch Taken instruction completed."
+  },
+  {
+    "EventCode": "30012",
+    "EventName": "PM_FLUSH_COMPLETION",
+    "BriefDescription": "The instruction that was next to complete (oldest in the pipeline) did not complete because it suffered a flush."
+  },
+  {
+    "EventCode": "30014",
+    "EventName": "PM_EXEC_STALL_STORE",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a store instruction executing in the Load Store Unit."
+  },
+  {
+    "EventCode": "30018",
+    "EventName": "PM_DISP_STALL_HELD_SCOREBOARD_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch while waiting on the Scoreboard. This event combines VSCR and FPSCR together."
+  },
+  {
+    "EventCode": "30026",
+    "EventName": "PM_EXEC_STALL_STORE_MISS",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a store whose cache line was not resident in the L1 and was waiting for allocation of the missing line into the L1."
+  },
+  {
+    "EventCode": "3012A",
+    "EventName": "PM_MRK_L2_RC_DONE",
+    "BriefDescription": "L2 RC machine completed the transaction for the marked instruction."
+  },
+  {
+    "EventCode": "3F046",
+    "EventName": "PM_ITLB_HIT_1G",
+    "BriefDescription": "Instruction TLB hit (IERAT reload) page size 1G, which implies Radix Page Table translation is in use. When MMCR1[17]=0 this event counts only for demand misses. When MMCR1[17]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "34058",
+    "EventName": "PM_DISP_STALL_BR_MPRED_ICMISS",
+    "BriefDescription": "Cycles when dispatch was stalled after a mispredicted branch resulted in an instruction cache miss."
+  },
+  {
+    "EventCode": "3D05C",
+    "EventName": "PM_DISP_STALL_HELD_RENAME_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the mapper/SRB was full. Includes GPR (count, link, tar), VSR, VMR, FPR and XVFC."
+  },
+  {
+    "EventCode": "3E052",
+    "EventName": "PM_DISP_STALL_IC_L3",
+    "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L3."
+  },
+  {
+    "EventCode": "3E054",
+    "EventName": "PM_LD_MISS_L1",
+    "BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load."
+  },
+  {
+    "EventCode": "301EA",
+    "EventName": "PM_THRESH_EXC_1024",
+    "BriefDescription": "Threshold counter exceeded a value of 1024."
+  },
+  {
+    "EventCode": "300FA",
+    "EventName": "PM_INST_FROM_L3MISS",
+    "BriefDescription": "The processor's instruction cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss."
+  },
+  {
+    "EventCode": "40006",
+    "EventName": "PM_ISSUE_KILL",
+    "BriefDescription": "Cycles in which an instruction or group of instructions were cancelled after being issued. This event increments once per occurrence, regardless of how many instructions are included in the issue group."
+  },
+  {
+    "EventCode": "40116",
+    "EventName": "PM_MRK_LARX_FIN",
+    "BriefDescription": "Marked load and reserve instruction (LARX) finished. LARX and STCX are instructions used to acquire a lock."
+  },
+  {
+    "EventCode": "4C010",
+    "EventName": "PM_DISP_STALL_BR_MPRED_IC_L3MISS",
+    "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from sources beyond the local L3 after suffering a mispredicted branch."
+  },
+  {
+    "EventCode": "4D01E",
+    "EventName": "PM_DISP_STALL_BR_MPRED",
+    "BriefDescription": "Cycles when dispatch was stalled for this thread due to a mispredicted branch."
+  },
+  {
+    "EventCode": "4E010",
+    "EventName": "PM_DISP_STALL_IC_L3MISS",
+    "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from any source beyond the local L3."
+  },
+  {
+    "EventCode": "4E01A",
+    "EventName": "PM_DISP_STALL_HELD_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch for any reason."
+  },
+  {
+    "EventCode": "44056",
+    "EventName": "PM_VECTOR_ST_CMPL",
+    "BriefDescription": "Vector store instructions completed."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/locks.json b/tools/perf/pmu-events/arch/powerpc/power10/locks.json
new file mode 100644 (file)
index 0000000..016d8de
--- /dev/null
@@ -0,0 +1,12 @@
+[
+  {
+    "EventCode": "1E058",
+    "EventName": "PM_STCX_FAIL_FIN",
+    "BriefDescription": "Conditional store instruction (STCX) failed. LARX and STCX are instructions used to acquire a lock."
+  },
+  {
+    "EventCode": "4E050",
+    "EventName": "PM_STCX_PASS_FIN",
+    "BriefDescription": "Conditional store instruction (STCX) passed. LARX and STCX are instructions used to acquire a lock."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/marked.json b/tools/perf/pmu-events/arch/powerpc/power10/marked.json
new file mode 100644 (file)
index 0000000..93a5a59
--- /dev/null
@@ -0,0 +1,147 @@
+[
+  {
+    "EventCode": "1002C",
+    "EventName": "PM_LD_PREFETCH_CACHE_LINE_MISS",
+    "BriefDescription": "The L1 cache was reloaded with a line that fulfills a prefetch request."
+  },
+  {
+    "EventCode": "10132",
+    "EventName": "PM_MRK_INST_ISSUED",
+    "BriefDescription": "Marked instruction issued. Note that stores always get issued twice, the address gets issued to the LSU and the data gets issued to the VSU. Also, issues can sometimes get killed/cancelled and cause multiple sequential issues for the same instruction."
+  },
+  {
+    "EventCode": "101E0",
+    "EventName": "PM_MRK_INST_DISP",
+    "BriefDescription": "The thread has dispatched a randomly sampled marked instruction."
+  },
+  {
+    "EventCode": "101E2",
+    "EventName": "PM_MRK_BR_TAKEN_CMPL",
+    "BriefDescription": "Marked Branch Taken instruction completed."
+  },
+  {
+    "EventCode": "20112",
+    "EventName": "PM_MRK_NTF_FIN",
+    "BriefDescription": "The marked instruction became the oldest in the pipeline before it finished. It excludes instructions that finish at dispatch."
+  },
+  {
+    "EventCode": "2C01C",
+    "EventName": "PM_EXEC_STALL_DMISS_OFF_CHIP",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a remote chip."
+  },
+  {
+    "EventCode": "20138",
+    "EventName": "PM_MRK_ST_NEST",
+    "BriefDescription": "A store has been sampled/marked and is at the point of execution where it has completed in the core and can no longer be flushed. At this point the store is sent to the L2."
+  },
+  {
+    "EventCode": "2013A",
+    "EventName": "PM_MRK_BRU_FIN",
+    "BriefDescription": "Marked Branch instruction finished."
+  },
+  {
+    "EventCode": "2C144",
+    "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC2",
+    "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[15:27]."
+  },
+  {
+    "EventCode": "24156",
+    "EventName": "PM_MRK_STCX_FIN",
+    "BriefDescription": "Marked conditional store instruction (STCX) finished. LARX and STCX are instructions used to acquire a lock."
+  },
+  {
+    "EventCode": "24158",
+    "EventName": "PM_MRK_INST",
+    "BriefDescription": "An instruction was marked. Includes both Random Instruction Sampling (RIS) at decode time and Random Event Sampling (RES) at the time the configured event happens."
+  },
+  {
+    "EventCode": "2415C",
+    "EventName": "PM_MRK_BR_CMPL",
+    "BriefDescription": "A marked branch completed. All branches are included."
+  },
+  {
+    "EventCode": "200FD",
+    "EventName": "PM_L1_ICACHE_MISS",
+    "BriefDescription": "Demand iCache Miss."
+  },
+  {
+    "EventCode": "30130",
+    "EventName": "PM_MRK_INST_FIN",
+    "BriefDescription": "marked instruction finished. Excludes instructions that finish at dispatch. Note that stores always finish twice since the address gets issued to the LSU and the data gets issued to the VSU."
+  },
+  {
+    "EventCode": "34146",
+    "EventName": "PM_MRK_LD_CMPL",
+    "BriefDescription": "Marked loads completed."
+  },
+  {
+    "EventCode": "3E158",
+    "EventName": "PM_MRK_STCX_FAIL",
+    "BriefDescription": "Marked conditional store instruction (STCX) failed. LARX and STCX are instructions used to acquire a lock."
+  },
+  {
+    "EventCode": "3E15A",
+    "EventName": "PM_MRK_ST_FIN",
+    "BriefDescription": "The marked instruction was a store of any kind."
+  },
+  {
+    "EventCode": "30068",
+    "EventName": "PM_L1_ICACHE_RELOADED_PREF",
+    "BriefDescription": "Counts all Icache prefetch reloads ( includes demand turned into prefetch)."
+  },
+  {
+    "EventCode": "301E4",
+    "EventName": "PM_MRK_BR_MPRED_CMPL",
+    "BriefDescription": "Marked Branch Mispredicted. Includes direction and target."
+  },
+  {
+    "EventCode": "300F6",
+    "EventName": "PM_LD_DEMAND_MISS_L1",
+    "BriefDescription": "The L1 cache was reloaded with a line that fulfills a demand miss request. Counted at reload time, before finish."
+  },
+  {
+    "EventCode": "300FE",
+    "EventName": "PM_DATA_FROM_L3MISS",
+    "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss."
+  },
+  {
+    "EventCode": "40012",
+    "EventName": "PM_L1_ICACHE_RELOADED_ALL",
+    "BriefDescription": "Counts all Icache reloads includes demand, prefetch, prefetch turned into demand and demand turned into prefetch."
+  },
+  {
+    "EventCode": "40134",
+    "EventName": "PM_MRK_INST_TIMEO",
+    "BriefDescription": "Marked instruction finish timeout (instruction was lost)."
+  },
+  {
+    "EventCode": "4003C",
+    "EventName": "PM_DISP_STALL_HELD_SYNC_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because of a synchronizing instruction that requires the ICT to be empty before dispatch."
+  },
+  {
+    "EventCode": "4505A",
+    "EventName": "PM_SP_FLOP_CMPL",
+    "BriefDescription": "Single Precision floating point instructions completed."
+  },
+  {
+    "EventCode": "4D058",
+    "EventName": "PM_VECTOR_FLOP_CMPL",
+    "BriefDescription": "Vector floating point instructions completed."
+  },
+  {
+    "EventCode": "4D05A",
+    "EventName": "PM_NON_MATH_FLOP_CMPL",
+    "BriefDescription": "Non Math instructions completed."
+  },
+  {
+    "EventCode": "401E0",
+    "EventName": "PM_MRK_INST_CMPL",
+    "BriefDescription": "marked instruction completed."
+  },
+  {
+    "EventCode": "400FE",
+    "EventName": "PM_DATA_FROM_MEMORY",
+    "BriefDescription": "The processor's data cache was reloaded from local, remote, or distant memory due to a demand miss."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/memory.json b/tools/perf/pmu-events/arch/powerpc/power10/memory.json
new file mode 100644 (file)
index 0000000..b01141e
--- /dev/null
@@ -0,0 +1,192 @@
+[
+  {
+    "EventCode": "1000A",
+    "EventName": "PM_PMC3_REWIND",
+    "BriefDescription": "The speculative event selected for PMC3 rewinds and the counter for PMC3 is not charged."
+  },
+  {
+    "EventCode": "1C040",
+    "EventName": "PM_XFER_FROM_SRC_PMC1",
+    "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[0:12]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "1C142",
+    "EventName": "PM_MRK_XFER_FROM_SRC_PMC1",
+    "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[0:12]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "1C144",
+    "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC1",
+    "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[0:12]."
+  },
+  {
+    "EventCode": "1C056",
+    "EventName": "PM_DERAT_MISS_4K",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 4K. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "1C058",
+    "EventName": "PM_DTLB_MISS_16G",
+    "BriefDescription": "Data TLB reload (after a miss) page size 16G. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "1C05C",
+    "EventName": "PM_DTLB_MISS_2M",
+    "BriefDescription": "Data TLB reload (after a miss) page size 2M. Implies radix translation was used. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "1E056",
+    "EventName": "PM_EXEC_STALL_STORE_PIPE",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the store unit. This does not include cycles spent handling store misses, PTESYNC instructions or TLBIE instructions."
+  },
+  {
+    "EventCode": "1F150",
+    "EventName": "PM_MRK_ST_L2_CYC",
+    "BriefDescription": "Cycles from L2 RC dispatch to L2 RC completion."
+  },
+  {
+    "EventCode": "10062",
+    "EventName": "PM_LD_L3MISS_PEND_CYC",
+    "BriefDescription": "Cycles L3 miss was pending for this thread."
+  },
+  {
+    "EventCode": "20010",
+    "EventName": "PM_PMC1_OVERFLOW",
+    "BriefDescription": "The event selected for PMC1 caused the event counter to overflow."
+  },
+  {
+    "EventCode": "2001A",
+    "EventName": "PM_ITLB_HIT",
+    "BriefDescription": "The PTE required to translate the instruction address was resident in the TLB (instruction TLB access/IERAT reload). Applies to both HPT and RPT. When MMCR1[17]=0 this event counts only for demand misses. When MMCR1[17]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "2003E",
+    "EventName": "PM_PTESYNC_FIN",
+    "BriefDescription": "Ptesync instruction finished in the store unit. Only one ptesync can finish at a time."
+  },
+  {
+    "EventCode": "2C040",
+    "EventName": "PM_XFER_FROM_SRC_PMC2",
+    "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[15:27]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "2C054",
+    "EventName": "PM_DERAT_MISS_64K",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 64K. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "2C056",
+    "EventName": "PM_DTLB_MISS_4K",
+    "BriefDescription": "Data TLB reload (after a miss) page size 4K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "2D154",
+    "EventName": "PM_MRK_DERAT_MISS_64K",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 64K for a marked instruction. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "200F6",
+    "EventName": "PM_DERAT_MISS",
+    "BriefDescription": "DERAT Reloaded to satisfy a DERAT miss. All page sizes are counted by this event. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "3000A",
+    "EventName": "PM_DISP_STALL_ITLB_MISS",
+    "BriefDescription": "Cycles when dispatch was stalled while waiting to resolve an instruction TLB miss."
+  },
+  {
+    "EventCode": "30016",
+    "EventName": "PM_EXEC_STALL_DERAT_DTLB_MISS",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered a TLB miss and waited for it resolve."
+  },
+  {
+    "EventCode": "3C040",
+    "EventName": "PM_XFER_FROM_SRC_PMC3",
+    "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[30:42]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "3C142",
+    "EventName": "PM_MRK_XFER_FROM_SRC_PMC3",
+    "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[30:42]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "3C144",
+    "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC3",
+    "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[30:42]."
+  },
+  {
+    "EventCode": "3C054",
+    "EventName": "PM_DERAT_MISS_16M",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 16M. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "3C056",
+    "EventName": "PM_DTLB_MISS_64K",
+    "BriefDescription": "Data TLB reload (after a miss) page size 64K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "3C058",
+    "EventName": "PM_LARX_FIN",
+    "BriefDescription": "Load and reserve instruction (LARX) finished. LARX and STCX are instructions used to acquire a lock."
+  },
+  {
+    "EventCode": "301E2",
+    "EventName": "PM_MRK_ST_CMPL",
+    "BriefDescription": "Marked store completed and sent to nest. Note that this count excludes cache-inhibited stores."
+  },
+  {
+    "EventCode": "300FC",
+    "EventName": "PM_DTLB_MISS",
+    "BriefDescription": "The DPTEG required for the load/store instruction in execution was missing from the TLB. It includes pages of all sizes for demand and prefetch activity."
+  },
+  {
+    "EventCode": "4D02C",
+    "EventName": "PM_PMC1_REWIND",
+    "BriefDescription": "The speculative event selected for PMC1 rewinds and the counter for PMC1 is not charged."
+  },
+  {
+    "EventCode": "4003E",
+    "EventName": "PM_LD_CMPL",
+    "BriefDescription": "Loads completed."
+  },
+  {
+    "EventCode": "4C040",
+    "EventName": "PM_XFER_FROM_SRC_PMC4",
+    "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[45:57]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "4C142",
+    "EventName": "PM_MRK_XFER_FROM_SRC_PMC4",
+    "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[45:57]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "4C144",
+    "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC4",
+    "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[45:57]."
+  },
+  {
+    "EventCode": "4C056",
+    "EventName": "PM_DTLB_MISS_16M",
+    "BriefDescription": "Data TLB reload (after a miss) page size 16M. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "4C05A",
+    "EventName": "PM_DTLB_MISS_1G",
+    "BriefDescription": "Data TLB reload (after a miss) page size 1G. Implies radix translation was used. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "4C15E",
+    "EventName": "PM_MRK_DTLB_MISS_64K",
+    "BriefDescription": "Marked Data TLB reload (after a miss) page size 64K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "4D056",
+    "EventName": "PM_NON_FMA_FLOP_CMPL",
+    "BriefDescription": "Non FMA instruction completed."
+  },
+  {
+    "EventCode": "40164",
+    "EventName": "PM_MRK_DERAT_MISS_2M",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 2M for a marked instruction. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/others.json b/tools/perf/pmu-events/arch/powerpc/power10/others.json
new file mode 100644 (file)
index 0000000..a119e56
--- /dev/null
@@ -0,0 +1,297 @@
+[
+  {
+    "EventCode": "10016",
+    "EventName": "PM_VSU0_ISSUE",
+    "BriefDescription": "VSU instructions issued to VSU pipe 0."
+  },
+  {
+    "EventCode": "1001C",
+    "EventName": "PM_ULTRAVISOR_INST_CMPL",
+    "BriefDescription": "PowerPC instructions that completed while the thread was in ultravisor state."
+  },
+  {
+    "EventCode": "100F0",
+    "EventName": "PM_CYC",
+    "BriefDescription": "Processor cycles."
+  },
+  {
+    "EventCode": "10134",
+    "EventName": "PM_MRK_ST_DONE_L2",
+    "BriefDescription": "Marked stores completed in L2 (RC machine done)."
+  },
+  {
+    "EventCode": "1505E",
+    "EventName": "PM_LD_HIT_L1",
+    "BriefDescription": "Loads that finished without experiencing an L1 miss."
+  },
+  {
+    "EventCode": "1D05E",
+    "EventName": "PM_DISP_STALL_HELD_HALT_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because of power management."
+  },
+  {
+    "EventCode": "1E054",
+    "EventName": "PM_EXEC_STALL_DMISS_L21_L31",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from another core's L2 or L3 on the same chip."
+  },
+  {
+    "EventCode": "1E05A",
+    "EventName": "PM_CMPL_STALL_LWSYNC",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a lwsync waiting to complete."
+  },
+  {
+    "EventCode": "1F056",
+    "EventName": "PM_DISP_SS0_2_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 0 dispatches either 1 or 2 instructions."
+  },
+  {
+    "EventCode": "1F15C",
+    "EventName": "PM_MRK_STCX_L2_CYC",
+    "BriefDescription": "Cycles spent in the nest portion of a marked Stcx instruction. It starts counting when the operation starts to drain to the L2 and it stops counting when the instruction retires from the Instruction Completion Table (ICT) in the Instruction Sequencing Unit (ISU)."
+  },
+  {
+    "EventCode": "10066",
+    "EventName": "PM_ADJUNCT_CYC",
+    "BriefDescription": "Cycles in which the thread is in Adjunct state. MSR[S HV PR] bits = 011."
+  },
+  {
+    "EventCode": "101E4",
+    "EventName": "PM_MRK_L1_ICACHE_MISS",
+    "BriefDescription": "Marked Instruction suffered an icache Miss."
+  },
+  {
+    "EventCode": "101EA",
+    "EventName": "PM_MRK_L1_RELOAD_VALID",
+    "BriefDescription": "Marked demand reload."
+  },
+  {
+    "EventCode": "100F4",
+    "EventName": "PM_FLOP_CMPL",
+    "BriefDescription": "Floating Point Operations Completed. Includes any type. It counts once for each 1, 2, 4 or 8 flop instruction. Use PM_1|2|4|8_FLOP_CMPL events to count flops."
+  },
+  {
+    "EventCode": "100FA",
+    "EventName": "PM_RUN_LATCH_ANY_THREAD_CYC",
+    "BriefDescription": "Cycles when at least one thread has the run latch set."
+  },
+  {
+    "EventCode": "100FC",
+    "EventName": "PM_LD_REF_L1",
+    "BriefDescription": "All L1 D cache load references counted at finish, gated by reject. In P9 and earlier this event counted only cacheable loads but in P10 both cacheable and non-cacheable loads are included."
+  },
+  {
+    "EventCode": "20006",
+    "EventName": "PM_DISP_STALL_HELD_ISSQ_FULL_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch due to Issue queue full. Includes issue queue and branch queue."
+  },
+  {
+    "EventCode": "2000C",
+    "EventName": "PM_RUN_LATCH_ALL_THREADS_CYC",
+    "BriefDescription": "Cycles when the run latch is set for all threads."
+  },
+  {
+    "EventCode": "2E010",
+    "EventName": "PM_ADJUNCT_INST_CMPL",
+    "BriefDescription": "PowerPC instructions that completed while the thread is in Adjunct state."
+  },
+  {
+    "EventCode": "2E014",
+    "EventName": "PM_STCX_FIN",
+    "BriefDescription": "Conditional store instruction (STCX) finished. LARX and STCX are instructions used to acquire a lock."
+  },
+  {
+    "EventCode": "20130",
+    "EventName": "PM_MRK_INST_DECODED",
+    "BriefDescription": "An instruction was marked at decode time. Random Instruction Sampling (RIS) only."
+  },
+  {
+    "EventCode": "20132",
+    "EventName": "PM_MRK_DFU_ISSUE",
+    "BriefDescription": "The marked instruction was a decimal floating point operation issued to the VSU. Measured at issue time."
+  },
+  {
+    "EventCode": "20134",
+    "EventName": "PM_MRK_FXU_ISSUE",
+    "BriefDescription": "The marked instruction was a fixed point operation issued to the VSU. Measured at issue time."
+  },
+  {
+    "EventCode": "2505C",
+    "EventName": "PM_VSU_ISSUE",
+    "BriefDescription": "At least one VSU instruction was issued to one of the VSU pipes. Up to 4 per cycle. Includes fixed point operations."
+  },
+  {
+    "EventCode": "2F054",
+    "EventName": "PM_DISP_SS1_2_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 1 dispatches either 1 or 2 instructions."
+  },
+  {
+    "EventCode": "2F056",
+    "EventName": "PM_DISP_SS1_4_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 1 dispatches either 3 or 4 instructions."
+  },
+  {
+    "EventCode": "2006C",
+    "EventName": "PM_RUN_CYC_SMT4_MODE",
+    "BriefDescription": "Cycles when this thread's run latch is set and the core is in SMT4 mode."
+  },
+  {
+    "EventCode": "201E0",
+    "EventName": "PM_MRK_DATA_FROM_MEMORY",
+    "BriefDescription": "The processor's data cache was reloaded from local, remote, or distant memory due to a demand miss for a marked load."
+  },
+  {
+    "EventCode": "201E4",
+    "EventName": "PM_MRK_DATA_FROM_L3MISS",
+    "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss for a marked load."
+  },
+  {
+    "EventCode": "201E8",
+    "EventName": "PM_THRESH_EXC_512",
+    "BriefDescription": "Threshold counter exceeded a value of 512."
+  },
+  {
+    "EventCode": "200F2",
+    "EventName": "PM_INST_DISP",
+    "BriefDescription": "PowerPC instructions dispatched."
+  },
+  {
+    "EventCode": "30132",
+    "EventName": "PM_MRK_VSU_FIN",
+    "BriefDescription": "VSU marked instructions finished. Excludes simple FX instructions issued to the Store Unit."
+  },
+  {
+    "EventCode": "30038",
+    "EventName": "PM_EXEC_STALL_DMISS_LMEM",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local memory, local OpenCapp cache, or local OpenCapp memory."
+  },
+  {
+    "EventCode": "3F04A",
+    "EventName": "PM_LSU_ST5_FIN",
+    "BriefDescription": "LSU Finished an internal operation in ST2 port."
+  },
+  {
+    "EventCode": "34054",
+    "EventName": "PM_EXEC_STALL_DMISS_L2L3_NOCONFLICT",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local L2 or local L3, without a dispatch conflict."
+  },
+  {
+    "EventCode": "3405A",
+    "EventName": "PM_PRIVILEGED_INST_CMPL",
+    "BriefDescription": "PowerPC Instructions that completed while the thread is in Privileged state."
+  },
+  {
+    "EventCode": "3F150",
+    "EventName": "PM_MRK_ST_DRAIN_CYC",
+    "BriefDescription": "cycles to drain st from core to L2."
+  },
+  {
+    "EventCode": "3F054",
+    "EventName": "PM_DISP_SS0_4_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 0 dispatches either 3 or 4 instructions."
+  },
+  {
+    "EventCode": "3F056",
+    "EventName": "PM_DISP_SS0_8_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 0 dispatches either 5, 6, 7 or 8 instructions."
+  },
+  {
+    "EventCode": "30162",
+    "EventName": "PM_MRK_ISSUE_DEPENDENT_LOAD",
+    "BriefDescription": "The marked instruction was dependent on a load. It is eligible for issue kill."
+  },
+  {
+    "EventCode": "40114",
+    "EventName": "PM_MRK_START_PROBE_NOP_DISP",
+    "BriefDescription": "Marked Start probe nop dispatched. Instruction AND R0,R0,R0."
+  },
+  {
+    "EventCode": "4001C",
+    "EventName": "PM_VSU_FIN",
+    "BriefDescription": "VSU instructions finished."
+  },
+  {
+    "EventCode": "4C01A",
+    "EventName": "PM_EXEC_STALL_DMISS_OFF_NODE",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a distant chip."
+  },
+  {
+    "EventCode": "4D012",
+    "EventName": "PM_PMC3_SAVED",
+    "BriefDescription": "The conditions for the speculative event selected for PMC3 are met and PMC3 is charged."
+  },
+  {
+    "EventCode": "4D022",
+    "EventName": "PM_HYPERVISOR_INST_CMPL",
+    "BriefDescription": "PowerPC instructions that completed while the thread is in hypervisor state."
+  },
+  {
+    "EventCode": "4D026",
+    "EventName": "PM_ULTRAVISOR_CYC",
+    "BriefDescription": "Cycles when the thread is in Ultravisor state. MSR[S HV PR]=110."
+  },
+  {
+    "EventCode": "4D028",
+    "EventName": "PM_PRIVILEGED_CYC",
+    "BriefDescription": "Cycles when the thread is in Privileged state. MSR[S HV PR]=x00."
+  },
+  {
+    "EventCode": "40030",
+    "EventName": "PM_INST_FIN",
+    "BriefDescription": "Instructions finished."
+  },
+  {
+    "EventCode": "44146",
+    "EventName": "PM_MRK_STCX_CORE_CYC",
+    "BriefDescription": "Cycles spent in the core portion of a marked Stcx instruction. It starts counting when the instruction is decoded and stops counting when it drains into the L2."
+  },
+  {
+    "EventCode": "44054",
+    "EventName": "PM_VECTOR_LD_CMPL",
+    "BriefDescription": "Vector load instructions completed."
+  },
+  {
+    "EventCode": "45054",
+    "EventName": "PM_FMA_CMPL",
+    "BriefDescription": "Two floating point instructions completed (FMA class of instructions: fmadd, fnmadd, fmsub, fnmsub). Scalar instructions only."
+  },
+  {
+    "EventCode": "45056",
+    "EventName": "PM_SCALAR_FLOP_CMPL",
+    "BriefDescription": "Scalar floating point instructions completed."
+  },
+  {
+    "EventCode": "4505C",
+    "EventName": "PM_MATH_FLOP_CMPL",
+    "BriefDescription": "Math floating point instructions completed."
+  },
+  {
+    "EventCode": "4D05E",
+    "EventName": "PM_BR_CMPL",
+    "BriefDescription": "A branch completed. All branches are included."
+  },
+  {
+    "EventCode": "4E15E",
+    "EventName": "PM_MRK_INST_FLUSHED",
+    "BriefDescription": "The marked instruction was flushed."
+  },
+  {
+    "EventCode": "401E6",
+    "EventName": "PM_MRK_INST_FROM_L3MISS",
+    "BriefDescription": "The processor's instruction cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss for a marked instruction."
+  },
+  {
+    "EventCode": "401E8",
+    "EventName": "PM_MRK_DATA_FROM_L2MISS",
+    "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1 or L2 due to a demand miss for a marked load."
+  },
+  {
+    "EventCode": "400F0",
+    "EventName": "PM_LD_DEMAND_MISS_L1_FIN",
+    "BriefDescription": "Load Missed L1, counted at finish time."
+  },
+  {
+    "EventCode": "400FA",
+    "EventName": "PM_RUN_INST_CMPL",
+    "BriefDescription": "Completed PowerPC instructions gated by the run latch."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json b/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json
new file mode 100644 (file)
index 0000000..b61b5cc
--- /dev/null
@@ -0,0 +1,297 @@
+[
+  {
+    "EventCode": "100FE",
+    "EventName": "PM_INST_CMPL",
+    "BriefDescription": "PowerPC instructions completed."
+  },
+  {
+    "EventCode": "10006",
+    "EventName": "PM_DISP_STALL_HELD_OTHER_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch for any other reason."
+  },
+  {
+    "EventCode": "1000C",
+    "EventName": "PM_LSU_LD0_FIN",
+    "BriefDescription": "LSU Finished an internal operation in LD0 port."
+  },
+  {
+    "EventCode": "1000E",
+    "EventName": "PM_MMA_ISSUED",
+    "BriefDescription": "MMA instructions issued."
+  },
+  {
+    "EventCode": "10012",
+    "EventName": "PM_LSU_ST0_FIN",
+    "BriefDescription": "LSU Finished an internal operation in ST0 port."
+  },
+  {
+    "EventCode": "10014",
+    "EventName": "PM_LSU_ST4_FIN",
+    "BriefDescription": "LSU Finished an internal operation in ST4 port."
+  },
+  {
+    "EventCode": "10018",
+    "EventName": "PM_IC_DEMAND_CYC",
+    "BriefDescription": "Cycles in which an instruction reload is pending to satisfy a demand miss."
+  },
+  {
+    "EventCode": "10022",
+    "EventName": "PM_PMC2_SAVED",
+    "BriefDescription": "The conditions for the speculative event selected for PMC2 are met and PMC2 is charged."
+  },
+  {
+    "EventCode": "10024",
+    "EventName": "PM_PMC5_OVERFLOW",
+    "BriefDescription": "The event selected for PMC5 caused the event counter to overflow."
+  },
+  {
+    "EventCode": "10058",
+    "EventName": "PM_EXEC_STALL_FIN_AT_DISP",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline finished at dispatch and did not require execution in the LSU, BRU or VSU."
+  },
+  {
+    "EventCode": "1005A",
+    "EventName": "PM_FLUSH_MPRED",
+    "BriefDescription": "A flush occurred due to a mispredicted branch. Includes target and direction."
+  },
+  {
+    "EventCode": "1C05A",
+    "EventName": "PM_DERAT_MISS_2M",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 2M. Implies radix translation. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "10064",
+    "EventName": "PM_DISP_STALL_IC_L2",
+    "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L2."
+  },
+  {
+    "EventCode": "10068",
+    "EventName": "PM_BR_FIN",
+    "BriefDescription": "A branch instruction finished. Includes predicted/mispredicted/unconditional."
+  },
+  {
+    "EventCode": "1006A",
+    "EventName": "PM_FX_LSU_FIN",
+    "BriefDescription": "Simple fixed point instruction issued to the store unit. Measured at finish time."
+  },
+  {
+    "EventCode": "1006C",
+    "EventName": "PM_RUN_CYC_ST_MODE",
+    "BriefDescription": "Cycles when the run latch is set and the core is in ST mode."
+  },
+  {
+    "EventCode": "20004",
+    "EventName": "PM_ISSUE_STALL",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was dispatched but not issued yet."
+  },
+  {
+    "EventCode": "2000A",
+    "EventName": "PM_HYPERVISOR_CYC",
+    "BriefDescription": "Cycles when the thread is in Hypervisor state. MSR[S HV PR]=010."
+  },
+  {
+    "EventCode": "2000E",
+    "EventName": "PM_LSU_LD1_FIN",
+    "BriefDescription": "LSU Finished an internal operation in LD1 port."
+  },
+  {
+    "EventCode": "2C014",
+    "EventName": "PM_CMPL_STALL_SPECIAL",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline required special handling before completing."
+  },
+  {
+    "EventCode": "2C018",
+    "EventName": "PM_EXEC_STALL_DMISS_L3MISS",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a source beyond the local L2 or local L3."
+  },
+  {
+    "EventCode": "2D010",
+    "EventName": "PM_LSU_ST1_FIN",
+    "BriefDescription": "LSU Finished an internal operation in ST1 port."
+  },
+  {
+    "EventCode": "2D012",
+    "EventName": "PM_VSU1_ISSUE",
+    "BriefDescription": "VSU instructions issued to VSU pipe 1."
+  },
+  {
+    "EventCode": "2D018",
+    "EventName": "PM_EXEC_STALL_VSU",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the VSU (includes FXU, VSU, CRU)."
+  },
+  {
+    "EventCode": "2E01E",
+    "EventName": "PM_EXEC_STALL_NTC_FLUSH",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in any unit before it was flushed. Note that if the flush of the oldest instruction happens after finish, the cycles from dispatch to issue will be included in PM_DISP_STALL and the cycles from issue to finish will be included in PM_EXEC_STALL and its corresponding children."
+  },
+  {
+    "EventCode": "2013C",
+    "EventName": "PM_MRK_FX_LSU_FIN",
+    "BriefDescription": "The marked instruction was simple fixed point that was issued to the store unit. Measured at finish time."
+  },
+  {
+    "EventCode": "2405A",
+    "EventName": "PM_NTC_FIN",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline (NTC) finishes. Note that instructions can finish out of order, therefore not all the instructions that finish have a Next-to-complete status."
+  },
+  {
+    "EventCode": "201E2",
+    "EventName": "PM_MRK_LD_MISS_L1",
+    "BriefDescription": "Marked DL1 Demand Miss counted at finish time."
+  },
+  {
+    "EventCode": "200F4",
+    "EventName": "PM_RUN_CYC",
+    "BriefDescription": "Processor cycles gated by the run latch."
+  },
+  {
+    "EventCode": "30004",
+    "EventName": "PM_DISP_STALL_FLUSH",
+    "BriefDescription": "Cycles when dispatch was stalled because of a flush that happened to an instruction(s) that was not yet NTC. PM_EXEC_STALL_NTC_FLUSH only includes instructions that were flushed after becoming NTC."
+  },
+  {
+    "EventCode": "30008",
+    "EventName": "PM_EXEC_STALL",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting to finish in one of the execution units (BRU, LSU, VSU). Only cycles between issue and finish are counted in this category."
+  },
+  {
+    "EventCode": "3001A",
+    "EventName": "PM_LSU_ST2_FIN",
+    "BriefDescription": "LSU Finished an internal operation in ST2 port."
+  },
+  {
+    "EventCode": "30020",
+    "EventName": "PM_PMC2_REWIND",
+    "BriefDescription": "The speculative event selected for PMC2 rewinds and the counter for PMC2 is not charged."
+  },
+  {
+    "EventCode": "30022",
+    "EventName": "PM_PMC4_SAVED",
+    "BriefDescription": "The conditions for the speculative event selected for PMC4 are met and PMC4 is charged."
+  },
+  {
+    "EventCode": "30024",
+    "EventName": "PM_PMC6_OVERFLOW",
+    "BriefDescription": "The event selected for PMC6 caused the event counter to overflow."
+  },
+  {
+    "EventCode": "30028",
+    "EventName": "PM_CMPL_STALL_MEM_ECC",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for the non-speculative finish of either a stcx waiting for its result or a load waiting for non-critical sectors of data and ECC."
+  },
+  {
+    "EventCode": "30036",
+    "EventName": "PM_EXEC_STALL_SIMPLE_FX",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a simple fixed point instruction executing in the Load Store Unit."
+  },
+  {
+    "EventCode": "3003A",
+    "EventName": "PM_CMPL_STALL_EXCEPTION",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was not allowed to complete because it was interrupted by ANY exception, which has to be serviced before the instruction can complete."
+  },
+  {
+    "EventCode": "3F044",
+    "EventName": "PM_VSU2_ISSUE",
+    "BriefDescription": "VSU instructions issued to VSU pipe 2."
+  },
+  {
+    "EventCode": "30058",
+    "EventName": "PM_TLBIE_FIN",
+    "BriefDescription": "TLBIE instructions finished in the LSU. Two TLBIEs can finish each cycle. All will be counted."
+  },
+  {
+    "EventCode": "3D058",
+    "EventName": "PM_SCALAR_FSQRT_FDIV_ISSUE",
+    "BriefDescription": "Scalar versions of four floating point operations: fdiv,fsqrt (xvdivdp, xvdivsp, xvsqrtdp, xvsqrtsp)."
+  },
+  {
+    "EventCode": "30066",
+    "EventName": "PM_LSU_FIN",
+    "BriefDescription": "LSU Finished an internal operation (up to 4 per cycle)."
+  },
+  {
+    "EventCode": "40004",
+    "EventName": "PM_FXU_ISSUE",
+    "BriefDescription": "A fixed point instruction was issued to the VSU."
+  },
+  {
+    "EventCode": "40008",
+    "EventName": "PM_NTC_ALL_FIN",
+    "BriefDescription": "Cycles in which both instructions in the ICT entry pair show as finished. These are the cycles between finish and completion for the oldest pair of instructions in the pipeline."
+  },
+  {
+    "EventCode": "40010",
+    "EventName": "PM_PMC3_OVERFLOW",
+    "BriefDescription": "The event selected for PMC3 caused the event counter to overflow."
+  },
+  {
+    "EventCode": "4C012",
+    "EventName": "PM_EXEC_STALL_DERAT_ONLY_MISS",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered an ERAT miss and waited for it resolve."
+  },
+  {
+    "EventCode": "4C018",
+    "EventName": "PM_CMPL_STALL",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline cannot complete because the thread was blocked for any reason."
+  },
+  {
+    "EventCode": "4C01E",
+    "EventName": "PM_LSU_ST3_FIN",
+    "BriefDescription": "LSU Finished an internal operation in ST3 port."
+  },
+  {
+    "EventCode": "4D018",
+    "EventName": "PM_EXEC_STALL_BRU",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the Branch unit."
+  },
+  {
+    "EventCode": "4D01A",
+    "EventName": "PM_CMPL_STALL_HWSYNC",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a hwsync waiting for response from L2 before completing."
+  },
+  {
+    "EventCode": "4D01C",
+    "EventName": "PM_EXEC_STALL_TLBIEL",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a TLBIEL instruction executing in the Load Store Unit. TLBIEL instructions have lower overhead than TLBIE instructions because they don't get set to the nest."
+  },
+  {
+    "EventCode": "4E012",
+    "EventName": "PM_EXEC_STALL_UNKNOWN",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline completed without an ntf_type pulse. The ntf_pulse was missed by the ISU because the NTF finishes and completions came too close together."
+  },
+  {
+    "EventCode": "4D020",
+    "EventName": "PM_VSU3_ISSUE",
+    "BriefDescription": "VSU instruction was issued to VSU pipe 3."
+  },
+  {
+    "EventCode": "40132",
+    "EventName": "PM_MRK_LSU_FIN",
+    "BriefDescription": "LSU marked instruction finish."
+  },
+  {
+    "EventCode": "45058",
+    "EventName": "PM_IC_MISS_CMPL",
+    "BriefDescription": "Non-speculative icache miss, counted at completion."
+  },
+  {
+    "EventCode": "4D050",
+    "EventName": "PM_VSU_NON_FLOP_CMPL",
+    "BriefDescription": "Non-floating point VSU instructions completed."
+  },
+  {
+    "EventCode": "4D052",
+    "EventName": "PM_2FLOP_CMPL",
+    "BriefDescription": "Double Precision vector version of fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg completed."
+  },
+  {
+    "EventCode": "400F2",
+    "EventName": "PM_1PLUS_PPC_DISP",
+    "BriefDescription": "Cycles at least one Instr Dispatched."
+  },
+  {
+    "EventCode": "400F8",
+    "EventName": "PM_FLUSH",
+    "BriefDescription": "Flush (any type)."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json
new file mode 100644 (file)
index 0000000..ea122a9
--- /dev/null
@@ -0,0 +1,22 @@
+[
+  {
+    "EventCode": "301E8",
+    "EventName": "PM_THRESH_EXC_64",
+    "BriefDescription": "Threshold counter exceeded a value of 64."
+  },
+  {
+    "EventCode": "45050",
+    "EventName": "PM_1FLOP_CMPL",
+    "BriefDescription": "One floating point instruction completed (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg)."
+  },
+  {
+    "EventCode": "45052",
+    "EventName": "PM_4FLOP_CMPL",
+    "BriefDescription": "Four floating point instructions completed (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg)."
+  },
+  {
+    "EventCode": "4D054",
+    "EventName": "PM_8FLOP_CMPL",
+    "BriefDescription": "Four Double Precision vector instructions completed."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/translation.json b/tools/perf/pmu-events/arch/powerpc/power10/translation.json
new file mode 100644 (file)
index 0000000..5a714e3
--- /dev/null
@@ -0,0 +1,57 @@
+[
+  {
+    "EventCode": "1F15E",
+    "EventName": "PM_MRK_START_PROBE_NOP_CMPL",
+    "BriefDescription": "Marked Start probe nop (AND R0,R0,R0) completed."
+  },
+  {
+    "EventCode": "20016",
+    "EventName": "PM_ST_FIN",
+    "BriefDescription": "Store finish count. Includes speculative activity."
+  },
+  {
+    "EventCode": "20018",
+    "EventName": "PM_ST_FWD",
+    "BriefDescription": "Store forwards that finished."
+  },
+  {
+    "EventCode": "2011C",
+    "EventName": "PM_MRK_NTF_CYC",
+    "BriefDescription": "Cycles during which the marked instruction is the oldest in the pipeline (NTF or NTC)."
+  },
+  {
+    "EventCode": "2E01C",
+    "EventName": "PM_EXEC_STALL_TLBIE",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a TLBIE instruction executing in the Load Store Unit."
+  },
+  {
+    "EventCode": "201E6",
+    "EventName": "PM_THRESH_EXC_32",
+    "BriefDescription": "Threshold counter exceeded a value of 32."
+  },
+  {
+    "EventCode": "200F0",
+    "EventName": "PM_ST_CMPL",
+    "BriefDescription": "Stores completed from S2Q (2nd-level store queue). This event includes regular stores, stcx and cache inhibited stores. The following operations are excluded (pteupdate, snoop tlbie complete, store atomics, miso, load atomic payloads, tlbie, tlbsync, slbieg, isync, msgsnd, slbiag, cpabort, copy, tcheck, tend, stsync, dcbst, icbi, dcbf, hwsync, lwsync, ptesync, eieio, msgsync)."
+  },
+  {
+    "EventCode": "200FE",
+    "EventName": "PM_DATA_FROM_L2MISS",
+    "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1 or L2 due to a demand miss."
+  },
+  {
+    "EventCode": "30010",
+    "EventName": "PM_PMC2_OVERFLOW",
+    "BriefDescription": "The event selected for PMC2 caused the event counter to overflow."
+  },
+  {
+    "EventCode": "4D010",
+    "EventName": "PM_PMC1_SAVED",
+    "BriefDescription": "The conditions for the speculative event selected for PMC1 are met and PMC1 is charged."
+  },
+  {
+    "EventCode": "4D05C",
+    "EventName": "PM_DPP_FLOP_CMPL",
+    "BriefDescription": "Double-Precision or Quad-Precision instructions completed."
+  }
+]
index fc4aa6c..4e25525 100644 (file)
         "MetricName": "flush_rate_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (11 to 14) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (11 to 14) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_11_14_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_11to14_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (15 to 17) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (15 to 17) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_15_17_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_15to17_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization 18+ as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization 18+ as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_18_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_18plus_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (1 to 2) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (1 to 2) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_1_2_ENTRIES /  ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_1to2_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (3 to 6) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (3 to 6) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_3_6_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_3to6_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (7 to 10) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (7 to 10) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_7_10_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_7to10_slots_percent"
index f8784c6..db86ba3 100644 (file)
         "MetricGroup": "instruction_stats_percent_per_ref",
         "MetricName": "inst_from_rmem_percent"
     },
-    {
-        "BriefDescription": "%L2 Modified CO Cache read Utilization (4 pclks per disp attempt)",
-        "MetricExpr": "((PM_L2_CASTOUT_MOD/2)*4)/ PM_RUN_CYC * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_co_m_rd_util"
-    },
-    {
-        "BriefDescription": "L2 dcache invalidates per run inst (per core)",
-        "MetricExpr": "(PM_L2_DC_INV / 2) / PM_RUN_INST_CMPL * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_dc_inv_rate_percent"
-    },
     {
         "BriefDescription": "Demand load misses as a % of L2 LD dispatches (per thread)",
         "MetricExpr": "PM_L1_DCACHE_RELOAD_VALID / (PM_L2_LD / 2) * 100",
         "MetricGroup": "l2_stats",
         "MetricName": "l2_dem_ld_disp_percent"
     },
-    {
-        "BriefDescription": "L2 Icache invalidates per run inst (per core)",
-        "MetricExpr": "(PM_L2_IC_INV / 2) / PM_RUN_INST_CMPL * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ic_inv_rate_percent"
-    },
-    {
-        "BriefDescription": "L2 Inst misses as a % of total L2 Inst dispatches (per thread)",
-        "MetricExpr": "PM_L2_INST_MISS / PM_L2_INST * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_inst_miss_ratio_percent"
-    },
-    {
-        "BriefDescription": "Average number of cycles between L2 Load hits",
-        "MetricExpr": "(PM_L2_LD_HIT / PM_RUN_CYC) / 2",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ld_hit_frequency"
-    },
-    {
-        "BriefDescription": "Average number of cycles between L2 Load misses",
-        "MetricExpr": "(PM_L2_LD_MISS / PM_RUN_CYC) / 2",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ld_miss_frequency"
-    },
-    {
-        "BriefDescription": "L2 Load misses as a % of total L2 Load dispatches (per thread)",
-        "MetricExpr": "PM_L2_LD_MISS / PM_L2_LD * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ld_miss_ratio_percent"
-    },
-    {
-        "BriefDescription": "% L2 load disp attempts Cache read Utilization (4 pclks per disp attempt)",
-        "MetricExpr": "((PM_L2_RCLD_DISP/2)*4)/ PM_RUN_CYC * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ld_rd_util"
-    },
-    {
-        "BriefDescription": "L2 load misses that require a cache write (4 pclks per disp attempt) % of pclks",
-        "MetricExpr": "((( PM_L2_LD_DISP - PM_L2_LD_HIT)/2)*4)/ PM_RUN_CYC * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ldmiss_wr_util"
-    },
-    {
-        "BriefDescription": "L2 local pump prediction success",
-        "MetricExpr": "PM_L2_LOC_GUESS_CORRECT / (PM_L2_LOC_GUESS_CORRECT + PM_L2_LOC_GUESS_WRONG) * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_local_pred_correct_percent"
-    },
-    {
-        "BriefDescription": "L2 COs that were in M,Me,Mu state as a % of all L2 COs",
-        "MetricExpr": "PM_L2_CASTOUT_MOD / (PM_L2_CASTOUT_MOD + PM_L2_CASTOUT_SHR) * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_mod_co_percent"
-    },
-    {
-        "BriefDescription": "% of L2 Load RC dispatch atampts that failed because of address collisions and cclass conflicts",
-        "MetricExpr": "(PM_L2_RCLD_DISP_FAIL_ADDR )/ PM_L2_RCLD_DISP * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rc_ld_disp_addr_fail_percent"
-    },
-    {
-        "BriefDescription": "% of L2 Load RC dispatch attempts that failed",
-        "MetricExpr": "(PM_L2_RCLD_DISP_FAIL_ADDR + PM_L2_RCLD_DISP_FAIL_OTHER)/ PM_L2_RCLD_DISP * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rc_ld_disp_fail_percent"
-    },
-    {
-        "BriefDescription": "% of L2 Store RC dispatch atampts that failed because of address collisions and cclass conflicts",
-        "MetricExpr": "PM_L2_RCST_DISP_FAIL_ADDR / PM_L2_RCST_DISP * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rc_st_disp_addr_fail_percent"
-    },
-    {
-        "BriefDescription": "% of L2 Store RC dispatch attempts that failed",
-        "MetricExpr": "(PM_L2_RCST_DISP_FAIL_ADDR + PM_L2_RCST_DISP_FAIL_OTHER)/ PM_L2_RCST_DISP * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rc_st_disp_fail_percent"
-    },
-    {
-        "BriefDescription": "L2 Cache Read Utilization (per core)",
-        "MetricExpr": "(((PM_L2_RCLD_DISP/2)*4)/ PM_RUN_CYC * 100) + (((PM_L2_RCST_DISP/2)*4)/PM_RUN_CYC * 100) + (((PM_L2_CASTOUT_MOD/2)*4)/PM_RUN_CYC * 100)",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rd_util_percent"
-    },
-    {
-        "BriefDescription": "L2 COs that were in T,Te,Si,S state as a % of all L2 COs",
-        "MetricExpr": "PM_L2_CASTOUT_SHR / (PM_L2_CASTOUT_MOD + PM_L2_CASTOUT_SHR) * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_shr_co_percent"
-    },
     {
         "BriefDescription": "L2 Store misses as a % of total L2 Store dispatches (per thread)",
         "MetricExpr": "PM_L2_ST_MISS / PM_L2_ST * 100",
         "MetricGroup": "l2_stats",
         "MetricName": "l2_st_miss_ratio_percent"
     },
-    {
-        "BriefDescription": "% L2 store disp attempts Cache read Utilization (4 pclks per disp attempt)",
-        "MetricExpr": "((PM_L2_RCST_DISP/2)*4) / PM_RUN_CYC * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_st_rd_util"
-    },
     {
         "BriefDescription": "L2 stores that require a cache write (4 pclks per disp attempt) % of pclks",
         "MetricExpr": "((PM_L2_ST_DISP/2)*4) / PM_RUN_CYC * 100",
         "MetricGroup": "l2_stats",
         "MetricName": "l2_st_wr_util"
     },
-    {
-        "BriefDescription": "L2 Cache Write Utilization (per core)",
-        "MetricExpr": "((((PM_L2_LD_DISP - PM_L2_LD_HIT)/2)*4) / PM_RUN_CYC * 100) + (((PM_L2_ST_DISP/2)*4) / PM_RUN_CYC * 100)",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_wr_util_percent"
-    },
-    {
-        "BriefDescription": "Average number of cycles between L3 Load hits",
-        "MetricExpr": "(PM_L3_LD_HIT / PM_RUN_CYC) / 2",
-        "MetricGroup": "l3_stats",
-        "MetricName": "l3_ld_hit_frequency"
-    },
-    {
-        "BriefDescription": "Average number of cycles between L3 Load misses",
-        "MetricExpr": "(PM_L3_LD_MISS / PM_RUN_CYC) / 2",
-        "MetricGroup": "l3_stats",
-        "MetricName": "l3_ld_miss_frequency"
-    },
-    {
-        "BriefDescription": "Average number of Write-in machines used. 1 of 8 WI machines is sampled every L3 cycle",
-        "MetricExpr": "(PM_L3_WI_USAGE / PM_RUN_CYC) * 8",
-        "MetricGroup": "l3_stats",
-        "MetricName": "l3_wi_usage"
-    },
     {
         "BriefDescription": "Average icache miss latency",
         "MetricExpr": "PM_IC_DEMAND_CYC / PM_IC_DEMAND_REQ",
         "MetricName": "custom_secs"
     },
     {
-        "BriefDescription": "Percentage Cycles atleast one instruction dispatched",
+        "BriefDescription": "Percentage Cycles at least one instruction dispatched",
         "MetricExpr": "PM_1PLUS_PPC_DISP / PM_CYC * 100",
         "MetricName": "cycles_atleast_one_inst_dispatched_percent"
     },
index 4ea7ec4..0d46cb8 100644 (file)
     "EventName": "ic_fetch_stall.ic_stall_any",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle for any reason (nothing valid in pipe ICM1).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ic_fetch_stall.ic_stall_dq_empty",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to DQ empty.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_fetch_stall.ic_stall_back_pressure",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to back-pressure.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ic_cache_inval.l2_invalidating_probe",
     "EventCode": "0x8c",
     "BriefDescription": "IC line invalidated due to L2 invalidating probe (external or LS). The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_cache_inval.fill_invalidated",
     "EventCode": "0x8c",
     "BriefDescription": "IC line invalidated due to overwriting fill response. The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "bp_tlb_rel",
     "EventName": "l2_request_g1.change_to_x",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache state change requests. Request change to writable, check L2 for current state.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_request_g1.prefetch_l2_cmd",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). PrefetchL2Cmd.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_request_g1.l2_hw_pf",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). L2 Prefetcher. All prefetches accepted by L2 pipeline, hit or miss. Types of PF and L2 hit/miss broken out in a separate perfmon event.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_request_g1.group2",
     "EventCode": "0x60",
     "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g2 (PMCx061).",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_request_g1.all_no_prefetch",
     "EventName": "l2_request_g2.ic_rd_sized_nc",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized non-cacheable.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_request_g2.smc_inval",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Self-modifying code invalidates.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_request_g2.bus_locks_originator",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus locks.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_request_g2.bus_locks_responses",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus lock response.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_latency.l2_cycles_waiting_on_fills",
     "EventCode": "0x62",
     "BriefDescription": "Total cycles spent waiting for L2 fills to complete from L3 or memory, divided by four. Event counts are for both threads. To calculate average latency, the number of fills from both threads must be used.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_wcb_req.wcb_write",
     "EventName": "l2_wcb_req.zero_byte_store",
     "EventCode": "0x63",
     "BriefDescription": "LS to L2 WCB zero byte store requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) zero byte store requests.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_wcb_req.cl_zero",
     "EventCode": "0x63",
     "BriefDescription": "LS to L2 WCB cache line zeroing requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) cache line zeroing requests.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_cache_req_stat.ls_rd_blk_cs",
     "EventName": "l2_cache_req_stat.ls_rd_blk_c",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache request miss in L2 (all types).",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_hit_x",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit modifiable line in L2.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_hit_s",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit clean line in L2.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_miss",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_cache_req_stat.ic_access_in_l2",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache requests in L2.",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2 and Data cache request miss in L2 (all types).",
-    "UMask": "0x9"
+    "UMask": "0x09"
   },
   {
     "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2",
     "EventName": "l2_fill_pending.l2_fill_busy",
     "EventCode": "0x6d",
     "BriefDescription": "Cycles with fill pending from L2. Total cycles spent with one or more fill requests in flight from L2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_pf_hit_l2",
     "EventCode": "0x70",
-    "BriefDescription": "L2 prefetch hit in L2.",
+    "BriefDescription": "L2 prefetch hit in L2. Use l2_cache_hits_from_l2_hwpf instead.",
     "UMask": "0xff"
   },
   {
index 653b11b..4dceeab 100644 (file)
     "EventCode": "0xcb",
     "BriefDescription": "SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ex_ret_mmx_fp_instr.mmx_instr",
     "EventCode": "0xcb",
     "BriefDescription": "MMX instructions.",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. MMX instructions.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ex_ret_mmx_fp_instr.x87_instr",
     "EventCode": "0xcb",
     "BriefDescription": "x87 instructions.",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. x87 instructions.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ex_ret_cond",
     "EventName": "ex_tagged_ibs_ops.ibs_count_rollover",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of times an op could not be tagged by IBS because of a previous tagged op that has not retired.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS that retired.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ex_ret_fus_brnch_inst",
index a35542b..3995b52 100644 (file)
     "EventCode": "0x00",
     "BriefDescription": "Total number uOps assigned to all fpu pipes.",
     "PublicDescription": "The number of operations (uOps) and dual-pipe uOps dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to all pipes.",
-    "UMask": "0xf"
+    "UMask": "0x0f"
   },
   {
     "EventName": "fpu_pipe_assignment.total3",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps on pipe 3.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one-cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 3.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fpu_pipe_assignment.total2",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps on pipe 2.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 2.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fpu_pipe_assignment.total1",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps on pipe 1.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 1.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fpu_pipe_assignment.total0",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps  on pipe 0.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 0.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_sched_empty",
     "EventCode": "0x02",
     "BriefDescription": "All Ops.",
     "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8.",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "EventName": "fp_retx87_fp_ops.div_sqr_r_ops",
     "EventCode": "0x02",
     "BriefDescription": "Divide and square root Ops.",
     "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Divide and square root Ops.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_retx87_fp_ops.mul_ops",
     "EventCode": "0x02",
     "BriefDescription": "Multiply Ops.",
     "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Multiply Ops.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_retx87_fp_ops.add_sub_ops",
     "EventCode": "0x02",
     "BriefDescription": "Add/subtract Ops.",
     "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Add/subtract Ops.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.all",
     "EventCode": "0x03",
     "BriefDescription": "Single precision multiply-add FLOPS. Multiply-add counts as 2 FLOPS.",
     "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single precision multiply-add FLOPS. Multiply-add counts as 2 FLOPS.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.sp_div_flops",
     "EventCode": "0x03",
     "BriefDescription": "Single-precision divide/square root FLOPS.",
     "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision divide/square root FLOPS.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.sp_mult_flops",
     "EventCode": "0x03",
     "BriefDescription": "Single-precision multiply FLOPS.",
     "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision multiply FLOPS.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.sp_add_sub_flops",
     "EventCode": "0x03",
     "BriefDescription": "Single-precision add/subtract FLOPS.",
     "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision add/subtract FLOPS.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.optimized",
     "EventCode": "0x04",
     "BriefDescription": "Number of Scalar Ops optimized.",
     "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of Scalar Ops optimized.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.opt_potential",
     "EventCode": "0x04",
     "BriefDescription": "Number of Ops that are candidates for optimization (have Z-bit either set or pass).",
     "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of Ops that are candidates for optimization (have Z-bit either set or pass).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops_elim",
     "EventCode": "0x04",
     "BriefDescription": "Number of SSE Move Ops eliminated.",
     "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of SSE Move Ops eliminated.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops",
     "EventCode": "0x04",
     "BriefDescription": "Number of SSE Move Ops.",
     "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of SSE Move Ops.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_retired_ser_ops.x87_ctrl_ret",
     "EventCode": "0x05",
     "BriefDescription": "x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits.",
     "PublicDescription": "The number of serializing Ops retired. x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_retired_ser_ops.x87_bot_ret",
     "EventCode": "0x05",
     "BriefDescription": "x87 bottom-executing uOps retired.",
     "PublicDescription": "The number of serializing Ops retired. x87 bottom-executing uOps retired.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_retired_ser_ops.sse_ctrl_ret",
     "EventCode": "0x05",
     "BriefDescription": "SSE control word mispredict traps due to mispredictions in RC, FTZ or DAZ, or changes in mask bits.",
     "PublicDescription": "The number of serializing Ops retired. SSE control word mispredict traps due to mispredictions in RC, FTZ or DAZ, or changes in mask bits.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_retired_ser_ops.sse_bot_ret",
     "EventCode": "0x05",
     "BriefDescription": "SSE bottom-executing uOps retired.",
     "PublicDescription": "The number of serializing Ops retired. SSE bottom-executing uOps retired.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   }
 ]
index b33a3c3..385022f 100644 (file)
@@ -3,25 +3,25 @@
     "EventName": "ls_locks.bus_lock",
     "EventCode": "0x25",
     "BriefDescription": "Bus lock when a locked operations crosses a cache boundary or is done on an uncacheable memory type.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_dispatch.ld_st_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Counts the number of operations dispatched to the LS unit. Unit Masks ADDed. Load-op-Stores.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_dispatch.store_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Counts the number of stores dispatched to the LS unit. Unit Masks ADDed.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_dispatch.ld_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Counts the number of loads dispatched to the LS unit. Unit Masks ADDed.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_stlf",
     "EventName": "ls_mab_alloc.dc_prefetcher",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB allocates by type - DC prefetcher.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_mab_alloc.stores",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB allocates by type - stores.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_mab_alloc.loads",
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Reload of a page of 1G size.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Reload of a page of 2M size.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_32k_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Reload of a page of 32K size.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Reload of a page of 4K size.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_tablewalker.iside",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks on I-side.",
-    "UMask": "0xc"
+    "UMask": "0x0c"
   },
   {
     "EventName": "ls_tablewalker.ic_type1",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks IC Type 1.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_tablewalker.ic_type0",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks IC Type 0.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_tablewalker.dside",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks on D-side.",
-    "UMask": "0x3"
+    "UMask": "0x03"
   },
   {
     "EventName": "ls_tablewalker.dc_type1",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks DC Type 1.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_tablewalker.dc_type0",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks DC Type 0.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_misal_accesses",
     "EventName": "ls_pref_instr_disp.prefetch_nta",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions (PREFETCHNTA instruction) Dispatched.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_pref_instr_disp.store_prefetch_w",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions (3DNow PREFETCHW instruction) Dispatched.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_pref_instr_disp.load_prefetch_w",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions Dispatched. Prefetch, Prefetch_T0_T1_T2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_inef_sw_pref.mab_mch_cnt",
     "EventCode": "0x52",
     "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a match on an already-allocated miss request buffer.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit",
     "EventCode": "0x52",
     "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a DC hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_not_halted_cyc",
index ff78009..7626986 100644 (file)
@@ -3,13 +3,13 @@
     "EventName": "ic_oc_mode_switch.oc_ic_mode_switch",
     "EventCode": "0x28a",
     "BriefDescription": "OC Mode Switch. OC to IC mode switch.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_oc_mode_switch.ic_oc_mode_switch",
     "EventCode": "0x28a",
     "BriefDescription": "OC Mode Switch. IC to OC mode switch.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.retire_token_stall",
     "EventName": "de_dis_dispatch_token_stalls0.alsq3_0_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 3_0 Tokens unavailable.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq3_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 3 Tokens unavailable.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq2_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 2 Tokens unavailable.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq1_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 1 Tokens unavailable.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   }
 ]
index 2cfe2d2..bf5083c 100644 (file)
@@ -10,7 +10,7 @@
     "EventName": "all_dc_accesses",
     "EventCode": "0x29",
     "BriefDescription": "All L1 Data Cache Accesses",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "MetricName": "all_l2_cache_accesses",
     "UMask": "0x70"
   },
   {
-    "MetricName": "l2_cache_hits_from_l2_hwpf",
+    "EventName": "l2_cache_hits_from_l2_hwpf",
+    "EventCode": "0x70",
     "BriefDescription": "L2 Cache Hits from L2 HWPF",
-    "MetricExpr": "l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
-    "MetricGroup": "l2_cache"
+    "UMask": "0xff"
   },
   {
     "EventName": "l3_accesses",
index ef4166a..84fb43f 100644 (file)
     "EventName": "bp_l1_tlb_fetch_hit",
     "EventCode": "0x94",
     "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB.",
-    "UMask": "0xFF"
+    "UMask": "0xff"
   },
   {
     "EventName": "bp_l1_tlb_fetch_hit.if1g",
     "EventCode": "0x94",
     "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. Instruction fetches to a 1GB page.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "bp_l1_tlb_fetch_hit.if2m",
     "EventCode": "0x94",
     "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. Instruction fetches to a 2MB page.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "bp_l1_tlb_fetch_hit.if4k",
     "EventCode": "0x94",
     "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. Instruction fetches to a 4KB page.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "bp_tlb_rel",
index f61b982..c858fb9 100644 (file)
     "EventName": "l2_request_g1.change_to_x",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache state change requests. Request change to writable, check L2 for current state.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_request_g1.prefetch_l2_cmd",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). PrefetchL2Cmd.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_request_g1.l2_hw_pf",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). L2 Prefetcher. All prefetches accepted by L2 pipeline, hit or miss. Types of PF and L2 hit/miss broken out in a separate perfmon event.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_request_g1.group2",
     "EventCode": "0x60",
     "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g2 (PMCx061).",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_request_g1.all_no_prefetch",
     "EventName": "l2_request_g2.ic_rd_sized_nc",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized non-cacheable.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_request_g2.smc_inval",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Self-modifying code invalidates.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_request_g2.bus_locks_originator",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus locks.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_request_g2.bus_locks_responses",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus lock response.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_latency.l2_cycles_waiting_on_fills",
     "EventCode": "0x62",
     "BriefDescription": "Total cycles spent waiting for L2 fills to complete from L3 or memory, divided by four. Event counts are for both threads. To calculate average latency, the number of fills from both threads must be used.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_wcb_req.wcb_write",
     "EventName": "l2_wcb_req.zero_byte_store",
     "EventCode": "0x63",
     "BriefDescription": "LS to L2 WCB zero byte store requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) zero byte store requests.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_wcb_req.cl_zero",
     "EventCode": "0x63",
     "BriefDescription": "LS to L2 WCB cache line zeroing requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) cache line zeroing requests.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_cache_req_stat.ls_rd_blk_cs",
     "EventName": "l2_cache_req_stat.ls_rd_blk_c",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache request miss in L2 (all types).",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_hit_x",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit modifiable line in L2.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_hit_s",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit clean line in L2.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_miss",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_cache_req_stat.ic_access_in_l2",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache requests in L2.",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2 and Data cache request miss in L2 (all types).",
-    "UMask": "0x9"
+    "UMask": "0x09"
   },
   {
     "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2",
     "EventName": "l2_fill_pending.l2_fill_busy",
     "EventCode": "0x6d",
     "BriefDescription": "Cycles with fill pending from L2. Total cycles spent with one or more fill requests in flight from L2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_pf_hit_l2",
     "EventCode": "0x70",
-    "BriefDescription": "L2 prefetch hit in L2.",
+    "BriefDescription": "L2 prefetch hit in L2. Use l2_cache_hits_from_l2_hwpf instead.",
     "UMask": "0xff"
   },
   {
     "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if1g",
     "EventCode": "0x85",
     "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs. Instruction fetches to a 1GB page.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if2m",
     "EventCode": "0x85",
     "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs. Instruction fetches to a 2MB page.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if4k",
     "EventCode": "0x85",
     "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs. Instruction fetches to a 4KB page.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "bp_snp_re_sync",
     "EventName": "ic_fetch_stall.ic_stall_any",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle for any reason (nothing valid in pipe ICM1).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ic_fetch_stall.ic_stall_dq_empty",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to DQ empty.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_fetch_stall.ic_stall_back_pressure",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to back-pressure.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ic_cache_inval.l2_invalidating_probe",
     "EventCode": "0x8c",
     "BriefDescription": "IC line invalidated due to L2 invalidating probe (external or LS). The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_cache_inval.fill_invalidated",
     "EventCode": "0x8c",
     "BriefDescription": "IC line invalidated due to overwriting fill response. The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ic_oc_mode_switch.oc_ic_mode_switch",
     "EventCode": "0x28a",
     "BriefDescription": "OC Mode Switch. OC to IC mode switch.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_oc_mode_switch.ic_oc_mode_switch",
     "EventCode": "0x28a",
     "BriefDescription": "OC Mode Switch. IC to OC mode switch.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l3_request_g1.caching_l3_cache_accesses",
   },
   {
     "EventName": "xi_ccx_sdp_req1.all_l3_miss_req_typs",
-    "EventCode": "0x9A",
+    "EventCode": "0x9a",
     "BriefDescription": "All L3 Miss Request Types. Ignores SliceMask and ThreadMask.",
     "UMask": "0x3f",
     "Unit": "L3PMC"
index 4b75183..bed1482 100644 (file)
     "EventCode": "0xcb",
     "BriefDescription": "SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ex_ret_mmx_fp_instr.mmx_instr",
     "EventCode": "0xcb",
     "BriefDescription": "MMX instructions.",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. MMX instructions.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ex_ret_mmx_fp_instr.x87_instr",
     "EventCode": "0xcb",
     "BriefDescription": "x87 instructions.",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. x87 instructions.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ex_ret_cond",
     "EventName": "ex_tagged_ibs_ops.ibs_count_rollover",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of times an op could not be tagged by IBS because of a previous tagged op that has not retired.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS that retired.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ex_ret_fus_brnch_inst",
index 622a0c4..91ed96f 100644 (file)
@@ -4,35 +4,35 @@
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps.",
     "PublicDescription": "Total number of fp uOps. The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS.",
-    "UMask": "0xf"
+    "UMask": "0x0f"
   },
   {
     "EventName": "fpu_pipe_assignment.total3",
     "EventCode": "0x00",
     "BriefDescription": "Total number uOps assigned to pipe 3.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one-cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 3.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fpu_pipe_assignment.total2",
     "EventCode": "0x00",
     "BriefDescription": "Total number uOps assigned to pipe 2.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 2.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fpu_pipe_assignment.total1",
     "EventCode": "0x00",
     "BriefDescription": "Total number uOps assigned to pipe 1.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 1.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fpu_pipe_assignment.total0",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps  on pipe 0.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 0.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.all",
     "EventCode": "0x03",
     "BriefDescription": "Multiply-add FLOPS. Multiply-add counts as 2 FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
     "PublicDescription": "",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.div_flops",
     "EventCode": "0x03",
     "BriefDescription": "Divide/square root FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.mult_flops",
     "EventCode": "0x03",
     "BriefDescription": "Multiply FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.add_sub_flops",
     "EventCode": "0x03",
     "BriefDescription": "Add/subtract FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.optimized",
     "EventCode": "0x04",
     "BriefDescription": "Number of Scalar Ops optimized. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.opt_potential",
     "EventCode": "0x04",
     "BriefDescription": "Number of Ops that are candidates for optimization (have Z-bit either set or pass). This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops_elim",
     "EventCode": "0x04",
     "BriefDescription": "Number of SSE Move Ops eliminated. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops",
     "EventCode": "0x04",
     "BriefDescription": "Number of SSE Move Ops. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_retired_ser_ops.sse_bot_ret",
     "EventCode": "0x05",
     "BriefDescription": "SSE bottom-executing uOps retired. The number of serializing Ops retired.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_retired_ser_ops.sse_ctrl_ret",
     "EventCode": "0x05",
     "BriefDescription": "The number of serializing Ops retired. SSE control word mispredict traps due to mispredictions in RC, FTZ or DAZ, or changes in mask bits.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_retired_ser_ops.x87_bot_ret",
     "EventCode": "0x05",
     "BriefDescription": "x87 bottom-executing uOps retired. The number of serializing Ops retired.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_retired_ser_ops.x87_ctrl_ret",
     "EventCode": "0x05",
     "BriefDescription": "x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits. The number of serializing Ops retired.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_disp_faults.ymm_spill_fault",
     "EventCode": "0x0e",
     "BriefDescription": "Floating Point Dispatch Faults. YMM spill fault.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_disp_faults.ymm_fill_fault",
     "EventCode": "0x0e",
     "BriefDescription": "Floating Point Dispatch Faults. YMM fill fault.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_disp_faults.xmm_fill_fault",
     "EventCode": "0x0e",
     "BriefDescription": "Floating Point Dispatch Faults. XMM fill fault.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_disp_faults.x87_fill_fault",
     "EventCode": "0x0e",
     "BriefDescription": "Floating Point Dispatch Faults. x87 fill fault.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   }
 ]
index 715046b..89822b9 100644 (file)
@@ -4,31 +4,31 @@
     "EventCode": "0x24",
     "BriefDescription": "Non-forwardable conflict; used to reduce STLI's via software. All reasons. Store To Load Interlock (STLI) are loads that were unable to complete because of a possible match with an older store, and the older store could not do STLF for some reason.",
     "PublicDescription" : "Store-to-load conflicts: A load was unable to complete due to a non-forwardable conflict with an older store. Most commonly, a load's address range partially but not completely overlaps with an uncompleted older store. Software can avoid this problem by using same-size and same-alignment loads and stores when accessing the same data. Vector/SIMD code is particularly susceptible to this problem; software should construct wide vector stores by manipulating vector elements in registers using shuffle/blend/swap instructions prior to storing to memory, instead of using narrow element-by-element stores.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_locks.spec_lock_hi_spec",
     "EventCode": "0x25",
     "BriefDescription": "Retired lock instructions. High speculative cacheable lock speculation succeeded.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_locks.spec_lock_lo_spec",
     "EventCode": "0x25",
     "BriefDescription": "Retired lock instructions. Low speculative cacheable lock speculation succeeded.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_locks.non_spec_lock",
     "EventCode": "0x25",
     "BriefDescription": "Retired lock instructions. Non-speculative lock succeeded.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_locks.bus_lock",
     "EventCode": "0x25",
     "BriefDescription": "Retired lock instructions. Bus lock when a locked operations crosses a cache boundary or is done on an uncacheable memory type. Comparable to legacy bus lock.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_ret_cl_flush",
     "EventName": "ls_dispatch.ld_st_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Dispatch of a single op that performs a load from and store to the same memory address. Number of single ops that do load/store to an address.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_dispatch.store_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Number of stores dispatched. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_dispatch.ld_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Number of loads dispatched. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_smi_rx",
-    "EventCode": "0x2B",
+    "EventCode": "0x2b",
     "BriefDescription": "Number of SMIs received."
   },
   {
     "EventName": "ls_int_taken",
-    "EventCode": "0x2C",
+    "EventCode": "0x2c",
     "BriefDescription": "Number of interrupts taken."
   },
   {
     "EventName": "ls_rdtsc",
-    "EventCode": "0x2D",
+    "EventCode": "0x2d",
     "BriefDescription": "Number of reads of the TSC (RDTSC instructions). The count is speculative."
   },
   {
     "EventName": "ls_mab_alloc.dc_prefetcher",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB Allocates by Type. DC prefetcher.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_mab_alloc.stores",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB Allocates by Type. Stores.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_mab_alloc.loads",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB Allocates by Type. Loads.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_refills_from_sys.ls_mabresp_rmt_dram",
     "EventName": "ls_refills_from_sys.ls_mabresp_lcl_dram",
     "EventCode": "0x43",
     "BriefDescription": "Demand Data Cache Fills by Data Source. DRAM or IO from this thread's die.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_refills_from_sys.ls_mabresp_lcl_cache",
     "EventCode": "0x43",
     "BriefDescription": "Demand Data Cache Fills by Data Source. Hit in cache; local CCX (not Local L2), or Remote CCX and the address's Home Node is on this thread's die.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_refills_from_sys.ls_mabresp_lcl_l2",
     "EventCode": "0x43",
     "BriefDescription": "Demand Data Cache Fills by Data Source. Local L2 hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.all",
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Miss. DTLB reload to a 1G page that hit in the L2 TLB.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Miss. DTLB reload to a 2M page that hit in the L2 TLB.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Miss. DTLB reload hit a coalesced page.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Miss. DTLB reload to a 4K page that hit in the L2 TLB.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_tablewalker.iside",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks on I-side.",
-    "UMask": "0xc"
+    "UMask": "0x0c"
   },
   {
     "EventName": "ls_tablewalker.ic_type1",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks IC Type 1.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_tablewalker.ic_type0",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks IC Type 0.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_tablewalker.dside",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks on D-side.",
-    "UMask": "0x3"
+    "UMask": "0x03"
   },
   {
     "EventName": "ls_tablewalker.dc_type1",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks DC Type 1.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_tablewalker.dc_type0",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks DC Type 0.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_misal_accesses",
     "EventName": "ls_pref_instr_disp.prefetch_nta",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchNTA instruction. See docAPM3 PREFETCHlevel.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_pref_instr_disp.prefetch_w",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). See docAPM3 PREFETCHW.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_pref_instr_disp.prefetch",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). Prefetch_T0_T1_T2. PrefetchT0, T1 and T2 instructions. See docAPM3 PREFETCHlevel.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_inef_sw_pref.mab_mch_cnt",
     "EventCode": "0x52",
     "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a match on an already-allocated miss request buffer.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit",
     "EventCode": "0x52",
     "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a DC hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_sw_pf_dc_fill.ls_mabresp_rmt_dram",
     "EventName": "ls_sw_pf_dc_fill.ls_mabresp_lcl_dram",
     "EventCode": "0x59",
     "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. DRAM or IO from this thread's die.  From DRAM (home node local).",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_sw_pf_dc_fill.ls_mabresp_lcl_cache",
     "EventCode": "0x59",
     "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From another cache (home node local).",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_sw_pf_dc_fill.ls_mabresp_lcl_l2",
     "EventCode": "0x59",
     "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. Local L2 hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_rmt_dram",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM (home node remote).",
     "UMask": "0x40"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_rmt_cache",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From another cache (home node remote).",
     "UMask": "0x10"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_dram",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM (home node local).",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_cache",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From another cache (home node local).",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_l2",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. Local L2 hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_not_halted_cyc",
index e94994d..1bdf106 100644 (file)
     "EventName": "de_dis_uops_from_decoder.opcache_dispatched",
     "EventCode": "0xaa",
     "BriefDescription": "Count of dispatched Ops from OpCache.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "de_dis_uops_from_decoder.decoder_dispatched",
     "EventCode": "0xaa",
     "BriefDescription": "Count of dispatched Ops from Decoder.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls1.fp_misc_rsrc_stall",
     "EventName": "de_dis_dispatch_token_stalls1.int_sched_misc_token_stall",
     "EventCode": "0xae",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Integer Scheduler miscellaneous resource stall.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls1.store_queue_token_stall",
     "EventCode": "0xae",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Store queue resource stall. Applies to all ops with store semantics.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls1.load_queue_token_stall",
     "EventCode": "0xae",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Load queue resource stall. Applies to all ops with load semantics.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls1.int_phy_reg_file_token_stall",
     "EventCode": "0xae",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Integer Physical Register File resource stall. Applies to all ops that have an integer destination register.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.sc_agu_dispatch_stall",
     "EventName": "de_dis_dispatch_token_stalls0.alu_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALU tokens total unavailable.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq3_0_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ3_0_TokenStall.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq2_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 2 Tokens unavailable.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq1_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 1 Tokens unavailable.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   }
 ]
index 2ef91e2..a71694a 100644 (file)
@@ -10,7 +10,7 @@
     "EventName": "all_dc_accesses",
     "EventCode": "0x29",
     "BriefDescription": "All L1 Data Cache Accesses",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "MetricName": "all_l2_cache_accesses",
     "UMask": "0x70"
   },
   {
-    "MetricName": "l2_cache_hits_from_l2_hwpf",
+    "EventName": "l2_cache_hits_from_l2_hwpf",
+    "EventCode": "0x70",
     "BriefDescription": "L2 Cache Hits from L2 HWPF",
-    "MetricExpr": "l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
-    "MetricGroup": "l2_cache"
+    "UMask": "0xff"
   },
   {
     "EventName": "l3_accesses",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/branch.json b/tools/perf/pmu-events/arch/x86/amdzen3/branch.json
new file mode 100644 (file)
index 0000000..018a7fe
--- /dev/null
@@ -0,0 +1,53 @@
+[
+  {
+    "EventName": "bp_l1_btb_correct",
+    "EventCode": "0x8a",
+    "BriefDescription": "L1 Branch Prediction Overrides Existing Prediction (speculative)."
+  },
+  {
+    "EventName": "bp_l2_btb_correct",
+    "EventCode": "0x8b",
+    "BriefDescription": "L2 Branch Prediction Overrides Existing Prediction (speculative)."
+  },
+  {
+    "EventName": "bp_dyn_ind_pred",
+    "EventCode": "0x8e",
+    "BriefDescription": "Dynamic Indirect Predictions.",
+    "PublicDescription": "The number of times a branch used the indirect predictor to make a prediction."
+  },
+  {
+    "EventName": "bp_de_redirect",
+    "EventCode": "0x91",
+    "BriefDescription": "Decode Redirects",
+    "PublicDescription": "The number of times the instruction decoder overrides the predicted target."
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit",
+    "EventCode": "0x94",
+    "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if1g",
+    "EventCode": "0x94",
+    "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. L1 Instruction TLB hit (1G page size).",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if2m",
+    "EventCode": "0x94",
+    "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. L1 Instruction TLB hit (2M page size).",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if4k",
+    "EventCode": "0x94",
+    "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. L1 Instrcution TLB hit (4K or 16K page size).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "bp_tlb_rel",
+    "EventCode": "0x99",
+    "BriefDescription": "The number of ITLB reload requests."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/cache.json b/tools/perf/pmu-events/arch/x86/amdzen3/cache.json
new file mode 100644 (file)
index 0000000..fa1d749
--- /dev/null
@@ -0,0 +1,402 @@
+[
+  {
+    "EventName": "l2_request_g1.rd_blk_l",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache reads (including hardware and software prefetch).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_request_g1.rd_blk_x",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache stores.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_request_g1.ls_rd_blk_c_s",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache shared reads.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_request_g1.cacheable_ic_read",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Instruction cache reads.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_request_g1.change_to_x",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache state change requests. Request change to writable, check L2 for current state.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "l2_request_g1.prefetch_l2_cmd",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). PrefetchL2Cmd.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_request_g1.l2_hw_pf",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). L2 Prefetcher. All prefetches accepted by L2 pipeline, hit or miss. Types of PF and L2 hit/miss broken out in a separate perfmon event.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_request_g1.group2",
+    "EventCode": "0x60",
+    "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g2 (PMCx061).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_request_g1.all_no_prefetch",
+    "EventCode": "0x60",
+    "UMask": "0xf9"
+  },
+  {
+    "EventName": "l2_request_g2.group1",
+    "EventCode": "0x61",
+    "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g1 (PMCx060).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_request_g2.ls_rd_sized",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Data cache read sized.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_request_g2.ls_rd_sized_nc",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Data cache read sized non-cacheable.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_request_g2.ic_rd_sized",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_request_g2.ic_rd_sized_nc",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized non-cacheable.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "l2_request_g2.smc_inval",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Self-modifying code invalidates.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_request_g2.bus_locks_originator",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus locks.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_request_g2.bus_locks_responses",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus lock response.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_latency.l2_cycles_waiting_on_fills",
+    "EventCode": "0x62",
+    "BriefDescription": "Total cycles spent waiting for L2 fills to complete from L3 or memory, divided by four. Event counts are for both threads. To calculate average latency, the number of fills from both threads must be used.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_wcb_req.wcb_write",
+    "EventCode": "0x63",
+    "BriefDescription": "LS to L2 WCB write requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) write requests.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_wcb_req.wcb_close",
+    "EventCode": "0x63",
+    "BriefDescription": "LS to L2 WCB close requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) close requests.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_wcb_req.zero_byte_store",
+    "EventCode": "0x63",
+    "BriefDescription": "LS to L2 WCB zero byte store requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) zero byte store requests.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_wcb_req.cl_zero",
+    "EventCode": "0x63",
+    "BriefDescription": "LS to L2 WCB cache line zeroing requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) cache line zeroing requests.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_cs",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache shared read hit in L2",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache read hit in L2. Modifiable.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_s",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache read hit non-modifiable line in L2.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache store or state change hit in L2.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_c",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache request miss in L2 (all types). Use l2_cache_misses_from_dc_misses instead.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_hit_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit modifiable line in L2.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_hit_s",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit non-modifiable line in L2.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_miss",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2. Use l2_cache_misses_from_ic_miss instead.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_access_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache requests in L2.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2 and Data cache request miss in L2 (all types).",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request hit in L2 and Data cache request hit in L2 (all types).",
+    "UMask": "0xf6"
+  },
+  {
+    "EventName": "l2_fill_pending.l2_fill_busy",
+    "EventCode": "0x6d",
+    "BriefDescription": "Cycles with fill pending from L2. Total cycles spent with one or more fill requests in flight from L2.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_pf_hit_l2",
+    "EventCode": "0x70",
+    "BriefDescription": "L2 prefetch hit in L2. Use l2_cache_hits_from_l2_hwpf instead.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_hit_l3",
+    "EventCode": "0x71",
+    "BriefDescription": "L2 prefetcher hits in L3. Counts all L2 prefetches accepted by the L2 pipeline which miss the L2 cache and hit the L3.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_l3",
+    "EventCode": "0x72",
+    "BriefDescription": "L2 prefetcher misses in L3. Counts all L2 prefetches accepted by the L2 pipeline which miss the L2 and the L3 caches.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ic_fw32",
+    "EventCode": "0x80",
+    "BriefDescription": "The number of 32B fetch windows transferred from IC pipe to DE instruction decoder (includes non-cacheable and cacheable fill responses)."
+  },
+  {
+    "EventName": "ic_fw32_miss",
+    "EventCode": "0x81",
+    "BriefDescription": "The number of 32B fetch windows tried to read the L1 IC and missed in the full tag."
+  },
+  {
+    "EventName": "ic_cache_fill_l2",
+    "EventCode": "0x82",
+    "BriefDescription": "Instruction Cache Refills from L2. The number of 64 byte instruction cache line was fulfilled from the L2 cache."
+  },
+  {
+    "EventName": "ic_cache_fill_sys",
+    "EventCode": "0x83",
+    "BriefDescription": "Instruction Cache Refills from System. The number of 64 byte instruction cache line fulfilled from system memory or another cache."
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_hit",
+    "EventCode": "0x84",
+    "BriefDescription": "L1 ITLB Miss, L2 ITLB Hit. The number of instruction fetches that miss in the L1 ITLB but hit in the L2 ITLB."
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.coalesced_4k",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk for >4K Coalesced page.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if1g",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk for 1G page.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if2m",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk for 2M page.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if4k",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk to 4K page.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "bp_snp_re_sync",
+    "EventCode": "0x86",
+    "BriefDescription": "The number of pipeline restarts caused by invalidating probes that hit on the instruction stream currently being executed. This would happen if the active instruction stream was being modified by another processor in an MP system - typically a highly unlikely event."
+  },
+  {
+    "EventName": "ic_fetch_stall.ic_stall_any",
+    "EventCode": "0x87",
+    "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle for any reason (nothing valid in pipe ICM1).",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ic_fetch_stall.ic_stall_dq_empty",
+    "EventCode": "0x87",
+    "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to DQ empty.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ic_fetch_stall.ic_stall_back_pressure",
+    "EventCode": "0x87",
+    "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to back-pressure.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ic_cache_inval.l2_invalidating_probe",
+    "EventCode": "0x8c",
+    "BriefDescription": "IC line invalidated due to L2 invalidating probe (external or LS). The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ic_cache_inval.fill_invalidated",
+    "EventCode": "0x8c",
+    "BriefDescription": "IC line invalidated due to overwriting fill response. The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ic_tag_hit_miss.all_instruction_cache_accesses",
+    "EventCode": "0x18e",
+    "BriefDescription": "All Instruction Cache Accesses. Counts various IC tag related hit and miss events.",
+    "UMask": "0x1f"
+  },
+  {
+    "EventName": "ic_tag_hit_miss.instruction_cache_miss",
+    "EventCode": "0x18e",
+    "BriefDescription": "Instruction Cache Miss. Counts various IC tag related hit and miss events.",
+    "UMask": "0x18"
+  },
+  {
+    "EventName": "ic_tag_hit_miss.instruction_cache_hit",
+    "EventCode": "0x18e",
+    "BriefDescription": "Instruction Cache Hit. Counts various IC tag related hit and miss events.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "ic_oc_mode_switch.oc_ic_mode_switch",
+    "EventCode": "0x28a",
+    "BriefDescription": "OC Mode Switch. OC to IC mode switch.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ic_oc_mode_switch.ic_oc_mode_switch",
+    "EventCode": "0x28a",
+    "BriefDescription": "OC Mode Switch. IC to OC mode switch.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "op_cache_hit_miss.all_op_cache_accesses",
+    "EventCode": "0x28f",
+    "BriefDescription": "All Op Cache accesses. Counts Op Cache micro-tag hit/miss events",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "op_cache_hit_miss.op_cache_miss",
+    "EventCode": "0x28f",
+    "BriefDescription": "Op Cache Miss. Counts Op Cache micro-tag hit/miss events",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "op_cache_hit_miss.op_cache_hit",
+    "EventCode": "0x28f",
+    "BriefDescription": "Op Cache Hit. Counts Op Cache micro-tag hit/miss events",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "l3_request_g1.caching_l3_cache_accesses",
+    "EventCode": "0x01",
+    "BriefDescription": "Caching: L3 cache accesses",
+    "UMask": "0x80",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_lookup_state.all_l3_req_typs",
+    "EventCode": "0x04",
+    "BriefDescription": "All L3 Request Types. All L3 cache Requests",
+    "UMask": "0xff",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_comb_clstr_state.other_l3_miss_typs",
+    "EventCode": "0x06",
+    "BriefDescription": "Other L3 Miss Request Types",
+    "UMask": "0xfe",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_comb_clstr_state.request_miss",
+    "EventCode": "0x06",
+    "BriefDescription": "L3 cache misses",
+    "UMask": "0x01",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "xi_sys_fill_latency",
+    "EventCode": "0x90",
+    "BriefDescription": "L3 Cache Miss Latency. Total cycles for all transactions divided by 16. Ignores SliceMask and ThreadMask.",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "xi_ccx_sdp_req1",
+    "EventCode": "0x9a",
+    "BriefDescription": "L3 Misses by Request Type. Ignores SliceID, EnAllSlices, CoreID, EnAllCores and ThreadMask. Requires unit mask 0xFF to engage event for counting.",
+    "UMask": "0xff",
+    "Unit": "L3PMC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/core.json b/tools/perf/pmu-events/arch/x86/amdzen3/core.json
new file mode 100644 (file)
index 0000000..4e27a2b
--- /dev/null
@@ -0,0 +1,137 @@
+[
+  {
+    "EventName": "ex_ret_instr",
+    "EventCode": "0xc0",
+    "BriefDescription": "Retired Instructions."
+  },
+  {
+    "EventName": "ex_ret_ops",
+    "EventCode": "0xc1",
+    "BriefDescription": "Retired Ops. Use macro_ops_retired instead.",
+    "PublicDescription": "The number of macro-ops retired."
+  },
+  {
+    "EventName": "ex_ret_brn",
+    "EventCode": "0xc2",
+    "BriefDescription": "Retired Branch Instructions.",
+    "PublicDescription": "The number of branch instructions retired. This includes all types of architectural control flow changes, including exceptions and interrupts."
+  },
+  {
+    "EventName": "ex_ret_brn_misp",
+    "EventCode": "0xc3",
+    "BriefDescription": "Retired Branch Instructions Mispredicted.",
+    "PublicDescription": "The number of retired branch instructions, that were mispredicted."
+  },
+  {
+    "EventName": "ex_ret_brn_tkn",
+    "EventCode": "0xc4",
+    "BriefDescription": "Retired Taken Branch Instructions.",
+    "PublicDescription": "The number of taken branches that were retired. This includes all types of architectural control flow changes, including exceptions and interrupts."
+  },
+  {
+    "EventName": "ex_ret_brn_tkn_misp",
+    "EventCode": "0xc5",
+    "BriefDescription": "Retired Taken Branch Instructions Mispredicted.",
+    "PublicDescription": "The number of retired taken branch instructions that were mispredicted."
+  },
+  {
+    "EventName": "ex_ret_brn_far",
+    "EventCode": "0xc6",
+    "BriefDescription": "Retired Far Control Transfers.",
+    "PublicDescription": "The number of far control transfers retired including far call/jump/return, IRET, SYSCALL and SYSRET, plus exceptions and interrupts. Far control transfers are not subject to branch prediction."
+  },
+  {
+    "EventName": "ex_ret_brn_resync",
+    "EventCode": "0xc7",
+    "BriefDescription": "Retired Branch Resyncs.",
+    "PublicDescription": "The number of resync branches. These reflect pipeline restarts due to certain microcode assists and events such as writes to the active instruction stream, among other things. Each occurrence reflects a restart penalty similar to a branch mispredict. This is relatively rare."
+  },
+  {
+    "EventName": "ex_ret_near_ret",
+    "EventCode": "0xc8",
+    "BriefDescription": "Retired Near Returns.",
+    "PublicDescription": "The number of near return instructions (RET or RET Iw) retired."
+  },
+  {
+    "EventName": "ex_ret_near_ret_mispred",
+    "EventCode": "0xc9",
+    "BriefDescription": "Retired Near Returns Mispredicted.",
+    "PublicDescription": "The number of near returns retired that were not correctly predicted by the return address predictor. Each such mispredict incurs the same penalty as a mispredicted conditional branch instruction."
+  },
+  {
+    "EventName": "ex_ret_brn_ind_misp",
+    "EventCode": "0xca",
+    "BriefDescription": "Retired Indirect Branch Instructions Mispredicted.",
+    "PublicDescription": "The number of indirect branches retired that were not correctly predicted. Each such mispredict incurs the same penalty as a mispredicted conditional branch instruction. Note that only EX mispredicts are counted."
+  },
+  {
+    "EventName": "ex_ret_mmx_fp_instr.sse_instr",
+    "EventCode": "0xcb",
+    "BriefDescription": "SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
+    "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ex_ret_mmx_fp_instr.mmx_instr",
+    "EventCode": "0xcb",
+    "BriefDescription": "MMX instructions.",
+    "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. MMX instructions.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ex_ret_mmx_fp_instr.x87_instr",
+    "EventCode": "0xcb",
+    "BriefDescription": "x87 instructions.",
+    "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. x87 instructions.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ex_ret_ind_brch_instr",
+    "EventCode": "0xcc",
+    "BriefDescription": "Retired Indirect Branch Instructions. The number of indirect branches retired."
+  },
+  {
+    "EventName": "ex_ret_cond",
+    "EventCode": "0xd1",
+    "BriefDescription": "Retired Conditional Branch Instructions."
+  },
+  {
+    "EventName": "ex_div_busy",
+    "EventCode": "0xd3",
+    "BriefDescription": "Div Cycles Busy count."
+  },
+  {
+    "EventName": "ex_div_count",
+    "EventCode": "0xd4",
+    "BriefDescription": "Div Op Count."
+  },
+  {
+    "EventName": "ex_ret_msprd_brnch_instr_dir_msmtch",
+    "EventCode": "0x1c7",
+    "BriefDescription": "Retired Mispredicted Branch Instructions due to Direction Mismatch",
+    "PublicDescription": "The number of retired conditional branch instructions that were not correctly predicted because of a branch direction mismatch."
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.ibs_count_rollover",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Tagged IBS Ops. Number of times an op could not be tagged by IBS because of a previous tagged op that has not retired.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS that retired.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ex_ret_fused_instr",
+    "EventCode": "0x1d0",
+    "BriefDescription": "Counts retired Fused Instructions."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/data-fabric.json b/tools/perf/pmu-events/arch/x86/amdzen3/data-fabric.json
new file mode 100644 (file)
index 0000000..40271df
--- /dev/null
@@ -0,0 +1,98 @@
+[
+  {
+    "EventName": "remote_outbound_data_controller_0",
+    "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 0",
+    "EventCode": "0x7c7",
+    "UMask": "0x02",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "remote_outbound_data_controller_1",
+    "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 1",
+    "EventCode": "0x807",
+    "UMask": "0x02",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "remote_outbound_data_controller_2",
+    "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 2",
+    "EventCode": "0x847",
+    "UMask": "0x02",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "remote_outbound_data_controller_3",
+    "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 3",
+    "EventCode": "0x887",
+    "UMask": "0x02",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_0",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x07",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_1",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x47",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_2",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x87",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_3",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0xc7",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_4",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x107",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_5",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x147",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_6",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x187",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_7",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x1c7",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen3/floating-point.json
new file mode 100644 (file)
index 0000000..98cfcb9
--- /dev/null
@@ -0,0 +1,139 @@
+[
+  {
+    "EventName": "fpu_pipe_assignment.total",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number of fp uOps.",
+    "PublicDescription": "Total number of fp uOps. The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fpu_pipe_assignment.total3",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number uOps assigned to pipe 3.",
+    "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one-cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 3.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fpu_pipe_assignment.total2",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number uOps assigned to pipe 2.",
+    "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 2.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fpu_pipe_assignment.total1",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number uOps assigned to pipe 1.",
+    "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 1.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fpu_pipe_assignment.total0",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number of fp uOps  on pipe 0.",
+    "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 0.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.all",
+    "EventCode": "0x03",
+    "BriefDescription": "All FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.mac_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Multiply-Accumulate FLOPs. Each MAC operation is counted as 2 FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.div_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Divide/square root FLOPs. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.mult_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Multiply FLOPs. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.add_sub_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Add/subtract FLOPs. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_num_mov_elim_scal_op.optimized",
+    "EventCode": "0x04",
+    "BriefDescription": "Number of Scalar Ops optimized. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_num_mov_elim_scal_op.opt_potential",
+    "EventCode": "0x04",
+    "BriefDescription": "Number of Ops that are candidates for optimization (have Z-bit either set or pass). This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops_elim",
+    "EventCode": "0x04",
+    "BriefDescription": "Number of SSE Move Ops eliminated. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops",
+    "EventCode": "0x04",
+    "BriefDescription": "Number of SSE Move Ops. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_retired_ser_ops.sse_bot_ret",
+    "EventCode": "0x05",
+    "BriefDescription": "SSE/AVX bottom-executing ops retired. The number of serializing Ops retired.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_retired_ser_ops.sse_ctrl_ret",
+    "EventCode": "0x05",
+    "BriefDescription": "SSE/AVX control word mispredict traps. The number of serializing Ops retired.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_retired_ser_ops.x87_bot_ret",
+    "EventCode": "0x05",
+    "BriefDescription": "x87 bottom-executing ops retired. The number of serializing Ops retired.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_retired_ser_ops.x87_ctrl_ret",
+    "EventCode": "0x05",
+    "BriefDescription": "x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits. The number of serializing Ops retired.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_disp_faults.ymm_spill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating Point Dispatch Faults. YMM spill fault.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_disp_faults.ymm_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating Point Dispatch Faults. YMM fill fault.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_disp_faults.xmm_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating Point Dispatch Faults. XMM fill fault.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_disp_faults.x87_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating Point Dispatch Faults. x87 fill fault.",
+    "UMask": "0x01"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/memory.json b/tools/perf/pmu-events/arch/x86/amdzen3/memory.json
new file mode 100644 (file)
index 0000000..a283395
--- /dev/null
@@ -0,0 +1,428 @@
+[
+  {
+    "EventName": "ls_bad_status2.stli_other",
+    "EventCode": "0x24",
+    "BriefDescription": "Non-forwardable conflict; used to reduce STLI's via software. All reasons. Store To Load Interlock (STLI) are loads that were unable to complete because of a possible match with an older store, and the older store could not do STLF for some reason.",
+    "PublicDescription" : "Store-to-load conflicts: A load was unable to complete due to a non-forwardable conflict with an older store. Most commonly, a load's address range partially but not completely overlaps with an uncompleted older store. Software can avoid this problem by using same-size and same-alignment loads and stores when accessing the same data. Vector/SIMD code is particularly susceptible to this problem; software should construct wide vector stores by manipulating vector elements in registers using shuffle/blend/swap instructions prior to storing to memory, instead of using narrow element-by-element stores.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_locks.spec_lock_hi_spec",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions. High speculative cacheable lock speculation succeeded.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_locks.spec_lock_lo_spec",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions. Low speculative cacheable lock speculation succeeded.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_locks.non_spec_lock",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions. Non-speculative lock succeeded.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_locks.bus_lock",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions. Comparable to legacy bus lock.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_ret_cl_flush",
+    "EventCode": "0x26",
+    "BriefDescription": "The number of retired CLFLUSH instructions. This is a non-speculative event."
+  },
+  {
+    "EventName": "ls_ret_cpuid",
+    "EventCode": "0x27",
+    "BriefDescription": "The number of CPUID instructions retired."
+  },
+  {
+    "EventName": "ls_dispatch.ld_st_dispatch",
+    "EventCode": "0x29",
+    "BriefDescription": "Load-op-Store Dispatch. Dispatch of a single op that performs a load from and store to the same memory address. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_dispatch.store_dispatch",
+    "EventCode": "0x29",
+    "BriefDescription": "Dispatch of a single op that performs a memory store. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_dispatch.ld_dispatch",
+    "EventCode": "0x29",
+    "BriefDescription": "Dispatch of a single op that performs a memory load. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_smi_rx",
+    "EventCode": "0x2b",
+    "BriefDescription": "Counts the number of SMIs received."
+  },
+  {
+    "EventName": "ls_int_taken",
+    "EventCode": "0x2c",
+    "BriefDescription": "Counts the number of interrupts taken."
+  },
+  {
+    "EventName": "ls_rdtsc",
+    "EventCode": "0x2d",
+    "BriefDescription": "Number of reads of the TSC (RDTSC instructions). The count is speculative."
+  },
+  {
+    "EventName": "ls_stlf",
+    "EventCode": "0x35",
+    "BriefDescription": "Number of STLF hits."
+  },
+  {
+    "EventName": "ls_st_commit_cancel2.st_commit_cancel_wcb_full",
+    "EventCode": "0x37",
+    "BriefDescription": "A non-cacheable store and the non-cacheable commit buffer is full.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_dc_accesses",
+    "EventCode": "0x40",
+    "BriefDescription": "Number of accesses to the dcache for load/store references.",
+    "PublicDescription": "The number of accesses to the data cache for load and store references. This may include certain microcode scratchpad accesses, although these are generally rare. Each increment represents an eight-byte access, although the instruction may only be accessing a portion of that. This event is a speculative event."
+  },
+  {
+    "EventName": "ls_mab_alloc.all_allocations",
+    "EventCode": "0x41",
+    "BriefDescription": "All Allocations. Counts when a LS pipe allocates a MAB entry.",
+    "UMask": "0x7f"
+  },
+  {
+    "EventName": "ls_mab_alloc.hardware_prefetcher_allocations",
+    "EventCode": "0x41",
+    "BriefDescription": "Hardware Prefetcher Allocations. Counts when a LS pipe allocates a MAB entry.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_mab_alloc.load_store_allocations",
+    "EventCode": "0x41",
+    "BriefDescription": "Load Store Allocations. Counts when a LS pipe allocates a MAB entry.",
+    "UMask": "0x3f"
+  },
+  {
+    "EventName": "ls_mab_alloc.dc_prefetcher",
+    "EventCode": "0x41",
+    "BriefDescription": "LS MAB Allocates by Type. DC prefetcher.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_mab_alloc.stores",
+    "EventCode": "0x41",
+    "BriefDescription": "LS MAB Allocates by Type. Stores.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_mab_alloc.loads",
+    "EventCode": "0x41",
+    "BriefDescription": "LS MAB Allocates by Type. Loads.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.mem_io_remote",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From DRAM or IO connected in different Node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.ext_cache_remote",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From CCX Cache in different Node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.mem_io_local",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From DRAM or IO connected in same node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.ext_cache_local",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From cache of different CCX in same node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.int_cache",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From L3 or different L2 in same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.lcl_l2",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From Local L2 to the core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.mem_io_remote",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From DRAM or IO connected in different Node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.ext_cache_remote",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From CCX Cache in different Node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.mem_io_local",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From DRAM or IO connected in same node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.ext_cache_local",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From cache of different CCX in same node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.int_cache",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From L3 or different L2 in same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.lcl_l2",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From Local L2 to the core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.all",
+    "EventCode": "0x45",
+    "BriefDescription": "All L1 DTLB Misses or Reloads. Use l1_dtlb_misses instead.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 1G page that also missed in the L2 TLB.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 2M page that also missed in the L2 TLB.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload coalesced page that also missed in the L2 TLB.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 4K page that missed the L2 TLB.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 1G page that hit in the L2 TLB.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 2M page that hit in the L2 TLB.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a coalesced page that hit in the L2 TLB.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 4K page that hit in the L2 TLB.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_tablewalker.iside",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks on I-side.",
+    "UMask": "0x0c"
+  },
+  {
+    "EventName": "ls_tablewalker.ic_type1",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks IC Type 1.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_tablewalker.ic_type0",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks IC Type 0.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_tablewalker.dside",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks on D-side.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ls_tablewalker.dc_type1",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks DC Type 1.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_tablewalker.dc_type0",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks DC Type 0.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_misal_loads.ma4k",
+    "EventCode": "0x47",
+    "BriefDescription": "The number of 4KB misaligned (i.e., page crossing) loads.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_misal_loads.ma64",
+    "EventCode": "0x47",
+    "BriefDescription": "The number of 64B misaligned (i.e., cacheline crossing) loads.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_pref_instr_disp",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative).",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch_nta",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchNTA instruction. See docAPM3 PREFETCHlevel.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch_w",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchW instruction. See docAPM3 PREFETCHW.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchT0, T1 and T2 instructions. See docAPM3 PREFETCHlevel.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_inef_sw_pref.mab_mch_cnt",
+    "EventCode": "0x52",
+    "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a match on an already-allocated miss request buffer.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit",
+    "EventCode": "0x52",
+    "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a DC hit.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.mem_io_remote",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in different Node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.ext_cache_remote",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From CCX Cache in different Node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.mem_io_local",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in same node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.ext_cache_local",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From cache of different CCX in same node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.int_cache",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From L3 or different L2 in same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.lcl_l2",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From Local L2 to the core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.mem_io_remote",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in different Node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.ext_cache_remote",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From CCX Cache in different Node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.mem_io_local",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in same node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.ext_cache_local",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From cache of different CCX in same node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.int_cache",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From L3 or different L2 in same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.lcl_l2",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From Local L2 to the core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_alloc_mab_count",
+    "EventCode": "0x5f",
+    "BriefDescription": "Count of Allocated Mabs",
+    "PublicDescription": "This event counts the in-flight L1 data cache misses (allocated Miss Address Buffers) divided by 4 and rounded down each cycle unless used with the MergeEvent functionality. If the MergeEvent is used, it counts the exact number of outstanding L1 data cache misses. See 2.1.17.3 [Large Increment per Cycle Events]."
+  },
+  {
+    "EventName": "ls_not_halted_cyc",
+    "EventCode": "0x76",
+    "BriefDescription": "Cycles not in Halt."
+  },
+  {
+    "EventName": "ls_tlb_flush.all_tlb_flushes",
+    "EventCode": "0x78",
+    "BriefDescription": "All TLB Flushes. Requires unit mask 0xFF to engage event for counting. Use all_tlbs_flushed instead",
+    "UMask": "0xff"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/other.json b/tools/perf/pmu-events/arch/x86/amdzen3/other.json
new file mode 100644 (file)
index 0000000..7da5d07
--- /dev/null
@@ -0,0 +1,103 @@
+[
+  {
+    "EventName": "de_dis_uop_queue_empty_di0",
+    "EventCode": "0xa9",
+    "BriefDescription": "Cycles where the Micro-Op Queue is empty."
+  },
+  {
+    "EventName": "de_dis_cops_from_decoder.disp_op_type.any_integer_dispatch",
+    "EventCode": "0xab",
+    "BriefDescription": "Any Integer dispatch. Types of Oops Dispatched from Decoder.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "de_dis_cops_from_decoder.disp_op_type.any_fp_dispatch",
+    "EventCode": "0xab",
+    "BriefDescription": "Any FP dispatch. Types of Oops Dispatched from Decoder.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.fp_flush_recovery_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. FP Flush recovery stall.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.fp_sch_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. FP scheduler resource stall. Applies to ops that use the FP scheduler.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.fp_reg_file_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Floating point register file resource stall. Applies to all FP ops that have a destination register.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.taken_brnch_buffer_rsrc",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Taken branch buffer resource stall.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.int_sched_misc_token_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Integer Scheduler miscellaneous resource stall.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.store_queue_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Store Queue resource stall. Applies to all ops with store semantics.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.load_queue_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Load Queue resource stall. Applies to all ops with load semantics.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.int_phy_reg_file_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Integer Physical Register File resource stall. Integer Physical Register File, applies to all ops that have an integer destination register.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.retire_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Insufficient Retire Queue tokens available.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.agsq_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. AGSQ Tokens unavailable.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.int_sch3_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 3 available.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.int_sch2_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 2 available.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.int_sch1_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 1 available.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.int_sch0_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 0 available.",
+    "UMask": "0x01"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json
new file mode 100644 (file)
index 0000000..988cf68
--- /dev/null
@@ -0,0 +1,214 @@
+[
+  {
+    "MetricName": "branch_misprediction_ratio",
+    "BriefDescription": "Execution-Time Branch Misprediction Ratio (Non-Speculative)",
+    "MetricExpr": "d_ratio(ex_ret_brn_misp, ex_ret_brn)",
+    "MetricGroup": "branch_prediction",
+    "ScaleUnit": "100%"
+  },
+  {
+    "EventName": "all_data_cache_accesses",
+    "EventCode": "0x29",
+    "BriefDescription": "All L1 Data Cache Accesses",
+    "UMask": "0x07"
+  },
+  {
+    "MetricName": "all_l2_cache_accesses",
+    "BriefDescription": "All L2 Cache Accesses",
+    "MetricExpr": "l2_request_g1.all_no_prefetch + l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "EventName": "l2_cache_accesses_from_ic_misses",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 Cache Accesses from L1 Instruction Cache Misses (including prefetch)",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_cache_accesses_from_dc_misses",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 Cache Accesses from L1 Data Cache Misses (including prefetch)",
+    "UMask": "0xe8"
+  },
+  {
+    "MetricName": "l2_cache_accesses_from_l2_hwpf",
+    "BriefDescription": "L2 Cache Accesses from L2 HWPF",
+    "MetricExpr": "l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "MetricName": "all_l2_cache_misses",
+    "BriefDescription": "All L2 Cache Misses",
+    "MetricExpr": "l2_cache_req_stat.ic_dc_miss_in_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "EventName": "l2_cache_misses_from_ic_miss",
+    "EventCode": "0x64",
+    "BriefDescription": "L2 Cache Misses from L1 Instruction Cache Misses",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_cache_misses_from_dc_misses",
+    "EventCode": "0x64",
+    "BriefDescription": "L2 Cache Misses from L1 Data Cache Misses",
+    "UMask": "0x08"
+  },
+  {
+    "MetricName": "l2_cache_misses_from_l2_hwpf",
+    "BriefDescription": "L2 Cache Misses from L2 Cache HWPF",
+    "MetricExpr": "l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "MetricName": "all_l2_cache_hits",
+    "BriefDescription": "All L2 Cache Hits",
+    "MetricExpr": "l2_cache_req_stat.ic_dc_hit_in_l2 + l2_pf_hit_l2",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "EventName": "l2_cache_hits_from_ic_misses",
+    "EventCode": "0x64",
+    "BriefDescription": "L2 Cache Hits from L1 Instruction Cache Misses",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "l2_cache_hits_from_dc_misses",
+    "EventCode": "0x64",
+    "BriefDescription": "L2 Cache Hits from L1 Data Cache Misses",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "l2_cache_hits_from_l2_hwpf",
+    "EventCode": "0x70",
+    "BriefDescription": "L2 Cache Hits from L2 Cache HWPF",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l3_cache_accesses",
+    "EventCode": "0x04",
+    "BriefDescription": "L3 Cache Accesses",
+    "UMask": "0xff",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_misses",
+    "EventCode": "0x04",
+    "BriefDescription": "L3 Misses (includes cacheline state change requests)",
+    "UMask": "0x01",
+    "Unit": "L3PMC"
+  },
+  {
+    "MetricName": "l3_read_miss_latency",
+    "BriefDescription": "Average L3 Read Miss Latency (in core clocks)",
+    "MetricExpr": "(xi_sys_fill_latency * 16) / xi_ccx_sdp_req1",
+    "MetricGroup": "l3_cache",
+    "ScaleUnit": "1core clocks"
+  },
+  {
+    "MetricName": "op_cache_fetch_miss_ratio",
+    "BriefDescription": "Op Cache (64B) Fetch Miss Ratio",
+    "MetricExpr": "d_ratio(op_cache_hit_miss.op_cache_miss, op_cache_hit_miss.all_op_cache_accesses)",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "MetricName": "ic_fetch_miss_ratio",
+    "BriefDescription": "Instruction Cache (32B) Fetch Miss Ratio",
+    "MetricExpr": "d_ratio(ic_tag_hit_miss.instruction_cache_miss, ic_tag_hit_miss.all_instruction_cache_accesses)",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "100%"
+  },
+  {
+    "EventName": "l1_data_cache_fills_from_memory",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: From Memory",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "l1_data_cache_fills_from_remote_node",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: From Remote Node",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "l1_data_cache_fills_from_within_same_ccx",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: From within same CCX",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "l1_data_cache_fills_from_external_ccx_cache",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: From External CCX Cache",
+    "UMask": "0x14"
+  },
+  {
+    "EventName": "l1_data_cache_fills_all",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: All",
+    "UMask": "0xff"
+  },
+  {
+    "MetricName": "l1_itlb_misses",
+    "BriefDescription": "L1 ITLB Misses",
+    "MetricExpr": "bp_l1_tlb_miss_l2_tlb_hit + bp_l1_tlb_miss_l2_tlb_miss",
+    "MetricGroup": "tlb"
+  },
+  {
+    "EventName": "l2_itlb_misses",
+    "EventCode": "0x85",
+    "BriefDescription": "L2 ITLB Misses & Instruction page walks",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "l1_dtlb_misses",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Misses",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_dtlb_misses",
+    "EventCode": "0x45",
+    "BriefDescription": "L2 DTLB Misses & Data page walks",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "all_tlbs_flushed",
+    "EventCode": "0x78",
+    "BriefDescription": "All TLBs Flushed",
+    "UMask": "0xff"
+  },
+  {
+    "MetricName": "macro_ops_dispatched",
+    "BriefDescription": "Macro-ops Dispatched",
+    "MetricExpr": "de_dis_cops_from_decoder.disp_op_type.any_integer_dispatch + de_dis_cops_from_decoder.disp_op_type.any_fp_dispatch",
+    "MetricGroup": "decoder"
+  },
+  {
+    "EventName": "sse_avx_stalls",
+    "EventCode": "0x0e",
+    "BriefDescription": "Mixed SSE/AVX Stalls",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "macro_ops_retired",
+    "EventCode": "0xc1",
+    "BriefDescription": "Macro-ops Retired"
+  },
+  {
+    "MetricName": "all_remote_links_outbound",
+    "BriefDescription": "Approximate: Outbound data bytes for all Remote Links for a node (die)",
+    "MetricExpr": "remote_outbound_data_controller_0 + remote_outbound_data_controller_1 + remote_outbound_data_controller_2 + remote_outbound_data_controller_3",
+    "MetricGroup": "data_fabric",
+    "PerPkg": "1",
+    "ScaleUnit": "3e-5MiB"
+  },
+  {
+    "MetricName": "nps1_die_to_dram",
+    "BriefDescription": "Approximate: Combined DRAM B/bytes of all channels on a NPS1 node (die) (may need --metric-no-group)",
+    "MetricExpr": "dram_channel_data_controller_0 + dram_channel_data_controller_1 + dram_channel_data_controller_2 + dram_channel_data_controller_3 + dram_channel_data_controller_4 + dram_channel_data_controller_5 + dram_channel_data_controller_6 + dram_channel_data_controller_7",
+    "MetricGroup": "data_fabric",
+    "PerPkg": "1",
+    "ScaleUnit": "6.1e-5MiB"
+  }
+]
index 2f2a209..0a6a8c7 100644 (file)
@@ -24,6 +24,7 @@ GenuineIntel-6-1F,v2,nehalemep,core
 GenuineIntel-6-1A,v2,nehalemep,core
 GenuineIntel-6-2E,v2,nehalemex,core
 GenuineIntel-6-[4589]E,v24,skylake,core
+GenuineIntel-6-A[56],v24,skylake,core
 GenuineIntel-6-37,v13,silvermont,core
 GenuineIntel-6-4D,v13,silvermont,core
 GenuineIntel-6-4C,v13,silvermont,core
@@ -35,7 +36,9 @@ GenuineIntel-6-55-[01234],v1,skylakex,core
 GenuineIntel-6-55-[56789ABCDEF],v1,cascadelakex,core
 GenuineIntel-6-7D,v1,icelake,core
 GenuineIntel-6-7E,v1,icelake,core
+GenuineIntel-6-8[CD],v1,icelake,core
+GenuineIntel-6-A7,v1,icelake,core
 GenuineIntel-6-86,v1,tremontx,core
 AuthenticAMD-23-([12][0-9A-F]|[0-9A-F]),v2,amdzen1,core
 AuthenticAMD-23-[[:xdigit:]]+,v1,amdzen2,core
-AuthenticAMD-25-[[:xdigit:]]+,v1,amdzen2,core
+AuthenticAMD-25-[[:xdigit:]]+,v1,amdzen3,core
index e1f3f5c..ed4f0bd 100644 (file)
@@ -285,6 +285,8 @@ static struct map {
        { "imx8_ddr", "imx8_ddr" },
        { "L3PMC", "amd_l3" },
        { "DFPMC", "amd_df" },
+       { "cpu_core", "cpu_core" },
+       { "cpu_atom", "cpu_atom" },
        {}
 };
 
@@ -1149,7 +1151,7 @@ static int process_one_file(const char *fpath, const struct stat *sb,
         * and directory tree could result in build failure due to table
         * names not being found.
         *
-        * Atleast for now, be strict with processing JSON file names.
+        * At least for now, be strict with processing JSON file names.
         * i.e. if JSON file name cannot be mapped to C-style table name,
         * fail.
         */
index ea0c8b9..a0cfc7f 100644 (file)
@@ -356,7 +356,7 @@ def handle_irq_softirq_exit(event_info):
                return
        rec_data = {'sirq_ent_t':sirq_ent_t, 'sirq_ext_t':time,
                        'irq_list':irq_list, 'event_list':event_list}
-       # merge information realted to a NET_RX softirq
+       # merge information related to a NET_RX softirq
        receive_hunk_list.append(rec_data)
 
 def handle_napi_poll(event_info):
index dd39ce9..9b40a25 100644 (file)
@@ -34,6 +34,7 @@
 #include "event.h"
 #include "util.h"
 #include "tests.h"
+#include "pmu.h"
 
 #define ENV "PERF_TEST_ATTR"
 
@@ -184,6 +185,9 @@ int test__attr(struct test *test __maybe_unused, int subtest __maybe_unused)
        char path_dir[PATH_MAX];
        char *exec_path;
 
+       if (perf_pmu__has_hybrid())
+               return TEST_SKIP;
+
        /* First try development tree tests. */
        if (!lstat("./tests", &st))
                return run_dir("./tests", "./perf");
index cc9fbce..ef37353 100644 (file)
@@ -225,11 +225,11 @@ int test__bp_signal(struct test *test __maybe_unused, int subtest __maybe_unused
         *
         * The test case check following error conditions:
         * - we get stuck in signal handler because of debug
-        *   exception being triggered receursively due to
+        *   exception being triggered recursively due to
         *   the wrong RF EFLAG management
         *
         * - we never trigger the sig_handler breakpoint due
-        *   to the rong RF EFLAG management
+        *   to the wrong RF EFLAG management
         *
         */
 
@@ -242,7 +242,7 @@ int test__bp_signal(struct test *test __maybe_unused, int subtest __maybe_unused
        ioctl(fd3, PERF_EVENT_IOC_ENABLE, 0);
 
        /*
-        * Kick off the test by trigering 'fd1'
+        * Kick off the test by triggering 'fd1'
         * breakpoint.
         */
        test_function();
index 2fdc7b2..9866cdd 100644 (file)
@@ -658,7 +658,7 @@ static int do_test_code_reading(bool try_kcore)
                                /*
                                 * Both cpus and threads are now owned by evlist
                                 * and will be freed by following perf_evlist__set_maps
-                                * call. Getting refference to keep them alive.
+                                * call. Getting reference to keep them alive.
                                 */
                                perf_cpu_map__get(cpus);
                                perf_thread_map__get(threads);
index a273ed5..0043be8 100644 (file)
@@ -19,14 +19,14 @@ int test__demangle_ocaml(struct test *test __maybe_unused, int subtest __maybe_u
                { "main",
                  NULL },
                { "camlStdlib__array__map_154",
-                 "Stdlib.array.map" },
+                 "Stdlib.array.map_154" },
                { "camlStdlib__anon_fn$5bstdlib$2eml$3a334$2c0$2d$2d54$5d_1453",
-                 "Stdlib.anon_fn[stdlib.ml:334,0--54]" },
+                 "Stdlib.anon_fn[stdlib.ml:334,0--54]_1453" },
                { "camlStdlib__bytes__$2b$2b_2205",
-                 "Stdlib.bytes.++" },
+                 "Stdlib.bytes.++_2205" },
        };
 
-       for (i = 0; i < sizeof(test_cases) / sizeof(test_cases[0]); i++) {
+       for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
                buf = ocaml_demangle_sym(test_cases[i].mangled);
                if ((buf == NULL && test_cases[i].demangled != NULL)
                                || (buf != NULL && test_cases[i].demangled == NULL)
index f7f3e5b..b74cf80 100644 (file)
@@ -4,6 +4,7 @@
 #include "parse-events.h"
 #include "tests.h"
 #include "debug.h"
+#include "pmu.h"
 #include <errno.h>
 #include <linux/kernel.h>
 
@@ -62,7 +63,8 @@ static int perf_evsel__roundtrip_cache_name_test(void)
        return ret;
 }
 
-static int __perf_evsel__name_array_test(const char *names[], int nr_names)
+static int __perf_evsel__name_array_test(const char *names[], int nr_names,
+                                        int distance)
 {
        int i, err;
        struct evsel *evsel;
@@ -82,9 +84,9 @@ static int __perf_evsel__name_array_test(const char *names[], int nr_names)
 
        err = 0;
        evlist__for_each_entry(evlist, evsel) {
-               if (strcmp(evsel__name(evsel), names[evsel->idx])) {
+               if (strcmp(evsel__name(evsel), names[evsel->idx / distance])) {
                        --err;
-                       pr_debug("%s != %s\n", evsel__name(evsel), names[evsel->idx]);
+                       pr_debug("%s != %s\n", evsel__name(evsel), names[evsel->idx / distance]);
                }
        }
 
@@ -93,18 +95,21 @@ out_delete_evlist:
        return err;
 }
 
-#define perf_evsel__name_array_test(names) \
-       __perf_evsel__name_array_test(names, ARRAY_SIZE(names))
+#define perf_evsel__name_array_test(names, distance) \
+       __perf_evsel__name_array_test(names, ARRAY_SIZE(names), distance)
 
 int test__perf_evsel__roundtrip_name_test(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
        int err = 0, ret = 0;
 
-       err = perf_evsel__name_array_test(evsel__hw_names);
+       if (perf_pmu__has_hybrid())
+               return perf_evsel__name_array_test(evsel__hw_names, 2);
+
+       err = perf_evsel__name_array_test(evsel__hw_names, 1);
        if (err)
                ret = err;
 
-       err = __perf_evsel__name_array_test(evsel__sw_names, PERF_COUNT_SW_DUMMY + 1);
+       err = __perf_evsel__name_array_test(evsel__sw_names, PERF_COUNT_SW_DUMMY + 1, 1);
        if (err)
                ret = err;
 
index 3f2e1a5..890cb1f 100644 (file)
@@ -47,7 +47,7 @@ static struct sample fake_samples[] = {
 };
 
 /*
- * Will be casted to struct ip_callchain which has all 64 bit entries
+ * Will be cast to struct ip_callchain which has all 64 bit entries
  * of nr and ips[].
  */
 static u64 fake_callchains[][10] = {
@@ -297,7 +297,7 @@ out:
        return err;
 }
 
-/* callcain + NO children */
+/* callchain + NO children */
 static int test2(struct evsel *evsel, struct machine *machine)
 {
        int err;
index 123e07d..ca6120c 100644 (file)
@@ -150,13 +150,13 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu
                }
 
                TEST_ASSERT_VAL("Invalid nr samples",
-                               hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10);
+                               hists->stats.nr_samples == 10);
                TEST_ASSERT_VAL("Invalid nr hist entries",
                                hists->nr_entries == 9);
                TEST_ASSERT_VAL("Invalid total period",
                                hists->stats.total_period == 1000);
                TEST_ASSERT_VAL("Unmatched nr samples",
-                               hists->stats.nr_events[PERF_RECORD_SAMPLE] ==
+                               hists->stats.nr_samples ==
                                hists->stats.nr_non_filtered_samples);
                TEST_ASSERT_VAL("Unmatched nr hist entries",
                                hists->nr_entries == hists->nr_non_filtered_entries);
@@ -175,7 +175,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu
 
                /* normal stats should be invariant */
                TEST_ASSERT_VAL("Invalid nr samples",
-                               hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10);
+                               hists->stats.nr_samples == 10);
                TEST_ASSERT_VAL("Invalid nr hist entries",
                                hists->nr_entries == 9);
                TEST_ASSERT_VAL("Invalid total period",
@@ -204,7 +204,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu
 
                /* normal stats should be invariant */
                TEST_ASSERT_VAL("Invalid nr samples",
-                               hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10);
+                               hists->stats.nr_samples == 10);
                TEST_ASSERT_VAL("Invalid nr hist entries",
                                hists->nr_entries == 9);
                TEST_ASSERT_VAL("Invalid total period",
@@ -239,7 +239,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu
 
                /* normal stats should be invariant */
                TEST_ASSERT_VAL("Invalid nr samples",
-                               hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10);
+                               hists->stats.nr_samples == 10);
                TEST_ASSERT_VAL("Invalid nr hist entries",
                                hists->nr_entries == 9);
                TEST_ASSERT_VAL("Invalid total period",
@@ -268,7 +268,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu
 
                /* normal stats should be invariant */
                TEST_ASSERT_VAL("Invalid nr samples",
-                               hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10);
+                               hists->stats.nr_samples == 10);
                TEST_ASSERT_VAL("Invalid nr hist entries",
                                hists->nr_entries == 9);
                TEST_ASSERT_VAL("Invalid total period",
@@ -299,7 +299,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu
 
                /* normal stats should be invariant */
                TEST_ASSERT_VAL("Invalid nr samples",
-                               hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10);
+                               hists->stats.nr_samples == 10);
                TEST_ASSERT_VAL("Invalid nr hist entries",
                                hists->nr_entries == 9);
                TEST_ASSERT_VAL("Invalid total period",
index a90fa04..94bd5d2 100644 (file)
@@ -155,7 +155,6 @@ run += make_no_syscall_tbl
 run += make_with_babeltrace
 run += make_with_clangllvm
 run += make_with_libpfm4
-run += make_with_gtk2
 run += make_help
 run += make_doc
 run += make_perf_o
@@ -172,7 +171,6 @@ run += make_install_prefix_slash
 # run += make_install_info
 # run += make_install_pdf
 run += make_minimal
-run += make_static
 
 ifneq ($(call has,ctags),)
 run += make_tags
@@ -307,6 +305,26 @@ $(run):
        $(call test,$@) && \
        rm -rf $@ $$TMP_DEST || (cat $@ ; false)
 
+make_with_gtk2:
+       $(call clean)
+       @TMP_DEST=$$(mktemp -d); \
+       cmd="cd $(PERF) && $(MAKE_F) $($@) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST"; \
+       printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
+       ( eval $$cmd ) >> $@ 2>&1; \
+       echo "  test: $(call test,$@)" >> $@ 2>&1; \
+       $(call test,$@) && \
+       rm -rf $@ $$TMP_DEST || (cat $@ ; false)
+
+make_static:
+       $(call clean)
+       @TMP_DEST=$$(mktemp -d); \
+       cmd="cd $(PERF) && $(MAKE_F) $($@) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST"; \
+       printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
+       ( eval $$cmd ) >> $@ 2>&1; \
+       echo "  test: $(call test,$@)" >> $@ 2>&1; \
+       $(call test,$@) && \
+       rm -rf $@ $$TMP_DEST || (cat $@ ; false)
+
 $(run_O):
        $(call clean)
        @TMP_O=$$(mktemp -d); \
index a7f6661..0f113b2 100644 (file)
@@ -20,7 +20,7 @@
 
 #if defined(__s390x__)
 /* Return true if kvm module is available and loaded. Test this
- * and retun success when trace point kvm_s390_create_vm
+ * and return success when trace point kvm_s390_create_vm
  * exists. Otherwise this test always fails.
  */
 static bool kvm_s390_create_vm_valid(void)
@@ -1512,6 +1512,124 @@ static int test__all_tracepoints(struct evlist *evlist)
        return test__checkevent_tracepoint_multi(evlist);
 }
 
+static int test__hybrid_hw_event_with_pmu(struct evlist *evlist)
+{
+       struct evsel *evsel = evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+       TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config);
+       return 0;
+}
+
+static int test__hybrid_hw_group_event(struct evlist *evlist)
+{
+       struct evsel *evsel, *leader;
+
+       evsel = leader = evlist__first(evlist);
+       TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+       TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+
+       evsel = evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+       TEST_ASSERT_VAL("wrong config", 0xc0 == evsel->core.attr.config);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+       return 0;
+}
+
+static int test__hybrid_sw_hw_group_event(struct evlist *evlist)
+{
+       struct evsel *evsel, *leader;
+
+       evsel = leader = evlist__first(evlist);
+       TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+
+       evsel = evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+       TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+       return 0;
+}
+
+static int test__hybrid_hw_sw_group_event(struct evlist *evlist)
+{
+       struct evsel *evsel, *leader;
+
+       evsel = leader = evlist__first(evlist);
+       TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+       TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+
+       evsel = evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+       return 0;
+}
+
+static int test__hybrid_group_modifier1(struct evlist *evlist)
+{
+       struct evsel *evsel, *leader;
+
+       evsel = leader = evlist__first(evlist);
+       TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+       TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+       TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel);
+
+       evsel = evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+       TEST_ASSERT_VAL("wrong config", 0xc0 == evsel->core.attr.config);
+       TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel);
+       return 0;
+}
+
+static int test__hybrid_raw1(struct evlist *evlist)
+{
+       struct evsel *evsel = evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+       TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config);
+
+       /* The type of second event is randome value */
+       evsel = evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config);
+       return 0;
+}
+
+static int test__hybrid_raw2(struct evlist *evlist)
+{
+       struct evsel *evsel = evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+       TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config);
+       return 0;
+}
+
+static int test__hybrid_cache_event(struct evlist *evlist)
+{
+       struct evsel *evsel = evlist__first(evlist);
+
+       TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->core.attr.type);
+       TEST_ASSERT_VAL("wrong config", 0x2 == (evsel->core.attr.config & 0xffffffff));
+
+       evsel = evsel__next(evsel);
+       TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->core.attr.type);
+       TEST_ASSERT_VAL("wrong config", 0x10002 == (evsel->core.attr.config & 0xffffffff));
+       return 0;
+}
+
 struct evlist_test {
        const char *name;
        __u32 type;
@@ -1868,6 +1986,54 @@ static struct terms_test test__terms[] = {
        },
 };
 
+static struct evlist_test test__hybrid_events[] = {
+       {
+               .name  = "cpu_core/cpu-cycles/",
+               .check = test__hybrid_hw_event_with_pmu,
+               .id    = 0,
+       },
+       {
+               .name  = "{cpu_core/cpu-cycles/,cpu_core/instructions/}",
+               .check = test__hybrid_hw_group_event,
+               .id    = 1,
+       },
+       {
+               .name  = "{cpu-clock,cpu_core/cpu-cycles/}",
+               .check = test__hybrid_sw_hw_group_event,
+               .id    = 2,
+       },
+       {
+               .name  = "{cpu_core/cpu-cycles/,cpu-clock}",
+               .check = test__hybrid_hw_sw_group_event,
+               .id    = 3,
+       },
+       {
+               .name  = "{cpu_core/cpu-cycles/k,cpu_core/instructions/u}",
+               .check = test__hybrid_group_modifier1,
+               .id    = 4,
+       },
+       {
+               .name  = "r1a",
+               .check = test__hybrid_raw1,
+               .id    = 5,
+       },
+       {
+               .name  = "cpu_core/r1a/",
+               .check = test__hybrid_raw2,
+               .id    = 6,
+       },
+       {
+               .name  = "cpu_core/config=10,config1,config2=3,period=1000/u",
+               .check = test__checkevent_pmu,
+               .id    = 7,
+       },
+       {
+               .name  = "cpu_core/LLC-loads/,cpu_atom/LLC-load-misses/",
+               .check = test__hybrid_cache_event,
+               .id    = 8,
+       },
+};
+
 static int test_event(struct evlist_test *e)
 {
        struct parse_events_error err;
@@ -2035,6 +2201,11 @@ do {                                                     \
                ret2 = ret1;                            \
 } while (0)
 
+       if (perf_pmu__has_hybrid()) {
+               TEST_EVENTS(test__hybrid_events);
+               return ret2;
+       }
+
        TEST_EVENTS(test__events);
 
        if (test_pmu())
index 6dc1db1..4f6f490 100644 (file)
@@ -11,6 +11,7 @@
 #include "debug.h"
 #include "expr.h"
 #include "stat.h"
+#include "pmu.h"
 
 static struct pmu_event pme_test[] = {
 {
@@ -98,7 +99,7 @@ static u64 find_value(const char *name, struct value *values)
                if (!strcmp(name, v->event))
                        return v->val;
                v++;
-       };
+       }
        return 0;
 }
 
@@ -186,7 +187,7 @@ static int __compute_metric(const char *name, struct value *vals,
                *ratio2 = compute_single(&metric_events, evlist, &st, name2);
 
 out:
-       /* ... clenup. */
+       /* ... cleanup. */
        metricgroup__rblist_exit(&metric_events);
        runtime_stat__exit(&st);
        evlist__free_stats(evlist);
@@ -372,10 +373,13 @@ int test__parse_metric(struct test *test __maybe_unused, int subtest __maybe_unu
 {
        TEST_ASSERT_VAL("IPC failed", test_ipc() == 0);
        TEST_ASSERT_VAL("frontend failed", test_frontend() == 0);
-       TEST_ASSERT_VAL("cache_miss_cycles failed", test_cache_miss_cycles() == 0);
        TEST_ASSERT_VAL("DCache_L2 failed", test_dcache_l2() == 0);
        TEST_ASSERT_VAL("recursion fail failed", test_recursion_fail() == 0);
-       TEST_ASSERT_VAL("test metric group", test_metric_group() == 0);
        TEST_ASSERT_VAL("Memory bandwidth", test_memory_bandwidth() == 0);
+
+       if (!perf_pmu__has_hybrid()) {
+               TEST_ASSERT_VAL("cache_miss_cycles failed", test_cache_miss_cycles() == 0);
+               TEST_ASSERT_VAL("test metric group", test_metric_group() == 0);
+       }
        return 0;
 }
index 680c3cf..85d75b9 100644 (file)
@@ -20,6 +20,7 @@
 #include "tsc.h"
 #include "mmap.h"
 #include "tests.h"
+#include "pmu.h"
 
 #define CHECK__(x) {                           \
        while ((x) < 0) {                       \
@@ -88,6 +89,17 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe
        evsel->core.attr.disabled = 1;
        evsel->core.attr.enable_on_exec = 0;
 
+       /*
+        * For hybrid "cycles:u", it creates two events.
+        * Init the second evsel here.
+        */
+       if (perf_pmu__has_hybrid()) {
+               evsel = evsel__next(evsel);
+               evsel->core.attr.comm = 1;
+               evsel->core.attr.disabled = 1;
+               evsel->core.attr.enable_on_exec = 0;
+       }
+
        CHECK__(evlist__open(evlist));
 
        CHECK__(evlist__mmap(evlist, UINT_MAX));
index 0ca6a5a..b8aff8f 100644 (file)
@@ -12,6 +12,7 @@
 #include "util/evlist.h"
 #include "util/expr.h"
 #include "util/parse-events.h"
+#include "metricgroup.h"
 
 struct perf_pmu_test_event {
        /* used for matching against events from generated pmu-events.c */
@@ -471,9 +472,74 @@ static void expr_failure(const char *msg,
        pr_debug("On expression %s\n", pe->metric_expr);
 }
 
+struct metric {
+       struct list_head list;
+       struct metric_ref metric_ref;
+};
+
+static int resolve_metric_simple(struct expr_parse_ctx *pctx,
+                                struct list_head *compound_list,
+                                struct pmu_events_map *map,
+                                const char *metric_name)
+{
+       struct hashmap_entry *cur, *cur_tmp;
+       struct metric *metric, *tmp;
+       size_t bkt;
+       bool all;
+       int rc;
+
+       do {
+               all = true;
+               hashmap__for_each_entry_safe((&pctx->ids), cur, cur_tmp, bkt) {
+                       struct metric_ref *ref;
+                       struct pmu_event *pe;
+
+                       pe = metricgroup__find_metric(cur->key, map);
+                       if (!pe)
+                               continue;
+
+                       if (!strcmp(metric_name, (char *)cur->key)) {
+                               pr_warning("Recursion detected for metric %s\n", metric_name);
+                               rc = -1;
+                               goto out_err;
+                       }
+
+                       all = false;
+
+                       /* The metric key itself needs to go out.. */
+                       expr__del_id(pctx, cur->key);
+
+                       metric = malloc(sizeof(*metric));
+                       if (!metric) {
+                               rc = -ENOMEM;
+                               goto out_err;
+                       }
+
+                       ref = &metric->metric_ref;
+                       ref->metric_name = pe->metric_name;
+                       ref->metric_expr = pe->metric_expr;
+                       list_add_tail(&metric->list, compound_list);
+
+                       rc = expr__find_other(pe->metric_expr, NULL, pctx, 0);
+                       if (rc)
+                               goto out_err;
+                       break; /* The hashmap has been modified, so restart */
+               }
+       } while (!all);
+
+       return 0;
+
+out_err:
+       list_for_each_entry_safe(metric, tmp, compound_list, list)
+               free(metric);
+
+       return rc;
+
+}
+
 static int test_parsing(void)
 {
-       struct pmu_events_map *cpus_map = perf_pmu__find_map(NULL);
+       struct pmu_events_map *cpus_map = pmu_events_map__find();
        struct pmu_events_map *map;
        struct pmu_event *pe;
        int i, j, k;
@@ -488,7 +554,9 @@ static int test_parsing(void)
                        break;
                j = 0;
                for (;;) {
+                       struct metric *metric, *tmp;
                        struct hashmap_entry *cur;
+                       LIST_HEAD(compound_list);
                        size_t bkt;
 
                        pe = &map->table[j++];
@@ -504,6 +572,13 @@ static int test_parsing(void)
                                continue;
                        }
 
+                       if (resolve_metric_simple(&ctx, &compound_list, map,
+                                                 pe->metric_name)) {
+                               expr_failure("Could not resolve metrics", map, pe);
+                               ret++;
+                               goto exit; /* Don't tolerate errors due to severity */
+                       }
+
                        /*
                         * Add all ids with a made up value. The value may
                         * trigger divide by zero when subtracted and so try to
@@ -519,6 +594,11 @@ static int test_parsing(void)
                                        ret++;
                        }
 
+                       list_for_each_entry_safe(metric, tmp, &compound_list, list) {
+                               expr__add_ref(&ctx, &metric->metric_ref);
+                               free(metric);
+                       }
+
                        if (expr__parse(&result, &ctx, pe->metric_expr, 0)) {
                                expr_failure("Parse failed", map, pe);
                                ret++;
@@ -527,6 +607,7 @@ static int test_parsing(void)
                }
        }
        /* TODO: fail when not ok */
+exit:
        return ret == 0 ? TEST_OK : TEST_SKIP;
 }
 
index 416af61..f05670d 100755 (executable)
@@ -14,18 +14,56 @@ if ! [ -x "$(command -v cc)" ]; then
        exit 2
 fi
 
+# check what we need to test windows binaries
+add_pe=1
+run_pe=1
+if ! perf version --build-options | grep -q 'libbfd: .* on '; then
+       echo "WARNING: perf not built with libbfd. PE binaries will not be tested."
+       add_pe=0
+       run_pe=0
+fi
+if ! which wine > /dev/null; then
+       echo "WARNING: wine not found. PE binaries will not be run."
+       run_pe=0
+fi
+
+# set up wine
+if [ ${run_pe} -eq 1 ]; then
+       wineprefix=$(mktemp -d /tmp/perf.wineprefix.XXX)
+       export WINEPREFIX=${wineprefix}
+       # clear display variables to prevent wine from popping up dialogs
+       unset DISPLAY
+       unset WAYLAND_DISPLAY
+fi
+
 ex_md5=$(mktemp /tmp/perf.ex.MD5.XXX)
 ex_sha1=$(mktemp /tmp/perf.ex.SHA1.XXX)
+ex_pe=$(dirname $0)/../pe-file.exe
 
 echo 'int main(void) { return 0; }' | cc -Wl,--build-id=sha1 -o ${ex_sha1} -x c -
 echo 'int main(void) { return 0; }' | cc -Wl,--build-id=md5 -o ${ex_md5} -x c -
 
-echo "test binaries: ${ex_sha1} ${ex_md5}"
+echo "test binaries: ${ex_sha1} ${ex_md5} ${ex_pe}"
 
 check()
 {
-       id=`readelf -n ${1} 2>/dev/null | grep 'Build ID' | awk '{print $3}'`
-
+       case $1 in
+       *.exe)
+               # We don't have a tool that can pull a nicely formatted build-id out of
+               # a PE file, but we can extract the whole section with objcopy and
+               # format it ourselves. The .buildid section is a Debug Directory
+               # containing a CodeView entry:
+               #     https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#debug-directory-image-only
+               #     https://github.com/dotnet/runtime/blob/da94c022576a5c3bbc0e896f006565905eb137f9/docs/design/specs/PE-COFF.md
+               # The build-id starts at byte 33 and must be rearranged into a GUID.
+               id=`objcopy -O binary --only-section=.buildid $1 /dev/stdout | \
+                       cut -c 33-48 | hexdump -ve '/1 "%02x"' | \
+                       sed 's@^\(..\)\(..\)\(..\)\(..\)\(..\)\(..\)\(..\)\(..\)\(.*\)0a$@\4\3\2\1\6\5\8\7\9@'`
+               ;;
+       *)
+               id=`readelf -n ${1} 2>/dev/null | grep 'Build ID' | awk '{print $3}'`
+               ;;
+       esac
        echo "build id: ${id}"
 
        link=${build_id_dir}/.build-id/${id:0:2}/${id:2}
@@ -50,7 +88,7 @@ check()
                exit 1
        fi
 
-       ${perf} buildid-cache -l | grep $id
+       ${perf} buildid-cache -l | grep ${id}
        if [ $? -ne 0 ]; then
                echo "failed: ${id} is not reported by \"perf buildid-cache -l\""
                exit 1
@@ -79,16 +117,20 @@ test_record()
 {
        data=$(mktemp /tmp/perf.data.XXX)
        build_id_dir=$(mktemp -d /tmp/perf.debug.XXX)
+       log=$(mktemp /tmp/perf.log.XXX)
        perf="perf --buildid-dir ${build_id_dir}"
 
-       ${perf} record --buildid-all -o ${data} ${1}
+       echo "running: perf record $@"
+       ${perf} record --buildid-all -o ${data} $@ &> ${log}
        if [ $? -ne 0 ]; then
-               echo "failed: record ${1}"
+               echo "failed: record $@"
+               echo "see log: ${log}"
                exit 1
        fi
 
-       check ${1}
+       check ${@: -1}
 
+       rm -f ${log}
        rm -rf ${build_id_dir}
        rm -rf ${data}
 }
@@ -96,12 +138,21 @@ test_record()
 # add binaries manual via perf buildid-cache -a
 test_add ${ex_sha1}
 test_add ${ex_md5}
+if [ ${add_pe} -eq 1 ]; then
+       test_add ${ex_pe}
+fi
 
 # add binaries via perf record post processing
 test_record ${ex_sha1}
 test_record ${ex_md5}
+if [ ${run_pe} -eq 1 ]; then
+       test_record wine ${ex_pe}
+fi
 
 # cleanup
 rm ${ex_sha1} ${ex_md5}
+if [ ${run_pe} -eq 1 ]; then
+       rm -r ${wineprefix}
+fi
 
 exit ${err}
index 5898438..45fc24a 100755 (executable)
@@ -98,6 +98,23 @@ check_line_other()
        fi
 }
 
+daemon_exit()
+{
+       local config=$1
+
+       local line=`perf daemon --config ${config} -x: | head -1`
+       local pid=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $1 }'`
+
+       # Reset trap handler.
+       trap - SIGINT SIGTERM
+
+       # stop daemon
+       perf daemon stop --config ${config}
+
+       # ... and wait for the pid to go away
+       tail --pid=${pid} -f /dev/null
+}
+
 daemon_start()
 {
        local config=$1
@@ -105,29 +122,24 @@ daemon_start()
 
        perf daemon start --config ${config}
 
+       # Clean up daemon if interrupted.
+       trap "echo 'FAILED: Signal caught'; daemon_exit ${config}; exit 1" SIGINT SIGTERM
+
        # wait for the session to ping
        local state="FAIL"
+       local retries=0
        while [ "${state}" != "OK" ]; do
                state=`perf daemon ping --config ${config} --session ${session} | awk '{ print $1 }'`
                sleep 0.05
+               retries=$((${retries} +1))
+               if [ ${retries} -ge 600 ]; then
+                       echo "FAILED: Timeout waiting for daemon to ping"
+                       daemon_exit ${config}
+                       exit 1
+               fi
        done
 }
 
-daemon_exit()
-{
-       local base=$1
-       local config=$2
-
-       local line=`perf daemon --config ${config} -x: | head -1`
-       local pid=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $1 }'`
-
-       # stop daemon
-       perf daemon stop --config ${config}
-
-       # ... and wait for the pid to go away
-       tail --pid=${pid} -f /dev/null
-}
-
 test_list()
 {
        echo "test daemon list"
@@ -171,7 +183,7 @@ EOF
                         ${base}/session-time/ack "0"
 
        # stop daemon
-       daemon_exit ${base} ${config}
+       daemon_exit ${config}
 
        rm -rf ${base}
        rm -f ${config}
@@ -288,7 +300,7 @@ EOF
        done
 
        # stop daemon
-       daemon_exit ${base} ${config}
+       daemon_exit ${config}
 
        rm -rf ${base}
        rm -f ${config}
@@ -333,7 +345,7 @@ EOF
        fi
 
        # stop daemon
-       daemon_exit ${base} ${config}
+       daemon_exit ${config}
 
        # check that sessions are gone
        if [ -d "/proc/${pid_size}" ]; then
@@ -374,7 +386,7 @@ EOF
        perf daemon signal --config ${config}
 
        # stop daemon
-       daemon_exit ${base} ${config}
+       daemon_exit ${config}
 
        # count is 2 perf.data for signals and 1 for perf record finished
        count=`ls ${base}/session-test/ | grep perf.data | wc -l`
@@ -420,7 +432,7 @@ EOF
        fi
 
        # stop daemon
-       daemon_exit ${base} ${config}
+       daemon_exit ${config}
 
        rm -rf ${base}
        rm -f ${config}
@@ -457,7 +469,7 @@ EOF
        fi
 
        # stop daemon
-       daemon_exit ${base} ${config}
+       daemon_exit ${config}
 
        rm -rf ${base}
        rm -f ${config}
diff --git a/tools/perf/tests/shell/stat+csv_summary.sh b/tools/perf/tests/shell/stat+csv_summary.sh
new file mode 100755 (executable)
index 0000000..5571ff7
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+# perf stat csv summary test
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+#
+#     1.001364330 9224197  cycles 8012885033 100.00
+#         summary 9224197  cycles 8012885033 100.00
+#
+perf stat -e cycles  -x' ' -I1000 --interval-count 1 --summary 2>&1 | \
+grep -e summary | \
+while read summary num event run pct
+do
+       if [ $summary != "summary" ]; then
+               exit 1
+       fi
+done
+
+#
+#     1.001360298 9148534  cycles 8012853854 100.00
+#9148534  cycles 8012853854 100.00
+#
+perf stat -e cycles  -x' ' -I1000 --interval-count 1 --summary --no-csv-summary 2>&1 | \
+grep -e summary | \
+while read num event run pct
+do
+       exit 1
+done
+
+exit 0
index ebebd35..e6e35fc 100755 (executable)
@@ -7,6 +7,9 @@ set -e
 # skip if system-wide mode is forbidden
 perf stat -a true > /dev/null 2>&1 || exit 2
 
+# skip if on hybrid platform
+perf stat -a -e cycles sleep 1 2>&1 | grep -e cpu_core && exit 2
+
 test_global_aggr()
 {
        perf stat -a --no-big-num -e cycles,instructions sleep 1  2>&1 | \
diff --git a/tools/perf/tests/shell/stat_bpf_counters.sh b/tools/perf/tests/shell/stat_bpf_counters.sh
new file mode 100755 (executable)
index 0000000..22eb31e
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+# perf stat --bpf-counters test
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+# check whether $2 is within +/- 10% of $1
+compare_number()
+{
+       first_num=$1
+       second_num=$2
+
+       # upper bound is first_num * 110%
+       upper=$(( $first_num + $first_num / 10 ))
+       # lower bound is first_num * 90%
+       lower=$(( $first_num - $first_num / 10 ))
+
+       if [ $second_num -gt $upper ] || [ $second_num -lt $lower ]; then
+               echo "The difference between $first_num and $second_num are greater than 10%."
+               exit 1
+       fi
+}
+
+# skip if --bpf-counters is not supported
+perf stat --bpf-counters true > /dev/null 2>&1 || exit 2
+
+base_cycles=$(perf stat --no-big-num -e cycles -- perf bench sched messaging -g 1 -l 100 -t 2>&1 | awk '/cycles/ {print $1}')
+bpf_cycles=$(perf stat --no-big-num --bpf-counters -e cycles -- perf bench sched messaging -g 1 -l 100 -t 2>&1 | awk '/cycles/ {print $1}')
+
+compare_number $base_cycles $bpf_cycles
+exit 0
index 3ebaa75..62c0ec2 100644 (file)
@@ -18,6 +18,7 @@
 #include "record.h"
 #include "tests.h"
 #include "util/mmap.h"
+#include "pmu.h"
 
 static int spin_sleep(void)
 {
@@ -371,7 +372,10 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_
        cpu_clocks_evsel = evlist__last(evlist);
 
        /* Second event */
-       err = parse_events(evlist, "cycles:u", NULL);
+       if (perf_pmu__has_hybrid())
+               err = parse_events(evlist, "cpu_core/cycles/u", NULL);
+       else
+               err = parse_events(evlist, "cycles:u", NULL);
        if (err) {
                pr_debug("Failed to parse event cycles:u\n");
                goto out_err;
index 74748ed..ec4e3b2 100644 (file)
@@ -8,6 +8,7 @@
 #include "session.h"
 #include "evlist.h"
 #include "debug.h"
+#include "pmu.h"
 #include <linux/err.h>
 
 #define TEMPL "/tmp/perf-test-XXXXXX"
@@ -40,8 +41,16 @@ static int session_write_header(char *path)
        session = perf_session__new(&data, false, NULL);
        TEST_ASSERT_VAL("can't get session", !IS_ERR(session));
 
-       session->evlist = evlist__new_default();
-       TEST_ASSERT_VAL("can't get evlist", session->evlist);
+       if (!perf_pmu__has_hybrid()) {
+               session->evlist = evlist__new_default();
+               TEST_ASSERT_VAL("can't get evlist", session->evlist);
+       } else {
+               struct parse_events_error err;
+
+               session->evlist = evlist__new();
+               TEST_ASSERT_VAL("can't get evlist", session->evlist);
+               parse_events(session->evlist, "cpu_core/cycles/", &err);
+       }
 
        perf_header__set_feat(&session->header, HEADER_CPU_TOPOLOGY);
        perf_header__set_feat(&session->header, HEADER_NRCPUS);
@@ -80,7 +89,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
         *   CPU 1 is on core_id 1 and physical_package_id 3
         *
         *   Core_id and physical_package_id are platform and architecture
-        *   dependend and might have higher numbers than the CPU id.
+        *   dependent and might have higher numbers than the CPU id.
         *   This actually depends on the configuration.
         *
         *  In this case process_cpu_topology() prints error message:
index 83fb24d..bc6ef7b 100755 (executable)
@@ -10,8 +10,7 @@ fi
 linux_mount=${linux_header_dir}/mount.h
 
 printf "static const char *fsconfig_cmds[] = {\n"
-regex='^[[:space:]]*+FSCONFIG_([[:alnum:]_]+)[[:space:]]*=[[:space:]]*([[:digit:]]+)[[:space:]]*,[[:space:]]*.*'
-egrep $regex ${linux_mount} | \
-       sed -r "s/$regex/\2 \1/g"       | \
-       xargs printf "\t[%s] = \"%s\",\n"
+ms='[[:space:]]*'
+sed -nr "s/^${ms}FSCONFIG_([[:alnum:]_]+)${ms}=${ms}([[:digit:]]+)${ms},.*/\t[\2] = \"\1\",/p" \
+       ${linux_mount}
 printf "};\n"
index 385894b..b8fc5c5 100644 (file)
@@ -85,7 +85,7 @@ struct mmsghdr {
 
 /*
  *     POSIX 1003.1g - ancillary data object information
- *     Ancillary data consits of a sequence of pairs of
+ *     Ancillary data consists of a sequence of pairs of
  *     (cmsghdr, cmsg_data[])
  */
 
index 35b82ca..ad0a70f 100644 (file)
@@ -381,6 +381,25 @@ static bool annotate_browser__toggle_source(struct annotate_browser *browser)
        return true;
 }
 
+#define SYM_TITLE_MAX_SIZE (PATH_MAX + 64)
+
+static void annotate_browser__show_full_location(struct ui_browser *browser)
+{
+       struct annotate_browser *ab = container_of(browser, struct annotate_browser, b);
+       struct disasm_line *cursor = disasm_line(ab->selection);
+       struct annotation_line *al = &cursor->al;
+
+       if (al->offset != -1)
+               ui_helpline__puts("Only available for source code lines.");
+       else if (al->fileloc == NULL)
+               ui_helpline__puts("No source file location.");
+       else {
+               char help_line[SYM_TITLE_MAX_SIZE];
+               sprintf (help_line, "Source file location: %s", al->fileloc);
+               ui_helpline__puts(help_line);
+       }
+}
+
 static void ui_browser__init_asm_mode(struct ui_browser *browser)
 {
        struct annotation *notes = browser__annotation(browser);
@@ -388,8 +407,6 @@ static void ui_browser__init_asm_mode(struct ui_browser *browser)
        browser->nr_entries = notes->nr_asm_entries;
 }
 
-#define SYM_TITLE_MAX_SIZE (PATH_MAX + 64)
-
 static int sym_title(struct symbol *sym, struct map *map, char *title,
                     size_t sz, int percent_type)
 {
@@ -398,7 +415,7 @@ static int sym_title(struct symbol *sym, struct map *map, char *title,
 }
 
 /*
- * This can be called from external jumps, i.e. jumps from one functon
+ * This can be called from external jumps, i.e. jumps from one function
  * to another, like from the kernel's entry_SYSCALL_64 function to the
  * swapgs_restore_regs_and_return_to_usermode() function.
  *
@@ -747,6 +764,7 @@ static int annotate_browser__run(struct annotate_browser *browser,
                "c             Show min/max cycle\n"
                "/             Search string\n"
                "k             Toggle line numbers\n"
+               "l             Show full source file location\n"
                "P             Print to [symbol_name].annotation file.\n"
                "r             Run available scripts\n"
                "p             Toggle percent type [local/global]\n"
@@ -760,6 +778,9 @@ static int annotate_browser__run(struct annotate_browser *browser,
                case 'k':
                        notes->options->show_linenr = !notes->options->show_linenr;
                        continue;
+               case 'l':
+                       annotate_browser__show_full_location (&browser->b);
+                       continue;
                case 'H':
                        nd = browser->curr_hot;
                        break;
index 3b9818e..b72ee68 100644 (file)
@@ -117,7 +117,7 @@ static void hist_browser__update_rows(struct hist_browser *hb)
        browser->rows -= browser->extra_title_lines;
        /*
         * Verify if we were at the last line and that line isn't
-        * visibe because we now show the header line(s).
+        * visible because we now show the header line(s).
         */
        index_row = browser->index - browser->top_idx;
        if (index_row >= browser->rows)
@@ -682,6 +682,7 @@ static int hist_browser__handle_hotkey(struct hist_browser *browser, bool warn_l
        switch (key) {
        case K_TIMER: {
                struct hist_browser_timer *hbt = browser->hbt;
+               struct evsel *evsel = hists_to_evsel(browser->hists);
                u64 nr_entries;
 
                WARN_ON_ONCE(!hbt);
@@ -696,10 +697,10 @@ static int hist_browser__handle_hotkey(struct hist_browser *browser, bool warn_l
                ui_browser__update_nr_entries(&browser->b, nr_entries);
 
                if (warn_lost_event &&
-                   (browser->hists->stats.nr_lost_warned !=
-                   browser->hists->stats.nr_events[PERF_RECORD_LOST])) {
-                       browser->hists->stats.nr_lost_warned =
-                               browser->hists->stats.nr_events[PERF_RECORD_LOST];
+                   (evsel->evlist->stats.nr_lost_warned !=
+                    evsel->evlist->stats.nr_events[PERF_RECORD_LOST])) {
+                       evsel->evlist->stats.nr_lost_warned =
+                               evsel->evlist->stats.nr_events[PERF_RECORD_LOST];
                        ui_browser__warn_lost_events(&browser->b);
                }
 
@@ -3416,7 +3417,7 @@ static void perf_evsel_menu__write(struct ui_browser *browser,
        struct evsel *evsel = list_entry(entry, struct evsel, core.node);
        struct hists *hists = evsel__hists(evsel);
        bool current_entry = ui_browser__is_current_entry(browser, row);
-       unsigned long nr_events = hists->stats.nr_events[PERF_RECORD_SAMPLE];
+       unsigned long nr_events = hists->stats.nr_samples;
        const char *ev_name = evsel__name(evsel);
        char bf[256], unit;
        const char *warn = " ";
@@ -3432,7 +3433,7 @@ static void perf_evsel_menu__write(struct ui_browser *browser,
 
                for_each_group_member(pos, evsel) {
                        struct hists *pos_hists = evsel__hists(pos);
-                       nr_events += pos_hists->stats.nr_events[PERF_RECORD_SAMPLE];
+                       nr_events += pos_hists->stats.nr_samples;
                }
        }
 
@@ -3441,7 +3442,7 @@ static void perf_evsel_menu__write(struct ui_browser *browser,
                           unit, unit == ' ' ? "" : " ", ev_name);
        ui_browser__printf(browser, "%s", bf);
 
-       nr_events = hists->stats.nr_events[PERF_RECORD_LOST];
+       nr_events = evsel->evlist->stats.nr_events[PERF_RECORD_LOST];
        if (nr_events != 0) {
                menu->lost_events = true;
                if (!current_entry)
@@ -3647,7 +3648,7 @@ static int block_hists_browser__title(struct hist_browser *browser, char *bf,
 {
        struct hists *hists = evsel__hists(browser->block_evsel);
        const char *evname = evsel__name(browser->block_evsel);
-       unsigned long nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE];
+       unsigned long nr_samples = hists->stats.nr_samples;
        int ret;
 
        ret = scnprintf(bf, size, "# Samples: %lu", nr_samples);
index 2ab2af4..f362704 100644 (file)
@@ -897,10 +897,12 @@ out:
        return ret;
 }
 
-size_t events_stats__fprintf(struct events_stats *stats, FILE *fp)
+size_t events_stats__fprintf(struct events_stats *stats, FILE *fp,
+                            bool skip_empty)
 {
        int i;
        size_t ret = 0;
+       u32 total = stats->nr_events[0];
 
        for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
                const char *name;
@@ -908,8 +910,17 @@ size_t events_stats__fprintf(struct events_stats *stats, FILE *fp)
                name = perf_event__name(i);
                if (!strcmp(name, "UNKNOWN"))
                        continue;
+               if (skip_empty && !stats->nr_events[i])
+                       continue;
 
-               ret += fprintf(fp, "%16s events: %10d\n", name, stats->nr_events[i]);
+               if (i && total) {
+                       ret += fprintf(fp, "%16s events: %10d  (%4.1f%%)\n",
+                                      name, stats->nr_events[i],
+                                      100.0 * stats->nr_events[i] / total);
+               } else {
+                       ret += fprintf(fp, "%16s events: %10d\n",
+                                      name, stats->nr_events[i]);
+               }
        }
 
        return ret;
index e3e12f9..8c0d9f3 100644 (file)
@@ -10,6 +10,7 @@ perf-y += db-export.o
 perf-y += env.o
 perf-y += event.o
 perf-y += evlist.o
+perf-y += evlist-hybrid.o
 perf-y += sideband_evlist.o
 perf-y += evsel.o
 perf-y += evsel_fprintf.o
@@ -23,6 +24,7 @@ perf-y += llvm-utils.o
 perf-y += mmap.o
 perf-y += memswap.o
 perf-y += parse-events.o
+perf-y += parse-events-hybrid.o
 perf-y += perf_regs.o
 perf-y += path.o
 perf-y += print_binary.o
@@ -69,6 +71,7 @@ perf-y += parse-events-bison.o
 perf-y += pmu.o
 perf-y += pmu-flex.o
 perf-y += pmu-bison.o
+perf-y += pmu-hybrid.o
 perf-y += trace-event-read.o
 perf-y += trace-event-info.o
 perf-y += trace-event-scripting.o
@@ -102,6 +105,7 @@ perf-y += rwsem.o
 perf-y += thread-stack.o
 perf-y += spark.o
 perf-y += topdown.o
+perf-y += iostat.o
 perf-y += stream.o
 perf-$(CONFIG_AUXTRACE) += auxtrace.o
 perf-$(CONFIG_AUXTRACE) += intel-pt-decoder/
@@ -164,6 +168,7 @@ perf-$(CONFIG_LIBUNWIND_X86)      += libunwind/x86_32.o
 perf-$(CONFIG_LIBUNWIND_AARCH64)  += libunwind/arm64.o
 
 perf-$(CONFIG_LIBBABELTRACE) += data-convert-bt.o
+perf-y += data-convert-json.o
 
 perf-y += scripting-engines/
 
index e60841b..abe1499 100644 (file)
@@ -1161,6 +1161,7 @@ struct annotate_args {
        s64                       offset;
        char                      *line;
        int                       line_nr;
+       char                      *fileloc;
 };
 
 static void annotation_line__init(struct annotation_line *al,
@@ -1170,6 +1171,7 @@ static void annotation_line__init(struct annotation_line *al,
        al->offset = args->offset;
        al->line = strdup(args->line);
        al->line_nr = args->line_nr;
+       al->fileloc = args->fileloc;
        al->data_nr = nr;
 }
 
@@ -1366,7 +1368,6 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start
 {
        struct disasm_line *dl = container_of(al, struct disasm_line, al);
        static const char *prev_line;
-       static const char *prev_color;
 
        if (al->offset != -1) {
                double max_percent = 0.0;
@@ -1405,20 +1406,6 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start
 
                color = get_percent_color(max_percent);
 
-               /*
-                * Also color the filename and line if needed, with
-                * the same color than the percentage. Don't print it
-                * twice for close colored addr with the same filename:line
-                */
-               if (al->path) {
-                       if (!prev_line || strcmp(prev_line, al->path)
-                                      || color != prev_color) {
-                               color_fprintf(stdout, color, " %s", al->path);
-                               prev_line = al->path;
-                               prev_color = color;
-                       }
-               }
-
                for (i = 0; i < nr_percent; i++) {
                        struct annotation_data *data = &al->data[i];
                        double percent;
@@ -1439,6 +1426,19 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start
                printf(" : ");
 
                disasm_line__print(dl, start, addr_fmt_width);
+
+               /*
+                * Also color the filename and line if needed, with
+                * the same color than the percentage. Don't print it
+                * twice for close colored addr with the same filename:line
+                */
+               if (al->path) {
+                       if (!prev_line || strcmp(prev_line, al->path)) {
+                               color_fprintf(stdout, color, " // %s", al->path);
+                               prev_line = al->path;
+                       }
+               }
+
                printf("\n");
        } else if (max_lines && printed >= max_lines)
                return 1;
@@ -1454,7 +1454,7 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start
                if (!*al->line)
                        printf(" %*s:\n", width, " ");
                else
-                       printf(" %*s:     %*s %s\n", width, " ", addr_fmt_width, " ", al->line);
+                       printf(" %*s: %-*d %s\n", width, " ", addr_fmt_width, al->line_nr, al->line);
        }
 
        return 0;
@@ -1482,7 +1482,7 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start
  */
 static int symbol__parse_objdump_line(struct symbol *sym,
                                      struct annotate_args *args,
-                                     char *parsed_line, int *line_nr)
+                                     char *parsed_line, int *line_nr, char **fileloc)
 {
        struct map *map = args->ms.map;
        struct annotation *notes = symbol__annotation(sym);
@@ -1494,6 +1494,7 @@ static int symbol__parse_objdump_line(struct symbol *sym,
        /* /filename:linenr ? Save line number and ignore. */
        if (regexec(&file_lineno, parsed_line, 2, match, 0) == 0) {
                *line_nr = atoi(parsed_line + match[1].rm_so);
+               *fileloc = strdup(parsed_line);
                return 0;
        }
 
@@ -1513,6 +1514,7 @@ static int symbol__parse_objdump_line(struct symbol *sym,
        args->offset  = offset;
        args->line    = parsed_line;
        args->line_nr = *line_nr;
+       args->fileloc = *fileloc;
        args->ms.sym  = sym;
 
        dl = disasm_line__new(args);
@@ -1807,6 +1809,7 @@ static int symbol__disassemble_bpf(struct symbol *sym,
                        args->offset = -1;
                        args->line = strdup(srcline);
                        args->line_nr = 0;
+                       args->fileloc = NULL;
                        args->ms.sym  = sym;
                        dl = disasm_line__new(args);
                        if (dl) {
@@ -1818,6 +1821,7 @@ static int symbol__disassemble_bpf(struct symbol *sym,
                args->offset = pc;
                args->line = buf + prev_buf_size;
                args->line_nr = 0;
+               args->fileloc = NULL;
                args->ms.sym  = sym;
                dl = disasm_line__new(args);
                if (dl)
@@ -1852,6 +1856,7 @@ symbol__disassemble_bpf_image(struct symbol *sym,
        args->offset = -1;
        args->line = strdup("to be implemented");
        args->line_nr = 0;
+       args->fileloc = NULL;
        dl = disasm_line__new(args);
        if (dl)
                annotation_line__add(&dl->al, &notes->src->source);
@@ -1933,6 +1938,7 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
        bool delete_extract = false;
        bool decomp = false;
        int lineno = 0;
+       char *fileloc = NULL;
        int nline;
        char *line;
        size_t line_len;
@@ -2060,7 +2066,7 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
                 * See disasm_line__new() and struct disasm_line::line_nr.
                 */
                if (symbol__parse_objdump_line(sym, args, expanded_line,
-                                              &lineno) < 0)
+                                              &lineno, &fileloc) < 0)
                        break;
                nline++;
        }
@@ -3144,6 +3150,10 @@ static int annotation__config(const char *var, const char *value, void *data)
                opt->use_offset = perf_config_bool("use_offset", value);
        } else if (!strcmp(var, "annotate.disassembler_style")) {
                opt->disassembler_style = value;
+       } else if (!strcmp(var, "annotate.demangle")) {
+               symbol_conf.demangle = perf_config_bool("demangle", value);
+       } else if (!strcmp(var, "annotate.demangle_kernel")) {
+               symbol_conf.demangle_kernel = perf_config_bool("demangle_kernel", value);
        } else {
                pr_debug("%s variable unknown, ignoring...", var);
        }
index 096cdaf..3757416 100644 (file)
@@ -84,6 +84,7 @@ struct annotation_options {
             print_lines,
             full_path,
             show_linenr,
+            show_fileloc,
             show_nr_jumps,
             show_minmax_cycle,
             show_asm_raw,
@@ -136,6 +137,7 @@ struct annotation_line {
        s64                      offset;
        char                    *line;
        int                      line_nr;
+       char                    *fileloc;
        int                      jump_sources;
        float                    ipc;
        u64                      cycles;
index 9087f1b..fbb3c40 100644 (file)
@@ -671,7 +671,7 @@ int bpf__probe(struct bpf_object *obj)
                 * After probing, let's consider prologue, which
                 * adds program fetcher to BPF programs.
                 *
-                * hook_load_preprocessorr() hooks pre-processor
+                * hook_load_preprocessor() hooks pre-processor
                 * to bpf_program, let it generate prologue
                 * dynamically during loading.
                 */
index 04f8912..ddb52f7 100644 (file)
@@ -5,6 +5,7 @@
 #include <assert.h>
 #include <limits.h>
 #include <unistd.h>
+#include <sys/file.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <linux/err.h>
 #include <bpf/bpf.h>
 #include <bpf/btf.h>
 #include <bpf/libbpf.h>
+#include <api/fs/fs.h>
+#include <perf/bpf_perf.h>
 
 #include "bpf_counter.h"
 #include "counts.h"
 #include "debug.h"
 #include "evsel.h"
+#include "evlist.h"
 #include "target.h"
+#include "cpumap.h"
+#include "thread_map.h"
 
 #include "bpf_skel/bpf_prog_profiler.skel.h"
+#include "bpf_skel/bperf_u.h"
+#include "bpf_skel/bperf_leader.skel.h"
+#include "bpf_skel/bperf_follower.skel.h"
+
+#define ATTR_MAP_SIZE 16
 
 static inline void *u64_to_ptr(__u64 ptr)
 {
@@ -204,6 +215,17 @@ static int bpf_program_profiler__enable(struct evsel *evsel)
        return 0;
 }
 
+static int bpf_program_profiler__disable(struct evsel *evsel)
+{
+       struct bpf_counter *counter;
+
+       list_for_each_entry(counter, &evsel->bpf_counter_list, list) {
+               assert(counter->skel != NULL);
+               bpf_prog_profiler_bpf__detach(counter->skel);
+       }
+       return 0;
+}
+
 static int bpf_program_profiler__read(struct evsel *evsel)
 {
        // perf_cpu_map uses /sys/devices/system/cpu/online
@@ -269,22 +291,527 @@ static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu,
 struct bpf_counter_ops bpf_program_profiler_ops = {
        .load       = bpf_program_profiler__load,
        .enable     = bpf_program_profiler__enable,
+       .disable    = bpf_program_profiler__disable,
        .read       = bpf_program_profiler__read,
        .destroy    = bpf_program_profiler__destroy,
        .install_pe = bpf_program_profiler__install_pe,
 };
 
+static __u32 bpf_link_get_id(int fd)
+{
+       struct bpf_link_info link_info = {0};
+       __u32 link_info_len = sizeof(link_info);
+
+       bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len);
+       return link_info.id;
+}
+
+static __u32 bpf_link_get_prog_id(int fd)
+{
+       struct bpf_link_info link_info = {0};
+       __u32 link_info_len = sizeof(link_info);
+
+       bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len);
+       return link_info.prog_id;
+}
+
+static __u32 bpf_map_get_id(int fd)
+{
+       struct bpf_map_info map_info = {0};
+       __u32 map_info_len = sizeof(map_info);
+
+       bpf_obj_get_info_by_fd(fd, &map_info, &map_info_len);
+       return map_info.id;
+}
+
+static bool bperf_attr_map_compatible(int attr_map_fd)
+{
+       struct bpf_map_info map_info = {0};
+       __u32 map_info_len = sizeof(map_info);
+       int err;
+
+       err = bpf_obj_get_info_by_fd(attr_map_fd, &map_info, &map_info_len);
+
+       if (err)
+               return false;
+       return (map_info.key_size == sizeof(struct perf_event_attr)) &&
+               (map_info.value_size == sizeof(struct perf_event_attr_map_entry));
+}
+
+static int bperf_lock_attr_map(struct target *target)
+{
+       char path[PATH_MAX];
+       int map_fd, err;
+
+       if (target->attr_map) {
+               scnprintf(path, PATH_MAX, "%s", target->attr_map);
+       } else {
+               scnprintf(path, PATH_MAX, "%s/fs/bpf/%s", sysfs__mountpoint(),
+                         BPF_PERF_DEFAULT_ATTR_MAP_PATH);
+       }
+
+       if (access(path, F_OK)) {
+               map_fd = bpf_create_map(BPF_MAP_TYPE_HASH,
+                                       sizeof(struct perf_event_attr),
+                                       sizeof(struct perf_event_attr_map_entry),
+                                       ATTR_MAP_SIZE, 0);
+               if (map_fd < 0)
+                       return -1;
+
+               err = bpf_obj_pin(map_fd, path);
+               if (err) {
+                       /* someone pinned the map in parallel? */
+                       close(map_fd);
+                       map_fd = bpf_obj_get(path);
+                       if (map_fd < 0)
+                               return -1;
+               }
+       } else {
+               map_fd = bpf_obj_get(path);
+               if (map_fd < 0)
+                       return -1;
+       }
+
+       if (!bperf_attr_map_compatible(map_fd)) {
+               close(map_fd);
+               return -1;
+
+       }
+       err = flock(map_fd, LOCK_EX);
+       if (err) {
+               close(map_fd);
+               return -1;
+       }
+       return map_fd;
+}
+
+/* trigger the leader program on a cpu */
+static int bperf_trigger_reading(int prog_fd, int cpu)
+{
+       DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
+                           .ctx_in = NULL,
+                           .ctx_size_in = 0,
+                           .flags = BPF_F_TEST_RUN_ON_CPU,
+                           .cpu = cpu,
+                           .retval = 0,
+               );
+
+       return bpf_prog_test_run_opts(prog_fd, &opts);
+}
+
+static int bperf_check_target(struct evsel *evsel,
+                             struct target *target,
+                             enum bperf_filter_type *filter_type,
+                             __u32 *filter_entry_cnt)
+{
+       if (evsel->leader->core.nr_members > 1) {
+               pr_err("bpf managed perf events do not yet support groups.\n");
+               return -1;
+       }
+
+       /* determine filter type based on target */
+       if (target->system_wide) {
+               *filter_type = BPERF_FILTER_GLOBAL;
+               *filter_entry_cnt = 1;
+       } else if (target->cpu_list) {
+               *filter_type = BPERF_FILTER_CPU;
+               *filter_entry_cnt = perf_cpu_map__nr(evsel__cpus(evsel));
+       } else if (target->tid) {
+               *filter_type = BPERF_FILTER_PID;
+               *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads);
+       } else if (target->pid || evsel->evlist->workload.pid != -1) {
+               *filter_type = BPERF_FILTER_TGID;
+               *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads);
+       } else {
+               pr_err("bpf managed perf events do not yet support these targets.\n");
+               return -1;
+       }
+
+       return 0;
+}
+
+static struct perf_cpu_map *all_cpu_map;
+
+static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
+                                      struct perf_event_attr_map_entry *entry)
+{
+       struct bperf_leader_bpf *skel = bperf_leader_bpf__open();
+       int link_fd, diff_map_fd, err;
+       struct bpf_link *link = NULL;
+
+       if (!skel) {
+               pr_err("Failed to open leader skeleton\n");
+               return -1;
+       }
+
+       bpf_map__resize(skel->maps.events, libbpf_num_possible_cpus());
+       err = bperf_leader_bpf__load(skel);
+       if (err) {
+               pr_err("Failed to load leader skeleton\n");
+               goto out;
+       }
+
+       err = -1;
+       link = bpf_program__attach(skel->progs.on_switch);
+       if (!link) {
+               pr_err("Failed to attach leader program\n");
+               goto out;
+       }
+
+       link_fd = bpf_link__fd(link);
+       diff_map_fd = bpf_map__fd(skel->maps.diff_readings);
+       entry->link_id = bpf_link_get_id(link_fd);
+       entry->diff_map_id = bpf_map_get_id(diff_map_fd);
+       err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, entry, BPF_ANY);
+       assert(err == 0);
+
+       evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry->link_id);
+       assert(evsel->bperf_leader_link_fd >= 0);
+
+       /*
+        * save leader_skel for install_pe, which is called within
+        * following evsel__open_per_cpu call
+        */
+       evsel->leader_skel = skel;
+       evsel__open_per_cpu(evsel, all_cpu_map, -1);
+
+out:
+       bperf_leader_bpf__destroy(skel);
+       bpf_link__destroy(link);
+       return err;
+}
+
+static int bperf__load(struct evsel *evsel, struct target *target)
+{
+       struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
+       int attr_map_fd, diff_map_fd = -1, err;
+       enum bperf_filter_type filter_type;
+       __u32 filter_entry_cnt, i;
+
+       if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
+               return -1;
+
+       if (!all_cpu_map) {
+               all_cpu_map = perf_cpu_map__new(NULL);
+               if (!all_cpu_map)
+                       return -1;
+       }
+
+       evsel->bperf_leader_prog_fd = -1;
+       evsel->bperf_leader_link_fd = -1;
+
+       /*
+        * Step 1: hold a fd on the leader program and the bpf_link, if
+        * the program is not already gone, reload the program.
+        * Use flock() to ensure exclusive access to the perf_event_attr
+        * map.
+        */
+       attr_map_fd = bperf_lock_attr_map(target);
+       if (attr_map_fd < 0) {
+               pr_err("Failed to lock perf_event_attr map\n");
+               return -1;
+       }
+
+       err = bpf_map_lookup_elem(attr_map_fd, &evsel->core.attr, &entry);
+       if (err) {
+               err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, &entry, BPF_ANY);
+               if (err)
+                       goto out;
+       }
+
+       evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry.link_id);
+       if (evsel->bperf_leader_link_fd < 0 &&
+           bperf_reload_leader_program(evsel, attr_map_fd, &entry))
+               goto out;
+
+       /*
+        * The bpf_link holds reference to the leader program, and the
+        * leader program holds reference to the maps. Therefore, if
+        * link_id is valid, diff_map_id should also be valid.
+        */
+       evsel->bperf_leader_prog_fd = bpf_prog_get_fd_by_id(
+               bpf_link_get_prog_id(evsel->bperf_leader_link_fd));
+       assert(evsel->bperf_leader_prog_fd >= 0);
+
+       diff_map_fd = bpf_map_get_fd_by_id(entry.diff_map_id);
+       assert(diff_map_fd >= 0);
+
+       /*
+        * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check
+        * whether the kernel support it
+        */
+       err = bperf_trigger_reading(evsel->bperf_leader_prog_fd, 0);
+       if (err) {
+               pr_err("The kernel does not support test_run for raw_tp BPF programs.\n"
+                      "Therefore, --use-bpf might show inaccurate readings\n");
+               goto out;
+       }
+
+       /* Step 2: load the follower skeleton */
+       evsel->follower_skel = bperf_follower_bpf__open();
+       if (!evsel->follower_skel) {
+               pr_err("Failed to open follower skeleton\n");
+               goto out;
+       }
+
+       /* attach fexit program to the leader program */
+       bpf_program__set_attach_target(evsel->follower_skel->progs.fexit_XXX,
+                                      evsel->bperf_leader_prog_fd, "on_switch");
+
+       /* connect to leader diff_reading map */
+       bpf_map__reuse_fd(evsel->follower_skel->maps.diff_readings, diff_map_fd);
+
+       /* set up reading map */
+       bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
+                                filter_entry_cnt);
+       /* set up follower filter based on target */
+       bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
+                                filter_entry_cnt);
+       err = bperf_follower_bpf__load(evsel->follower_skel);
+       if (err) {
+               pr_err("Failed to load follower skeleton\n");
+               bperf_follower_bpf__destroy(evsel->follower_skel);
+               evsel->follower_skel = NULL;
+               goto out;
+       }
+
+       for (i = 0; i < filter_entry_cnt; i++) {
+               int filter_map_fd;
+               __u32 key;
+
+               if (filter_type == BPERF_FILTER_PID ||
+                   filter_type == BPERF_FILTER_TGID)
+                       key = evsel->core.threads->map[i].pid;
+               else if (filter_type == BPERF_FILTER_CPU)
+                       key = evsel->core.cpus->map[i];
+               else
+                       break;
+
+               filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
+               bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
+       }
+
+       evsel->follower_skel->bss->type = filter_type;
+
+       err = bperf_follower_bpf__attach(evsel->follower_skel);
+
+out:
+       if (err && evsel->bperf_leader_link_fd >= 0)
+               close(evsel->bperf_leader_link_fd);
+       if (err && evsel->bperf_leader_prog_fd >= 0)
+               close(evsel->bperf_leader_prog_fd);
+       if (diff_map_fd >= 0)
+               close(diff_map_fd);
+
+       flock(attr_map_fd, LOCK_UN);
+       close(attr_map_fd);
+
+       return err;
+}
+
+static int bperf__install_pe(struct evsel *evsel, int cpu, int fd)
+{
+       struct bperf_leader_bpf *skel = evsel->leader_skel;
+
+       return bpf_map_update_elem(bpf_map__fd(skel->maps.events),
+                                  &cpu, &fd, BPF_ANY);
+}
+
+/*
+ * trigger the leader prog on each cpu, so the accum_reading map could get
+ * the latest readings.
+ */
+static int bperf_sync_counters(struct evsel *evsel)
+{
+       int num_cpu, i, cpu;
+
+       num_cpu = all_cpu_map->nr;
+       for (i = 0; i < num_cpu; i++) {
+               cpu = all_cpu_map->map[i];
+               bperf_trigger_reading(evsel->bperf_leader_prog_fd, cpu);
+       }
+       return 0;
+}
+
+static int bperf__enable(struct evsel *evsel)
+{
+       evsel->follower_skel->bss->enabled = 1;
+       return 0;
+}
+
+static int bperf__disable(struct evsel *evsel)
+{
+       evsel->follower_skel->bss->enabled = 0;
+       return 0;
+}
+
+static int bperf__read(struct evsel *evsel)
+{
+       struct bperf_follower_bpf *skel = evsel->follower_skel;
+       __u32 num_cpu_bpf = cpu__max_cpu();
+       struct bpf_perf_event_value values[num_cpu_bpf];
+       int reading_map_fd, err = 0;
+       __u32 i, j, num_cpu;
+
+       bperf_sync_counters(evsel);
+       reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
+
+       for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
+               __u32 cpu;
+
+               err = bpf_map_lookup_elem(reading_map_fd, &i, values);
+               if (err)
+                       goto out;
+               switch (evsel->follower_skel->bss->type) {
+               case BPERF_FILTER_GLOBAL:
+                       assert(i == 0);
+
+                       num_cpu = all_cpu_map->nr;
+                       for (j = 0; j < num_cpu; j++) {
+                               cpu = all_cpu_map->map[j];
+                               perf_counts(evsel->counts, cpu, 0)->val = values[cpu].counter;
+                               perf_counts(evsel->counts, cpu, 0)->ena = values[cpu].enabled;
+                               perf_counts(evsel->counts, cpu, 0)->run = values[cpu].running;
+                       }
+                       break;
+               case BPERF_FILTER_CPU:
+                       cpu = evsel->core.cpus->map[i];
+                       perf_counts(evsel->counts, i, 0)->val = values[cpu].counter;
+                       perf_counts(evsel->counts, i, 0)->ena = values[cpu].enabled;
+                       perf_counts(evsel->counts, i, 0)->run = values[cpu].running;
+                       break;
+               case BPERF_FILTER_PID:
+               case BPERF_FILTER_TGID:
+                       perf_counts(evsel->counts, 0, i)->val = 0;
+                       perf_counts(evsel->counts, 0, i)->ena = 0;
+                       perf_counts(evsel->counts, 0, i)->run = 0;
+
+                       for (cpu = 0; cpu < num_cpu_bpf; cpu++) {
+                               perf_counts(evsel->counts, 0, i)->val += values[cpu].counter;
+                               perf_counts(evsel->counts, 0, i)->ena += values[cpu].enabled;
+                               perf_counts(evsel->counts, 0, i)->run += values[cpu].running;
+                       }
+                       break;
+               default:
+                       break;
+               }
+       }
+out:
+       return err;
+}
+
+static int bperf__destroy(struct evsel *evsel)
+{
+       bperf_follower_bpf__destroy(evsel->follower_skel);
+       close(evsel->bperf_leader_prog_fd);
+       close(evsel->bperf_leader_link_fd);
+       return 0;
+}
+
+/*
+ * bperf: share hardware PMCs with BPF
+ *
+ * perf uses performance monitoring counters (PMC) to monitor system
+ * performance. The PMCs are limited hardware resources. For example,
+ * Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu.
+ *
+ * Modern data center systems use these PMCs in many different ways:
+ * system level monitoring, (maybe nested) container level monitoring, per
+ * process monitoring, profiling (in sample mode), etc. In some cases,
+ * there are more active perf_events than available hardware PMCs. To allow
+ * all perf_events to have a chance to run, it is necessary to do expensive
+ * time multiplexing of events.
+ *
+ * On the other hand, many monitoring tools count the common metrics
+ * (cycles, instructions). It is a waste to have multiple tools create
+ * multiple perf_events of "cycles" and occupy multiple PMCs.
+ *
+ * bperf tries to reduce such wastes by allowing multiple perf_events of
+ * "cycles" or "instructions" (at different scopes) to share PMUs. Instead
+ * of having each perf-stat session to read its own perf_events, bperf uses
+ * BPF programs to read the perf_events and aggregate readings to BPF maps.
+ * Then, the perf-stat session(s) reads the values from these BPF maps.
+ *
+ *                                ||
+ *       shared progs and maps <- || -> per session progs and maps
+ *                                ||
+ *   ---------------              ||
+ *   | perf_events |              ||
+ *   ---------------       fexit  ||      -----------------
+ *          |             --------||----> | follower prog |
+ *       --------------- /        || ---  -----------------
+ * cs -> | leader prog |/         ||/        |         |
+ *   --> ---------------         /||  --------------  ------------------
+ *  /       |         |         / ||  | filter map |  | accum_readings |
+ * /  ------------  ------------  ||  --------------  ------------------
+ * |  | prev map |  | diff map |  ||                        |
+ * |  ------------  ------------  ||                        |
+ *  \                             ||                        |
+ * = \ ==================================================== | ============
+ *    \                                                    /   user space
+ *     \                                                  /
+ *      \                                                /
+ *    BPF_PROG_TEST_RUN                    BPF_MAP_LOOKUP_ELEM
+ *        \                                            /
+ *         \                                          /
+ *          \------  perf-stat ----------------------/
+ *
+ * The figure above shows the architecture of bperf. Note that the figure
+ * is divided into 3 regions: shared progs and maps (top left), per session
+ * progs and maps (top right), and user space (bottom).
+ *
+ * The leader prog is triggered on each context switch (cs). The leader
+ * prog reads perf_events and stores the difference (current_reading -
+ * previous_reading) to the diff map. For the same metric, e.g. "cycles",
+ * multiple perf-stat sessions share the same leader prog.
+ *
+ * Each perf-stat session creates a follower prog as fexit program to the
+ * leader prog. It is possible to attach up to BPF_MAX_TRAMP_PROGS (38)
+ * follower progs to the same leader prog. The follower prog checks current
+ * task and processor ID to decide whether to add the value from the diff
+ * map to its accumulated reading map (accum_readings).
+ *
+ * Finally, perf-stat user space reads the value from accum_reading map.
+ *
+ * Besides context switch, it is also necessary to trigger the leader prog
+ * before perf-stat reads the value. Otherwise, the accum_reading map may
+ * not have the latest reading from the perf_events. This is achieved by
+ * triggering the event via sys_bpf(BPF_PROG_TEST_RUN) to each CPU.
+ *
+ * Comment before the definition of struct perf_event_attr_map_entry
+ * describes how different sessions of perf-stat share information about
+ * the leader prog.
+ */
+
+struct bpf_counter_ops bperf_ops = {
+       .load       = bperf__load,
+       .enable     = bperf__enable,
+       .disable    = bperf__disable,
+       .read       = bperf__read,
+       .install_pe = bperf__install_pe,
+       .destroy    = bperf__destroy,
+};
+
+static inline bool bpf_counter_skip(struct evsel *evsel)
+{
+       return list_empty(&evsel->bpf_counter_list) &&
+               evsel->follower_skel == NULL;
+}
+
 int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd)
 {
-       if (list_empty(&evsel->bpf_counter_list))
+       if (bpf_counter_skip(evsel))
                return 0;
        return evsel->bpf_counter_ops->install_pe(evsel, cpu, fd);
 }
 
 int bpf_counter__load(struct evsel *evsel, struct target *target)
 {
-       if (target__has_bpf(target))
+       if (target->bpf_str)
                evsel->bpf_counter_ops = &bpf_program_profiler_ops;
+       else if (target->use_bpf || evsel->bpf_counter ||
+                evsel__match_bpf_counter_events(evsel->name))
+               evsel->bpf_counter_ops = &bperf_ops;
 
        if (evsel->bpf_counter_ops)
                return evsel->bpf_counter_ops->load(evsel, target);
@@ -293,21 +820,28 @@ int bpf_counter__load(struct evsel *evsel, struct target *target)
 
 int bpf_counter__enable(struct evsel *evsel)
 {
-       if (list_empty(&evsel->bpf_counter_list))
+       if (bpf_counter_skip(evsel))
                return 0;
        return evsel->bpf_counter_ops->enable(evsel);
 }
 
+int bpf_counter__disable(struct evsel *evsel)
+{
+       if (bpf_counter_skip(evsel))
+               return 0;
+       return evsel->bpf_counter_ops->disable(evsel);
+}
+
 int bpf_counter__read(struct evsel *evsel)
 {
-       if (list_empty(&evsel->bpf_counter_list))
+       if (bpf_counter_skip(evsel))
                return -EAGAIN;
        return evsel->bpf_counter_ops->read(evsel);
 }
 
 void bpf_counter__destroy(struct evsel *evsel)
 {
-       if (list_empty(&evsel->bpf_counter_list))
+       if (bpf_counter_skip(evsel))
                return;
        evsel->bpf_counter_ops->destroy(evsel);
        evsel->bpf_counter_ops = NULL;
index 2eca210..d6d907c 100644 (file)
@@ -18,6 +18,7 @@ typedef int (*bpf_counter_evsel_install_pe_op)(struct evsel *evsel,
 struct bpf_counter_ops {
        bpf_counter_evsel_target_op load;
        bpf_counter_evsel_op enable;
+       bpf_counter_evsel_op disable;
        bpf_counter_evsel_op read;
        bpf_counter_evsel_op destroy;
        bpf_counter_evsel_install_pe_op install_pe;
@@ -32,13 +33,14 @@ struct bpf_counter {
 
 int bpf_counter__load(struct evsel *evsel, struct target *target);
 int bpf_counter__enable(struct evsel *evsel);
+int bpf_counter__disable(struct evsel *evsel);
 int bpf_counter__read(struct evsel *evsel);
 void bpf_counter__destroy(struct evsel *evsel);
 int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd);
 
 #else /* HAVE_BPF_SKEL */
 
-#include<linux/err.h>
+#include <linux/err.h>
 
 static inline int bpf_counter__load(struct evsel *evsel __maybe_unused,
                                    struct target *target __maybe_unused)
@@ -51,6 +53,11 @@ static inline int bpf_counter__enable(struct evsel *evsel __maybe_unused)
        return 0;
 }
 
+static inline int bpf_counter__disable(struct evsel *evsel __maybe_unused)
+{
+       return 0;
+}
+
 static inline int bpf_counter__read(struct evsel *evsel __maybe_unused)
 {
        return -EAGAIN;
diff --git a/tools/perf/util/bpf_skel/bperf.h b/tools/perf/util/bpf_skel/bperf.h
new file mode 100644 (file)
index 0000000..186a555
--- /dev/null
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+
+#ifndef __BPERF_STAT_H
+#define __BPERF_STAT_H
+
+typedef struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+       __uint(key_size, sizeof(__u32));
+       __uint(value_size, sizeof(struct bpf_perf_event_value));
+       __uint(max_entries, 1);
+} reading_map;
+
+#endif /* __BPERF_STAT_H */
diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
new file mode 100644 (file)
index 0000000..b8fa3cb
--- /dev/null
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+#include <linux/bpf.h>
+#include <linux/perf_event.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bperf.h"
+#include "bperf_u.h"
+
+reading_map diff_readings SEC(".maps");
+reading_map accum_readings SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __uint(key_size, sizeof(__u32));
+       __uint(value_size, sizeof(__u32));
+} filter SEC(".maps");
+
+enum bperf_filter_type type = 0;
+int enabled = 0;
+
+SEC("fexit/XXX")
+int BPF_PROG(fexit_XXX)
+{
+       struct bpf_perf_event_value *diff_val, *accum_val;
+       __u32 filter_key, zero = 0;
+       __u32 *accum_key;
+
+       if (!enabled)
+               return 0;
+
+       switch (type) {
+       case BPERF_FILTER_GLOBAL:
+               accum_key = &zero;
+               goto do_add;
+       case BPERF_FILTER_CPU:
+               filter_key = bpf_get_smp_processor_id();
+               break;
+       case BPERF_FILTER_PID:
+               filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
+               break;
+       case BPERF_FILTER_TGID:
+               filter_key = bpf_get_current_pid_tgid() >> 32;
+               break;
+       default:
+               return 0;
+       }
+
+       accum_key = bpf_map_lookup_elem(&filter, &filter_key);
+       if (!accum_key)
+               return 0;
+
+do_add:
+       diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
+       if (!diff_val)
+               return 0;
+
+       accum_val = bpf_map_lookup_elem(&accum_readings, accum_key);
+       if (!accum_val)
+               return 0;
+
+       accum_val->counter += diff_val->counter;
+       accum_val->enabled += diff_val->enabled;
+       accum_val->running += diff_val->running;
+
+       return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/bperf_leader.bpf.c b/tools/perf/util/bpf_skel/bperf_leader.bpf.c
new file mode 100644 (file)
index 0000000..4f70d14
--- /dev/null
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+#include <linux/bpf.h>
+#include <linux/perf_event.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bperf.h"
+
+struct {
+       __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+       __uint(key_size, sizeof(__u32));
+       __uint(value_size, sizeof(int));
+       __uint(map_flags, BPF_F_PRESERVE_ELEMS);
+} events SEC(".maps");
+
+reading_map prev_readings SEC(".maps");
+reading_map diff_readings SEC(".maps");
+
+SEC("raw_tp/sched_switch")
+int BPF_PROG(on_switch)
+{
+       struct bpf_perf_event_value val, *prev_val, *diff_val;
+       __u32 key = bpf_get_smp_processor_id();
+       __u32 zero = 0;
+       long err;
+
+       prev_val = bpf_map_lookup_elem(&prev_readings, &zero);
+       if (!prev_val)
+               return 0;
+
+       diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
+       if (!diff_val)
+               return 0;
+
+       err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
+       if (err)
+               return 0;
+
+       diff_val->counter = val.counter - prev_val->counter;
+       diff_val->enabled = val.enabled - prev_val->enabled;
+       diff_val->running = val.running - prev_val->running;
+       *prev_val = val;
+       return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h
new file mode 100644 (file)
index 0000000..1ce0c2c
--- /dev/null
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+
+#ifndef __BPERF_STAT_U_H
+#define __BPERF_STAT_U_H
+
+enum bperf_filter_type {
+       BPERF_FILTER_GLOBAL = 1,
+       BPERF_FILTER_CPU,
+       BPERF_FILTER_PID,
+       BPERF_FILTER_TGID,
+};
+
+#endif /* __BPERF_STAT_U_H */
index c7cec92..ab12b4c 100644 (file)
@@ -52,7 +52,7 @@ int BPF_PROG(fentry_XXX)
 static inline void
 fexit_update_maps(struct bpf_perf_event_value *after)
 {
-       struct bpf_perf_event_value *before, diff, *accum;
+       struct bpf_perf_event_value *before, diff;
        __u32 zero = 0;
 
        before = bpf_map_lookup_elem(&fentry_readings, &zero);
@@ -78,7 +78,6 @@ int BPF_PROG(fexit_XXX)
 {
        struct bpf_perf_event_value reading;
        __u32 cpu = bpf_get_smp_processor_id();
-       __u32 one = 1, zero = 0;
        int err;
 
        /* read all events before updating the maps, to reduce error */
index 6b32291..5875cfc 100644 (file)
@@ -23,7 +23,7 @@
  * @children: tree of call paths of functions called
  *
  * In combination with the call_return structure, the call_path structure
- * defines a context-sensitve call-graph.
+ * defines a context-sensitive call-graph.
  */
 struct call_path {
        struct call_path *parent;
index 1b60985..8e27771 100644 (file)
@@ -877,7 +877,7 @@ append_chain_children(struct callchain_node *root,
        if (!node)
                return -1;
 
-       /* lookup in childrens */
+       /* lookup in children */
        while (*p) {
                enum match_result ret;
 
index 6984c77..63d472b 100644 (file)
@@ -18,6 +18,7 @@
 #include "util/hist.h"  /* perf_hist_config */
 #include "util/llvm-utils.h"   /* perf_llvm_config */
 #include "util/stat.h"  /* perf_stat__set_big_num */
+#include "util/evsel.h"  /* evsel__hw_names, evsel__use_bpf_counters */
 #include "build-id.h"
 #include "debug.h"
 #include "config.h"
@@ -457,6 +458,12 @@ static int perf_stat_config(const char *var, const char *value)
        if (!strcmp(var, "stat.big-num"))
                perf_stat__set_big_num(perf_config_bool(var, value));
 
+       if (!strcmp(var, "stat.no-csv-summary"))
+               perf_stat__set_no_csv_summary(perf_config_bool(var, value));
+
+       if (!strcmp(var, "stat.bpf-counter-events"))
+               evsel__bpf_counter_events = strdup(value);
+
        /* Add other config variables here. */
        return 0;
 }
@@ -699,7 +706,7 @@ static int collect_config(const char *var, const char *value,
        /* perf_config_set can contain both user and system config items.
         * So we should know where each value is from.
         * The classification would be needed when a particular config file
-        * is overwrited by setting feature i.e. set_config().
+        * is overwritten by setting feature i.e. set_config().
         */
        if (strcmp(config_file_name, perf_etc_perfconfig()) == 0) {
                section->from_system_config = true;
index 3f4bc40..059bcec 100644 (file)
@@ -6,6 +6,7 @@
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
  */
 
+#include <linux/coresight-pmu.h>
 #include <linux/err.h>
 #include <linux/list.h>
 #include <linux/zalloc.h>
@@ -316,7 +317,7 @@ cs_etm_decoder__do_hard_timestamp(struct cs_etm_queue *etmq,
         * This is the first timestamp we've seen since the beginning of traces
         * or a discontinuity.  Since timestamps packets are generated *after*
         * range packets have been generated, we need to estimate the time at
-        * which instructions started by substracting the number of instructions
+        * which instructions started by subtracting the number of instructions
         * executed to the timestamp.
         */
        packet_queue->timestamp = elem->timestamp - packet_queue->instr_count;
@@ -491,13 +492,42 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq,
                        const ocsd_generic_trace_elem *elem,
                        const uint8_t trace_chan_id)
 {
-       pid_t tid;
+       pid_t tid = -1;
+       static u64 pid_fmt;
+       int ret;
 
-       /* Ignore PE_CONTEXT packets that don't have a valid contextID */
-       if (!elem->context.ctxt_id_valid)
+       /*
+        * As all the ETMs run at the same exception level, the system should
+        * have the same PID format crossing CPUs.  So cache the PID format
+        * and reuse it for sequential decoding.
+        */
+       if (!pid_fmt) {
+               ret = cs_etm__get_pid_fmt(trace_chan_id, &pid_fmt);
+               if (ret)
+                       return OCSD_RESP_FATAL_SYS_ERR;
+       }
+
+       /*
+        * Process the PE_CONTEXT packets if we have a valid contextID or VMID.
+        * If the kernel is running at EL2, the PID is traced in CONTEXTIDR_EL2
+        * as VMID, Bit ETM_OPT_CTXTID2 is set in this case.
+        */
+       switch (pid_fmt) {
+       case BIT(ETM_OPT_CTXTID):
+               if (elem->context.ctxt_id_valid)
+                       tid = elem->context.context_id;
+               break;
+       case BIT(ETM_OPT_CTXTID2):
+               if (elem->context.vmid_valid)
+                       tid = elem->context.vmid;
+               break;
+       default:
+               break;
+       }
+
+       if (tid == -1)
                return OCSD_RESP_CONT;
 
-       tid =  elem->context.context_id;
        if (cs_etm__etmq_set_tid(etmq, tid, trace_chan_id))
                return OCSD_RESP_FATAL_SYS_ERR;
 
index a2a369e..7e63e7d 100644 (file)
@@ -7,6 +7,7 @@
  */
 
 #include <linux/bitops.h>
+#include <linux/coresight-pmu.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
@@ -156,11 +157,52 @@ int cs_etm__get_cpu(u8 trace_chan_id, int *cpu)
        return 0;
 }
 
+/*
+ * The returned PID format is presented by two bits:
+ *
+ *   Bit ETM_OPT_CTXTID: CONTEXTIDR or CONTEXTIDR_EL1 is traced;
+ *   Bit ETM_OPT_CTXTID2: CONTEXTIDR_EL2 is traced.
+ *
+ * It's possible that the two bits ETM_OPT_CTXTID and ETM_OPT_CTXTID2
+ * are enabled at the same time when the session runs on an EL2 kernel.
+ * This means the CONTEXTIDR_EL1 and CONTEXTIDR_EL2 both will be
+ * recorded in the trace data, the tool will selectively use
+ * CONTEXTIDR_EL2 as PID.
+ */
+int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt)
+{
+       struct int_node *inode;
+       u64 *metadata, val;
+
+       inode = intlist__find(traceid_list, trace_chan_id);
+       if (!inode)
+               return -EINVAL;
+
+       metadata = inode->priv;
+
+       if (metadata[CS_ETM_MAGIC] == __perf_cs_etmv3_magic) {
+               val = metadata[CS_ETM_ETMCR];
+               /* CONTEXTIDR is traced */
+               if (val & BIT(ETM_OPT_CTXTID))
+                       *pid_fmt = BIT(ETM_OPT_CTXTID);
+       } else {
+               val = metadata[CS_ETMV4_TRCCONFIGR];
+               /* CONTEXTIDR_EL2 is traced */
+               if (val & (BIT(ETM4_CFG_BIT_VMID) | BIT(ETM4_CFG_BIT_VMID_OPT)))
+                       *pid_fmt = BIT(ETM_OPT_CTXTID2);
+               /* CONTEXTIDR_EL1 is traced */
+               else if (val & BIT(ETM4_CFG_BIT_CTXTID))
+                       *pid_fmt = BIT(ETM_OPT_CTXTID);
+       }
+
+       return 0;
+}
+
 void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
                                              u8 trace_chan_id)
 {
        /*
-        * Wnen a timestamp packet is encountered the backend code
+        * When a timestamp packet is encountered the backend code
         * is stopped so that the front end has time to process packets
         * that were accumulated in the traceID queue.  Since there can
         * be more than one channel per cs_etm_queue, we need to specify
@@ -1655,7 +1697,7 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
                 * | 1 1 0 1 1 1 1 1 |  imm8  |
                 * +-----------------+--------+
                 *
-                * According to the specifiction, it only defines SVC for T32
+                * According to the specification, it only defines SVC for T32
                 * with 16 bits instruction and has no definition for 32bits;
                 * so below only read 2 bytes as instruction size for T32.
                 */
@@ -1887,7 +1929,7 @@ static int cs_etm__set_sample_flags(struct cs_etm_queue *etmq,
 
                /*
                 * If the previous packet is an exception return packet
-                * and the return address just follows SVC instuction,
+                * and the return address just follows SVC instruction,
                 * it needs to calibrate the previous packet sample flags
                 * as PERF_IP_FLAG_SYSCALLRET.
                 */
@@ -1961,7 +2003,7 @@ static int cs_etm__set_sample_flags(struct cs_etm_queue *etmq,
                 * contain exception type related info so we cannot decide
                 * the exception type purely based on exception return packet.
                 * If we record the exception number from exception packet and
-                * reuse it for excpetion return packet, this is not reliable
+                * reuse it for exception return packet, this is not reliable
                 * due the trace can be discontinuity or the interrupt can
                 * be nested, thus the recorded exception number cannot be
                 * used for exception return packet for these two cases.
@@ -2435,7 +2477,7 @@ static bool cs_etm__is_timeless_decoding(struct cs_etm_auxtrace *etm)
 }
 
 static const char * const cs_etm_global_header_fmts[] = {
-       [CS_HEADER_VERSION_0]   = "     Header version                 %llx\n",
+       [CS_HEADER_VERSION]     = "     Header version                 %llx\n",
        [CS_PMU_TYPE_CPUS]      = "     PMU type/num cpus              %llx\n",
        [CS_ETM_SNAPSHOT]       = "     Snapshot                       %llx\n",
 };
@@ -2443,6 +2485,7 @@ static const char * const cs_etm_global_header_fmts[] = {
 static const char * const cs_etm_priv_fmts[] = {
        [CS_ETM_MAGIC]          = "     Magic number                   %llx\n",
        [CS_ETM_CPU]            = "     CPU                            %lld\n",
+       [CS_ETM_NR_TRC_PARAMS]  = "     NR_TRC_PARAMS                  %llx\n",
        [CS_ETM_ETMCR]          = "     ETMCR                          %llx\n",
        [CS_ETM_ETMTRACEIDR]    = "     ETMTRACEIDR                    %llx\n",
        [CS_ETM_ETMCCER]        = "     ETMCCER                        %llx\n",
@@ -2452,6 +2495,7 @@ static const char * const cs_etm_priv_fmts[] = {
 static const char * const cs_etmv4_priv_fmts[] = {
        [CS_ETM_MAGIC]          = "     Magic number                   %llx\n",
        [CS_ETM_CPU]            = "     CPU                            %lld\n",
+       [CS_ETM_NR_TRC_PARAMS]  = "     NR_TRC_PARAMS                  %llx\n",
        [CS_ETMV4_TRCCONFIGR]   = "     TRCCONFIGR                     %llx\n",
        [CS_ETMV4_TRCTRACEIDR]  = "     TRCTRACEIDR                    %llx\n",
        [CS_ETMV4_TRCIDR0]      = "     TRCIDR0                        %llx\n",
@@ -2461,26 +2505,167 @@ static const char * const cs_etmv4_priv_fmts[] = {
        [CS_ETMV4_TRCAUTHSTATUS] = "    TRCAUTHSTATUS                  %llx\n",
 };
 
-static void cs_etm__print_auxtrace_info(__u64 *val, int num)
+static const char * const param_unk_fmt =
+       "       Unknown parameter [%d]         %llx\n";
+static const char * const magic_unk_fmt =
+       "       Magic number Unknown           %llx\n";
+
+static int cs_etm__print_cpu_metadata_v0(__u64 *val, int *offset)
 {
-       int i, j, cpu = 0;
+       int i = *offset, j, nr_params = 0, fmt_offset;
+       __u64 magic;
 
-       for (i = 0; i < CS_HEADER_VERSION_0_MAX; i++)
-               fprintf(stdout, cs_etm_global_header_fmts[i], val[i]);
+       /* check magic value */
+       magic = val[i + CS_ETM_MAGIC];
+       if ((magic != __perf_cs_etmv3_magic) &&
+           (magic != __perf_cs_etmv4_magic)) {
+               /* failure - note bad magic value */
+               fprintf(stdout, magic_unk_fmt, magic);
+               return -EINVAL;
+       }
+
+       /* print common header block */
+       fprintf(stdout, cs_etm_priv_fmts[CS_ETM_MAGIC], val[i++]);
+       fprintf(stdout, cs_etm_priv_fmts[CS_ETM_CPU], val[i++]);
+
+       if (magic == __perf_cs_etmv3_magic) {
+               nr_params = CS_ETM_NR_TRC_PARAMS_V0;
+               fmt_offset = CS_ETM_ETMCR;
+               /* after common block, offset format index past NR_PARAMS */
+               for (j = fmt_offset; j < nr_params + fmt_offset; j++, i++)
+                       fprintf(stdout, cs_etm_priv_fmts[j], val[i]);
+       } else if (magic == __perf_cs_etmv4_magic) {
+               nr_params = CS_ETMV4_NR_TRC_PARAMS_V0;
+               fmt_offset = CS_ETMV4_TRCCONFIGR;
+               /* after common block, offset format index past NR_PARAMS */
+               for (j = fmt_offset; j < nr_params + fmt_offset; j++, i++)
+                       fprintf(stdout, cs_etmv4_priv_fmts[j], val[i]);
+       }
+       *offset = i;
+       return 0;
+}
+
+static int cs_etm__print_cpu_metadata_v1(__u64 *val, int *offset)
+{
+       int i = *offset, j, total_params = 0;
+       __u64 magic;
+
+       magic = val[i + CS_ETM_MAGIC];
+       /* total params to print is NR_PARAMS + common block size for v1 */
+       total_params = val[i + CS_ETM_NR_TRC_PARAMS] + CS_ETM_COMMON_BLK_MAX_V1;
 
-       for (i = CS_HEADER_VERSION_0_MAX; cpu < num; cpu++) {
-               if (val[i] == __perf_cs_etmv3_magic)
-                       for (j = 0; j < CS_ETM_PRIV_MAX; j++, i++)
+       if (magic == __perf_cs_etmv3_magic) {
+               for (j = 0; j < total_params; j++, i++) {
+                       /* if newer record - could be excess params */
+                       if (j >= CS_ETM_PRIV_MAX)
+                               fprintf(stdout, param_unk_fmt, j, val[i]);
+                       else
                                fprintf(stdout, cs_etm_priv_fmts[j], val[i]);
-               else if (val[i] == __perf_cs_etmv4_magic)
-                       for (j = 0; j < CS_ETMV4_PRIV_MAX; j++, i++)
+               }
+       } else if (magic == __perf_cs_etmv4_magic) {
+               for (j = 0; j < total_params; j++, i++) {
+                       /* if newer record - could be excess params */
+                       if (j >= CS_ETMV4_PRIV_MAX)
+                               fprintf(stdout, param_unk_fmt, j, val[i]);
+                       else
                                fprintf(stdout, cs_etmv4_priv_fmts[j], val[i]);
-               else
-                       /* failure.. return */
+               }
+       } else {
+               /* failure - note bad magic value and error out */
+               fprintf(stdout, magic_unk_fmt, magic);
+               return -EINVAL;
+       }
+       *offset = i;
+       return 0;
+}
+
+static void cs_etm__print_auxtrace_info(__u64 *val, int num)
+{
+       int i, cpu = 0, version, err;
+
+       /* bail out early on bad header version */
+       version = val[0];
+       if (version > CS_HEADER_CURRENT_VERSION) {
+               /* failure.. return */
+               fprintf(stdout, "       Unknown Header Version = %x, ", version);
+               fprintf(stdout, "Version supported <= %x\n", CS_HEADER_CURRENT_VERSION);
+               return;
+       }
+
+       for (i = 0; i < CS_HEADER_VERSION_MAX; i++)
+               fprintf(stdout, cs_etm_global_header_fmts[i], val[i]);
+
+       for (i = CS_HEADER_VERSION_MAX; cpu < num; cpu++) {
+               if (version == 0)
+                       err = cs_etm__print_cpu_metadata_v0(val, &i);
+               else if (version == 1)
+                       err = cs_etm__print_cpu_metadata_v1(val, &i);
+               if (err)
                        return;
        }
 }
 
+/*
+ * Read a single cpu parameter block from the auxtrace_info priv block.
+ *
+ * For version 1 there is a per cpu nr_params entry. If we are handling
+ * version 1 file, then there may be less, the same, or more params
+ * indicated by this value than the compile time number we understand.
+ *
+ * For a version 0 info block, there are a fixed number, and we need to
+ * fill out the nr_param value in the metadata we create.
+ */
+static u64 *cs_etm__create_meta_blk(u64 *buff_in, int *buff_in_offset,
+                                   int out_blk_size, int nr_params_v0)
+{
+       u64 *metadata = NULL;
+       int hdr_version;
+       int nr_in_params, nr_out_params, nr_cmn_params;
+       int i, k;
+
+       metadata = zalloc(sizeof(*metadata) * out_blk_size);
+       if (!metadata)
+               return NULL;
+
+       /* read block current index & version */
+       i = *buff_in_offset;
+       hdr_version = buff_in[CS_HEADER_VERSION];
+
+       if (!hdr_version) {
+       /* read version 0 info block into a version 1 metadata block  */
+               nr_in_params = nr_params_v0;
+               metadata[CS_ETM_MAGIC] = buff_in[i + CS_ETM_MAGIC];
+               metadata[CS_ETM_CPU] = buff_in[i + CS_ETM_CPU];
+               metadata[CS_ETM_NR_TRC_PARAMS] = nr_in_params;
+               /* remaining block params at offset +1 from source */
+               for (k = CS_ETM_COMMON_BLK_MAX_V1 - 1; k < nr_in_params; k++)
+                       metadata[k + 1] = buff_in[i + k];
+               /* version 0 has 2 common params */
+               nr_cmn_params = 2;
+       } else {
+       /* read version 1 info block - input and output nr_params may differ */
+               /* version 1 has 3 common params */
+               nr_cmn_params = 3;
+               nr_in_params = buff_in[i + CS_ETM_NR_TRC_PARAMS];
+
+               /* if input has more params than output - skip excess */
+               nr_out_params = nr_in_params + nr_cmn_params;
+               if (nr_out_params > out_blk_size)
+                       nr_out_params = out_blk_size;
+
+               for (k = CS_ETM_MAGIC; k < nr_out_params; k++)
+                       metadata[k] = buff_in[i + k];
+
+               /* record the actual nr params we copied */
+               metadata[CS_ETM_NR_TRC_PARAMS] = nr_out_params - nr_cmn_params;
+       }
+
+       /* adjust in offset by number of in params used */
+       i += nr_in_params + nr_cmn_params;
+       *buff_in_offset = i;
+       return metadata;
+}
+
 int cs_etm__process_auxtrace_info(union perf_event *event,
                                  struct perf_session *session)
 {
@@ -2492,11 +2677,12 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
        int info_header_size;
        int total_size = auxtrace_info->header.size;
        int priv_size = 0;
-       int num_cpu;
-       int err = 0, idx = -1;
-       int i, j, k;
+       int num_cpu, trcidr_idx;
+       int err = 0;
+       int i, j;
        u64 *ptr, *hdr = NULL;
        u64 **metadata = NULL;
+       u64 hdr_version;
 
        /*
         * sizeof(auxtrace_info_event::type) +
@@ -2512,16 +2698,21 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
        /* First the global part */
        ptr = (u64 *) auxtrace_info->priv;
 
-       /* Look for version '0' of the header */
-       if (ptr[0] != 0)
+       /* Look for version of the header */
+       hdr_version = ptr[0];
+       if (hdr_version > CS_HEADER_CURRENT_VERSION) {
+               /* print routine will print an error on bad version */
+               if (dump_trace)
+                       cs_etm__print_auxtrace_info(auxtrace_info->priv, 0);
                return -EINVAL;
+       }
 
-       hdr = zalloc(sizeof(*hdr) * CS_HEADER_VERSION_0_MAX);
+       hdr = zalloc(sizeof(*hdr) * CS_HEADER_VERSION_MAX);
        if (!hdr)
                return -ENOMEM;
 
        /* Extract header information - see cs-etm.h for format */
-       for (i = 0; i < CS_HEADER_VERSION_0_MAX; i++)
+       for (i = 0; i < CS_HEADER_VERSION_MAX; i++)
                hdr[i] = ptr[i];
        num_cpu = hdr[CS_PMU_TYPE_CPUS] & 0xffffffff;
        pmu_type = (unsigned int) ((hdr[CS_PMU_TYPE_CPUS] >> 32) &
@@ -2552,35 +2743,31 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
         */
        for (j = 0; j < num_cpu; j++) {
                if (ptr[i] == __perf_cs_etmv3_magic) {
-                       metadata[j] = zalloc(sizeof(*metadata[j]) *
-                                            CS_ETM_PRIV_MAX);
-                       if (!metadata[j]) {
-                               err = -ENOMEM;
-                               goto err_free_metadata;
-                       }
-                       for (k = 0; k < CS_ETM_PRIV_MAX; k++)
-                               metadata[j][k] = ptr[i + k];
+                       metadata[j] =
+                               cs_etm__create_meta_blk(ptr, &i,
+                                                       CS_ETM_PRIV_MAX,
+                                                       CS_ETM_NR_TRC_PARAMS_V0);
 
                        /* The traceID is our handle */
-                       idx = metadata[j][CS_ETM_ETMTRACEIDR];
-                       i += CS_ETM_PRIV_MAX;
+                       trcidr_idx = CS_ETM_ETMTRACEIDR;
+
                } else if (ptr[i] == __perf_cs_etmv4_magic) {
-                       metadata[j] = zalloc(sizeof(*metadata[j]) *
-                                            CS_ETMV4_PRIV_MAX);
-                       if (!metadata[j]) {
-                               err = -ENOMEM;
-                               goto err_free_metadata;
-                       }
-                       for (k = 0; k < CS_ETMV4_PRIV_MAX; k++)
-                               metadata[j][k] = ptr[i + k];
+                       metadata[j] =
+                               cs_etm__create_meta_blk(ptr, &i,
+                                                       CS_ETMV4_PRIV_MAX,
+                                                       CS_ETMV4_NR_TRC_PARAMS_V0);
 
                        /* The traceID is our handle */
-                       idx = metadata[j][CS_ETMV4_TRCTRACEIDR];
-                       i += CS_ETMV4_PRIV_MAX;
+                       trcidr_idx = CS_ETMV4_TRCTRACEIDR;
+               }
+
+               if (!metadata[j]) {
+                       err = -ENOMEM;
+                       goto err_free_metadata;
                }
 
                /* Get an RB node for this CPU */
-               inode = intlist__findnew(traceid_list, idx);
+               inode = intlist__findnew(traceid_list, metadata[j][trcidr_idx]);
 
                /* Something went wrong, no need to continue */
                if (!inode) {
@@ -2601,7 +2788,7 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
        }
 
        /*
-        * Each of CS_HEADER_VERSION_0_MAX, CS_ETM_PRIV_MAX and
+        * Each of CS_HEADER_VERSION_MAX, CS_ETM_PRIV_MAX and
         * CS_ETMV4_PRIV_MAX mark how many double words are in the
         * global metadata, and each cpu's metadata respectively.
         * The following tests if the correct number of double words was
@@ -2703,6 +2890,12 @@ err_free_traceid_list:
        intlist__delete(traceid_list);
 err_free_hdr:
        zfree(&hdr);
-
+       /*
+        * At this point, as a minimum we have valid header. Dump the rest of
+        * the info section - the print routines will error out on structural
+        * issues.
+        */
+       if (dump_trace)
+               cs_etm__print_auxtrace_info(auxtrace_info->priv, num_cpu);
        return err;
 }
index 4ad925d..3642891 100644 (file)
 
 struct perf_session;
 
-/* Versionning header in case things need tro change in the future.  That way
+/*
+ * Versioning header in case things need to change in the future.  That way
  * decoding of old snapshot is still possible.
  */
 enum {
        /* Starting with 0x0 */
-       CS_HEADER_VERSION_0,
+       CS_HEADER_VERSION,
        /* PMU->type (32 bit), total # of CPUs (32 bit) */
        CS_PMU_TYPE_CPUS,
        CS_ETM_SNAPSHOT,
-       CS_HEADER_VERSION_0_MAX,
+       CS_HEADER_VERSION_MAX,
 };
 
+/*
+ * Update the version for new format.
+ *
+ * New version 1 format adds a param count to the per cpu metadata.
+ * This allows easy adding of new metadata parameters.
+ * Requires that new params always added after current ones.
+ * Also allows client reader to handle file versions that are different by
+ * checking the number of params in the file vs the number expected.
+ */
+#define CS_HEADER_CURRENT_VERSION 1
+
 /* Beginning of header common to both ETMv3 and V4 */
 enum {
        CS_ETM_MAGIC,
        CS_ETM_CPU,
+       /* Number of trace config params in following ETM specific block */
+       CS_ETM_NR_TRC_PARAMS,
+       CS_ETM_COMMON_BLK_MAX_V1,
 };
 
 /* ETMv3/PTM metadata */
 enum {
        /* Dynamic, configurable parameters */
-       CS_ETM_ETMCR = CS_ETM_CPU + 1,
+       CS_ETM_ETMCR = CS_ETM_COMMON_BLK_MAX_V1,
        CS_ETM_ETMTRACEIDR,
        /* RO, taken from sysFS */
        CS_ETM_ETMCCER,
@@ -41,10 +56,13 @@ enum {
        CS_ETM_PRIV_MAX,
 };
 
+/* define fixed version 0 length - allow new format reader to read old files. */
+#define CS_ETM_NR_TRC_PARAMS_V0 (CS_ETM_ETMIDR - CS_ETM_ETMCR + 1)
+
 /* ETMv4 metadata */
 enum {
        /* Dynamic, configurable parameters */
-       CS_ETMV4_TRCCONFIGR = CS_ETM_CPU + 1,
+       CS_ETMV4_TRCCONFIGR = CS_ETM_COMMON_BLK_MAX_V1,
        CS_ETMV4_TRCTRACEIDR,
        /* RO, taken from sysFS */
        CS_ETMV4_TRCIDR0,
@@ -55,9 +73,12 @@ enum {
        CS_ETMV4_PRIV_MAX,
 };
 
+/* define fixed version 0 length - allow new format reader to read old files. */
+#define CS_ETMV4_NR_TRC_PARAMS_V0 (CS_ETMV4_TRCAUTHSTATUS - CS_ETMV4_TRCCONFIGR + 1)
+
 /*
  * ETMv3 exception encoding number:
- * See Embedded Trace Macrocell spcification (ARM IHI 0014Q)
+ * See Embedded Trace Macrocell specification (ARM IHI 0014Q)
  * table 7-12 Encoding of Exception[3:0] for non-ARMv7-M processors.
  */
 enum {
@@ -162,7 +183,7 @@ struct cs_etm_packet_queue {
 
 #define BMVAL(val, lsb, msb)   ((val & GENMASK(msb, lsb)) >> lsb)
 
-#define CS_ETM_HEADER_SIZE (CS_HEADER_VERSION_0_MAX * sizeof(u64))
+#define CS_ETM_HEADER_SIZE (CS_HEADER_VERSION_MAX * sizeof(u64))
 
 #define __perf_cs_etmv3_magic 0x3030303030303030ULL
 #define __perf_cs_etmv4_magic 0x4040404040404040ULL
@@ -173,6 +194,7 @@ struct cs_etm_packet_queue {
 int cs_etm__process_auxtrace_info(union perf_event *event,
                                  struct perf_session *session);
 int cs_etm__get_cpu(u8 trace_chan_id, int *cpu);
+int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt);
 int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
                         pid_t tid, u8 trace_chan_id);
 bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq);
index 8b67bd9..cace349 100644 (file)
@@ -21,7 +21,7 @@
 #include <babeltrace/ctf/events.h>
 #include <traceevent/event-parse.h>
 #include "asm/bug.h"
-#include "data-convert-bt.h"
+#include "data-convert.h"
 #include "session.h"
 #include "debug.h"
 #include "tool.h"
@@ -949,7 +949,7 @@ static char *change_name(char *name, char *orig_name, int dup)
        /*
         * Add '_' prefix to potential keywork.  According to
         * Mathieu Desnoyers (https://lore.kernel.org/lkml/1074266107.40857.1422045946295.JavaMail.zimbra@efficios.com),
-        * futher CTF spec updating may require us to use '$'.
+        * further CTF spec updating may require us to use '$'.
         */
        if (dup < 0)
                len = strlen(name) + sizeof("_");
diff --git a/tools/perf/util/data-convert-bt.h b/tools/perf/util/data-convert-bt.h
deleted file mode 100644 (file)
index 821674d..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __DATA_CONVERT_BT_H
-#define __DATA_CONVERT_BT_H
-#include "data-convert.h"
-#ifdef HAVE_LIBBABELTRACE_SUPPORT
-
-int bt_convert__perf2ctf(const char *input_name, const char *to_ctf,
-                        struct perf_data_convert_opts *opts);
-
-#endif /* HAVE_LIBBABELTRACE_SUPPORT */
-#endif /* __DATA_CONVERT_BT_H */
diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c
new file mode 100644 (file)
index 0000000..355cd19
--- /dev/null
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * JSON export.
+ *
+ * Copyright (C) 2021, CodeWeavers Inc. <nfraser@codeweavers.com>
+ */
+
+#include "data-convert.h"
+
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "linux/compiler.h"
+#include "linux/err.h"
+#include "util/auxtrace.h"
+#include "util/debug.h"
+#include "util/dso.h"
+#include "util/event.h"
+#include "util/evsel.h"
+#include "util/evlist.h"
+#include "util/header.h"
+#include "util/map.h"
+#include "util/session.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/tool.h"
+
+struct convert_json {
+       struct perf_tool tool;
+       FILE *out;
+       bool first;
+       u64 events_count;
+};
+
+// Outputs a JSON-encoded string surrounded by quotes with characters escaped.
+static void output_json_string(FILE *out, const char *s)
+{
+       fputc('"', out);
+       while (*s) {
+               switch (*s) {
+
+               // required escapes with special forms as per RFC 8259
+               case '"':  fputs("\\\"", out); break;
+               case '\\': fputs("\\\\", out); break;
+               case '\b': fputs("\\b", out);  break;
+               case '\f': fputs("\\f", out);  break;
+               case '\n': fputs("\\n", out);  break;
+               case '\r': fputs("\\r", out);  break;
+               case '\t': fputs("\\t", out);  break;
+
+               default:
+                       // all other control characters must be escaped by hex code
+                       if (*s <= 0x1f)
+                               fprintf(out, "\\u%04x", *s);
+                       else
+                               fputc(*s, out);
+                       break;
+               }
+
+               ++s;
+       }
+       fputc('"', out);
+}
+
+// Outputs an optional comma, newline and indentation to delimit a new value
+// from the previous one in a JSON object or array.
+static void output_json_delimiters(FILE *out, bool comma, int depth)
+{
+       int i;
+
+       if (comma)
+               fputc(',', out);
+       fputc('\n', out);
+       for (i = 0; i < depth; ++i)
+               fputc('\t', out);
+}
+
+// Outputs a printf format string (with delimiter) as a JSON value.
+__printf(4, 5)
+static void output_json_format(FILE *out, bool comma, int depth, const char *format, ...)
+{
+       va_list args;
+
+       output_json_delimiters(out, comma, depth);
+       va_start(args, format);
+       vfprintf(out,  format, args);
+       va_end(args);
+}
+
+// Outputs a JSON key-value pair where the value is a string.
+static void output_json_key_string(FILE *out, bool comma, int depth,
+               const char *key, const char *value)
+{
+       output_json_delimiters(out, comma, depth);
+       output_json_string(out, key);
+       fputs(": ", out);
+       output_json_string(out, value);
+}
+
+// Outputs a JSON key-value pair where the value is a printf format string.
+__printf(5, 6)
+static void output_json_key_format(FILE *out, bool comma, int depth,
+               const char *key, const char *format, ...)
+{
+       va_list args;
+
+       output_json_delimiters(out, comma, depth);
+       output_json_string(out, key);
+       fputs(": ", out);
+       va_start(args, format);
+       vfprintf(out,  format, args);
+       va_end(args);
+}
+
+static void output_sample_callchain_entry(struct perf_tool *tool,
+               u64 ip, struct addr_location *al)
+{
+       struct convert_json *c = container_of(tool, struct convert_json, tool);
+       FILE *out = c->out;
+
+       output_json_format(out, false, 4, "{");
+       output_json_key_format(out, false, 5, "ip", "\"0x%" PRIx64 "\"", ip);
+
+       if (al && al->sym && al->sym->namelen) {
+               fputc(',', out);
+               output_json_key_string(out, false, 5, "symbol", al->sym->name);
+
+               if (al->map && al->map->dso) {
+                       const char *dso = al->map->dso->short_name;
+
+                       if (dso && strlen(dso) > 0) {
+                               fputc(',', out);
+                               output_json_key_string(out, false, 5, "dso", dso);
+                       }
+               }
+       }
+
+       output_json_format(out, false, 4, "}");
+}
+
+static int process_sample_event(struct perf_tool *tool,
+                               union perf_event *event __maybe_unused,
+                               struct perf_sample *sample,
+                               struct evsel *evsel __maybe_unused,
+                               struct machine *machine)
+{
+       struct convert_json *c = container_of(tool, struct convert_json, tool);
+       FILE *out = c->out;
+       struct addr_location al, tal;
+       u8 cpumode = PERF_RECORD_MISC_USER;
+
+       if (machine__resolve(machine, &al, sample) < 0) {
+               pr_err("Sample resolution failed!\n");
+               return -1;
+       }
+
+       ++c->events_count;
+
+       if (c->first)
+               c->first = false;
+       else
+               fputc(',', out);
+       output_json_format(out, false, 2, "{");
+
+       output_json_key_format(out, false, 3, "timestamp", "%" PRIi64, sample->time);
+       output_json_key_format(out, true, 3, "pid", "%i", al.thread->pid_);
+       output_json_key_format(out, true, 3, "tid", "%i", al.thread->tid);
+
+       if (al.thread->cpu >= 0)
+               output_json_key_format(out, true, 3, "cpu", "%i", al.thread->cpu);
+
+       output_json_key_string(out, true, 3, "comm", thread__comm_str(al.thread));
+
+       output_json_key_format(out, true, 3, "callchain", "[");
+       if (sample->callchain) {
+               unsigned int i;
+               bool ok;
+               bool first_callchain = true;
+
+               for (i = 0; i < sample->callchain->nr; ++i) {
+                       u64 ip = sample->callchain->ips[i];
+
+                       if (ip >= PERF_CONTEXT_MAX) {
+                               switch (ip) {
+                               case PERF_CONTEXT_HV:
+                                       cpumode = PERF_RECORD_MISC_HYPERVISOR;
+                                       break;
+                               case PERF_CONTEXT_KERNEL:
+                                       cpumode = PERF_RECORD_MISC_KERNEL;
+                                       break;
+                               case PERF_CONTEXT_USER:
+                                       cpumode = PERF_RECORD_MISC_USER;
+                                       break;
+                               default:
+                                       pr_debug("invalid callchain context: %"
+                                                       PRId64 "\n", (s64) ip);
+                                       break;
+                               }
+                               continue;
+                       }
+
+                       if (first_callchain)
+                               first_callchain = false;
+                       else
+                               fputc(',', out);
+
+                       ok = thread__find_symbol(al.thread, cpumode, ip, &tal);
+                       output_sample_callchain_entry(tool, ip, ok ? &tal : NULL);
+               }
+       } else {
+               output_sample_callchain_entry(tool, sample->ip, &al);
+       }
+       output_json_format(out, false, 3, "]");
+
+       output_json_format(out, false, 2, "}");
+       return 0;
+}
+
+static void output_headers(struct perf_session *session, struct convert_json *c)
+{
+       struct stat st;
+       struct perf_header *header = &session->header;
+       int ret;
+       int fd = perf_data__fd(session->data);
+       int i;
+       FILE *out = c->out;
+
+       output_json_key_format(out, false, 2, "header-version", "%u", header->version);
+
+       ret = fstat(fd, &st);
+       if (ret >= 0) {
+               time_t stctime = st.st_mtime;
+               char buf[256];
+
+               strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&stctime));
+               output_json_key_string(out, true, 2, "captured-on", buf);
+       } else {
+               pr_debug("Failed to get mtime of source file, not writing captured-on");
+       }
+
+       output_json_key_format(out, true, 2, "data-offset", "%" PRIu64, header->data_offset);
+       output_json_key_format(out, true, 2, "data-size", "%" PRIu64, header->data_size);
+       output_json_key_format(out, true, 2, "feat-offset", "%" PRIu64, header->feat_offset);
+
+       output_json_key_string(out, true, 2, "hostname", header->env.hostname);
+       output_json_key_string(out, true, 2, "os-release", header->env.os_release);
+       output_json_key_string(out, true, 2, "arch", header->env.arch);
+
+       output_json_key_string(out, true, 2, "cpu-desc", header->env.cpu_desc);
+       output_json_key_string(out, true, 2, "cpuid", header->env.cpuid);
+       output_json_key_format(out, true, 2, "nrcpus-online", "%u", header->env.nr_cpus_online);
+       output_json_key_format(out, true, 2, "nrcpus-avail", "%u", header->env.nr_cpus_avail);
+
+       if (header->env.clock.enabled) {
+               output_json_key_format(out, true, 2, "clockid",
+                               "%u", header->env.clock.clockid);
+               output_json_key_format(out, true, 2, "clock-time",
+                               "%" PRIu64, header->env.clock.clockid_ns);
+               output_json_key_format(out, true, 2, "real-time",
+                               "%" PRIu64, header->env.clock.tod_ns);
+       }
+
+       output_json_key_string(out, true, 2, "perf-version", header->env.version);
+
+       output_json_key_format(out, true, 2, "cmdline", "[");
+       for (i = 0; i < header->env.nr_cmdline; i++) {
+               output_json_delimiters(out, i != 0, 3);
+               output_json_string(c->out, header->env.cmdline_argv[i]);
+       }
+       output_json_format(out, false, 2, "]");
+}
+
+int bt_convert__perf2json(const char *input_name, const char *output_name,
+               struct perf_data_convert_opts *opts __maybe_unused)
+{
+       struct perf_session *session;
+       int fd;
+       int ret = -1;
+
+       struct convert_json c = {
+               .tool = {
+                       .sample         = process_sample_event,
+                       .mmap           = perf_event__process_mmap,
+                       .mmap2          = perf_event__process_mmap2,
+                       .comm           = perf_event__process_comm,
+                       .namespaces     = perf_event__process_namespaces,
+                       .cgroup         = perf_event__process_cgroup,
+                       .exit           = perf_event__process_exit,
+                       .fork           = perf_event__process_fork,
+                       .lost           = perf_event__process_lost,
+                       .tracing_data   = perf_event__process_tracing_data,
+                       .build_id       = perf_event__process_build_id,
+                       .id_index       = perf_event__process_id_index,
+                       .auxtrace_info  = perf_event__process_auxtrace_info,
+                       .auxtrace       = perf_event__process_auxtrace,
+                       .event_update   = perf_event__process_event_update,
+                       .ordered_events = true,
+                       .ordering_requires_timestamps = true,
+               },
+               .first = true,
+               .events_count = 0,
+       };
+
+       struct perf_data data = {
+               .mode = PERF_DATA_MODE_READ,
+               .path = input_name,
+               .force = opts->force,
+       };
+
+       if (opts->all) {
+               pr_err("--all is currently unsupported for JSON output.\n");
+               goto err;
+       }
+       if (opts->tod) {
+               pr_err("--tod is currently unsupported for JSON output.\n");
+               goto err;
+       }
+
+       fd = open(output_name, O_CREAT | O_WRONLY | (opts->force ? O_TRUNC : O_EXCL), 0666);
+       if (fd == -1) {
+               if (errno == EEXIST)
+                       pr_err("Output file exists. Use --force to overwrite it.\n");
+               else
+                       pr_err("Error opening output file!\n");
+               goto err;
+       }
+
+       c.out = fdopen(fd, "w");
+       if (!c.out) {
+               fprintf(stderr, "Error opening output file!\n");
+               close(fd);
+               goto err;
+       }
+
+       session = perf_session__new(&data, false, &c.tool);
+       if (IS_ERR(session)) {
+               fprintf(stderr, "Error creating perf session!\n");
+               goto err_fclose;
+       }
+
+       if (symbol__init(&session->header.env) < 0) {
+               fprintf(stderr, "Symbol init error!\n");
+               goto err_session_delete;
+       }
+
+       // The opening brace is printed manually because it isn't delimited from a
+       // previous value (i.e. we don't want a leading newline)
+       fputc('{', c.out);
+
+       // Version number for future-proofing. Most additions should be able to be
+       // done in a backwards-compatible way so this should only need to be bumped
+       // if some major breaking change must be made.
+       output_json_format(c.out, false, 1, "\"linux-perf-json-version\": 1");
+
+       // Output headers
+       output_json_format(c.out, true, 1, "\"headers\": {");
+       output_headers(session, &c);
+       output_json_format(c.out, false, 1, "}");
+
+       // Output samples
+       output_json_format(c.out, true, 1, "\"samples\": [");
+       perf_session__process_events(session);
+       output_json_format(c.out, false, 1, "]");
+       output_json_format(c.out, false, 0, "}");
+       fputc('\n', c.out);
+
+       fprintf(stderr,
+                       "[ perf data convert: Converted '%s' into JSON data '%s' ]\n",
+                       data.path, output_name);
+
+       fprintf(stderr,
+                       "[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples) ]\n",
+                       (ftell(c.out)) / 1024.0 / 1024.0, c.events_count);
+
+       ret = 0;
+err_session_delete:
+       perf_session__delete(session);
+err_fclose:
+       fclose(c.out);
+err:
+       return ret;
+}
index feab5f1..1b4c5f5 100644 (file)
@@ -2,10 +2,20 @@
 #ifndef __DATA_CONVERT_H
 #define __DATA_CONVERT_H
 
+#include <stdbool.h>
+
 struct perf_data_convert_opts {
        bool force;
        bool all;
        bool tod;
 };
 
+#ifdef HAVE_LIBBABELTRACE_SUPPORT
+int bt_convert__perf2ctf(const char *input_name, const char *to_ctf,
+                        struct perf_data_convert_opts *opts);
+#endif /* HAVE_LIBBABELTRACE_SUPPORT */
+
+int bt_convert__perf2json(const char *input_name, const char *to_ctf,
+                        struct perf_data_convert_opts *opts);
+
 #endif /* __DATA_CONVERT_H */
index 39c0520..ddf33d5 100644 (file)
@@ -147,7 +147,7 @@ error:
  * Demangle Java function signature (openJDK, not GCJ)
  * input:
  *     str: string to parse. String is not modified
- *    flags: comobination of JAVA_DEMANGLE_* flags to modify demangling
+ *    flags: combination of JAVA_DEMANGLE_* flags to modify demangling
  * return:
  *     if input can be demangled, then a newly allocated string is returned.
  *     if input cannot be demangled, then NULL is returned
@@ -164,7 +164,7 @@ java_demangle_sym(const char *str, int flags)
        if (!str)
                return NULL;
 
-       /* find start of retunr type */
+       /* find start of return type */
        p = strrchr(str, ')');
        if (!p)
                return NULL;
index 3df14e6..9d707bb 100644 (file)
@@ -64,17 +64,5 @@ ocaml_demangle_sym(const char *sym)
        }
        result[j] = '\0';
 
-       /* scan backwards to remove an "_" followed by decimal digits */
-       if (j != 0 && isdigit(result[j - 1])) {
-               while (--j) {
-                       if (!isdigit(result[j])) {
-                               break;
-                       }
-               }
-               if (result[j] == '_') {
-                       result[j] = '\0';
-               }
-       }
-
        return result;
 }
index cd2fe64..52e7101 100644 (file)
@@ -216,7 +216,7 @@ struct dso {
 
 /* dso__for_each_symbol - iterate over the symbols of given type
  *
- * @dso: the 'struct dso *' in which symbols itereated
+ * @dso: the 'struct dso *' in which symbols are iterated
  * @pos: the 'struct symbol *' to use as a loop cursor
  * @n: the 'struct rb_node *' to use as a temporary storage
  */
index 7b2d471..b2f4920 100644 (file)
@@ -91,7 +91,7 @@ static Dwarf_Line *cu_getsrc_die(Dwarf_Die *cu_die, Dwarf_Addr addr)
                        return NULL;
        } while (laddr == addr);
        l++;
-       /* Going foward to find the statement line */
+       /* Going forward to find the statement line */
        do {
                line = dwarf_onesrcline(lines, l++);
                if (!line || dwarf_lineaddr(line, &laddr) != 0 ||
@@ -177,7 +177,7 @@ int cu_walk_functions_at(Dwarf_Die *cu_die, Dwarf_Addr addr,
  * die_get_linkage_name - Get the linkage name of the object
  * @dw_die: A DIE of the object
  *
- * Get the linkage name attiribute of given @dw_die.
+ * Get the linkage name attribute of given @dw_die.
  * For C++ binary, the linkage name will be the mangled symbol.
  */
 const char *die_get_linkage_name(Dwarf_Die *dw_die)
@@ -739,7 +739,7 @@ static int __die_walk_instances_cb(Dwarf_Die *inst, void *data)
  * @data: user data
  *
  * Walk on the instances of give @in_die. @in_die must be an inlined function
- * declartion. This returns the return value of @callback if it returns
+ * declaration. This returns the return value of @callback if it returns
  * non-zero value, or -ENOENT if there is no instance.
  */
 int die_walk_instances(Dwarf_Die *or_die, int (*callback)(Dwarf_Die *, void *),
index 506006e..cb99646 100644 (file)
@@ -22,7 +22,7 @@ const char *cu_get_comp_dir(Dwarf_Die *cu_die);
 int cu_find_lineinfo(Dwarf_Die *cudie, unsigned long addr,
                     const char **fname, int *lineno);
 
-/* Walk on funcitons at given address */
+/* Walk on functions at given address */
 int cu_walk_functions_at(Dwarf_Die *cu_die, Dwarf_Addr addr,
                         int (*callback)(Dwarf_Die *, void *), void *data);
 
index 1b49ece..3fa4486 100644 (file)
@@ -24,6 +24,7 @@
 #include "../arch/s390/include/dwarf-regs-table.h"
 #include "../arch/sparc/include/dwarf-regs-table.h"
 #include "../arch/xtensa/include/dwarf-regs-table.h"
+#include "../arch/mips/include/dwarf-regs-table.h"
 
 #define __get_dwarf_regstr(tbl, n) (((n) < ARRAY_SIZE(tbl)) ? (tbl)[(n)] : NULL)
 
@@ -53,6 +54,8 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine)
                return __get_dwarf_regstr(sparc_regstr_tbl, n);
        case EM_XTENSA:
                return __get_dwarf_regstr(xtensa_regstr_tbl, n);
+       case EM_MIPS:
+               return __get_dwarf_regstr(mips_regstr_tbl, n);
        default:
                pr_err("ELF MACHINE %x is not supported.\n", machine);
        }
index f603edb..8a62fb3 100644 (file)
@@ -147,6 +147,7 @@ struct perf_sample {
        u8  cpumode;
        u16 misc;
        u16 ins_lat;
+       u16 p_stage_cyc;
        bool no_hw_idx;         /* No hw_idx collected in branch_stack */
        char insn[MAX_INSN];
        void *raw_data;
@@ -427,5 +428,7 @@ char *get_page_size_name(u64 size, char *str);
 
 void arch_perf_parse_sample_weight(struct perf_sample *data, const __u64 *array, u64 type);
 void arch_perf_synthesize_sample_weight(const struct perf_sample *data, __u64 *array, u64 type);
+const char *arch_perf_header_entry(const char *se_header);
+int arch_support_sort_key(const char *sort_key);
 
 #endif /* __PERF_RECORD_H */
index 859cb34..3480baf 100644 (file)
  * all struct perf_record_lost_samples.lost fields reported.
  *
  * The total_period is needed because by default auto-freq is used, so
- * multipling nr_events[PERF_EVENT_SAMPLE] by a frequency isn't possible to get
+ * multiplying nr_events[PERF_EVENT_SAMPLE] by a frequency isn't possible to get
  * the total number of low level events, it is necessary to to sum all struct
  * perf_record_sample.period and stash the result in total_period.
  */
 struct events_stats {
-       u64 total_period;
-       u64 total_non_filtered_period;
        u64 total_lost;
        u64 total_lost_samples;
        u64 total_aux_lost;
        u64 total_aux_partial;
        u64 total_invalid_chains;
        u32 nr_events[PERF_RECORD_HEADER_MAX];
-       u32 nr_non_filtered_samples;
        u32 nr_lost_warned;
        u32 nr_unknown_events;
        u32 nr_invalid_chains;
@@ -44,8 +41,16 @@ struct events_stats {
        u32 nr_proc_map_timeout;
 };
 
+struct hists_stats {
+       u64 total_period;
+       u64 total_non_filtered_period;
+       u32 nr_samples;
+       u32 nr_non_filtered_samples;
+};
+
 void events_stats__inc(struct events_stats *stats, u32 type);
 
-size_t events_stats__fprintf(struct events_stats *stats, FILE *fp);
+size_t events_stats__fprintf(struct events_stats *stats, FILE *fp,
+                            bool skip_empty);
 
 #endif /* __PERF_EVENTS_STATS_ */
diff --git a/tools/perf/util/evlist-hybrid.c b/tools/perf/util/evlist-hybrid.c
new file mode 100644 (file)
index 0000000..db3f5fb
--- /dev/null
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <errno.h>
+#include <inttypes.h>
+#include "cpumap.h"
+#include "evlist.h"
+#include "evsel.h"
+#include "../perf.h"
+#include "util/pmu-hybrid.h"
+#include "util/evlist-hybrid.h"
+#include "debug.h"
+#include <unistd.h>
+#include <stdlib.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <perf/evlist.h>
+#include <perf/evsel.h>
+#include <perf/cpumap.h>
+
+int evlist__add_default_hybrid(struct evlist *evlist, bool precise)
+{
+       struct evsel *evsel;
+       struct perf_pmu *pmu;
+       __u64 config;
+       struct perf_cpu_map *cpus;
+
+       perf_pmu__for_each_hybrid_pmu(pmu) {
+               config = PERF_COUNT_HW_CPU_CYCLES |
+                        ((__u64)pmu->type << PERF_PMU_TYPE_SHIFT);
+               evsel = evsel__new_cycles(precise, PERF_TYPE_HARDWARE,
+                                         config);
+               if (!evsel)
+                       return -ENOMEM;
+
+               cpus = perf_cpu_map__get(pmu->cpus);
+               evsel->core.cpus = cpus;
+               evsel->core.own_cpus = perf_cpu_map__get(cpus);
+               evsel->pmu_name = strdup(pmu->name);
+               evlist__add(evlist, evsel);
+       }
+
+       return 0;
+}
+
+static bool group_hybrid_conflict(struct evsel *leader)
+{
+       struct evsel *pos, *prev = NULL;
+
+       for_each_group_evsel(pos, leader) {
+               if (!evsel__is_hybrid(pos))
+                       continue;
+
+               if (prev && strcmp(prev->pmu_name, pos->pmu_name))
+                       return true;
+
+               prev = pos;
+       }
+
+       return false;
+}
+
+void evlist__warn_hybrid_group(struct evlist *evlist)
+{
+       struct evsel *evsel;
+
+       evlist__for_each_entry(evlist, evsel) {
+               if (evsel__is_group_leader(evsel) &&
+                   evsel->core.nr_members > 1 &&
+                   group_hybrid_conflict(evsel)) {
+                       pr_warning("WARNING: events in group from "
+                                  "different hybrid PMUs!\n");
+                       return;
+               }
+       }
+}
+
+bool evlist__has_hybrid(struct evlist *evlist)
+{
+       struct evsel *evsel;
+
+       evlist__for_each_entry(evlist, evsel) {
+               if (evsel->pmu_name &&
+                   perf_pmu__is_hybrid(evsel->pmu_name)) {
+                       return true;
+               }
+       }
+
+       return false;
+}
diff --git a/tools/perf/util/evlist-hybrid.h b/tools/perf/util/evlist-hybrid.h
new file mode 100644 (file)
index 0000000..19f74b4
--- /dev/null
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PERF_EVLIST_HYBRID_H
+#define __PERF_EVLIST_HYBRID_H
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include "evlist.h"
+#include <unistd.h>
+
+int evlist__add_default_hybrid(struct evlist *evlist, bool precise);
+void evlist__warn_hybrid_group(struct evlist *evlist);
+bool evlist__has_hybrid(struct evlist *evlist);
+
+#endif /* __PERF_EVLIST_HYBRID_H */
index 882cd1f..6e5c415 100644 (file)
@@ -17,6 +17,7 @@
 #include "evsel.h"
 #include "debug.h"
 #include "units.h"
+#include "bpf_counter.h"
 #include <internal/lib.h> // page_size
 #include "affinity.h"
 #include "../perf.h"
@@ -25,6 +26,7 @@
 #include "util/string2.h"
 #include "util/perf_api_probe.h"
 #include "util/evsel_fprintf.h"
+#include "util/evlist-hybrid.h"
 #include <signal.h>
 #include <unistd.h>
 #include <sched.h>
@@ -36,6 +38,7 @@
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/prctl.h>
 
 #include <linux/bitops.h>
 #include <linux/hash.h>
@@ -246,8 +249,10 @@ void evlist__set_leader(struct evlist *evlist)
 
 int __evlist__add_default(struct evlist *evlist, bool precise)
 {
-       struct evsel *evsel = evsel__new_cycles(precise);
+       struct evsel *evsel;
 
+       evsel = evsel__new_cycles(precise, PERF_TYPE_HARDWARE,
+                                 PERF_COUNT_HW_CPU_CYCLES);
        if (evsel == NULL)
                return -ENOMEM;
 
@@ -420,6 +425,9 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name)
        if (affinity__setup(&affinity) < 0)
                return;
 
+       evlist__for_each_entry(evlist, pos)
+               bpf_counter__disable(pos);
+
        /* Disable 'immediate' events last */
        for (imm = 0; imm <= 1; imm++) {
                evlist__for_each_cpu(evlist, i, cpu) {
@@ -1209,7 +1217,7 @@ bool evlist__valid_read_format(struct evlist *evlist)
                }
        }
 
-       /* PERF_SAMPLE_READ imples PERF_FORMAT_ID. */
+       /* PERF_SAMPLE_READ implies PERF_FORMAT_ID. */
        if ((sample_type & PERF_SAMPLE_READ) &&
            !(read_format & PERF_FORMAT_ID)) {
                return false;
@@ -1405,6 +1413,13 @@ int evlist__prepare_workload(struct evlist *evlist, struct target *target, const
                close(go_pipe[1]);
                fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
 
+               /*
+                * Change the name of this process not to confuse --exclude-perf users
+                * that sees 'perf' in the window up to the execvp() and thinks that
+                * perf samples are not being excluded.
+                */
+               prctl(PR_SET_NAME, "perf-exec");
+
                /*
                 * Tell the parent we're ready to go
                 */
@@ -2130,3 +2145,22 @@ struct evsel *evlist__find_evsel(struct evlist *evlist, int idx)
        }
        return NULL;
 }
+
+int evlist__scnprintf_evsels(struct evlist *evlist, size_t size, char *bf)
+{
+       struct evsel *evsel;
+       int printed = 0;
+
+       evlist__for_each_entry(evlist, evsel) {
+               if (evsel__is_dummy_event(evsel))
+                       continue;
+               if (size > (strlen(evsel__name(evsel)) + (printed ? 2 : 1))) {
+                       printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "," : "", evsel__name(evsel));
+               } else {
+                       printed += scnprintf(bf + printed, size - printed, "%s...", printed ? "," : "");
+                       break;
+               }
+       }
+
+       return printed;
+}
index b695ffa..a8b97b5 100644 (file)
@@ -365,4 +365,6 @@ int evlist__ctlfd_ack(struct evlist *evlist);
 #define EVLIST_DISABLED_MSG "Events disabled\n"
 
 struct evsel *evlist__find_evsel(struct evlist *evlist, int idx);
+
+int evlist__scnprintf_evsels(struct evlist *evlist, size_t size, char *bf);
 #endif /* __PERF_EVLIST_H */
index 7ecbc8e..4a3cd1b 100644 (file)
@@ -47,6 +47,7 @@
 #include "memswap.h"
 #include "util.h"
 #include "hashmap.h"
+#include "pmu-hybrid.h"
 #include "../perf-sys.h"
 #include "util/parse-branch-options.h"
 #include <internal/xyarray.h>
@@ -295,11 +296,11 @@ static bool perf_event_can_profile_kernel(void)
        return perf_event_paranoid_check(1);
 }
 
-struct evsel *evsel__new_cycles(bool precise)
+struct evsel *evsel__new_cycles(bool precise, __u32 type, __u64 config)
 {
        struct perf_event_attr attr = {
-               .type   = PERF_TYPE_HARDWARE,
-               .config = PERF_COUNT_HW_CPU_CYCLES,
+               .type   = type,
+               .config = config,
                .exclude_kernel = !perf_event_can_profile_kernel(),
        };
        struct evsel *evsel;
@@ -492,6 +493,28 @@ const char *evsel__hw_names[PERF_COUNT_HW_MAX] = {
        "ref-cycles",
 };
 
+char *evsel__bpf_counter_events;
+
+bool evsel__match_bpf_counter_events(const char *name)
+{
+       int name_len;
+       bool match;
+       char *ptr;
+
+       if (!evsel__bpf_counter_events)
+               return false;
+
+       ptr = strstr(evsel__bpf_counter_events, name);
+       name_len = strlen(name);
+
+       /* check name matches a full token in evsel__bpf_counter_events */
+       match = (ptr != NULL) &&
+               ((ptr == evsel__bpf_counter_events) || (*(ptr - 1) == ',')) &&
+               ((*(ptr + name_len) == ',') || (*(ptr + name_len) == '\0'));
+
+       return match;
+}
+
 static const char *__evsel__hw_name(u64 config)
 {
        if (config < PERF_COUNT_HW_MAX && evsel__hw_names[config])
@@ -621,7 +644,7 @@ const char *evsel__hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX][EVSEL__MAX_AL
 #define COP(x)         (1 << x)
 
 /*
- * cache operartion stat
+ * cache operation stat
  * L1I : Read and prefetch only
  * ITLB and BPU : Read-only
  */
@@ -2275,7 +2298,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
                /*
                 * Undo swap of u64, then swap on individual u32s,
                 * get the size of the raw area and undo all of the
-                * swap. The pevent interface handles endianity by
+                * swap. The pevent interface handles endianness by
                 * itself.
                 */
                if (swapped) {
@@ -2797,3 +2820,8 @@ void evsel__zero_per_pkg(struct evsel *evsel)
                hashmap__clear(evsel->per_pkg_mask);
        }
 }
+
+bool evsel__is_hybrid(struct evsel *evsel)
+{
+       return evsel->pmu_name && perf_pmu__is_hybrid(evsel->pmu_name);
+}
index 6026487..75cf5db 100644 (file)
@@ -20,6 +20,8 @@ union perf_event;
 struct bpf_counter_ops;
 struct target;
 struct hashmap;
+struct bperf_leader_bpf;
+struct bperf_follower_bpf;
 
 typedef int (evsel__sb_cb_t)(union perf_event *event, void *data);
 
@@ -80,6 +82,7 @@ struct evsel {
                bool                    auto_merge_stats;
                bool                    collect_stat;
                bool                    weak_group;
+               bool                    bpf_counter;
                int                     bpf_fd;
                struct bpf_object       *bpf_obj;
        };
@@ -113,6 +116,7 @@ struct evsel {
        bool                    merged_stat;
        bool                    reset_group;
        bool                    errored;
+       bool                    use_config_name;
        struct hashmap          *per_pkg_mask;
        struct evsel            *leader;
        struct list_head        config_terms;
@@ -130,8 +134,24 @@ struct evsel {
         * See also evsel__has_callchain().
         */
        __u64                   synth_sample_type;
-       struct list_head        bpf_counter_list;
+
+       /*
+        * bpf_counter_ops serves two use cases:
+        *   1. perf-stat -b          counting events used byBPF programs
+        *   2. perf-stat --use-bpf   use BPF programs to aggregate counts
+        */
        struct bpf_counter_ops  *bpf_counter_ops;
+
+       /* for perf-stat -b */
+       struct list_head        bpf_counter_list;
+
+       /* for perf-stat --use-bpf */
+       int                     bperf_leader_prog_fd;
+       int                     bperf_leader_link_fd;
+       union {
+               struct bperf_leader_bpf *leader_skel;
+               struct bperf_follower_bpf *follower_skel;
+       };
 };
 
 struct perf_missing_features {
@@ -157,7 +177,6 @@ struct perf_missing_features {
 extern struct perf_missing_features perf_missing_features;
 
 struct perf_cpu_map;
-struct target;
 struct thread_map;
 struct record_opts;
 
@@ -202,7 +221,7 @@ static inline struct evsel *evsel__newtp(const char *sys, const char *name)
        return evsel__newtp_idx(sys, name, 0);
 }
 
-struct evsel *evsel__new_cycles(bool precise);
+struct evsel *evsel__new_cycles(bool precise, __u32 type, __u64 config);
 
 struct tep_event *event_format__new(const char *sys, const char *name);
 
@@ -222,6 +241,11 @@ void evsel__calc_id_pos(struct evsel *evsel);
 
 bool evsel__is_cache_op_valid(u8 type, u8 op);
 
+static inline bool evsel__is_bpf(struct evsel *evsel)
+{
+       return evsel->bpf_counter_ops != NULL;
+}
+
 #define EVSEL__MAX_ALIASES 8
 
 extern const char *evsel__hw_cache[PERF_COUNT_HW_CACHE_MAX][EVSEL__MAX_ALIASES];
@@ -229,6 +253,9 @@ extern const char *evsel__hw_cache_op[PERF_COUNT_HW_CACHE_OP_MAX][EVSEL__MAX_ALI
 extern const char *evsel__hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX][EVSEL__MAX_ALIASES];
 extern const char *evsel__hw_names[PERF_COUNT_HW_MAX];
 extern const char *evsel__sw_names[PERF_COUNT_SW_MAX];
+extern char *evsel__bpf_counter_events;
+bool evsel__match_bpf_counter_events(const char *name);
+
 int __evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, char *bf, size_t size);
 const char *evsel__name(struct evsel *evsel);
 
@@ -435,4 +462,5 @@ struct perf_env *evsel__env(struct evsel *evsel);
 int evsel__store_ids(struct evsel *evsel, struct evlist *evlist);
 
 void evsel__zero_per_pkg(struct evsel *evsel);
+bool evsel__is_hybrid(struct evsel *evsel);
 #endif /* __PERF_EVSEL_H */
index dcf8d19..85df3e4 100644 (file)
@@ -3,7 +3,7 @@
 #define PARSE_CTX_H 1
 
 // There are fixes that need to land upstream before we can use libbpf's headers,
-// for now use our copy uncoditionally, since the data structures at this point
+// for now use our copy unconditionally, since the data structures at this point
 // are exactly the same, no problem.
 //#ifdef HAVE_LIBBPF_SUPPORT
 //#include <bpf/hashmap.h>
index 20effdf..aa1e425 100644 (file)
@@ -127,7 +127,7 @@ static int __do_write_buf(struct feat_fd *ff,  const void *buf, size_t size)
        return 0;
 }
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 int do_write(struct feat_fd *ff, const void *buf, size_t size)
 {
        if (!ff->buf)
@@ -135,7 +135,7 @@ int do_write(struct feat_fd *ff, const void *buf, size_t size)
        return __do_write_buf(ff, buf, size);
 }
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 static int do_write_bitmap(struct feat_fd *ff, unsigned long *set, u64 size)
 {
        u64 *p = (u64 *) set;
@@ -154,7 +154,7 @@ static int do_write_bitmap(struct feat_fd *ff, unsigned long *set, u64 size)
        return 0;
 }
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 int write_padded(struct feat_fd *ff, const void *bf,
                 size_t count, size_t count_aligned)
 {
@@ -170,7 +170,7 @@ int write_padded(struct feat_fd *ff, const void *bf,
 #define string_size(str)                                               \
        (PERF_ALIGN((strlen(str) + 1), NAME_ALIGN) + sizeof(u32))
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 static int do_write_string(struct feat_fd *ff, const char *str)
 {
        u32 len, olen;
@@ -266,7 +266,7 @@ static char *do_read_string(struct feat_fd *ff)
        return NULL;
 }
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 static int do_read_bitmap(struct feat_fd *ff, unsigned long **pset, u64 *psize)
 {
        unsigned long *set;
@@ -2874,7 +2874,7 @@ static int process_bpf_prog_info(struct feat_fd *ff, void *data __maybe_unused)
        int err = -1;
 
        if (ff->ph->needs_swap) {
-               pr_warning("interpreting bpf_prog_info from systems with endianity is not yet supported\n");
+               pr_warning("interpreting bpf_prog_info from systems with endianness is not yet supported\n");
                return 0;
        }
 
@@ -2942,7 +2942,7 @@ static int process_bpf_btf(struct feat_fd *ff, void *data __maybe_unused)
        int err = -1;
 
        if (ff->ph->needs_swap) {
-               pr_warning("interpreting btf from systems with endianity is not yet supported\n");
+               pr_warning("interpreting btf from systems with endianness is not yet supported\n");
                return 0;
        }
 
@@ -3481,11 +3481,11 @@ static const size_t attr_pipe_abi_sizes[] = {
 };
 
 /*
- * In the legacy pipe format, there is an implicit assumption that endiannesss
+ * In the legacy pipe format, there is an implicit assumption that endianness
  * between host recording the samples, and host parsing the samples is the
  * same. This is not always the case given that the pipe output may always be
  * redirected into a file and analyzed on a different machine with possibly a
- * different endianness and perf_event ABI revsions in the perf tool itself.
+ * different endianness and perf_event ABI revisions in the perf tool itself.
  */
 static int try_all_pipe_abis(uint64_t hdr_sz, struct perf_header *ph)
 {
index c82f5fc..65fe65b 100644 (file)
@@ -211,6 +211,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
        hists__new_col_len(hists, HISTC_MEM_BLOCKED, 10);
        hists__new_col_len(hists, HISTC_LOCAL_INS_LAT, 13);
        hists__new_col_len(hists, HISTC_GLOBAL_INS_LAT, 13);
+       hists__new_col_len(hists, HISTC_P_STAGE_CYC, 13);
        if (symbol_conf.nanosecs)
                hists__new_col_len(hists, HISTC_TIME, 16);
        else
@@ -289,13 +290,14 @@ static long hist_time(unsigned long htime)
 }
 
 static void he_stat__add_period(struct he_stat *he_stat, u64 period,
-                               u64 weight, u64 ins_lat)
+                               u64 weight, u64 ins_lat, u64 p_stage_cyc)
 {
 
        he_stat->period         += period;
        he_stat->weight         += weight;
        he_stat->nr_events      += 1;
        he_stat->ins_lat        += ins_lat;
+       he_stat->p_stage_cyc    += p_stage_cyc;
 }
 
 static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
@@ -308,6 +310,7 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
        dest->nr_events         += src->nr_events;
        dest->weight            += src->weight;
        dest->ins_lat           += src->ins_lat;
+       dest->p_stage_cyc               += src->p_stage_cyc;
 }
 
 static void he_stat__decay(struct he_stat *he_stat)
@@ -597,6 +600,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
        u64 period = entry->stat.period;
        u64 weight = entry->stat.weight;
        u64 ins_lat = entry->stat.ins_lat;
+       u64 p_stage_cyc = entry->stat.p_stage_cyc;
        bool leftmost = true;
 
        p = &hists->entries_in->rb_root.rb_node;
@@ -615,11 +619,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 
                if (!cmp) {
                        if (sample_self) {
-                               he_stat__add_period(&he->stat, period, weight, ins_lat);
+                               he_stat__add_period(&he->stat, period, weight, ins_lat, p_stage_cyc);
                                hist_entry__add_callchain_period(he, period);
                        }
                        if (symbol_conf.cumulate_callchain)
-                               he_stat__add_period(he->stat_acc, period, weight, ins_lat);
+                               he_stat__add_period(he->stat_acc, period, weight, ins_lat, p_stage_cyc);
 
                        /*
                         * This mem info was allocated from sample__resolve_mem
@@ -731,6 +735,7 @@ __hists__add_entry(struct hists *hists,
                        .period = sample->period,
                        .weight = sample->weight,
                        .ins_lat = sample->ins_lat,
+                       .p_stage_cyc = sample->p_stage_cyc,
                },
                .parent = sym_parent,
                .filtered = symbol__parent_filter(sym_parent) | al->filtered,
@@ -2320,14 +2325,19 @@ void events_stats__inc(struct events_stats *stats, u32 type)
        ++stats->nr_events[type];
 }
 
-void hists__inc_nr_events(struct hists *hists, u32 type)
+static void hists_stats__inc(struct hists_stats *stats)
 {
-       events_stats__inc(&hists->stats, type);
+       ++stats->nr_samples;
+}
+
+void hists__inc_nr_events(struct hists *hists)
+{
+       hists_stats__inc(&hists->stats);
 }
 
 void hists__inc_nr_samples(struct hists *hists, bool filtered)
 {
-       events_stats__inc(&hists->stats, PERF_RECORD_SAMPLE);
+       hists_stats__inc(&hists->stats);
        if (!filtered)
                hists->stats.nr_non_filtered_samples++;
 }
@@ -2666,14 +2676,21 @@ void hist__account_cycles(struct branch_stack *bs, struct addr_location *al,
        }
 }
 
-size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp)
+size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp,
+                                bool skip_empty)
 {
        struct evsel *pos;
        size_t ret = 0;
 
        evlist__for_each_entry(evlist, pos) {
+               struct hists *hists = evsel__hists(pos);
+
+               if (skip_empty && !hists->stats.nr_samples)
+                       continue;
+
                ret += fprintf(fp, "%s stats:\n", evsel__name(pos));
-               ret += events_stats__fprintf(&evsel__hists(pos)->stats, fp);
+               ret += fprintf(fp, "%16s events: %10d\n",
+                              "SAMPLE", hists->stats.nr_samples);
        }
 
        return ret;
@@ -2693,7 +2710,7 @@ int __hists__scnprintf_title(struct hists *hists, char *bf, size_t size, bool sh
        const struct dso *dso = hists->dso_filter;
        struct thread *thread = hists->thread_filter;
        int socket_id = hists->socket_filter;
-       unsigned long nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE];
+       unsigned long nr_samples = hists->stats.nr_samples;
        u64 nr_events = hists->stats.total_period;
        struct evsel *evsel = hists_to_evsel(hists);
        const char *ev_name = evsel__name(evsel);
@@ -2720,7 +2737,7 @@ int __hists__scnprintf_title(struct hists *hists, char *bf, size_t size, bool sh
                                nr_samples += pos_hists->stats.nr_non_filtered_samples;
                                nr_events += pos_hists->stats.total_non_filtered_period;
                        } else {
-                               nr_samples += pos_hists->stats.nr_events[PERF_RECORD_SAMPLE];
+                               nr_samples += pos_hists->stats.nr_samples;
                                nr_events += pos_hists->stats.total_period;
                        }
                }
index 3c53723..5343b62 100644 (file)
@@ -75,6 +75,7 @@ enum hist_column {
        HISTC_MEM_BLOCKED,
        HISTC_LOCAL_INS_LAT,
        HISTC_GLOBAL_INS_LAT,
+       HISTC_P_STAGE_CYC,
        HISTC_NR_COLS, /* Last entry */
 };
 
@@ -95,7 +96,7 @@ struct hists {
        const char              *uid_filter_str;
        const char              *symbol_filter_str;
        pthread_mutex_t         lock;
-       struct events_stats     stats;
+       struct hists_stats      stats;
        u64                     event_stream;
        u16                     col_len[HISTC_NR_COLS];
        bool                    has_callchains;
@@ -195,13 +196,14 @@ struct hist_entry *hists__get_entry(struct hists *hists, int idx);
 u64 hists__total_period(struct hists *hists);
 void hists__reset_stats(struct hists *hists);
 void hists__inc_stats(struct hists *hists, struct hist_entry *h);
-void hists__inc_nr_events(struct hists *hists, u32 type);
+void hists__inc_nr_events(struct hists *hists);
 void hists__inc_nr_samples(struct hists *hists, bool filtered);
 
 size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
                      int max_cols, float min_pcnt, FILE *fp,
                      bool ignore_callchains);
-size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp);
+size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp,
+                                bool skip_empty);
 
 void hists__filter_by_dso(struct hists *hists);
 void hists__filter_by_thread(struct hists *hists);
index f6e28ac..8658d42 100644 (file)
@@ -3569,7 +3569,7 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
        /*
         * Since this thread will not be kept in any rbtree not in a
         * list, initialize its list node so that at thread__put() the
-        * current thread lifetime assuption is kept and we don't segfault
+        * current thread lifetime assumption is kept and we don't segfault
         * at list_del_init().
         */
        INIT_LIST_HEAD(&pt->unknown_thread->node);
diff --git a/tools/perf/util/iostat.c b/tools/perf/util/iostat.c
new file mode 100644 (file)
index 0000000..57dd49d
--- /dev/null
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "util/iostat.h"
+#include "util/debug.h"
+
+enum iostat_mode_t iostat_mode = IOSTAT_NONE;
+
+__weak int iostat_prepare(struct evlist *evlist __maybe_unused,
+                         struct perf_stat_config *config __maybe_unused)
+{
+       return -1;
+}
+
+__weak int iostat_parse(const struct option *opt __maybe_unused,
+                        const char *str __maybe_unused,
+                        int unset __maybe_unused)
+{
+       pr_err("iostat mode is not supported on current platform\n");
+       return -1;
+}
+
+__weak void iostat_list(struct evlist *evlist __maybe_unused,
+                      struct perf_stat_config *config __maybe_unused)
+{
+}
+
+__weak void iostat_release(struct evlist *evlist __maybe_unused)
+{
+}
+
+__weak void iostat_print_header_prefix(struct perf_stat_config *config __maybe_unused)
+{
+}
+
+__weak void iostat_print_metric(struct perf_stat_config *config __maybe_unused,
+                               struct evsel *evsel __maybe_unused,
+                               struct perf_stat_output_ctx *out __maybe_unused)
+{
+}
+
+__weak void iostat_prefix(struct evlist *evlist __maybe_unused,
+                         struct perf_stat_config *config __maybe_unused,
+                         char *prefix __maybe_unused,
+                         struct timespec *ts __maybe_unused)
+{
+}
+
+__weak void iostat_print_counters(struct evlist *evlist __maybe_unused,
+                                 struct perf_stat_config *config __maybe_unused,
+                                 struct timespec *ts __maybe_unused,
+                                 char *prefix __maybe_unused,
+                                 iostat_print_counter_t print_cnt_cb __maybe_unused)
+{
+}
diff --git a/tools/perf/util/iostat.h b/tools/perf/util/iostat.h
new file mode 100644 (file)
index 0000000..23c1c46
--- /dev/null
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * perf iostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov <alexander.antonov@linux.intel.com>
+ */
+
+#ifndef _IOSTAT_H
+#define _IOSTAT_H
+
+#include <subcmd/parse-options.h>
+#include "util/stat.h"
+#include "util/parse-events.h"
+#include "util/evlist.h"
+
+struct option;
+struct perf_stat_config;
+struct evlist;
+struct timespec;
+
+enum iostat_mode_t {
+       IOSTAT_NONE             = -1,
+       IOSTAT_RUN              = 0,
+       IOSTAT_LIST             = 1
+};
+
+extern enum iostat_mode_t iostat_mode;
+
+typedef void (*iostat_print_counter_t)(struct perf_stat_config *, struct evsel *, char *);
+
+int iostat_prepare(struct evlist *evlist, struct perf_stat_config *config);
+int iostat_parse(const struct option *opt, const char *str,
+                int unset __maybe_unused);
+void iostat_list(struct evlist *evlist, struct perf_stat_config *config);
+void iostat_release(struct evlist *evlist);
+void iostat_prefix(struct evlist *evlist, struct perf_stat_config *config,
+                  char *prefix, struct timespec *ts);
+void iostat_print_header_prefix(struct perf_stat_config *config);
+void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel,
+                        struct perf_stat_output_ctx *out);
+void iostat_print_counters(struct evlist *evlist,
+                          struct perf_stat_config *config, struct timespec *ts,
+                          char *prefix, iostat_print_counter_t print_cnt_cb);
+
+#endif /* _IOSTAT_H */
index 9760d8e..917a9c7 100644 (file)
@@ -396,21 +396,31 @@ static pid_t jr_entry_tid(struct jit_buf_desc *jd, union jr_entry *jr)
 
 static uint64_t convert_timestamp(struct jit_buf_desc *jd, uint64_t timestamp)
 {
-       struct perf_tsc_conversion tc;
+       struct perf_tsc_conversion tc = { .time_shift = 0, };
+       struct perf_record_time_conv *time_conv = &jd->session->time_conv;
 
        if (!jd->use_arch_timestamp)
                return timestamp;
 
-       tc.time_shift          = jd->session->time_conv.time_shift;
-       tc.time_mult           = jd->session->time_conv.time_mult;
-       tc.time_zero           = jd->session->time_conv.time_zero;
-       tc.time_cycles         = jd->session->time_conv.time_cycles;
-       tc.time_mask           = jd->session->time_conv.time_mask;
-       tc.cap_user_time_zero  = jd->session->time_conv.cap_user_time_zero;
-       tc.cap_user_time_short = jd->session->time_conv.cap_user_time_short;
+       tc.time_shift = time_conv->time_shift;
+       tc.time_mult  = time_conv->time_mult;
+       tc.time_zero  = time_conv->time_zero;
 
-       if (!tc.cap_user_time_zero)
-               return 0;
+       /*
+        * The event TIME_CONV was extended for the fields from "time_cycles"
+        * when supported cap_user_time_short, for backward compatibility,
+        * checks the event size and assigns these extended fields if these
+        * fields are contained in the event.
+        */
+       if (event_contains(*time_conv, time_cycles)) {
+               tc.time_cycles         = time_conv->time_cycles;
+               tc.time_mask           = time_conv->time_mask;
+               tc.cap_user_time_zero  = time_conv->cap_user_time_zero;
+               tc.cap_user_time_short = time_conv->cap_user_time_short;
+
+               if (!tc.cap_user_time_zero)
+                       return 0;
+       }
 
        return tsc_to_perf_time(timestamp, &tc);
 }
index a217ecf..6a67126 100644 (file)
@@ -30,7 +30,7 @@
  *
  * It does so by calculating the costs of the path ending in characters
  * i (in string1) and j (in string2), respectively, given that the last
- * operation is a substition, a swap, a deletion, or an insertion.
+ * operation is a substitution, a swap, a deletion, or an insertion.
  *
  * This implementation allows the costs to be weighted:
  *
index 6b4e5a0..c397be0 100644 (file)
@@ -4,7 +4,7 @@
  * generic one.
  *
  * The function 'LIBUNWIND__ARCH_REG_ID' name is set according to arch
- * name and the defination of this function is included directly from
+ * name and the definition of this function is included directly from
  * 'arch/arm64/util/unwind-libunwind.c', to make sure that this function
  * is defined no matter what arch the host is.
  *
index 21c216c..b2b92d0 100644 (file)
@@ -4,7 +4,7 @@
  * generic one.
  *
  * The function 'LIBUNWIND__ARCH_REG_ID' name is set according to arch
- * name and the defination of this function is included directly from
+ * name and the definition of this function is included directly from
  * 'arch/x86/util/unwind-libunwind.c', to make sure that this function
  * is defined no matter what arch the host is.
  *
index dbdffb6..3ceaf7e 100644 (file)
@@ -471,7 +471,7 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf,
 
        /*
         * This is an optional work. Even it fail we can continue our
-        * work. Needn't to check error return.
+        * work. Needn't check error return.
         */
        llvm__get_kbuild_opts(&kbuild_dir, &kbuild_include_opts);
 
index b5c2d8b..3ff4936 100644 (file)
@@ -905,7 +905,7 @@ static struct map *machine__addnew_module_map(struct machine *machine, u64 start
 
        maps__insert(&machine->kmaps, map);
 
-       /* Put the map here because maps__insert alread got it */
+       /* Put the map here because maps__insert already got it */
        map__put(map);
 out:
        /* put the dso here, corresponding to  machine__findnew_module_dso */
@@ -1952,7 +1952,7 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event
         * maps because that is what the kernel just did.
         *
         * But when synthesizing, this should not be done.  If we do, we end up
-        * with overlapping maps as we process the sythesized MMAP2 events that
+        * with overlapping maps as we process the synthesized MMAP2 events that
         * get delivered shortly thereafter.
         *
         * Use the FORK event misc flags in an internal way to signal this
@@ -2038,8 +2038,8 @@ int machine__process_event(struct machine *machine, union perf_event *event,
 static bool symbol__match_regex(struct symbol *sym, regex_t *regex)
 {
        if (!regexec(regex, sym->name, 0, NULL, 0))
-               return 1;
-       return 0;
+               return true;
+       return false;
 }
 
 static void ip__resolve_ams(struct thread *thread,
@@ -2518,7 +2518,7 @@ static bool has_stitched_lbr(struct thread *thread,
 
        /*
         * Check if there are identical LBRs between two samples.
-        * Identicall LBRs must have same from, to and flags values. Also,
+        * Identical LBRs must have same from, to and flags values. Also,
         * they have to be saved in the same LBR registers (same physical
         * index).
         *
@@ -2588,7 +2588,7 @@ err:
 }
 
 /*
- * Recolve LBR callstack chain sample
+ * Resolve LBR callstack chain sample
  * Return:
  * 1 on success get LBR callchain information
  * 0 no available LBR callchain information, should try fp
index 9f32825..d32f5b2 100644 (file)
@@ -75,7 +75,7 @@ struct thread;
 
 /* map__for_each_symbol - iterate over the symbols in the given map
  *
- * @map: the 'struct map *' in which symbols itereated
+ * @map: the 'struct map *' in which symbols are iterated
  * @pos: the 'struct symbol *' to use as a loop cursor
  * @n: the 'struct rb_node *' to use as a temporary storage
  * Note: caller must ensure map->dso is not NULL (map is loaded).
@@ -86,7 +86,7 @@ struct thread;
 /* map__for_each_symbol_with_name - iterate over the symbols in the given map
  *                                  that have the given name
  *
- * @map: the 'struct map *' in which symbols itereated
+ * @map: the 'struct map *' in which symbols are iterated
  * @sym_name: the symbol name
  * @pos: the 'struct symbol *' to use as a loop cursor
  */
index 755cef7..cacdebd 100644 (file)
@@ -44,7 +44,6 @@ bool is_mem_loads_aux_event(struct evsel *leader);
 
 void perf_mem_events__list(void);
 
-struct mem_info;
 int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
@@ -81,7 +80,7 @@ struct c2c_stats {
        u32     rmt_dram;            /* count of loads miss to remote DRAM */
        u32     blk_data;            /* count of loads blocked by data */
        u32     blk_addr;            /* count of loads blocked by address conflict */
-       u32     nomap;               /* count of load/stores with no phys adrs */
+       u32     nomap;               /* count of load/stores with no phys addrs */
        u32     noparse;             /* count of unparsable data sources */
 };
 
index 26c990e..8336dd8 100644 (file)
@@ -181,7 +181,7 @@ static bool evsel_same_pmu(struct evsel *ev1, struct evsel *ev2)
  * @pctx: the parse context for the metric expression.
  * @metric_no_merge: don't attempt to share events for the metric with other
  * metrics.
- * @has_constraint: is there a contraint on the group of events? In which case
+ * @has_constraint: is there a constraint on the group of events? In which case
  * the events won't be grouped.
  * @metric_events: out argument, null terminated array of evsel's associated
  * with the metric.
@@ -618,7 +618,7 @@ static int metricgroup__print_sys_event_iter(struct pmu_event *pe, void *data)
 void metricgroup__print(bool metrics, bool metricgroups, char *filter,
                        bool raw, bool details)
 {
-       struct pmu_events_map *map = perf_pmu__find_map(NULL);
+       struct pmu_events_map *map = pmu_events_map__find();
        struct pmu_event *pe;
        int i;
        struct rblist groups;
@@ -900,7 +900,8 @@ static int __add_metric(struct list_head *metric_list,
                    (match_metric(__pe->metric_group, __metric) ||      \
                     match_metric(__pe->metric_name, __metric)))
 
-static struct pmu_event *find_metric(const char *metric, struct pmu_events_map *map)
+struct pmu_event *metricgroup__find_metric(const char *metric,
+                                          struct pmu_events_map *map)
 {
        struct pmu_event *pe;
        int i;
@@ -985,7 +986,7 @@ static int __resolve_metric(struct metric *m,
                        struct expr_id *parent;
                        struct pmu_event *pe;
 
-                       pe = find_metric(cur->key, map);
+                       pe = metricgroup__find_metric(cur->key, map);
                        if (!pe)
                                continue;
 
@@ -1253,8 +1254,7 @@ int metricgroup__parse_groups(const struct option *opt,
                              struct rblist *metric_events)
 {
        struct evlist *perf_evlist = *(struct evlist **)opt->value;
-       struct pmu_events_map *map = perf_pmu__find_map(NULL);
-
+       struct pmu_events_map *map = pmu_events_map__find();
 
        return parse_groups(perf_evlist, str, metric_no_group,
                            metric_no_merge, NULL, metric_events, map);
@@ -1273,7 +1273,7 @@ int metricgroup__parse_groups_test(struct evlist *evlist,
 
 bool metricgroup__has_metric(const char *metric)
 {
-       struct pmu_events_map *map = perf_pmu__find_map(NULL);
+       struct pmu_events_map *map = pmu_events_map__find();
        struct pmu_event *pe;
        int i;
 
index ed1b939..cc4a924 100644 (file)
@@ -9,7 +9,6 @@
 
 struct evlist;
 struct evsel;
-struct evlist;
 struct option;
 struct rblist;
 struct pmu_events_map;
@@ -44,7 +43,8 @@ int metricgroup__parse_groups(const struct option *opt,
                              bool metric_no_group,
                              bool metric_no_merge,
                              struct rblist *metric_events);
-
+struct pmu_event *metricgroup__find_metric(const char *metric,
+                                          struct pmu_events_map *map);
 int metricgroup__parse_groups_test(struct evlist *evlist,
                                   struct pmu_events_map *map,
                                   const char *str,
diff --git a/tools/perf/util/parse-events-hybrid.c b/tools/perf/util/parse-events-hybrid.c
new file mode 100644 (file)
index 0000000..10160ab
--- /dev/null
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/err.h>
+#include <linux/zalloc.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/param.h>
+#include "evlist.h"
+#include "evsel.h"
+#include "parse-events.h"
+#include "parse-events-hybrid.h"
+#include "debug.h"
+#include "pmu.h"
+#include "pmu-hybrid.h"
+#include "perf.h"
+
+static void config_hybrid_attr(struct perf_event_attr *attr,
+                              int type, int pmu_type)
+{
+       /*
+        * attr.config layout for type PERF_TYPE_HARDWARE and
+        * PERF_TYPE_HW_CACHE
+        *
+        * PERF_TYPE_HARDWARE:                 0xEEEEEEEE000000AA
+        *                                     AA: hardware event ID
+        *                                     EEEEEEEE: PMU type ID
+        * PERF_TYPE_HW_CACHE:                 0xEEEEEEEE00DDCCBB
+        *                                     BB: hardware cache ID
+        *                                     CC: hardware cache op ID
+        *                                     DD: hardware cache op result ID
+        *                                     EEEEEEEE: PMU type ID
+        * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied.
+        */
+       attr->type = type;
+       attr->config = attr->config | ((__u64)pmu_type << PERF_PMU_TYPE_SHIFT);
+}
+
+static int create_event_hybrid(__u32 config_type, int *idx,
+                              struct list_head *list,
+                              struct perf_event_attr *attr, char *name,
+                              struct list_head *config_terms,
+                              struct perf_pmu *pmu)
+{
+       struct evsel *evsel;
+       __u32 type = attr->type;
+       __u64 config = attr->config;
+
+       config_hybrid_attr(attr, config_type, pmu->type);
+       evsel = parse_events__add_event_hybrid(list, idx, attr, name,
+                                              pmu, config_terms);
+       if (evsel)
+               evsel->pmu_name = strdup(pmu->name);
+       else
+               return -ENOMEM;
+
+       attr->type = type;
+       attr->config = config;
+       return 0;
+}
+
+static int pmu_cmp(struct parse_events_state *parse_state,
+                  struct perf_pmu *pmu)
+{
+       if (!parse_state->hybrid_pmu_name)
+               return 0;
+
+       return strcmp(parse_state->hybrid_pmu_name, pmu->name);
+}
+
+static int add_hw_hybrid(struct parse_events_state *parse_state,
+                        struct list_head *list, struct perf_event_attr *attr,
+                        char *name, struct list_head *config_terms)
+{
+       struct perf_pmu *pmu;
+       int ret;
+
+       perf_pmu__for_each_hybrid_pmu(pmu) {
+               if (pmu_cmp(parse_state, pmu))
+                       continue;
+
+               ret = create_event_hybrid(PERF_TYPE_HARDWARE,
+                                         &parse_state->idx, list, attr, name,
+                                         config_terms, pmu);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int create_raw_event_hybrid(int *idx, struct list_head *list,
+                                  struct perf_event_attr *attr, char *name,
+                                  struct list_head *config_terms,
+                                  struct perf_pmu *pmu)
+{
+       struct evsel *evsel;
+
+       attr->type = pmu->type;
+       evsel = parse_events__add_event_hybrid(list, idx, attr, name,
+                                              pmu, config_terms);
+       if (evsel)
+               evsel->pmu_name = strdup(pmu->name);
+       else
+               return -ENOMEM;
+
+       return 0;
+}
+
+static int add_raw_hybrid(struct parse_events_state *parse_state,
+                         struct list_head *list, struct perf_event_attr *attr,
+                         char *name, struct list_head *config_terms)
+{
+       struct perf_pmu *pmu;
+       int ret;
+
+       perf_pmu__for_each_hybrid_pmu(pmu) {
+               if (pmu_cmp(parse_state, pmu))
+                       continue;
+
+               ret = create_raw_event_hybrid(&parse_state->idx, list, attr,
+                                             name, config_terms, pmu);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state,
+                                    struct list_head *list,
+                                    struct perf_event_attr *attr,
+                                    char *name, struct list_head *config_terms,
+                                    bool *hybrid)
+{
+       *hybrid = false;
+       if (attr->type == PERF_TYPE_SOFTWARE)
+               return 0;
+
+       if (!perf_pmu__has_hybrid())
+               return 0;
+
+       *hybrid = true;
+       if (attr->type != PERF_TYPE_RAW) {
+               return add_hw_hybrid(parse_state, list, attr, name,
+                                    config_terms);
+       }
+
+       return add_raw_hybrid(parse_state, list, attr, name,
+                             config_terms);
+}
+
+int parse_events__add_cache_hybrid(struct list_head *list, int *idx,
+                                  struct perf_event_attr *attr, char *name,
+                                  struct list_head *config_terms,
+                                  bool *hybrid,
+                                  struct parse_events_state *parse_state)
+{
+       struct perf_pmu *pmu;
+       int ret;
+
+       *hybrid = false;
+       if (!perf_pmu__has_hybrid())
+               return 0;
+
+       *hybrid = true;
+       perf_pmu__for_each_hybrid_pmu(pmu) {
+               if (pmu_cmp(parse_state, pmu))
+                       continue;
+
+               ret = create_event_hybrid(PERF_TYPE_HW_CACHE, idx, list,
+                                         attr, name, config_terms, pmu);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
diff --git a/tools/perf/util/parse-events-hybrid.h b/tools/perf/util/parse-events-hybrid.h
new file mode 100644 (file)
index 0000000..f33bd67
--- /dev/null
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PERF_PARSE_EVENTS_HYBRID_H
+#define __PERF_PARSE_EVENTS_HYBRID_H
+
+#include <linux/list.h>
+#include <stdbool.h>
+#include <linux/types.h>
+#include <linux/perf_event.h>
+#include <string.h>
+
+int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state,
+                                    struct list_head *list,
+                                    struct perf_event_attr *attr,
+                                    char *name, struct list_head *config_terms,
+                                    bool *hybrid);
+
+int parse_events__add_cache_hybrid(struct list_head *list, int *idx,
+                                  struct perf_event_attr *attr, char *name,
+                                  struct list_head *config_terms,
+                                  bool *hybrid,
+                                  struct parse_events_state *parse_state);
+
+#endif /* __PERF_PARSE_EVENTS_HYBRID_H */
index c0c0fab..4dad142 100644 (file)
@@ -37,6 +37,8 @@
 #include "util/evsel_config.h"
 #include "util/event.h"
 #include "util/pfm.h"
+#include "util/parse-events-hybrid.h"
+#include "util/pmu-hybrid.h"
 #include "perf.h"
 
 #define MAX_NAME_LEN 100
@@ -47,6 +49,9 @@ extern int parse_events_debug;
 int parse_events_parse(void *parse_state, void *scanner);
 static int get_config_terms(struct list_head *head_config,
                            struct list_head *head_terms __maybe_unused);
+static int parse_events__with_hybrid_pmu(struct parse_events_state *parse_state,
+                                        const char *str, char *pmu_name,
+                                        struct list_head *list);
 
 static struct perf_pmu_event_symbol *perf_pmu_events_list;
 /*
@@ -452,14 +457,16 @@ static int config_attr(struct perf_event_attr *attr,
 int parse_events_add_cache(struct list_head *list, int *idx,
                           char *type, char *op_result1, char *op_result2,
                           struct parse_events_error *err,
-                          struct list_head *head_config)
+                          struct list_head *head_config,
+                          struct parse_events_state *parse_state)
 {
        struct perf_event_attr attr;
        LIST_HEAD(config_terms);
        char name[MAX_NAME_LEN], *config_name;
        int cache_type = -1, cache_op = -1, cache_result = -1;
        char *op_result[2] = { op_result1, op_result2 };
-       int i, n;
+       int i, n, ret;
+       bool hybrid;
 
        /*
         * No fallback - if we cannot get a clear cache type
@@ -519,6 +526,13 @@ int parse_events_add_cache(struct list_head *list, int *idx,
                if (get_config_terms(head_config, &config_terms))
                        return -ENOMEM;
        }
+
+       ret = parse_events__add_cache_hybrid(list, idx, &attr,
+                                            config_name ? : name, &config_terms,
+                                            &hybrid, parse_state);
+       if (hybrid)
+               return ret;
+
        return add_event(list, idx, &attr, config_name ? : name, &config_terms);
 }
 
@@ -846,9 +860,9 @@ split_bpf_config_terms(struct list_head *evt_head_config,
        struct parse_events_term *term, *temp;
 
        /*
-        * Currectly, all possible user config term
+        * Currently, all possible user config term
         * belong to bpf object. parse_events__is_hardcoded_term()
-        * happends to be a good flag.
+        * happens to be a good flag.
         *
         * See parse_events_config_bpf() and
         * config_term_tracepoint().
@@ -898,7 +912,7 @@ int parse_events_load_bpf(struct parse_events_state *parse_state,
 
        /*
         * Caller doesn't know anything about obj_head_config,
-        * so combine them together again before returnning.
+        * so combine them together again before returning.
         */
        if (head_config)
                list_splice_tail(&obj_head_config, head_config);
@@ -1185,10 +1199,10 @@ do {                                                                       \
        }
 
        /*
-        * Check term availbility after basic checking so
+        * Check term availability after basic checking so
         * PARSE_EVENTS__TERM_TYPE_USER can be found and filtered.
         *
-        * If check availbility at the entry of this function,
+        * If check availability at the entry of this function,
         * user will see "'<sysfs term>' is not usable in 'perf stat'"
         * if an invalid config term is provided for legacy events
         * (for example, instructions/badterm/...), which is confusing.
@@ -1419,6 +1433,8 @@ int parse_events_add_numeric(struct parse_events_state *parse_state,
 {
        struct perf_event_attr attr;
        LIST_HEAD(config_terms);
+       bool hybrid;
+       int ret;
 
        memset(&attr, 0, sizeof(attr));
        attr.type = type;
@@ -1433,6 +1449,12 @@ int parse_events_add_numeric(struct parse_events_state *parse_state,
                        return -ENOMEM;
        }
 
+       ret = parse_events__add_numeric_hybrid(parse_state, list, &attr,
+                                              get_config_name(head_config),
+                                              &config_terms, &hybrid);
+       if (hybrid)
+               return ret;
+
        return add_event(list, &parse_state->idx, &attr,
                         get_config_name(head_config), &config_terms);
 }
@@ -1456,6 +1478,33 @@ static bool config_term_percore(struct list_head *config_terms)
        return false;
 }
 
+static int parse_events__inside_hybrid_pmu(struct parse_events_state *parse_state,
+                                          struct list_head *list, char *name,
+                                          struct list_head *head_config)
+{
+       struct parse_events_term *term;
+       int ret = -1;
+
+       if (parse_state->fake_pmu || !head_config || list_empty(head_config) ||
+           !perf_pmu__is_hybrid(name)) {
+               return -1;
+       }
+
+       /*
+        * More than one term in list.
+        */
+       if (head_config->next && head_config->next->next != head_config)
+               return -1;
+
+       term = list_first_entry(head_config, struct parse_events_term, list);
+       if (term && term->config && strcmp(term->config, "event")) {
+               ret = parse_events__with_hybrid_pmu(parse_state, term->config,
+                                                   name, list);
+       }
+
+       return ret;
+}
+
 int parse_events_add_pmu(struct parse_events_state *parse_state,
                         struct list_head *list, char *name,
                         struct list_head *head_config,
@@ -1549,6 +1598,11 @@ int parse_events_add_pmu(struct parse_events_state *parse_state,
        if (pmu->default_config && get_config_chgs(pmu, head_config, &config_terms))
                return -ENOMEM;
 
+       if (!parse_events__inside_hybrid_pmu(parse_state, list, name,
+                                            head_config)) {
+               return 0;
+       }
+
        if (!parse_state->fake_pmu && perf_pmu__config(pmu, &attr, head_config, parse_state->error)) {
                struct evsel_config_term *pos, *tmp;
 
@@ -1567,6 +1621,9 @@ int parse_events_add_pmu(struct parse_events_state *parse_state,
        if (!evsel)
                return -ENOMEM;
 
+       if (evsel->name)
+               evsel->use_config_name = true;
+
        evsel->pmu_name = name ? strdup(name) : NULL;
        evsel->use_uncore_alias = use_uncore_alias;
        evsel->percore = config_term_percore(&evsel->config_terms);
@@ -1804,6 +1861,7 @@ struct event_modifier {
        int pinned;
        int weak;
        int exclusive;
+       int bpf_counter;
 };
 
 static int get_event_modifier(struct event_modifier *mod, char *str,
@@ -1824,6 +1882,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str,
        int exclude = eu | ek | eh;
        int exclude_GH = evsel ? evsel->exclude_GH : 0;
        int weak = 0;
+       int bpf_counter = 0;
 
        memset(mod, 0, sizeof(*mod));
 
@@ -1867,6 +1926,8 @@ static int get_event_modifier(struct event_modifier *mod, char *str,
                        exclusive = 1;
                } else if (*str == 'W') {
                        weak = 1;
+               } else if (*str == 'b') {
+                       bpf_counter = 1;
                } else
                        break;
 
@@ -1898,6 +1959,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str,
        mod->sample_read = sample_read;
        mod->pinned = pinned;
        mod->weak = weak;
+       mod->bpf_counter = bpf_counter;
        mod->exclusive = exclusive;
 
        return 0;
@@ -1912,7 +1974,7 @@ static int check_modifier(char *str)
        char *p = str;
 
        /* The sizeof includes 0 byte as well. */
-       if (strlen(str) > (sizeof("ukhGHpppPSDIWe") - 1))
+       if (strlen(str) > (sizeof("ukhGHpppPSDIWeb") - 1))
                return -1;
 
        while (*p) {
@@ -1953,6 +2015,7 @@ int parse_events__modifier_event(struct list_head *list, char *str, bool add)
                evsel->sample_read         = mod.sample_read;
                evsel->precise_max         = mod.precise_max;
                evsel->weak_group          = mod.weak;
+               evsel->bpf_counter         = mod.bpf_counter;
 
                if (evsel__is_group_leader(evsel)) {
                        evsel->core.attr.pinned = mod.pinned;
@@ -2162,6 +2225,33 @@ int parse_events_terms(struct list_head *terms, const char *str)
        return ret;
 }
 
+static int parse_events__with_hybrid_pmu(struct parse_events_state *parse_state,
+                                        const char *str, char *pmu_name,
+                                        struct list_head *list)
+{
+       struct parse_events_state ps = {
+               .list            = LIST_HEAD_INIT(ps.list),
+               .stoken          = PE_START_EVENTS,
+               .hybrid_pmu_name = pmu_name,
+               .idx             = parse_state->idx,
+       };
+       int ret;
+
+       ret = parse_events__scanner(str, &ps);
+       perf_pmu__parse_cleanup();
+
+       if (!ret) {
+               if (!list_empty(&ps.list)) {
+                       list_splice(&ps.list, list);
+                       parse_state->idx = ps.idx;
+                       return 0;
+               } else
+                       return -1;
+       }
+
+       return ret;
+}
+
 int __parse_events(struct evlist *evlist, const char *str,
                   struct parse_events_error *err, struct perf_pmu *fake_pmu)
 {
@@ -3185,3 +3275,12 @@ char *parse_events_formats_error_string(char *additional_terms)
 fail:
        return NULL;
 }
+
+struct evsel *parse_events__add_event_hybrid(struct list_head *list, int *idx,
+                                            struct perf_event_attr *attr,
+                                            char *name, struct perf_pmu *pmu,
+                                            struct list_head *config_terms)
+{
+       return __add_event(list, idx, attr, true, name, pmu,
+                          config_terms, false, NULL);
+}
index e80c9b7..bf6e41a 100644 (file)
@@ -138,6 +138,7 @@ struct parse_events_state {
        struct list_head          *terms;
        int                        stoken;
        struct perf_pmu           *fake_pmu;
+       char                      *hybrid_pmu_name;
 };
 
 void parse_events__handle_error(struct parse_events_error *err, int idx,
@@ -188,7 +189,8 @@ int parse_events_add_tool(struct parse_events_state *parse_state,
 int parse_events_add_cache(struct list_head *list, int *idx,
                           char *type, char *op_result1, char *op_result2,
                           struct parse_events_error *error,
-                          struct list_head *head_config);
+                          struct list_head *head_config,
+                          struct parse_events_state *parse_state);
 int parse_events_add_breakpoint(struct list_head *list, int *idx,
                                u64 addr, char *type, u64 len);
 int parse_events_add_pmu(struct parse_events_state *parse_state,
@@ -263,4 +265,9 @@ static inline bool is_sdt_event(char *str __maybe_unused)
 
 int perf_pmu__test_parse_init(void);
 
+struct evsel *parse_events__add_event_hybrid(struct list_head *list, int *idx,
+                                            struct perf_event_attr *attr,
+                                            char *name, struct perf_pmu *pmu,
+                                            struct list_head *config_terms);
+
 #endif /* __PERF_PARSE_EVENTS_H */
index 0b36285..fb8646c 100644 (file)
@@ -210,7 +210,7 @@ name_tag    [\'][a-zA-Z_*?\[\]][a-zA-Z0-9_*?\-,\.\[\]:=]*[\']
 name_minus     [a-zA-Z_*?][a-zA-Z0-9\-_*?.:]*
 drv_cfg_term   [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)?
 /* If you add a modifier you need to update check_modifier() */
-modifier_event [ukhpPGHSDIWe]+
+modifier_event [ukhpPGHSDIWeb]+
 modifier_bp    [rwx]{1,3}
 
 %%
index d57ac86..aba12a4 100644 (file)
@@ -454,7 +454,8 @@ PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT opt_e
 
        list = alloc_list();
        ABORT_ON(!list);
-       err = parse_events_add_cache(list, &parse_state->idx, $1, $3, $5, error, $6);
+       err = parse_events_add_cache(list, &parse_state->idx, $1, $3, $5, error, $6,
+                                    parse_state);
        parse_events_terms__delete($6);
        free($1);
        free($3);
@@ -475,7 +476,8 @@ PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT opt_event_config
 
        list = alloc_list();
        ABORT_ON(!list);
-       err = parse_events_add_cache(list, &parse_state->idx, $1, $3, NULL, error, $4);
+       err = parse_events_add_cache(list, &parse_state->idx, $1, $3, NULL, error, $4,
+                                    parse_state);
        parse_events_terms__delete($4);
        free($1);
        free($3);
@@ -495,7 +497,8 @@ PE_NAME_CACHE_TYPE opt_event_config
 
        list = alloc_list();
        ABORT_ON(!list);
-       err = parse_events_add_cache(list, &parse_state->idx, $1, NULL, NULL, error, $2);
+       err = parse_events_add_cache(list, &parse_state->idx, $1, NULL, NULL, error, $2,
+                                    parse_state);
        parse_events_terms__delete($2);
        free($1);
        if (err) {
diff --git a/tools/perf/util/pmu-hybrid.c b/tools/perf/util/pmu-hybrid.c
new file mode 100644 (file)
index 0000000..f51ccaa
--- /dev/null
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include <linux/string.h>
+#include <linux/zalloc.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <locale.h>
+#include <api/fs/fs.h>
+#include "fncache.h"
+#include "pmu-hybrid.h"
+
+LIST_HEAD(perf_pmu__hybrid_pmus);
+
+bool perf_pmu__hybrid_mounted(const char *name)
+{
+       char path[PATH_MAX];
+       const char *sysfs;
+       FILE *file;
+       int n, cpu;
+
+       if (strncmp(name, "cpu_", 4))
+               return false;
+
+       sysfs = sysfs__mountpoint();
+       if (!sysfs)
+               return false;
+
+       snprintf(path, PATH_MAX, CPUS_TEMPLATE_CPU, sysfs, name);
+       if (!file_available(path))
+               return false;
+
+       file = fopen(path, "r");
+       if (!file)
+               return false;
+
+       n = fscanf(file, "%u", &cpu);
+       fclose(file);
+       if (n <= 0)
+               return false;
+
+       return true;
+}
+
+struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name)
+{
+       struct perf_pmu *pmu;
+
+       if (!name)
+               return NULL;
+
+       perf_pmu__for_each_hybrid_pmu(pmu) {
+               if (!strcmp(name, pmu->name))
+                       return pmu;
+       }
+
+       return NULL;
+}
+
+bool perf_pmu__is_hybrid(const char *name)
+{
+       return perf_pmu__find_hybrid_pmu(name) != NULL;
+}
+
+char *perf_pmu__hybrid_type_to_pmu(const char *type)
+{
+       char *pmu_name = NULL;
+
+       if (asprintf(&pmu_name, "cpu_%s", type) < 0)
+               return NULL;
+
+       if (perf_pmu__is_hybrid(pmu_name))
+               return pmu_name;
+
+       /*
+        * pmu may be not scanned, check the sysfs.
+        */
+       if (perf_pmu__hybrid_mounted(pmu_name))
+               return pmu_name;
+
+       free(pmu_name);
+       return NULL;
+}
diff --git a/tools/perf/util/pmu-hybrid.h b/tools/perf/util/pmu-hybrid.h
new file mode 100644 (file)
index 0000000..d0fa7bc
--- /dev/null
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PMU_HYBRID_H
+#define __PMU_HYBRID_H
+
+#include <linux/perf_event.h>
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <stdbool.h>
+#include "pmu.h"
+
+extern struct list_head perf_pmu__hybrid_pmus;
+
+#define perf_pmu__for_each_hybrid_pmu(pmu)     \
+       list_for_each_entry(pmu, &perf_pmu__hybrid_pmus, hybrid_list)
+
+bool perf_pmu__hybrid_mounted(const char *name);
+
+struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name);
+bool perf_pmu__is_hybrid(const char *name);
+char *perf_pmu__hybrid_type_to_pmu(const char *type);
+
+#endif /* __PMU_HYBRID_H */
index 46fd0f9..88c8ecd 100644 (file)
@@ -25,6 +25,7 @@
 #include "string2.h"
 #include "strbuf.h"
 #include "fncache.h"
+#include "pmu-hybrid.h"
 
 struct perf_pmu perf_pmu__fake;
 
@@ -39,6 +40,7 @@ int perf_pmu_parse(struct list_head *list, char *name);
 extern FILE *perf_pmu_in;
 
 static LIST_HEAD(pmus);
+static bool hybrid_scanned;
 
 /*
  * Parse & process all the sysfs attributes located under
@@ -283,6 +285,7 @@ void perf_pmu_free_alias(struct perf_pmu_alias *newalias)
        zfree(&newalias->str);
        zfree(&newalias->metric_expr);
        zfree(&newalias->metric_name);
+       zfree(&newalias->pmu_name);
        parse_events_terms__purge(&newalias->terms);
        free(newalias);
 }
@@ -297,6 +300,10 @@ static bool perf_pmu_merge_alias(struct perf_pmu_alias *newalias,
 
        list_for_each_entry(a, alist, list) {
                if (!strcasecmp(newalias->name, a->name)) {
+                       if (newalias->pmu_name && a->pmu_name &&
+                           !strcasecmp(newalias->pmu_name, a->pmu_name)) {
+                               continue;
+                       }
                        perf_pmu_update_alias(a, newalias);
                        perf_pmu_free_alias(newalias);
                        return true;
@@ -306,18 +313,27 @@ static bool perf_pmu_merge_alias(struct perf_pmu_alias *newalias,
 }
 
 static int __perf_pmu__new_alias(struct list_head *list, char *dir, char *name,
-                                char *desc, char *val,
-                                char *long_desc, char *topic,
-                                char *unit, char *perpkg,
-                                char *metric_expr,
-                                char *metric_name,
-                                char *deprecated)
+                                char *desc, char *val, struct pmu_event *pe)
 {
        struct parse_events_term *term;
        struct perf_pmu_alias *alias;
        int ret;
        int num;
        char newval[256];
+       char *long_desc = NULL, *topic = NULL, *unit = NULL, *perpkg = NULL,
+            *metric_expr = NULL, *metric_name = NULL, *deprecated = NULL,
+            *pmu_name = NULL;
+
+       if (pe) {
+               long_desc = (char *)pe->long_desc;
+               topic = (char *)pe->topic;
+               unit = (char *)pe->unit;
+               perpkg = (char *)pe->perpkg;
+               metric_expr = (char *)pe->metric_expr;
+               metric_name = (char *)pe->metric_name;
+               deprecated = (char *)pe->deprecated;
+               pmu_name = (char *)pe->pmu;
+       }
 
        alias = malloc(sizeof(*alias));
        if (!alias)
@@ -382,6 +398,7 @@ static int __perf_pmu__new_alias(struct list_head *list, char *dir, char *name,
        }
        alias->per_pkg = perpkg && sscanf(perpkg, "%d", &num) == 1 && num == 1;
        alias->str = strdup(newval);
+       alias->pmu_name = pmu_name ? strdup(pmu_name) : NULL;
 
        if (deprecated)
                alias->deprecated = true;
@@ -406,8 +423,7 @@ static int perf_pmu__new_alias(struct list_head *list, char *dir, char *name, FI
        /* Remove trailing newline from sysfs file */
        strim(buf);
 
-       return __perf_pmu__new_alias(list, dir, name, NULL, buf, NULL, NULL, NULL,
-                                    NULL, NULL, NULL, NULL);
+       return __perf_pmu__new_alias(list, dir, name, NULL, buf, NULL);
 }
 
 static inline bool pmu_alias_info_file(char *name)
@@ -599,7 +615,6 @@ static struct perf_cpu_map *__pmu_cpumask(const char *path)
  */
 #define SYS_TEMPLATE_ID        "./bus/event_source/devices/%s/identifier"
 #define CPUS_TEMPLATE_UNCORE   "%s/bus/event_source/devices/%s/cpumask"
-#define CPUS_TEMPLATE_CPU      "%s/bus/event_source/devices/%s/cpus"
 
 static struct perf_cpu_map *pmu_cpumask(const char *name)
 {
@@ -631,6 +646,9 @@ static bool pmu_is_uncore(const char *name)
        char path[PATH_MAX];
        const char *sysfs;
 
+       if (perf_pmu__hybrid_mounted(name))
+               return false;
+
        sysfs = sysfs__mountpoint();
        snprintf(path, PATH_MAX, CPUS_TEMPLATE_UNCORE, sysfs, name);
        return file_available(path);
@@ -717,6 +735,11 @@ struct pmu_events_map *perf_pmu__find_map(struct perf_pmu *pmu)
        return map;
 }
 
+struct pmu_events_map *__weak pmu_events_map__find(void)
+{
+       return perf_pmu__find_map(NULL);
+}
+
 bool pmu_uncore_alias_match(const char *pmu_name, const char *name)
 {
        char *tmp = NULL, *tok, *str;
@@ -793,11 +816,7 @@ new_alias:
                /* need type casts to override 'const' */
                __perf_pmu__new_alias(head, NULL, (char *)pe->name,
                                (char *)pe->desc, (char *)pe->event,
-                               (char *)pe->long_desc, (char *)pe->topic,
-                               (char *)pe->unit, (char *)pe->perpkg,
-                               (char *)pe->metric_expr,
-                               (char *)pe->metric_name,
-                               (char *)pe->deprecated);
+                               pe);
        }
 }
 
@@ -864,13 +883,7 @@ static int pmu_add_sys_aliases_iter_fn(struct pmu_event *pe, void *data)
                                      (char *)pe->name,
                                      (char *)pe->desc,
                                      (char *)pe->event,
-                                     (char *)pe->long_desc,
-                                     (char *)pe->topic,
-                                     (char *)pe->unit,
-                                     (char *)pe->perpkg,
-                                     (char *)pe->metric_expr,
-                                     (char *)pe->metric_name,
-                                     (char *)pe->deprecated);
+                                     pe);
        }
 
        return 0;
@@ -942,6 +955,7 @@ static struct perf_pmu *pmu_lookup(const char *name)
        pmu->is_uncore = pmu_is_uncore(name);
        if (pmu->is_uncore)
                pmu->id = pmu_id(name);
+       pmu->is_hybrid = perf_pmu__hybrid_mounted(name);
        pmu->max_precise = pmu_max_precise(name);
        pmu_add_cpu_aliases(&aliases, pmu);
        pmu_add_sys_aliases(&aliases, pmu);
@@ -953,6 +967,9 @@ static struct perf_pmu *pmu_lookup(const char *name)
        list_splice(&aliases, &pmu->aliases);
        list_add_tail(&pmu->list, &pmus);
 
+       if (pmu->is_hybrid)
+               list_add_tail(&pmu->hybrid_list, &perf_pmu__hybrid_pmus);
+
        pmu->default_config = perf_pmu__get_default_config(pmu);
 
        return pmu;
@@ -1069,7 +1086,7 @@ int perf_pmu__format_type(struct list_head *formats, const char *name)
 
 /*
  * Sets value based on the format definition (format parameter)
- * and unformated value (value parameter).
+ * and unformatted value (value parameter).
  */
 static void pmu_format_value(unsigned long *format, __u64 value, __u64 *v,
                             bool zero)
@@ -1408,7 +1425,7 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms,
        }
 
        /*
-        * if no unit or scale foundin aliases, then
+        * if no unit or scale found in aliases, then
         * set defaults as for evsel
         * unit cannot left to NULL
         */
@@ -1845,3 +1862,13 @@ void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config,
                   "'%llx' not supported by kernel)!\n",
                   name ?: "N/A", buf, config);
 }
+
+bool perf_pmu__has_hybrid(void)
+{
+       if (!hybrid_scanned) {
+               hybrid_scanned = true;
+               perf_pmu__scan(NULL);
+       }
+
+       return !list_empty(&perf_pmu__hybrid_pmus);
+}
index 160b0f5..a790ef7 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/bitmap.h>
 #include <linux/compiler.h>
 #include <linux/perf_event.h>
+#include <linux/list.h>
 #include <stdbool.h>
 #include "parse-events.h"
 #include "pmu-events/pmu-events.h"
@@ -19,6 +20,7 @@ enum {
 
 #define PERF_PMU_FORMAT_BITS 64
 #define EVENT_SOURCE_DEVICE_PATH "/bus/event_source/devices/"
+#define CPUS_TEMPLATE_CPU      "%s/bus/event_source/devices/%s/cpus"
 
 struct perf_event_attr;
 
@@ -34,6 +36,7 @@ struct perf_pmu {
        __u32 type;
        bool selectable;
        bool is_uncore;
+       bool is_hybrid;
        bool auxtrace;
        int max_precise;
        struct perf_event_attr *default_config;
@@ -42,6 +45,7 @@ struct perf_pmu {
        struct list_head aliases; /* HEAD struct perf_pmu_alias -> list */
        struct list_head caps;    /* HEAD struct perf_pmu_caps -> list */
        struct list_head list;    /* ELEM */
+       struct list_head hybrid_list;
 };
 
 extern struct perf_pmu perf_pmu__fake;
@@ -72,6 +76,7 @@ struct perf_pmu_alias {
        bool deprecated;
        char *metric_expr;
        char *metric_name;
+       char *pmu_name;
 };
 
 struct perf_pmu *perf_pmu__find(const char *name);
@@ -114,6 +119,7 @@ void pmu_add_cpu_aliases_map(struct list_head *head, struct perf_pmu *pmu,
                             struct pmu_events_map *map);
 
 struct pmu_events_map *perf_pmu__find_map(struct perf_pmu *pmu);
+struct pmu_events_map *pmu_events_map__find(void);
 bool pmu_uncore_alias_match(const char *pmu_name, const char *name);
 void perf_pmu_free_alias(struct perf_pmu_alias *alias);
 
@@ -126,4 +132,6 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu);
 void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config,
                                   char *name);
 
+bool perf_pmu__has_hybrid(void);
+
 #endif /* __PMU_H */
index a9cff3a..a78c8d5 100644 (file)
@@ -3228,7 +3228,7 @@ errout:
        return err;
 }
 
-/* Concatinate two arrays */
+/* Concatenate two arrays */
 static void *memcat(void *a, size_t sz_a, void *b, size_t sz_b)
 {
        void *ret;
@@ -3258,7 +3258,7 @@ concat_probe_trace_events(struct probe_trace_event **tevs, int *ntevs,
        if (*ntevs + ntevs2 > probe_conf.max_probes)
                ret = -E2BIG;
        else {
-               /* Concatinate the array of probe_trace_event */
+               /* Concatenate the array of probe_trace_event */
                new_tevs = memcat(*tevs, (*ntevs) * sizeof(**tevs),
                                  *tevs2, ntevs2 * sizeof(**tevs2));
                if (!new_tevs)
index 1b118c9..866f2d5 100644 (file)
@@ -164,7 +164,7 @@ static struct probe_trace_arg_ref *alloc_trace_arg_ref(long offs)
 /*
  * Convert a location into trace_arg.
  * If tvar == NULL, this just checks variable can be converted.
- * If fentry == true and vr_die is a parameter, do huristic search
+ * If fentry == true and vr_die is a parameter, do heuristic search
  * for the location fuzzed by function entry mcount.
  */
 static int convert_variable_location(Dwarf_Die *vr_die, Dwarf_Addr addr,
@@ -498,7 +498,7 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
                               " nor array.\n", varname);
                        return -EINVAL;
                }
-               /* While prcessing unnamed field, we don't care about this */
+               /* While processing unnamed field, we don't care about this */
                if (field->ref && dwarf_diename(vr_die)) {
                        pr_err("Semantic error: %s must be referred by '.'\n",
                               field->name);
@@ -1832,7 +1832,7 @@ static int line_range_walk_cb(const char *fname, int lineno,
            (lf->lno_s > lineno || lf->lno_e < lineno))
                return 0;
 
-       /* Make sure this line can be reversable */
+       /* Make sure this line can be reversible */
        if (cu_find_lineinfo(&lf->cu_die, addr, &__fname, &__lineno) > 0
            && (lineno != __lineno || strcmp(fname, __fname)))
                return 0;
index 845dd46..d7c9766 100644 (file)
@@ -37,3 +37,5 @@ util/units.c
 util/affinity.c
 util/rwsem.c
 util/hashmap.c
+util/pmu-hybrid.c
+util/fncache.c
index 278abec..412f8e7 100644 (file)
@@ -90,6 +90,7 @@ int metricgroup__copy_metric_events(struct evlist *evlist, struct cgroup *cgrp,
  */
 void bpf_counter__destroy(struct evsel *evsel);
 int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd);
+int bpf_counter__disable(struct evsel *evsel);
 
 void bpf_counter__destroy(struct evsel *evsel __maybe_unused)
 {
@@ -100,6 +101,11 @@ int bpf_counter__install_pe(struct evsel *evsel __maybe_unused, int cpu __maybe_
        return 0;
 }
 
+int bpf_counter__disable(struct evsel *evsel __maybe_unused)
+{
+       return 0;
+}
+
 /*
  * Support debug printing even though util/debug.c is not linked.  That means
  * implementing 'verbose' and 'eprintf'.
index 078a717..8130b56 100644 (file)
@@ -45,7 +45,7 @@
  * the data portion is mmap()'ed.
  *
  * To sort the queues in chronological order, all queue access is controlled
- * by the auxtrace_heap. This is basicly a stack, each stack element has two
+ * by the auxtrace_heap. This is basically a stack, each stack element has two
  * entries, the queue number and a time stamp. However the stack is sorted by
  * the time stamps. The highest time stamp is at the bottom the lowest
  * (nearest) time stamp is at the top. That sort order is maintained at all
  * stamp of the last processed entry of the auxtrace_buffer replaces the
  * current auxtrace_heap top.
  *
- * 3. Auxtrace_queues might run of out data and are feeded by the
+ * 3. Auxtrace_queues might run of out data and are fed by the
  * PERF_RECORD_AUXTRACE handling, see s390_cpumsf_process_auxtrace_event().
  *
  * Event Generation
- * Each sampling-data entry in the auxilary trace data generates a perf sample.
+ * Each sampling-data entry in the auxiliary trace data generates a perf sample.
  * This sample is filled
  * with data from the auxtrace such as PID/TID, instruction address, CPU state,
  * etc. This sample is processed with perf_session__deliver_synth_event() to
@@ -575,7 +575,7 @@ static unsigned long long get_trailer_time(const unsigned char *buf)
  * pointer to the queue, the second parameter is the time stamp. This
  * is the time stamp:
  * - of the event that triggered this processing.
- * - or the time stamp when the last proccesing of this queue stopped.
+ * - or the time stamp when the last processing of this queue stopped.
  *   In this case it stopped at a 4KB page boundary and record the
  *   position on where to continue processing on the next invocation
  *   (see buffer->use_data and buffer->use_size).
@@ -640,7 +640,7 @@ static int s390_cpumsf_samples(struct s390_cpumsf_queue *sfq, u64 *ts)
                        goto out;
                }
 
-               pos += dsdes;   /* Skip diagnositic entry */
+               pos += dsdes;   /* Skip diagnostic entry */
 
                /* Check for trailer entry */
                if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
index cfcf8d5..08ec3c3 100644 (file)
@@ -160,11 +160,9 @@ static void s390_cpumcfdg_dump(struct perf_sample *sample)
        const char *color = PERF_COLOR_BLUE;
        struct cf_ctrset_entry *cep, ce;
        struct pmu_events_map *map;
-       struct perf_pmu pmu;
        u64 *p;
 
-       memset(&pmu, 0, sizeof(pmu));
-       map = perf_pmu__find_map(&pmu);
+       map = pmu_events_map__find();
        while (offset < len) {
                cep = (struct cf_ctrset_entry *)(buf + offset);
 
index c83c2c6..4e4aa4c 100644 (file)
@@ -1531,7 +1531,7 @@ static void set_table_handlers(struct tables *tables)
                 * Attempt to use the call path root from the call return
                 * processor, if the call return processor is in use. Otherwise,
                 * we allocate a new call path root. This prevents exporting
-                * duplicate call path ids when both are in use simultaniously.
+                * duplicate call path ids when both are in use simultaneously.
                 */
                if (tables->dbe.crp)
                        tables->dbe.cpr = tables->dbe.crp->cpr;
index 859832a..a12cf4f 100644 (file)
@@ -29,6 +29,7 @@
 #include "thread-stack.h"
 #include "sample-raw.h"
 #include "stat.h"
+#include "tsc.h"
 #include "ui/progress.h"
 #include "../perf.h"
 #include "arch/common.h"
@@ -451,6 +452,16 @@ static int process_stat_round_stub(struct perf_session *perf_session __maybe_unu
        return 0;
 }
 
+static int process_event_time_conv_stub(struct perf_session *perf_session __maybe_unused,
+                                       union perf_event *event)
+{
+       if (dump_trace)
+               perf_event__fprintf_time_conv(event, stdout);
+
+       dump_printf(": unhandled!\n");
+       return 0;
+}
+
 static int perf_session__process_compressed_event_stub(struct perf_session *session __maybe_unused,
                                                       union perf_event *event __maybe_unused,
                                                       u64 file_offset __maybe_unused)
@@ -532,7 +543,7 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
        if (tool->stat_round == NULL)
                tool->stat_round = process_stat_round_stub;
        if (tool->time_conv == NULL)
-               tool->time_conv = process_event_op2_stub;
+               tool->time_conv = process_event_time_conv_stub;
        if (tool->feature == NULL)
                tool->feature = process_event_op2_stub;
        if (tool->compressed == NULL)
@@ -949,6 +960,19 @@ static void perf_event__stat_round_swap(union perf_event *event,
        event->stat_round.time = bswap_64(event->stat_round.time);
 }
 
+static void perf_event__time_conv_swap(union perf_event *event,
+                                      bool sample_id_all __maybe_unused)
+{
+       event->time_conv.time_shift = bswap_64(event->time_conv.time_shift);
+       event->time_conv.time_mult  = bswap_64(event->time_conv.time_mult);
+       event->time_conv.time_zero  = bswap_64(event->time_conv.time_zero);
+
+       if (event_contains(event->time_conv, time_cycles)) {
+               event->time_conv.time_cycles = bswap_64(event->time_conv.time_cycles);
+               event->time_conv.time_mask = bswap_64(event->time_conv.time_mask);
+       }
+}
+
 typedef void (*perf_event__swap_op)(union perf_event *event,
                                    bool sample_id_all);
 
@@ -985,7 +1009,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
        [PERF_RECORD_STAT]                = perf_event__stat_swap,
        [PERF_RECORD_STAT_ROUND]          = perf_event__stat_round_swap,
        [PERF_RECORD_EVENT_UPDATE]        = perf_event__event_update_swap,
-       [PERF_RECORD_TIME_CONV]           = perf_event__all64_swap,
+       [PERF_RECORD_TIME_CONV]           = perf_event__time_conv_swap,
        [PERF_RECORD_HEADER_MAX]          = NULL,
 };
 
@@ -1069,7 +1093,7 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample)
                 * in "to" register.
                 * For example, there is a call stack
                 * "A"->"B"->"C"->"D".
-                * The LBR registers will recorde like
+                * The LBR registers will be recorded like
                 * "C"->"D", "B"->"C", "A"->"B".
                 * So only the first "to" register and all "from"
                 * registers are needed to construct the whole stack.
@@ -1302,8 +1326,10 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
 
        if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
                printf("... weight: %" PRIu64 "", sample->weight);
-                       if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
+                       if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
                                printf(",0x%"PRIx16"", sample->ins_lat);
+                               printf(",0x%"PRIx16"", sample->p_stage_cyc);
+                       }
                printf("\n");
        }
 
@@ -1584,7 +1610,7 @@ static s64 perf_session__process_user_event(struct perf_session *session,
                return tool->event_update(tool, event, &session->evlist);
        case PERF_RECORD_HEADER_EVENT_TYPE:
                /*
-                * Depreceated, but we need to handle it for sake
+                * Deprecated, but we need to handle it for sake
                 * of old data files create in pipe mode.
                 */
                return 0;
@@ -2350,7 +2376,8 @@ size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp
        return machines__fprintf_dsos_buildid(&session->machines, fp, skip, parm);
 }
 
-size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
+size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp,
+                                      bool skip_empty)
 {
        size_t ret;
        const char *msg = "";
@@ -2360,7 +2387,7 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
 
        ret = fprintf(fp, "\nAggregated stats:%s\n", msg);
 
-       ret += events_stats__fprintf(&session->evlist->stats, fp);
+       ret += events_stats__fprintf(&session->evlist->stats, fp, skip_empty);
        return ret;
 }
 
index f764801..e31ba4c 100644 (file)
@@ -113,7 +113,8 @@ size_t perf_session__fprintf_dsos(struct perf_session *session, FILE *fp);
 size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp,
                                          bool (fn)(struct dso *dso, int parm), int parm);
 
-size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp);
+size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp,
+                                      bool skip_empty);
 
 struct evsel *perf_session__find_first_evtype(struct perf_session *session,
                                            unsigned int type);
index 552b590..88ce47f 100644 (file)
@@ -25,6 +25,7 @@
 #include <traceevent/event-parse.h>
 #include "mem-events.h"
 #include "annotate.h"
+#include "event.h"
 #include "time-utils.h"
 #include "cgroup.h"
 #include "machine.h"
@@ -36,7 +37,7 @@ const char    default_parent_pattern[] = "^sys_|^do_page_fault";
 const char     *parent_pattern = default_parent_pattern;
 const char     *default_sort_order = "comm,dso,symbol";
 const char     default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles";
-const char     default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat";
+const char     default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat,p_stage_cyc";
 const char     default_top_sort_order[] = "dso,symbol";
 const char     default_diff_sort_order[] = "dso,symbol";
 const char     default_tracepoint_sort_order[] = "trace";
@@ -45,6 +46,8 @@ const char    *field_order;
 regex_t                ignore_callees_regex;
 int            have_ignore_callees = 0;
 enum sort_mode sort__mode = SORT_MODE__NORMAL;
+const char     *dynamic_headers[] = {"local_ins_lat", "p_stage_cyc"};
+const char     *arch_specific_sort_keys[] = {"p_stage_cyc"};
 
 /*
  * Replaces all occurrences of a char used with the:
@@ -1408,6 +1411,25 @@ struct sort_entry sort_global_ins_lat = {
        .se_width_idx   = HISTC_GLOBAL_INS_LAT,
 };
 
+static int64_t
+sort__global_p_stage_cyc_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       return left->stat.p_stage_cyc - right->stat.p_stage_cyc;
+}
+
+static int hist_entry__p_stage_cyc_snprintf(struct hist_entry *he, char *bf,
+                                       size_t size, unsigned int width)
+{
+       return repsep_snprintf(bf, size, "%-*u", width, he->stat.p_stage_cyc);
+}
+
+struct sort_entry sort_p_stage_cyc = {
+       .se_header      = "Pipeline Stage Cycle",
+       .se_cmp         = sort__global_p_stage_cyc_cmp,
+       .se_snprintf    = hist_entry__p_stage_cyc_snprintf,
+       .se_width_idx   = HISTC_P_STAGE_CYC,
+};
+
 struct sort_entry sort_mem_daddr_sym = {
        .se_header      = "Data Symbol",
        .se_cmp         = sort__daddr_cmp,
@@ -1816,6 +1838,21 @@ struct sort_dimension {
        int                     taken;
 };
 
+int __weak arch_support_sort_key(const char *sort_key __maybe_unused)
+{
+       return 0;
+}
+
+const char * __weak arch_perf_header_entry(const char *se_header)
+{
+       return se_header;
+}
+
+static void sort_dimension_add_dynamic_header(struct sort_dimension *sd)
+{
+       sd->entry->se_header = arch_perf_header_entry(sd->entry->se_header);
+}
+
 #define DIM(d, n, func) [d] = { .name = n, .entry = &(func) }
 
 static struct sort_dimension common_sort_dimensions[] = {
@@ -1841,6 +1878,7 @@ static struct sort_dimension common_sort_dimensions[] = {
        DIM(SORT_CODE_PAGE_SIZE, "code_page_size", sort_code_page_size),
        DIM(SORT_LOCAL_INS_LAT, "local_ins_lat", sort_local_ins_lat),
        DIM(SORT_GLOBAL_INS_LAT, "ins_lat", sort_global_ins_lat),
+       DIM(SORT_PIPELINE_STAGE_CYC, "p_stage_cyc", sort_p_stage_cyc),
 };
 
 #undef DIM
@@ -2739,7 +2777,20 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
                        struct evlist *evlist,
                        int level)
 {
-       unsigned int i;
+       unsigned int i, j;
+
+       /*
+        * Check to see if there are any arch specific
+        * sort dimensions not applicable for the current
+        * architecture. If so, Skip that sort key since
+        * we don't want to display it in the output fields.
+        */
+       for (j = 0; j < ARRAY_SIZE(arch_specific_sort_keys); j++) {
+               if (!strcmp(arch_specific_sort_keys[j], tok) &&
+                               !arch_support_sort_key(tok)) {
+                       return 0;
+               }
+       }
 
        for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) {
                struct sort_dimension *sd = &common_sort_dimensions[i];
@@ -2747,6 +2798,11 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
                if (strncasecmp(tok, sd->name, strlen(tok)))
                        continue;
 
+               for (j = 0; j < ARRAY_SIZE(dynamic_headers); j++) {
+                       if (!strcmp(dynamic_headers[j], sd->name))
+                               sort_dimension_add_dynamic_header(sd);
+               }
+
                if (sd->entry == &sort_parent) {
                        int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
                        if (ret) {
index 63f67a3..87a0926 100644 (file)
@@ -51,6 +51,7 @@ struct he_stat {
        u64                     period_guest_us;
        u64                     weight;
        u64                     ins_lat;
+       u64                     p_stage_cyc;
        u32                     nr_events;
 };
 
@@ -234,6 +235,7 @@ enum sort_type {
        SORT_CODE_PAGE_SIZE,
        SORT_LOCAL_INS_LAT,
        SORT_GLOBAL_INS_LAT,
+       SORT_PIPELINE_STAGE_CYC,
 
        /* branch stack specific sort keys */
        __SORT_BRANCH_STACK,
index 7f09cda..a76fff5 100644 (file)
@@ -17,6 +17,8 @@
 #include "cgroup.h"
 #include <api/fs/fs.h>
 #include "util.h"
+#include "iostat.h"
+#include "pmu-hybrid.h"
 
 #define CNTR_NOT_SUPPORTED     "<not supported>"
 #define CNTR_NOT_COUNTED       "<not counted>"
@@ -310,6 +312,11 @@ static void print_metric_header(struct perf_stat_config *config,
        struct outstate *os = ctx;
        char tbuf[1024];
 
+       /* In case of iostat, print metric header for first root port only */
+       if (config->iostat_run &&
+           os->evsel->priv != os->evsel->evlist->selected->priv)
+               return;
+
        if (!valid_only_metric(unit))
                return;
        unit = fixunit(tbuf, os->evsel, unit);
@@ -439,6 +446,12 @@ static void printout(struct perf_stat_config *config, struct aggr_cpu_id id, int
                if (counter->cgrp)
                        os.nfields++;
        }
+
+       if (!config->no_csv_summary && config->csv_output &&
+           config->summary && !config->interval) {
+               fprintf(config->output, "%16s%s", "summary", config->csv_sep);
+       }
+
        if (run == 0 || ena == 0 || counter->counts->scaled == -1) {
                if (config->metric_only) {
                        pm(config, &os, NULL, "", "", 0);
@@ -526,6 +539,7 @@ static void uniquify_event_name(struct evsel *counter)
 {
        char *new_name;
        char *config;
+       int ret = 0;
 
        if (counter->uniquified_name ||
            !counter->pmu_name || !strncmp(counter->name, counter->pmu_name,
@@ -540,8 +554,17 @@ static void uniquify_event_name(struct evsel *counter)
                        counter->name = new_name;
                }
        } else {
-               if (asprintf(&new_name,
-                            "%s [%s]", counter->name, counter->pmu_name) > 0) {
+               if (perf_pmu__has_hybrid()) {
+                       if (!counter->use_config_name) {
+                               ret = asprintf(&new_name, "%s/%s/",
+                                              counter->pmu_name, counter->name);
+                       }
+               } else {
+                       ret = asprintf(&new_name, "%s [%s]",
+                                      counter->name, counter->pmu_name);
+               }
+
+               if (ret) {
                        free(counter->name);
                        counter->name = new_name;
                }
@@ -644,6 +667,9 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
        if (!collect_data(config, counter, aggr_cb, &ad))
                return;
 
+       if (perf_pmu__has_hybrid() && ad.ena == 0)
+               return;
+
        nr = ad.nr;
        ena = ad.ena;
        run = ad.run;
@@ -952,8 +978,11 @@ static void print_metric_headers(struct perf_stat_config *config,
        if (config->csv_output) {
                if (config->interval)
                        fputs("time,", config->output);
-               fputs(aggr_header_csv[config->aggr_mode], config->output);
+               if (!config->iostat_run)
+                       fputs(aggr_header_csv[config->aggr_mode], config->output);
        }
+       if (config->iostat_run)
+               iostat_print_header_prefix(config);
 
        /* Print metrics headers only */
        evlist__for_each_entry(evlist, counter) {
@@ -983,7 +1012,8 @@ static void print_interval(struct perf_stat_config *config,
        if (config->interval_clear)
                puts(CONSOLE_CLEAR);
 
-       sprintf(prefix, "%6lu.%09lu%s", (unsigned long) ts->tv_sec, ts->tv_nsec, config->csv_sep);
+       if (!config->iostat_run)
+               sprintf(prefix, "%6lu.%09lu%s", (unsigned long) ts->tv_sec, ts->tv_nsec, config->csv_sep);
 
        if ((num_print_interval == 0 && !config->csv_output) || config->interval_clear) {
                switch (config->aggr_mode) {
@@ -1019,9 +1049,11 @@ static void print_interval(struct perf_stat_config *config,
                        break;
                case AGGR_GLOBAL:
                default:
-                       fprintf(output, "#           time");
-                       if (!metric_only)
-                               fprintf(output, "             counts %*s events\n", unit_width, "unit");
+                       if (!config->iostat_run) {
+                               fprintf(output, "#           time");
+                               if (!metric_only)
+                                       fprintf(output, "             counts %*s events\n", unit_width, "unit");
+                       }
                case AGGR_UNSET:
                        break;
                }
@@ -1214,6 +1246,9 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf
        struct evsel *counter;
        char buf[64], *prefix = NULL;
 
+       if (config->iostat_run)
+               evlist->selected = evlist__first(evlist);
+
        if (interval)
                print_interval(config, evlist, prefix = buf, ts);
        else
@@ -1226,7 +1261,7 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf
                        print_metric_headers(config, evlist, prefix, false);
                if (num_print_iv++ == 25)
                        num_print_iv = 0;
-               if (config->aggr_mode == AGGR_GLOBAL && prefix)
+               if (config->aggr_mode == AGGR_GLOBAL && prefix && !config->iostat_run)
                        fprintf(config->output, "%s", prefix);
        }
 
@@ -1243,11 +1278,16 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf
                }
                break;
        case AGGR_GLOBAL:
-               evlist__for_each_entry(evlist, counter) {
-                       print_counter_aggr(config, counter, prefix);
+               if (config->iostat_run)
+                       iostat_print_counters(evlist, config, ts, prefix = buf,
+                                             print_counter_aggr);
+               else {
+                       evlist__for_each_entry(evlist, counter) {
+                               print_counter_aggr(config, counter, prefix);
+                       }
+                       if (metric_only)
+                               fputc('\n', config->output);
                }
-               if (metric_only)
-                       fputc('\n', config->output);
                break;
        case AGGR_NONE:
                if (metric_only)
index 6ccf21a..39967a4 100644 (file)
@@ -9,7 +9,9 @@
 #include "expr.h"
 #include "metricgroup.h"
 #include "cgroup.h"
+#include "units.h"
 #include <linux/zalloc.h>
+#include "iostat.h"
 
 /*
  * AGGR_GLOBAL: Use CPU 0
@@ -961,7 +963,9 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
        struct metric_event *me;
        int num = 1;
 
-       if (evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
+       if (config->iostat_run) {
+               iostat_print_metric(config, evsel, out);
+       } else if (evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
                total = runtime_stat_avg(st, STAT_CYCLES, cpu, &rsd);
 
                if (total) {
@@ -1270,18 +1274,15 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
                generic_metric(config, evsel->metric_expr, evsel->metric_events, NULL,
                                evsel->name, evsel->metric_name, NULL, 1, cpu, out, st);
        } else if (runtime_stat_n(st, STAT_NSECS, cpu, &rsd) != 0) {
-               char unit = 'M';
-               char unit_buf[10];
+               char unit = ' ';
+               char unit_buf[10] = "/sec";
 
                total = runtime_stat_avg(st, STAT_NSECS, cpu, &rsd);
-
                if (total)
-                       ratio = 1000.0 * avg / total;
-               if (ratio < 0.001) {
-                       ratio *= 1000;
-                       unit = 'K';
-               }
-               snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
+                       ratio = convert_unit_double(1000000000.0 * avg / total, &unit);
+
+               if (unit != ' ')
+                       snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
                print_metric(config, ctxp, NULL, "%8.3f", unit_buf, ratio);
        } else if (perf_stat_evsel__is(evsel, SMI_NUM)) {
                print_smi_cost(config, cpu, out, st, &rsd);
index c400f8d..2db46b9 100644 (file)
@@ -76,8 +76,7 @@ double rel_stddev_stats(double stddev, double avg)
        return pct;
 }
 
-bool __perf_evsel_stat__is(struct evsel *evsel,
-                          enum perf_stat_evsel_id id)
+bool __perf_stat_evsel__is(struct evsel *evsel, enum perf_stat_evsel_id id)
 {
        struct perf_stat_evsel *ps = evsel->stats;
 
index d85c292..32c8527 100644 (file)
@@ -128,10 +128,12 @@ struct perf_stat_config {
        bool                     all_user;
        bool                     percore_show_thread;
        bool                     summary;
+       bool                     no_csv_summary;
        bool                     metric_no_group;
        bool                     metric_no_merge;
        bool                     stop_read_counter;
        bool                     quiet;
+       bool                     iostat_run;
        FILE                    *output;
        unsigned int             interval;
        unsigned int             timeout;
@@ -160,6 +162,7 @@ struct perf_stat_config {
 };
 
 void perf_stat__set_big_num(int set);
+void perf_stat__set_no_csv_summary(int set);
 
 void update_stats(struct stats *stats, u64 val);
 double avg_stats(struct stats *stats);
@@ -187,11 +190,10 @@ struct perf_aggr_thread_value {
        u64 ena;
 };
 
-bool __perf_evsel_stat__is(struct evsel *evsel,
-                          enum perf_stat_evsel_id id);
+bool __perf_stat_evsel__is(struct evsel *evsel, enum perf_stat_evsel_id id);
 
 #define perf_stat_evsel__is(evsel, id) \
-       __perf_evsel_stat__is(evsel, PERF_STAT_EVSEL_ID__ ## id)
+       __perf_stat_evsel__is(evsel, PERF_STAT_EVSEL_ID__ ## id)
 
 extern struct runtime_stat rt_stat;
 extern struct stats walltime_nsecs_stats;
index ea94d86..be94d70 100644 (file)
@@ -12,7 +12,7 @@
  *    build complex strings/buffers whose final size isn't easily known.
  *
  *    It is NOT legal to copy the ->buf pointer away.
- *    `strbuf_detach' is the operation that detachs a buffer from its shell
+ *    `strbuf_detach' is the operation that detaches a buffer from its shell
  *    while keeping the shell valid wrt its invariants.
  *
  * 2. the ->buf member is a byte array that has at least ->len + 1 bytes
index e0c25a4..c05aca9 100644 (file)
@@ -8,8 +8,8 @@
 
 /* A node of string filter */
 struct strfilter_node {
-       struct strfilter_node *l;       /* Tree left branche (for &,|) */
-       struct strfilter_node *r;       /* Tree right branche (for !,&,|) */
+       struct strfilter_node *l;       /* Tree left branch (for &,|) */
+       struct strfilter_node *r;       /* Tree right branch (for !,&,|) */
        const char *p;          /* Operator or rule */
 };
 
index 6dff843..4c56aa8 100644 (file)
@@ -1058,7 +1058,7 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map,
                curr_dso->symtab_type = dso->symtab_type;
                maps__insert(kmaps, curr_map);
                /*
-                * Add it before we drop the referece to curr_map, i.e. while
+                * Add it before we drop the reference to curr_map, i.e. while
                 * we still are sure to have a reference to this DSO via
                 * *curr_map->dso.
                 */
index 35c936c..2664fb6 100644 (file)
@@ -68,7 +68,7 @@ size_t dso__fprintf_symbols_by_name(struct dso *dso,
 
        for (nd = rb_first_cached(&dso->symbol_names); nd; nd = rb_next(nd)) {
                pos = rb_entry(nd, struct symbol_name_rb_node, rb_node);
-               fprintf(fp, "%s\n", pos->sym.name);
+               ret += fprintf(fp, "%s\n", pos->sym.name);
        }
 
        return ret;
index dff1781..35aa0c0 100644 (file)
@@ -1211,7 +1211,7 @@ static size_t mask_size(struct perf_cpu_map *map, int *max)
        *max = 0;
 
        for (i = 0; i < map->nr; i++) {
-               /* bit possition of the cpu is + 1 */
+               /* bit position of the cpu is + 1 */
                int bit = map->map[i] + 1;
 
                if (bit > *max)
@@ -1237,7 +1237,7 @@ void *cpu_map_data__alloc(struct perf_cpu_map *map, size_t *size, u16 *type, int
         *   mask  = size of 'struct perf_record_record_cpu_map' +
         *           maximum cpu bit converted to size of longs
         *
-        * and finaly + the size of 'struct perf_record_cpu_map_data'.
+        * and finally + the size of 'struct perf_record_cpu_map_data'.
         */
        size_cpus = cpus_size(map);
        size_mask = mask_size(map, max);
index 03bd99d..a2e9068 100644 (file)
@@ -34,6 +34,10 @@ static const char **syscalltbl_native = syscalltbl_powerpc_32;
 #include <asm/syscalls.c>
 const int syscalltbl_native_max_id = SYSCALLTBL_ARM64_MAX_ID;
 static const char **syscalltbl_native = syscalltbl_arm64;
+#elif defined(__mips__)
+#include <asm/syscalls_n64.c>
+const int syscalltbl_native_max_id = SYSCALLTBL_MIPS_N64_MAX_ID;
+static const char **syscalltbl_native = syscalltbl_mips_n64;
 #endif
 
 struct syscall {
index f132c6c..4ff5621 100644 (file)
@@ -16,6 +16,8 @@ struct target {
        bool         uses_mmap;
        bool         default_per_cpu;
        bool         per_thread;
+       bool         use_bpf;
+       const char   *attr_map;
 };
 
 enum target_errno {
@@ -64,11 +66,6 @@ static inline bool target__has_cpu(struct target *target)
        return target->system_wide || target->cpu_list;
 }
 
-static inline bool target__has_bpf(struct target *target)
-{
-       return target->bpf_str;
-}
-
 static inline bool target__none(struct target *target)
 {
        return !target__has_task(target) && !target__has_cpu(target);
index 3bc47a4..b3cd09b 100644 (file)
@@ -16,7 +16,6 @@ struct comm;
 struct ip_callchain;
 struct symbol;
 struct dso;
-struct comm;
 struct perf_sample;
 struct addr_location;
 struct call_path;
index 62b4c75..f19791d 100644 (file)
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <errno.h>
+#include <inttypes.h>
+#include <string.h>
 
 #include <linux/compiler.h>
 #include <linux/perf_event.h>
@@ -110,3 +112,31 @@ u64 __weak rdtsc(void)
 {
        return 0;
 }
+
+size_t perf_event__fprintf_time_conv(union perf_event *event, FILE *fp)
+{
+       struct perf_record_time_conv *tc = (struct perf_record_time_conv *)event;
+       size_t ret;
+
+       ret  = fprintf(fp, "\n... Time Shift      %" PRI_lu64 "\n", tc->time_shift);
+       ret += fprintf(fp, "... Time Muliplier  %" PRI_lu64 "\n", tc->time_mult);
+       ret += fprintf(fp, "... Time Zero       %" PRI_lu64 "\n", tc->time_zero);
+
+       /*
+        * The event TIME_CONV was extended for the fields from "time_cycles"
+        * when supported cap_user_time_short, for backward compatibility,
+        * prints the extended fields only if they are contained in the event.
+        */
+       if (event_contains(*tc, time_cycles)) {
+               ret += fprintf(fp, "... Time Cycles     %" PRI_lu64 "\n",
+                              tc->time_cycles);
+               ret += fprintf(fp, "... Time Mask       %#" PRI_lx64 "\n",
+                              tc->time_mask);
+               ret += fprintf(fp, "... Cap Time Zero   %" PRId32 "\n",
+                              tc->cap_user_time_zero);
+               ret += fprintf(fp, "... Cap Time Short  %" PRId32 "\n",
+                              tc->cap_user_time_short);
+       }
+
+       return ret;
+}
index 72a1541..7d83a31 100644 (file)
@@ -4,6 +4,8 @@
 
 #include <linux/types.h>
 
+#include "event.h"
+
 struct perf_tsc_conversion {
        u16 time_shift;
        u32 time_mult;
@@ -24,4 +26,6 @@ u64 perf_time_to_tsc(u64 ns, struct perf_tsc_conversion *tc);
 u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc);
 u64 rdtsc(void);
 
+size_t perf_event__fprintf_time_conv(union perf_event *event, FILE *fp);
+
 #endif // __PERF_TSC_H
index a46762a..32c39cf 100644 (file)
@@ -33,28 +33,35 @@ unsigned long parse_tag_value(const char *str, struct parse_tag *tags)
        return (unsigned long) -1;
 }
 
-unsigned long convert_unit(unsigned long value, char *unit)
+double convert_unit_double(double value, char *unit)
 {
        *unit = ' ';
 
-       if (value > 1000) {
-               value /= 1000;
+       if (value > 1000.0) {
+               value /= 1000.0;
                *unit = 'K';
        }
 
-       if (value > 1000) {
-               value /= 1000;
+       if (value > 1000.0) {
+               value /= 1000.0;
                *unit = 'M';
        }
 
-       if (value > 1000) {
-               value /= 1000;
+       if (value > 1000.0) {
+               value /= 1000.0;
                *unit = 'G';
        }
 
        return value;
 }
 
+unsigned long convert_unit(unsigned long value, char *unit)
+{
+       double v = convert_unit_double((double)value, unit);
+
+       return (unsigned long)v;
+}
+
 int unit_number__scnprintf(char *buf, size_t size, u64 n)
 {
        char unit[4] = "BKMG";
index 99263b6..ea43e74 100644 (file)
@@ -12,6 +12,7 @@ struct parse_tag {
 
 unsigned long parse_tag_value(const char *str, struct parse_tag *tags);
 
+double convert_unit_double(double value, char *unit);
 unsigned long convert_unit(unsigned long value, char *unit);
 int unit_number__scnprintf(char *buf, size_t size, u64 n);
 
index 9aededc..71a3533 100644 (file)
@@ -82,7 +82,7 @@ UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
 #define DW_EH_PE_funcrel       0x40    /* start-of-procedure-relative */
 #define DW_EH_PE_aligned       0x50    /* aligned pointer */
 
-/* Flags intentionaly not handled, since they're not needed:
+/* Flags intentionally not handled, since they're not needed:
  * #define DW_EH_PE_indirect      0x80
  * #define DW_EH_PE_uleb128       0x01
  * #define DW_EH_PE_udata2        0x02
index f6b7e85..9b17097 100644 (file)
@@ -54,12 +54,14 @@ name as necessary to disambiguate it from others is necessary.  Note that option
 .PP
 \fB--cpu cpu-set\fP limit output to system summary plus the specified cpu-set.  If cpu-set is the string "core", then the system summary plus the first CPU in each core are printed -- eg. subsequent HT siblings are not printed.  Or if cpu-set is the string "package", then the system summary plus the first CPU in each package is printed.  Otherwise, the system summary plus the specified set of CPUs are printed.  The cpu-set is ordered from low to high, comma delimited with ".." and "-" permitted to denote a range. eg. 1,2,8,14..17,21-44
 .PP
-\fB--hide column\fP do not show the specified built-in columns.  May be invoked multiple times, or with a comma-separated list of column names.  Use "--hide sysfs" to hide the sysfs statistics columns as a group.
+\fB--hide column\fP do not show the specified built-in columns.  May be invoked multiple times, or with a comma-separated list of column names.
 .PP
 \fB--enable column\fP show the specified built-in columns, which are otherwise disabled, by default.  Currently the only built-in counters disabled by default are "usec", "Time_Of_Day_Seconds", "APIC" and "X2APIC".
 The column name "all" can be used to enable all disabled-by-default built-in counters.
 .PP
-\fB--show column\fP show only the specified built-in columns.  May be invoked multiple times, or with a comma-separated list of column names.  Use "--show sysfs" to show the sysfs statistics columns as a group.
+\fB--show column\fP show only the specified built-in columns.  May be invoked multiple times, or with a comma-separated list of column names.
+.PP
+\fB--show CATEGORY --hide CATEGORY\fP  Show and hide also accept a single CATEGORY of columns: "all", "topology", "idle", "frequency", "power", "sysfs", "other".
 .PP
 \fB--Dump\fP displays the raw counter values.
 .PP
index 5939615..47d3ba8 100644 (file)
@@ -3,7 +3,7 @@
  * turbostat -- show CPU frequency and C-state residency
  * on modern Intel and AMD processors.
  *
- * Copyright (c) 2013 Intel Corporation.
+ * Copyright (c) 2021 Intel Corporation.
  * Len Brown <len.brown@intel.com>
  */
 
 #include <sys/capability.h>
 #include <errno.h>
 #include <math.h>
+#include <linux/perf_event.h>
+#include <asm/unistd.h>
+#include <stdbool.h>
 
 char *proc_stat = "/proc/stat";
 FILE *outf;
 int *fd_percpu;
-struct timeval interval_tv = {5, 0};
-struct timespec interval_ts = {5, 0};
+int *fd_instr_count_percpu;
+struct timeval interval_tv = { 5, 0 };
+struct timespec interval_ts = { 5, 0 };
+
+/* Save original CPU model */
+unsigned int model_orig;
+
 unsigned int num_iterations;
 unsigned int debug;
 unsigned int quiet;
@@ -75,30 +83,33 @@ char *output_buffer, *outp;
 unsigned int do_rapl;
 unsigned int do_dts;
 unsigned int do_ptm;
-unsigned long long  gfx_cur_rc6_ms;
+unsigned int do_ipc;
+unsigned long long gfx_cur_rc6_ms;
 unsigned long long cpuidle_cur_cpu_lpi_us;
 unsigned long long cpuidle_cur_sys_lpi_us;
 unsigned int gfx_cur_mhz;
 unsigned int gfx_act_mhz;
-unsigned int tcc_activation_temp;
-unsigned int tcc_activation_temp_override;
+unsigned int tj_max;
+unsigned int tj_max_override;
+int tcc_offset_bits;
 double rapl_power_units, rapl_time_units;
 double rapl_dram_energy_units, rapl_energy_units;
 double rapl_joule_counter_range;
 unsigned int do_core_perf_limit_reasons;
 unsigned int has_automatic_cstate_conversion;
+unsigned int dis_cstate_prewake;
 unsigned int do_gfx_perf_limit_reasons;
 unsigned int do_ring_perf_limit_reasons;
 unsigned int crystal_hz;
 unsigned long long tsc_hz;
 int base_cpu;
 double discover_bclk(unsigned int family, unsigned int model);
-unsigned int has_hwp;  /* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */
+unsigned int has_hwp;          /* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */
                        /* IA32_HWP_REQUEST, IA32_HWP_STATUS */
-unsigned int has_hwp_notify;           /* IA32_HWP_INTERRUPT */
+unsigned int has_hwp_notify;   /* IA32_HWP_INTERRUPT */
 unsigned int has_hwp_activity_window;  /* IA32_HWP_REQUEST[bits 41:32] */
-unsigned int has_hwp_epp;              /* IA32_HWP_REQUEST[bits 31:24] */
-unsigned int has_hwp_pkg;              /* IA32_HWP_REQUEST_PKG */
+unsigned int has_hwp_epp;      /* IA32_HWP_REQUEST[bits 31:24] */
+unsigned int has_hwp_pkg;      /* IA32_HWP_REQUEST_PKG */
 unsigned int has_misc_feature_control;
 unsigned int first_counter_read = 1;
 int ignore_stdin;
@@ -173,12 +184,14 @@ struct thread_data {
        unsigned long long aperf;
        unsigned long long mperf;
        unsigned long long c1;
-       unsigned long long  irq_count;
+       unsigned long long instr_count;
+       unsigned long long irq_count;
        unsigned int smi_count;
        unsigned int cpu_id;
        unsigned int apic_id;
        unsigned int x2apic_id;
        unsigned int flags;
+       bool is_atom;
 #define CPU_IS_FIRST_THREAD_IN_CORE    0x2
 #define CPU_IS_FIRST_CORE_IN_PACKAGE   0x4
        unsigned long long counter[MAX_ADDED_THREAD_COUNTERS];
@@ -240,12 +253,11 @@ struct pkg_data {
         ((node_no) * topo.cores_per_node) +                            \
         (core_no))
 
-
 #define GET_PKG(pkg_base, pkg_no) (pkg_base + pkg_no)
 
-enum counter_scope {SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE};
-enum counter_type {COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC};
-enum counter_format {FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT};
+enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE };
+enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC };
+enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT };
 
 struct msr_counter {
        unsigned int msr_num;
@@ -281,9 +293,9 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long *msr);
 struct msr_sum_array {
        /* get_msr_sum() = sum + (get_msr() - last) */
        struct {
-               /*The accumulated MSR value is updated by the timer*/
+               /*The accumulated MSR value is updated by the timer */
                unsigned long long sum;
-               /*The MSR footprint recorded in last timer*/
+               /*The MSR footprint recorded in last timer */
                unsigned long long last;
        } entries[IDX_COUNT];
 };
@@ -291,13 +303,16 @@ struct msr_sum_array {
 /* The percpu MSR sum array.*/
 struct msr_sum_array *per_cpu_msr_sum;
 
-int idx_to_offset(int idx)
+off_t idx_to_offset(int idx)
 {
-       int offset;
+       off_t offset;
 
        switch (idx) {
        case IDX_PKG_ENERGY:
-               offset = MSR_PKG_ENERGY_STATUS;
+               if (do_rapl & RAPL_AMD_F17H)
+                       offset = MSR_PKG_ENERGY_STAT;
+               else
+                       offset = MSR_PKG_ENERGY_STATUS;
                break;
        case IDX_DRAM_ENERGY:
                offset = MSR_DRAM_ENERGY_STATUS;
@@ -320,12 +335,13 @@ int idx_to_offset(int idx)
        return offset;
 }
 
-int offset_to_idx(int offset)
+int offset_to_idx(off_t offset)
 {
        int idx;
 
        switch (offset) {
        case MSR_PKG_ENERGY_STATUS:
+       case MSR_PKG_ENERGY_STAT:
                idx = IDX_PKG_ENERGY;
                break;
        case MSR_DRAM_ENERGY_STATUS:
@@ -353,7 +369,7 @@ int idx_valid(int idx)
 {
        switch (idx) {
        case IDX_PKG_ENERGY:
-               return do_rapl & RAPL_PKG;
+               return do_rapl & (RAPL_PKG | RAPL_AMD_F17H);
        case IDX_DRAM_ENERGY:
                return do_rapl & RAPL_DRAM;
        case IDX_PP0_ENERGY:
@@ -368,6 +384,7 @@ int idx_valid(int idx)
                return 0;
        }
 }
+
 struct sys_counters {
        unsigned int added_thread_counters;
        unsigned int added_core_counters;
@@ -391,7 +408,7 @@ struct cpu_topology {
        int logical_node_id;    /* 0-based count within the package */
        int physical_core_id;
        int thread_id;
-       cpu_set_t *put_ids; /* Processing Unit/Thread IDs */
+       cpu_set_t *put_ids;     /* Processing Unit/Thread IDs */
 } *cpus;
 
 struct topo_params {
@@ -408,7 +425,7 @@ struct topo_params {
 
 struct timeval tv_even, tv_odd, tv_delta;
 
-int *irq_column_2_cpu; /* /proc/interrupts column numbers */
+int *irq_column_2_cpu;         /* /proc/interrupts column numbers */
 int *irqs_per_cpu;             /* indexed by cpu_num */
 
 void setup_all_buffers(void);
@@ -421,34 +438,31 @@ int cpu_is_not_present(int cpu)
 {
        return !CPU_ISSET_S(cpu, cpu_present_setsize, cpu_present_set);
 }
+
 /*
  * run func(thread, core, package) in topology order
  * skip non-present cpus
  */
 
-int for_all_cpus(int (func)(struct thread_data *, struct core_data *, struct pkg_data *),
-       struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base)
+int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pkg_data *),
+                struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base)
 {
        int retval, pkg_no, core_no, thread_no, node_no;
 
        for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
                for (node_no = 0; node_no < topo.nodes_per_pkg; node_no++) {
                        for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
-                               for (thread_no = 0; thread_no <
-                                       topo.threads_per_core; ++thread_no) {
+                               for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
                                        struct thread_data *t;
                                        struct core_data *c;
                                        struct pkg_data *p;
 
-                                       t = GET_THREAD(thread_base, thread_no,
-                                                      core_no, node_no,
-                                                      pkg_no);
+                                       t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
 
                                        if (cpu_is_not_present(t->cpu_id))
                                                continue;
 
-                                       c = GET_CORE(core_base, core_no,
-                                                    node_no, pkg_no);
+                                       c = GET_CORE(core_base, core_no, node_no, pkg_no);
                                        p = GET_PKG(pkg_base, pkg_no);
 
                                        retval = func(t, c, p);
@@ -470,6 +484,7 @@ int cpu_migrate(int cpu)
        else
                return 0;
 }
+
 int get_msr_fd(int cpu)
 {
        char pathname[32];
@@ -490,6 +505,39 @@ int get_msr_fd(int cpu)
        return fd;
 }
 
+static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
+{
+       return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
+}
+
+static int perf_instr_count_open(int cpu_num)
+{
+       struct perf_event_attr pea;
+       int fd;
+
+       memset(&pea, 0, sizeof(struct perf_event_attr));
+       pea.type = PERF_TYPE_HARDWARE;
+       pea.size = sizeof(struct perf_event_attr);
+       pea.config = PERF_COUNT_HW_INSTRUCTIONS;
+
+       /* counter for cpu_num, including user + kernel and all processes */
+       fd = perf_event_open(&pea, -1, cpu_num, -1, 0);
+       if (fd == -1)
+               err(-1, "cpu%d: perf instruction counter\n", cpu_num);
+
+       return fd;
+}
+
+int get_instr_count_fd(int cpu)
+{
+       if (fd_instr_count_percpu[cpu])
+               return fd_instr_count_percpu[cpu];
+
+       fd_instr_count_percpu[cpu] = perf_instr_count_open(cpu);
+
+       return fd_instr_count_percpu[cpu];
+}
+
 int get_msr(int cpu, off_t offset, unsigned long long *msr)
 {
        ssize_t retval;
@@ -518,7 +566,7 @@ struct msr_counter bic[] = {
        { 0x0, "Bzy_MHz" },
        { 0x0, "TSC_MHz" },
        { 0x0, "IRQ" },
-       { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL},
+       { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL },
        { 0x0, "sysfs" },
        { 0x0, "CPU%c1" },
        { 0x0, "CPU%c3" },
@@ -561,6 +609,7 @@ struct msr_counter bic[] = {
        { 0x0, "X2APIC" },
        { 0x0, "Die" },
        { 0x0, "GFXAMHz" },
+       { 0x0, "IPC" },
 };
 
 #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter))
@@ -616,6 +665,13 @@ struct msr_counter bic[] = {
 #define        BIC_X2APIC      (1ULL << 49)
 #define        BIC_Die         (1ULL << 50)
 #define        BIC_GFXACTMHz   (1ULL << 51)
+#define        BIC_IPC         (1ULL << 52)
+
+#define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die )
+#define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__)
+#define BIC_FREQUENCY ( BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz )
+#define BIC_IDLE ( BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX)
+#define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
 
 #define BIC_DISABLED_BY_DEFAULT        (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC)
 
@@ -627,7 +683,7 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC
 #define ENABLE_BIC(COUNTER_NAME) (bic_enabled |= COUNTER_NAME)
 #define BIC_PRESENT(COUNTER_BIT) (bic_present |= COUNTER_BIT)
 #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT)
-
+#define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT)
 
 #define MAX_DEFERRED 16
 char *deferred_skip_names[MAX_DEFERRED];
@@ -642,42 +698,40 @@ enum show_hide_mode { SHOW_LIST, HIDE_LIST } global_show_hide_mode = HIDE_LIST;
 void help(void)
 {
        fprintf(outf,
-       "Usage: turbostat [OPTIONS][(--interval seconds) | COMMAND ...]\n"
-       "\n"
-       "Turbostat forks the specified COMMAND and prints statistics\n"
-       "when COMMAND completes.\n"
-       "If no COMMAND is specified, turbostat wakes every 5-seconds\n"
-       "to print statistics, until interrupted.\n"
-       "  -a, --add    add a counter\n"
-       "                 eg. --add msr0x10,u64,cpu,delta,MY_TSC\n"
-       "  -c, --cpu    cpu-set limit output to summary plus cpu-set:\n"
-       "                 {core | package | j,k,l..m,n-p }\n"
-       "  -d, --debug  displays usec, Time_Of_Day_Seconds and more debugging\n"
-       "  -D, --Dump   displays the raw counter values\n"
-       "  -e, --enable [all | column]\n"
-       "               shows all or the specified disabled column\n"
-       "  -H, --hide [column|column,column,...]\n"
-       "               hide the specified column(s)\n"
-       "  -i, --interval sec.subsec\n"
-       "               Override default 5-second measurement interval\n"
-       "  -J, --Joules displays energy in Joules instead of Watts\n"
-       "  -l, --list   list column headers only\n"
-       "  -n, --num_iterations num\n"
-       "               number of the measurement iterations\n"
-       "  -o, --out file\n"
-       "               create or truncate \"file\" for all output\n"
-       "  -q, --quiet  skip decoding system configuration header\n"
-       "  -s, --show [column|column,column,...]\n"
-       "               show only the specified column(s)\n"
-       "  -S, --Summary\n"
-       "               limits output to 1-line system summary per interval\n"
-       "  -T, --TCC temperature\n"
-       "               sets the Thermal Control Circuit temperature in\n"
-       "                 degrees Celsius\n"
-       "  -h, --help   print this help message\n"
-       "  -v, --version        print version information\n"
-       "\n"
-       "For more help, run \"man turbostat\"\n");
+               "Usage: turbostat [OPTIONS][(--interval seconds) | COMMAND ...]\n"
+               "\n"
+               "Turbostat forks the specified COMMAND and prints statistics\n"
+               "when COMMAND completes.\n"
+               "If no COMMAND is specified, turbostat wakes every 5-seconds\n"
+               "to print statistics, until interrupted.\n"
+               "  -a, --add    add a counter\n"
+               "                 eg. --add msr0x10,u64,cpu,delta,MY_TSC\n"
+               "  -c, --cpu    cpu-set limit output to summary plus cpu-set:\n"
+               "                 {core | package | j,k,l..m,n-p }\n"
+               "  -d, --debug  displays usec, Time_Of_Day_Seconds and more debugging\n"
+               "  -D, --Dump   displays the raw counter values\n"
+               "  -e, --enable [all | column]\n"
+               "               shows all or the specified disabled column\n"
+               "  -H, --hide [column|column,column,...]\n"
+               "               hide the specified column(s)\n"
+               "  -i, --interval sec.subsec\n"
+               "               Override default 5-second measurement interval\n"
+               "  -J, --Joules displays energy in Joules instead of Watts\n"
+               "  -l, --list   list column headers only\n"
+               "  -n, --num_iterations num\n"
+               "               number of the measurement iterations\n"
+               "  -o, --out file\n"
+               "               create or truncate \"file\" for all output\n"
+               "  -q, --quiet  skip decoding system configuration header\n"
+               "  -s, --show [column|column,column,...]\n"
+               "               show only the specified column(s)\n"
+               "  -S, --Summary\n"
+               "               limits output to 1-line system summary per interval\n"
+               "  -T, --TCC temperature\n"
+               "               sets the Thermal Control Circuit temperature in\n"
+               "                 degrees Celsius\n"
+               "  -h, --help   print this help message\n"
+               "  -v, --version        print version information\n" "\n" "For more help, run \"man turbostat\"\n");
 }
 
 /*
@@ -700,6 +754,18 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode)
 
                if (!strcmp(name_list, "all"))
                        return ~0;
+               if (!strcmp(name_list, "topology"))
+                       return BIC_TOPOLOGY;
+               if (!strcmp(name_list, "power"))
+                       return BIC_THERMAL_PWR;
+               if (!strcmp(name_list, "idle"))
+                       return BIC_IDLE;
+               if (!strcmp(name_list, "frequency"))
+                       return BIC_FREQUENCY;
+               if (!strcmp(name_list, "other"))
+                       return BIC_OTHER;
+               if (!strcmp(name_list, "all"))
+                       return 0;
 
                for (i = 0; i < MAX_BIC; ++i) {
                        if (!strcmp(name_list, bic[i].name)) {
@@ -731,7 +797,6 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode)
        return retval;
 }
 
-
 void print_header(char *delim)
 {
        struct msr_counter *mp;
@@ -764,6 +829,9 @@ void print_header(char *delim)
        if (DO_BIC(BIC_TSC_MHz))
                outp += sprintf(outp, "%sTSC_MHz", (printed++ ? delim : ""));
 
+       if (DO_BIC(BIC_IPC))
+               outp += sprintf(outp, "%sIPC", (printed++ ? delim : ""));
+
        if (DO_BIC(BIC_IRQ)) {
                if (sums_need_wide_columns)
                        outp += sprintf(outp, "%s     IRQ", (printed++ ? delim : ""));
@@ -910,8 +978,7 @@ void print_header(char *delim)
        outp += sprintf(outp, "\n");
 }
 
-int dump_counters(struct thread_data *t, struct core_data *c,
-       struct pkg_data *p)
+int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
        int i;
        struct msr_counter *mp;
@@ -919,21 +986,22 @@ int dump_counters(struct thread_data *t, struct core_data *c,
        outp += sprintf(outp, "t %p, c %p, p %p\n", t, c, p);
 
        if (t) {
-               outp += sprintf(outp, "CPU: %d flags 0x%x\n",
-                       t->cpu_id, t->flags);
+               outp += sprintf(outp, "CPU: %d flags 0x%x\n", t->cpu_id, t->flags);
                outp += sprintf(outp, "TSC: %016llX\n", t->tsc);
                outp += sprintf(outp, "aperf: %016llX\n", t->aperf);
                outp += sprintf(outp, "mperf: %016llX\n", t->mperf);
                outp += sprintf(outp, "c1: %016llX\n", t->c1);
 
+               if (DO_BIC(BIC_IPC))
+                       outp += sprintf(outp, "IPC: %lld\n", t->instr_count);
+
                if (DO_BIC(BIC_IRQ))
                        outp += sprintf(outp, "IRQ: %lld\n", t->irq_count);
                if (DO_BIC(BIC_SMI))
                        outp += sprintf(outp, "SMI: %d\n", t->smi_count);
 
                for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
-                       outp += sprintf(outp, "tADDED [%d] msr0x%x: %08llX\n",
-                               i, mp->msr_num, t->counter[i]);
+                       outp += sprintf(outp, "tADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, t->counter[i]);
                }
        }
 
@@ -946,8 +1014,7 @@ int dump_counters(struct thread_data *t, struct core_data *c,
                outp += sprintf(outp, "Joules: %0X\n", c->core_energy);
 
                for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
-                       outp += sprintf(outp, "cADDED [%d] msr0x%x: %08llX\n",
-                               i, mp->msr_num, c->counter[i]);
+                       outp += sprintf(outp, "cADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, c->counter[i]);
                }
                outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us);
        }
@@ -976,15 +1043,12 @@ int dump_counters(struct thread_data *t, struct core_data *c,
                outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores);
                outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx);
                outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram);
-               outp += sprintf(outp, "Throttle PKG: %0llX\n",
-                       p->rapl_pkg_perf_status);
-               outp += sprintf(outp, "Throttle RAM: %0llX\n",
-                       p->rapl_dram_perf_status);
+               outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status);
+               outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status);
                outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c);
 
                for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
-                       outp += sprintf(outp, "pADDED [%d] msr0x%x: %08llX\n",
-                               i, mp->msr_num, p->counter[i]);
+                       outp += sprintf(outp, "pADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, p->counter[i]);
                }
        }
 
@@ -996,8 +1060,7 @@ int dump_counters(struct thread_data *t, struct core_data *c,
 /*
  * column formatting convention & formats
  */
-int format_counters(struct thread_data *t, struct core_data *c,
-       struct pkg_data *p)
+int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
        double interval_float, tsc;
        char *fmt8;
@@ -1006,17 +1069,16 @@ int format_counters(struct thread_data *t, struct core_data *c,
        char *delim = "\t";
        int printed = 0;
 
-        /* if showing only 1st thread in core and this isn't one, bail out */
+       /* if showing only 1st thread in core and this isn't one, bail out */
        if (show_core_only && !(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
                return 0;
 
-        /* if showing only 1st thread in pkg and this isn't one, bail out */
+       /* if showing only 1st thread in pkg and this isn't one, bail out */
        if (show_pkg_only && !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
                return 0;
 
        /*if not summary line and --cpu is used */
-       if ((t != &average.threads) &&
-               (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset)))
+       if ((t != &average.threads) && (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset)))
                return 0;
 
        if (DO_BIC(BIC_USEC)) {
@@ -1031,7 +1093,7 @@ int format_counters(struct thread_data *t, struct core_data *c,
        if (DO_BIC(BIC_TOD))
                outp += sprintf(outp, "%10ld.%06ld\t", t->tv_end.tv_sec, t->tv_end.tv_usec);
 
-       interval_float = t->tv_delta.tv_sec + t->tv_delta.tv_usec/1000000.0;
+       interval_float = t->tv_delta.tv_sec + t->tv_delta.tv_usec / 1000000.0;
 
        tsc = t->tsc * tsc_tweak;
 
@@ -1067,11 +1129,9 @@ int format_counters(struct thread_data *t, struct core_data *c,
                if (DO_BIC(BIC_Node)) {
                        if (t)
                                outp += sprintf(outp, "%s%d",
-                                               (printed++ ? delim : ""),
-                                             cpus[t->cpu_id].physical_node_id);
+                                               (printed++ ? delim : ""), cpus[t->cpu_id].physical_node_id);
                        else
-                               outp += sprintf(outp, "%s-",
-                                               (printed++ ? delim : ""));
+                               outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
                }
                if (DO_BIC(BIC_Core)) {
                        if (c)
@@ -1088,22 +1148,25 @@ int format_counters(struct thread_data *t, struct core_data *c,
        }
 
        if (DO_BIC(BIC_Avg_MHz))
-               outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""),
-                       1.0 / units * t->aperf / interval_float);
+               outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 / units * t->aperf / interval_float);
 
        if (DO_BIC(BIC_Busy))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->mperf/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->mperf / tsc);
 
        if (DO_BIC(BIC_Bzy_MHz)) {
                if (has_base_hz)
-                       outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), base_hz / units * t->aperf / t->mperf);
+                       outp +=
+                           sprintf(outp, "%s%.0f", (printed++ ? delim : ""), base_hz / units * t->aperf / t->mperf);
                else
                        outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""),
-                               tsc / units * t->aperf / t->mperf / interval_float);
+                                       tsc / units * t->aperf / t->mperf / interval_float);
        }
 
        if (DO_BIC(BIC_TSC_MHz))
-               outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 * t->tsc/units/interval_float);
+               outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 * t->tsc / units / interval_float);
+
+       if (DO_BIC(BIC_IPC))
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 1.0 * t->instr_count / t->aperf);
 
        /* IRQ */
        if (DO_BIC(BIC_IRQ)) {
@@ -1121,7 +1184,8 @@ int format_counters(struct thread_data *t, struct core_data *c,
        for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
                if (mp->format == FORMAT_RAW) {
                        if (mp->width == 32)
-                               outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int) t->counter[i]);
+                               outp +=
+                                   sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)t->counter[i]);
                        else
                                outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->counter[i]);
                } else if (mp->format == FORMAT_DELTA) {
@@ -1131,27 +1195,28 @@ int format_counters(struct thread_data *t, struct core_data *c,
                                outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->counter[i]);
                } else if (mp->format == FORMAT_PERCENT) {
                        if (mp->type == COUNTER_USEC)
-                               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), t->counter[i]/interval_float/10000);
+                               outp +=
+                                   sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
+                                           t->counter[i] / interval_float / 10000);
                        else
-                               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->counter[i]/tsc);
+                               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->counter[i] / tsc);
                }
        }
 
        /* C1 */
        if (DO_BIC(BIC_CPU_c1))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1/tsc);
-
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc);
 
        /* print per-core data only for 1st thread in core */
        if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
                goto done;
 
        if (DO_BIC(BIC_CPU_c3))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c3/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c3 / tsc);
        if (DO_BIC(BIC_CPU_c6))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c6/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c6 / tsc);
        if (DO_BIC(BIC_CPU_c7))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c7/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c7 / tsc);
 
        /* Mod%c6 */
        if (DO_BIC(BIC_Mod_c6))
@@ -1163,7 +1228,8 @@ int format_counters(struct thread_data *t, struct core_data *c,
        for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
                if (mp->format == FORMAT_RAW) {
                        if (mp->width == 32)
-                               outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int) c->counter[i]);
+                               outp +=
+                                   sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)c->counter[i]);
                        else
                                outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->counter[i]);
                } else if (mp->format == FORMAT_DELTA) {
@@ -1172,14 +1238,15 @@ int format_counters(struct thread_data *t, struct core_data *c,
                        else
                                outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->counter[i]);
                } else if (mp->format == FORMAT_PERCENT) {
-                       outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->counter[i]/tsc);
+                       outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->counter[i] / tsc);
                }
        }
 
        fmt8 = "%s%.2f";
 
        if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY))
-               outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float);
+               outp +=
+                   sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float);
        if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY))
                outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units);
 
@@ -1197,7 +1264,7 @@ int format_counters(struct thread_data *t, struct core_data *c,
                        outp += sprintf(outp, "%s**.**", (printed++ ? delim : ""));
                } else {
                        outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
-                               p->gfx_rc6_ms / 10.0 / interval_float);
+                                       p->gfx_rc6_ms / 10.0 / interval_float);
                }
        }
 
@@ -1211,42 +1278,49 @@ int format_counters(struct thread_data *t, struct core_data *c,
 
        /* Totl%C0, Any%C0 GFX%C0 CPUGFX% */
        if (DO_BIC(BIC_Totl_c0))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc);
        if (DO_BIC(BIC_Any_c0))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_core_c0/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_core_c0 / tsc);
        if (DO_BIC(BIC_GFX_c0))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_gfxe_c0/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_gfxe_c0 / tsc);
        if (DO_BIC(BIC_CPUGFX))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_both_core_gfxe_c0/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_both_core_gfxe_c0 / tsc);
 
        if (DO_BIC(BIC_Pkgpc2))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc2/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc2 / tsc);
        if (DO_BIC(BIC_Pkgpc3))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc3/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc3 / tsc);
        if (DO_BIC(BIC_Pkgpc6))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc6/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc6 / tsc);
        if (DO_BIC(BIC_Pkgpc7))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc7/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc7 / tsc);
        if (DO_BIC(BIC_Pkgpc8))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc8/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc8 / tsc);
        if (DO_BIC(BIC_Pkgpc9))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc9/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc9 / tsc);
        if (DO_BIC(BIC_Pkgpc10))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10/tsc);
+               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc);
 
        if (DO_BIC(BIC_CPU_LPI))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->cpu_lpi / 1000000.0 / interval_float);
+               outp +=
+                   sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->cpu_lpi / 1000000.0 / interval_float);
        if (DO_BIC(BIC_SYS_LPI))
-               outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->sys_lpi / 1000000.0 / interval_float);
+               outp +=
+                   sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->sys_lpi / 1000000.0 / interval_float);
 
        if (DO_BIC(BIC_PkgWatt))
-               outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float);
+               outp +=
+                   sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float);
        if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY))
-               outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float);
+               outp +=
+                   sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float);
        if (DO_BIC(BIC_GFXWatt))
-               outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units / interval_float);
+               outp +=
+                   sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units / interval_float);
        if (DO_BIC(BIC_RAMWatt))
-               outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_dram * rapl_dram_energy_units / interval_float);
+               outp +=
+                   sprintf(outp, fmt8, (printed++ ? delim : ""),
+                           p->energy_dram * rapl_dram_energy_units / interval_float);
        if (DO_BIC(BIC_Pkg_J))
                outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units);
        if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY))
@@ -1256,14 +1330,19 @@ int format_counters(struct thread_data *t, struct core_data *c,
        if (DO_BIC(BIC_RAM_J))
                outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_dram * rapl_dram_energy_units);
        if (DO_BIC(BIC_PKG__))
-               outp += sprintf(outp, fmt8, (printed++ ? delim : ""), 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float);
+               outp +=
+                   sprintf(outp, fmt8, (printed++ ? delim : ""),
+                           100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float);
        if (DO_BIC(BIC_RAM__))
-               outp += sprintf(outp, fmt8, (printed++ ? delim : ""), 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float);
+               outp +=
+                   sprintf(outp, fmt8, (printed++ ? delim : ""),
+                           100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float);
 
        for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
                if (mp->format == FORMAT_RAW) {
                        if (mp->width == 32)
-                               outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int) p->counter[i]);
+                               outp +=
+                                   sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)p->counter[i]);
                        else
                                outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->counter[i]);
                } else if (mp->format == FORMAT_DELTA) {
@@ -1272,7 +1351,7 @@ int format_counters(struct thread_data *t, struct core_data *c,
                        else
                                outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]);
                } else if (mp->format == FORMAT_PERCENT) {
-                       outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i]/tsc);
+                       outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc);
                }
        }
 
@@ -1297,12 +1376,14 @@ void flush_output_stdout(void)
 
        outp = output_buffer;
 }
+
 void flush_output_stderr(void)
 {
        fputs(output_buffer, outf);
        fflush(outf);
        outp = output_buffer;
 }
+
 void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
        static int printed;
@@ -1323,13 +1404,11 @@ void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_
 #define DELTA_WRAP32(new, old)                 \
        old = ((((unsigned long long)new << 32) - ((unsigned long long)old << 32)) >> 32);
 
-int
-delta_package(struct pkg_data *new, struct pkg_data *old)
+int delta_package(struct pkg_data *new, struct pkg_data *old)
 {
        int i;
        struct msr_counter *mp;
 
-
        if (DO_BIC(BIC_Totl_c0))
                old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0;
        if (DO_BIC(BIC_Any_c0))
@@ -1354,7 +1433,7 @@ delta_package(struct pkg_data *new, struct pkg_data *old)
        old->pkg_temp_c = new->pkg_temp_c;
 
        /* flag an error when rc6 counter resets/wraps */
-       if (old->gfx_rc6_ms >  new->gfx_rc6_ms)
+       if (old->gfx_rc6_ms > new->gfx_rc6_ms)
                old->gfx_rc6_ms = -1;
        else
                old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms;
@@ -1379,8 +1458,7 @@ delta_package(struct pkg_data *new, struct pkg_data *old)
        return 0;
 }
 
-void
-delta_core(struct core_data *new, struct core_data *old)
+void delta_core(struct core_data *new, struct core_data *old)
 {
        int i;
        struct msr_counter *mp;
@@ -1412,9 +1490,7 @@ int soft_c1_residency_display(int bic)
 /*
  * old = new - old
  */
-int
-delta_thread(struct thread_data *new, struct thread_data *old,
-       struct core_data *core_delta)
+int delta_thread(struct thread_data *new, struct thread_data *old, struct core_data *core_delta)
 {
        int i;
        struct msr_counter *mp;
@@ -1445,8 +1521,7 @@ delta_thread(struct thread_data *new, struct thread_data *old,
 
        old->c1 = new->c1 - old->c1;
 
-       if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) ||
-           soft_c1_residency_display(BIC_Avg_MHz)) {
+       if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) {
                if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) {
                        old->aperf = new->aperf - old->aperf;
                        old->mperf = new->mperf - old->mperf;
@@ -1455,7 +1530,6 @@ delta_thread(struct thread_data *new, struct thread_data *old,
                }
        }
 
-
        if (use_c1_residency_msr) {
                /*
                 * Some models have a dedicated C1 residency MSR,
@@ -1472,7 +1546,7 @@ delta_thread(struct thread_data *new, struct thread_data *old,
                else {
                        /* normal case, derive c1 */
                        old->c1 = (old->tsc * tsc_tweak) - old->mperf - core_delta->c3
-                               - core_delta->c6 - core_delta->c7;
+                           - core_delta->c6 - core_delta->c7;
                }
        }
 
@@ -1482,6 +1556,9 @@ delta_thread(struct thread_data *new, struct thread_data *old,
                old->mperf = 1; /* divide by 0 protection */
        }
 
+       if (DO_BIC(BIC_IPC))
+               old->instr_count = new->instr_count - old->instr_count;
+
        if (DO_BIC(BIC_IRQ))
                old->irq_count = new->irq_count - old->irq_count;
 
@@ -1498,8 +1575,7 @@ delta_thread(struct thread_data *new, struct thread_data *old,
 }
 
 int delta_cpu(struct thread_data *t, struct core_data *c,
-       struct pkg_data *p, struct thread_data *t2,
-       struct core_data *c2, struct pkg_data *p2)
+             struct pkg_data *p, struct thread_data *t2, struct core_data *c2, struct pkg_data *p2)
 {
        int retval = 0;
 
@@ -1522,7 +1598,7 @@ int delta_cpu(struct thread_data *t, struct core_data *c,
 void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
        int i;
-       struct msr_counter  *mp;
+       struct msr_counter *mp;
 
        t->tv_begin.tv_sec = 0;
        t->tv_begin.tv_usec = 0;
@@ -1536,6 +1612,8 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
        t->mperf = 0;
        t->c1 = 0;
 
+       t->instr_count = 0;
+
        t->irq_count = 0;
        t->smi_count = 0;
 
@@ -1587,8 +1665,8 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
        for (i = 0, mp = sys.pp; mp; i++, mp = mp->next)
                p->counter[i] = 0;
 }
-int sum_counters(struct thread_data *t, struct core_data *c,
-       struct pkg_data *p)
+
+int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
        int i;
        struct msr_counter *mp;
@@ -1611,6 +1689,8 @@ int sum_counters(struct thread_data *t, struct core_data *c,
        average.threads.mperf += t->mperf;
        average.threads.c1 += t->c1;
 
+       average.threads.instr_count += t->instr_count;
+
        average.threads.irq_count += t->irq_count;
        average.threads.smi_count += t->smi_count;
 
@@ -1687,12 +1767,12 @@ int sum_counters(struct thread_data *t, struct core_data *c,
        }
        return 0;
 }
+
 /*
  * sum the counters for all cpus in the system
  * compute the weighted average
  */
-void compute_average(struct thread_data *t, struct core_data *c,
-       struct pkg_data *p)
+void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
        int i;
        struct msr_counter *mp;
@@ -1707,6 +1787,7 @@ void compute_average(struct thread_data *t, struct core_data *c,
        average.threads.tsc /= topo.num_cpus;
        average.threads.aperf /= topo.num_cpus;
        average.threads.mperf /= topo.num_cpus;
+       average.threads.instr_count /= topo.num_cpus;
        average.threads.c1 /= topo.num_cpus;
 
        if (average.threads.irq_count > 9999999)
@@ -1772,7 +1853,7 @@ static unsigned long long rdtsc(void)
 {
        unsigned int low, high;
 
-       asm volatile("rdtsc" : "=a" (low), "=d" (high));
+       asm volatile ("rdtsc":"=a" (low), "=d"(high));
 
        return low | ((unsigned long long)high) << 32;
 }
@@ -1788,6 +1869,7 @@ FILE *fopen_or_die(const char *path, const char *mode)
                err(1, "%s: open failed", path);
        return filep;
 }
+
 /*
  * snapshot_sysfs_counter()
  *
@@ -1819,8 +1901,7 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp)
                char path[128 + PATH_BYTES];
 
                if (mp->flags & SYSFS_PERCPU) {
-                       sprintf(path, "/sys/devices/system/cpu/cpu%d/%s",
-                                cpu, mp->path);
+                       sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->path);
 
                        *counterp = snapshot_sysfs_counter(path);
                } else {
@@ -1880,7 +1961,7 @@ void get_apic_id(struct thread_data *t)
 
                eax = ebx = ecx = edx = 0;
                __cpuid(0x80000001, eax, ebx, ecx, edx);
-                       topology_extensions = ecx & (1 << 22);
+               topology_extensions = ecx & (1 << 22);
 
                if (topology_extensions == 0)
                        return;
@@ -1903,8 +1984,7 @@ void get_apic_id(struct thread_data *t)
        t->x2apic_id = edx;
 
        if (debug && (t->apic_id != (t->x2apic_id & 0xff)))
-               fprintf(outf, "cpu%d: BIOS BUG: apic 0x%x x2apic 0x%x\n",
-                               t->cpu_id, t->apic_id, t->x2apic_id);
+               fprintf(outf, "cpu%d: BIOS BUG: apic 0x%x x2apic 0x%x\n", t->cpu_id, t->apic_id, t->x2apic_id);
 }
 
 /*
@@ -1932,8 +2012,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 retry:
        t->tsc = rdtsc();       /* we are running on local CPU of interest */
 
-       if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) ||
-           soft_c1_residency_display(BIC_Avg_MHz)) {
+       if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) {
                unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
 
                /*
@@ -1980,8 +2059,7 @@ retry:
                        if (aperf_mperf_retry_count < 5)
                                goto retry;
                        else
-                               warnx("cpu%d jitter %lld %lld",
-                                       cpu, aperf_time, mperf_time);
+                               warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time);
                }
                aperf_mperf_retry_count = 0;
 
@@ -1989,6 +2067,10 @@ retry:
                t->mperf = t->mperf * aperf_mperf_multiplier;
        }
 
+       if (DO_BIC(BIC_IPC))
+               if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long))
+                       return -4;
+
        if (DO_BIC(BIC_IRQ))
                t->irq_count = irqs_per_cpu[cpu];
        if (DO_BIC(BIC_SMI)) {
@@ -2023,9 +2105,19 @@ retry:
                        return -7;
        }
 
-       if (DO_BIC(BIC_CPU_c7) || soft_c1_residency_display(BIC_CPU_c7))
+       if (DO_BIC(BIC_CPU_c7) || soft_c1_residency_display(BIC_CPU_c7)) {
                if (get_msr(cpu, MSR_CORE_C7_RESIDENCY, &c->c7))
                        return -8;
+               else if (t->is_atom) {
+                       /*
+                        * For Atom CPUs that has core cstate deeper than c6,
+                        * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper.
+                        * Minus CC7 (and deeper cstates) residency to get
+                        * accturate cc6 residency.
+                        */
+                       c->c6 -= c->c7;
+               }
+       }
 
        if (DO_BIC(BIC_Mod_c6))
                if (get_msr(cpu, MSR_MODULE_C6_RES_MS, &c->mc6_us))
@@ -2034,7 +2126,7 @@ retry:
        if (DO_BIC(BIC_CoreTmp)) {
                if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
                        return -9;
-               c->core_temp_c = tcc_activation_temp - ((msr >> 16) & 0x7F);
+               c->core_temp_c = tj_max - ((msr >> 16) & 0x7F);
        }
 
        if (do_rapl & RAPL_AMD_F17H) {
@@ -2140,7 +2232,7 @@ retry:
        if (DO_BIC(BIC_PkgTmp)) {
                if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
                        return -17;
-               p->pkg_temp_c = tcc_activation_temp - ((msr >> 16) & 0x7F);
+               p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F);
        }
 
        if (DO_BIC(BIC_GFX_rc6))
@@ -2168,45 +2260,81 @@ done:
  * (>= PCL__7) and to index pkg_cstate_limit_strings[].
  */
 
-#define PCLUKN 0 /* Unknown */
-#define PCLRSV 1 /* Reserved */
-#define PCL__0 2 /* PC0 */
-#define PCL__1 3 /* PC1 */
-#define PCL__2 4 /* PC2 */
-#define PCL__3 5 /* PC3 */
-#define PCL__4 6 /* PC4 */
-#define PCL__6 7 /* PC6 */
-#define PCL_6N 8 /* PC6 No Retention */
-#define PCL_6R 9 /* PC6 Retention */
-#define PCL__7 10 /* PC7 */
-#define PCL_7S 11 /* PC7 Shrink */
-#define PCL__8 12 /* PC8 */
-#define PCL__9 13 /* PC9 */
-#define PCL_10 14 /* PC10 */
-#define PCLUNL 15 /* Unlimited */
+#define PCLUKN 0               /* Unknown */
+#define PCLRSV 1               /* Reserved */
+#define PCL__0 2               /* PC0 */
+#define PCL__1 3               /* PC1 */
+#define PCL__2 4               /* PC2 */
+#define PCL__3 5               /* PC3 */
+#define PCL__4 6               /* PC4 */
+#define PCL__6 7               /* PC6 */
+#define PCL_6N 8               /* PC6 No Retention */
+#define PCL_6R 9               /* PC6 Retention */
+#define PCL__7 10              /* PC7 */
+#define PCL_7S 11              /* PC7 Shrink */
+#define PCL__8 12              /* PC8 */
+#define PCL__9 13              /* PC9 */
+#define PCL_10 14              /* PC10 */
+#define PCLUNL 15              /* Unlimited */
 
 int pkg_cstate_limit = PCLUKN;
 char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2",
-       "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited"};
+       "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited"
+};
 
-int nhm_pkg_cstate_limits[16] = {PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV};
-int snb_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV};
-int hsw_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV};
-int slv_pkg_cstate_limits[16] = {PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7};
-int amt_pkg_cstate_limits[16] = {PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV};
-int phi_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV};
-int glm_pkg_cstate_limits[16] = {PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV};
-int skx_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV};
+int nhm_pkg_cstate_limits[16] =
+    { PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
+       PCLRSV, PCLRSV
+};
 
+int snb_pkg_cstate_limits[16] =
+    { PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
+       PCLRSV, PCLRSV
+};
 
-static void
-calculate_tsc_tweak()
+int hsw_pkg_cstate_limits[16] =
+    { PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
+       PCLRSV, PCLRSV
+};
+
+int slv_pkg_cstate_limits[16] =
+    { PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
+       PCL__6, PCL__7
+};
+
+int amt_pkg_cstate_limits[16] =
+    { PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
+       PCLRSV, PCLRSV
+};
+
+int phi_pkg_cstate_limits[16] =
+    { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
+       PCLRSV, PCLRSV
+};
+
+int glm_pkg_cstate_limits[16] =
+    { PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
+       PCLRSV, PCLRSV
+};
+
+int skx_pkg_cstate_limits[16] =
+    { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
+       PCLRSV, PCLRSV
+};
+
+int icx_pkg_cstate_limits[16] =
+    { PCL__0, PCL__2, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
+       PCLRSV, PCLRSV
+};
+
+static void calculate_tsc_tweak()
 {
        tsc_tweak = base_hz / tsc_hz;
 }
 
-static void
-dump_nhm_platform_info(void)
+void prewake_cstate_probe(unsigned int family, unsigned int model);
+
+static void dump_nhm_platform_info(void)
 {
        unsigned long long msr;
        unsigned int ratio;
@@ -2216,22 +2344,23 @@ dump_nhm_platform_info(void)
        fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr);
 
        ratio = (msr >> 40) & 0xFF;
-       fprintf(outf, "%d * %.1f = %.1f MHz max efficiency frequency\n",
-               ratio, bclk, ratio * bclk);
+       fprintf(outf, "%d * %.1f = %.1f MHz max efficiency frequency\n", ratio, bclk, ratio * bclk);
 
        ratio = (msr >> 8) & 0xFF;
-       fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n",
-               ratio, bclk, ratio * bclk);
+       fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
 
        get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr);
        fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n",
                base_cpu, msr, msr & 0x2 ? "EN" : "DIS");
 
+       /* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */
+       if (dis_cstate_prewake)
+               fprintf(outf, "C-state Pre-wake: %sabled\n", msr & 0x40000000 ? "DIS" : "EN");
+
        return;
 }
 
-static void
-dump_hsw_turbo_ratio_limits(void)
+static void dump_hsw_turbo_ratio_limits(void)
 {
        unsigned long long msr;
        unsigned int ratio;
@@ -2242,18 +2371,15 @@ dump_hsw_turbo_ratio_limits(void)
 
        ratio = (msr >> 8) & 0xFF;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 18 active cores\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 18 active cores\n", ratio, bclk, ratio * bclk);
 
        ratio = (msr >> 0) & 0xFF;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 17 active cores\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 17 active cores\n", ratio, bclk, ratio * bclk);
        return;
 }
 
-static void
-dump_ivt_turbo_ratio_limits(void)
+static void dump_ivt_turbo_ratio_limits(void)
 {
        unsigned long long msr;
        unsigned int ratio;
@@ -2264,45 +2390,38 @@ dump_ivt_turbo_ratio_limits(void)
 
        ratio = (msr >> 56) & 0xFF;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 16 active cores\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 16 active cores\n", ratio, bclk, ratio * bclk);
 
        ratio = (msr >> 48) & 0xFF;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 15 active cores\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 15 active cores\n", ratio, bclk, ratio * bclk);
 
        ratio = (msr >> 40) & 0xFF;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 14 active cores\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 14 active cores\n", ratio, bclk, ratio * bclk);
 
        ratio = (msr >> 32) & 0xFF;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 13 active cores\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 13 active cores\n", ratio, bclk, ratio * bclk);
 
        ratio = (msr >> 24) & 0xFF;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 12 active cores\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 12 active cores\n", ratio, bclk, ratio * bclk);
 
        ratio = (msr >> 16) & 0xFF;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 11 active cores\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 11 active cores\n", ratio, bclk, ratio * bclk);
 
        ratio = (msr >> 8) & 0xFF;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 10 active cores\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 10 active cores\n", ratio, bclk, ratio * bclk);
 
        ratio = (msr >> 0) & 0xFF;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 9 active cores\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 9 active cores\n", ratio, bclk, ratio * bclk);
        return;
 }
+
 int has_turbo_ratio_group_limits(int family, int model)
 {
 
@@ -2312,6 +2431,7 @@ int has_turbo_ratio_group_limits(int family, int model)
        switch (model) {
        case INTEL_FAM6_ATOM_GOLDMONT:
        case INTEL_FAM6_SKYLAKE_X:
+       case INTEL_FAM6_ICELAKE_X:
        case INTEL_FAM6_ATOM_GOLDMONT_D:
        case INTEL_FAM6_ATOM_TREMONT_D:
                return 1;
@@ -2319,8 +2439,7 @@ int has_turbo_ratio_group_limits(int family, int model)
        return 0;
 }
 
-static void
-dump_turbo_ratio_limits(int family, int model)
+static void dump_turbo_ratio_limits(int family, int model)
 {
        unsigned long long msr, core_counts;
        unsigned int ratio, group_size;
@@ -2385,8 +2504,7 @@ dump_turbo_ratio_limits(int family, int model)
        return;
 }
 
-static void
-dump_atom_turbo_ratio_limits(void)
+static void dump_atom_turbo_ratio_limits(void)
 {
        unsigned long long msr;
        unsigned int ratio;
@@ -2396,45 +2514,37 @@ dump_atom_turbo_ratio_limits(void)
 
        ratio = (msr >> 0) & 0x3F;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz minimum operating frequency\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz minimum operating frequency\n", ratio, bclk, ratio * bclk);
 
        ratio = (msr >> 8) & 0x3F;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz low frequency mode (LFM)\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz low frequency mode (LFM)\n", ratio, bclk, ratio * bclk);
 
        ratio = (msr >> 16) & 0x3F;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
 
        get_msr(base_cpu, MSR_ATOM_CORE_TURBO_RATIOS, &msr);
        fprintf(outf, "cpu%d: MSR_ATOM_CORE_TURBO_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF);
 
        ratio = (msr >> 24) & 0x3F;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 4 active cores\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 4 active cores\n", ratio, bclk, ratio * bclk);
 
        ratio = (msr >> 16) & 0x3F;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 3 active cores\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 3 active cores\n", ratio, bclk, ratio * bclk);
 
        ratio = (msr >> 8) & 0x3F;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 2 active cores\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 2 active cores\n", ratio, bclk, ratio * bclk);
 
        ratio = (msr >> 0) & 0x3F;
        if (ratio)
-               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 1 active core\n",
-                       ratio, bclk, ratio * bclk);
+               fprintf(outf, "%d * %.1f = %.1f MHz max turbo 1 active core\n", ratio, bclk, ratio * bclk);
 }
 
-static void
-dump_knl_turbo_ratio_limits(void)
+static void dump_knl_turbo_ratio_limits(void)
 {
        const unsigned int buckets_no = 7;
 
@@ -2446,8 +2556,7 @@ dump_knl_turbo_ratio_limits(void)
 
        get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT, &msr);
 
-       fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n",
-               base_cpu, msr);
+       fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr);
 
        /*
         * Turbo encoding in KNL is as follows:
@@ -2492,8 +2601,7 @@ dump_knl_turbo_ratio_limits(void)
                                ratio[i], bclk, ratio[i] * bclk, cores[i]);
 }
 
-static void
-dump_nhm_cst_cfg(void)
+static void dump_nhm_cst_cfg(void)
 {
        unsigned long long msr;
 
@@ -2506,14 +2614,11 @@ dump_nhm_cst_cfg(void)
                (msr & SNB_C1_AUTO_UNDEMOTE) ? "UNdemote-C1, " : "",
                (msr & NHM_C3_AUTO_DEMOTE) ? "demote-C3, " : "",
                (msr & NHM_C1_AUTO_DEMOTE) ? "demote-C1, " : "",
-               (msr & (1 << 15)) ? "" : "UN",
-               (unsigned int)msr & 0xF,
-               pkg_cstate_limit_strings[pkg_cstate_limit]);
+               (msr & (1 << 15)) ? "" : "UN", (unsigned int)msr & 0xF, pkg_cstate_limit_strings[pkg_cstate_limit]);
 
 #define AUTOMATIC_CSTATE_CONVERSION            (1UL << 16)
        if (has_automatic_cstate_conversion) {
-               fprintf(outf, ", automatic c-state conversion=%s",
-                       (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off");
+               fprintf(outf, ", automatic c-state conversion=%s", (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off");
        }
 
        fprintf(outf, ")\n");
@@ -2521,8 +2626,7 @@ dump_nhm_cst_cfg(void)
        return;
 }
 
-static void
-dump_config_tdp(void)
+static void dump_config_tdp(void)
 {
        unsigned long long msr;
 
@@ -2564,7 +2668,7 @@ dump_config_tdp(void)
        fprintf(outf, ")\n");
 }
 
-unsigned int irtl_time_units[] = {1, 32, 1024, 32768, 1048576, 33554432, 0, 0 };
+unsigned int irtl_time_units[] = { 1, 32, 1024, 32768, 1048576, 33554432, 0, 0 };
 
 void print_irtl(void)
 {
@@ -2604,6 +2708,7 @@ void print_irtl(void)
                (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
 
 }
+
 void free_fd_percpu(void)
 {
        int i;
@@ -2660,7 +2765,6 @@ void free_all_buffers(void)
        free(cpus);
 }
 
-
 /*
  * Parse a file containing a single int.
  * Return 0 if file can not be opened
@@ -2735,8 +2839,7 @@ void set_node_data(void)
                         * the logical_node_id
                         */
                        for (cpux = cpu; cpux <= topo.max_cpu_num; cpux++) {
-                               if ((cpus[cpux].physical_package_id == pkg) &&
-                                  (cpus[cpux].physical_node_id == node)) {
+                               if ((cpus[cpux].physical_package_id == pkg) && (cpus[cpux].physical_node_id == node)) {
                                        cpus[cpux].logical_node_id = lnode;
                                        cpu_count++;
                                }
@@ -2758,8 +2861,7 @@ int get_physical_node_id(struct cpu_topology *thiscpu)
        int cpu = thiscpu->logical_cpu_id;
 
        for (i = 0; i <= topo.max_cpu_num; i++) {
-               sprintf(path, "/sys/devices/system/cpu/cpu%d/node%i/cpulist",
-                       cpu, i);
+               sprintf(path, "/sys/devices/system/cpu/cpu%d/node%i/cpulist", cpu, i);
                filep = fopen(path, "r");
                if (!filep)
                        continue;
@@ -2789,8 +2891,7 @@ int get_thread_siblings(struct cpu_topology *thiscpu)
        size = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
        CPU_ZERO_S(size, thiscpu->put_ids);
 
-       sprintf(path,
-               "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu);
+       sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu);
        filep = fopen(path, "r");
 
        if (!filep) {
@@ -2807,10 +2908,8 @@ int get_thread_siblings(struct cpu_topology *thiscpu)
                                sib_core = get_core_id(so);
                                if (sib_core == thiscpu->physical_core_id) {
                                        CPU_SET_S(so, size, thiscpu->put_ids);
-                                       if ((so != cpu) &&
-                                           (cpus[so].thread_id < 0))
-                                               cpus[so].thread_id =
-                                                                   thread_id++;
+                                       if ((so != cpu) && (cpus[so].thread_id < 0))
+                                               cpus[so].thread_id = thread_id++;
                                }
                        }
                }
@@ -2825,41 +2924,31 @@ int get_thread_siblings(struct cpu_topology *thiscpu)
  * skip non-present cpus
  */
 
-int for_all_cpus_2(int (func)(struct thread_data *, struct core_data *,
-       struct pkg_data *, struct thread_data *, struct core_data *,
-       struct pkg_data *), struct thread_data *thread_base,
-       struct core_data *core_base, struct pkg_data *pkg_base,
-       struct thread_data *thread_base2, struct core_data *core_base2,
-       struct pkg_data *pkg_base2)
+int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *,
+                              struct pkg_data *, struct thread_data *, struct core_data *,
+                              struct pkg_data *), struct thread_data *thread_base,
+                  struct core_data *core_base, struct pkg_data *pkg_base,
+                  struct thread_data *thread_base2, struct core_data *core_base2, struct pkg_data *pkg_base2)
 {
        int retval, pkg_no, node_no, core_no, thread_no;
 
        for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
                for (node_no = 0; node_no < topo.nodes_per_pkg; ++node_no) {
-                       for (core_no = 0; core_no < topo.cores_per_node;
-                            ++core_no) {
-                               for (thread_no = 0; thread_no <
-                                       topo.threads_per_core; ++thread_no) {
+                       for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
+                               for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
                                        struct thread_data *t, *t2;
                                        struct core_data *c, *c2;
                                        struct pkg_data *p, *p2;
 
-                                       t = GET_THREAD(thread_base, thread_no,
-                                                      core_no, node_no,
-                                                      pkg_no);
+                                       t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
 
                                        if (cpu_is_not_present(t->cpu_id))
                                                continue;
 
-                                       t2 = GET_THREAD(thread_base2, thread_no,
-                                                       core_no, node_no,
-                                                       pkg_no);
+                                       t2 = GET_THREAD(thread_base2, thread_no, core_no, node_no, pkg_no);
 
-                                       c = GET_CORE(core_base, core_no,
-                                                    node_no, pkg_no);
-                                       c2 = GET_CORE(core_base2, core_no,
-                                                     node_no,
-                                                     pkg_no);
+                                       c = GET_CORE(core_base, core_no, node_no, pkg_no);
+                                       c2 = GET_CORE(core_base2, core_no, node_no, pkg_no);
 
                                        p = GET_PKG(pkg_base, pkg_no);
                                        p2 = GET_PKG(pkg_base2, pkg_no);
@@ -2878,7 +2967,7 @@ int for_all_cpus_2(int (func)(struct thread_data *, struct core_data *,
  * run func(cpu) on every cpu in /proc/stat
  * return max_cpu number
  */
-int for_all_proc_cpus(int (func)(int))
+int for_all_proc_cpus(int (func) (int))
 {
        FILE *fp;
        int cpu_num;
@@ -2898,7 +2987,7 @@ int for_all_proc_cpus(int (func)(int))
                retval = func(cpu_num);
                if (retval) {
                        fclose(fp);
-                       return(retval);
+                       return (retval);
                }
        }
        fclose(fp);
@@ -2922,16 +3011,14 @@ void set_max_cpu_num(void)
        base_cpu = sched_getcpu();
        if (base_cpu < 0)
                err(1, "cannot find calling cpu ID");
-       sprintf(pathname,
-               "/sys/devices/system/cpu/cpu%d/topology/thread_siblings",
-               base_cpu);
+       sprintf(pathname, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", base_cpu);
 
        filep = fopen_or_die(pathname, "r");
        topo.max_cpu_num = 0;
        while (fscanf(filep, "%lx,", &dummy) == 1)
                topo.max_cpu_num += BITMASK_SIZE;
        fclose(filep);
-       topo.max_cpu_num--; /* 0 based */
+       topo.max_cpu_num--;     /* 0 based */
 }
 
 /*
@@ -2943,6 +3030,7 @@ int count_cpus(int cpu)
        topo.num_cpus++;
        return 0;
 }
+
 int mark_cpu_present(int cpu)
 {
        CPU_SET_S(cpu, cpu_present_setsize, cpu_present_set);
@@ -3012,12 +3100,12 @@ int snapshot_proc_interrupts(void)
 
                }
 
-               while (getc(fp) != '\n')
-                       ;       /* flush interrupt description */
+               while (getc(fp) != '\n') ;      /* flush interrupt description */
 
        }
        return 0;
 }
+
 /*
  * snapshot_gfx_rc6_ms()
  *
@@ -3041,6 +3129,7 @@ int snapshot_gfx_rc6_ms(void)
 
        return 0;
 }
+
 /*
  * snapshot_gfx_mhz()
  *
@@ -3120,6 +3209,7 @@ int snapshot_cpu_lpi_us(void)
 
        return 0;
 }
+
 /*
  * snapshot_sys_lpi()
  *
@@ -3143,6 +3233,7 @@ int snapshot_sys_lpi_us(void)
 
        return 0;
 }
+
 /*
  * snapshot /proc and /sys files
  *
@@ -3174,7 +3265,7 @@ int snapshot_proc_sysfs_files(void)
 
 int exit_requested;
 
-static void signal_handler (int signal)
+static void signal_handler(int signal)
 {
        switch (signal) {
        case SIGINT:
@@ -3272,7 +3363,7 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg
 
        for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) {
                unsigned long long msr_cur, msr_last;
-               int offset;
+               off_t offset;
 
                if (!idx_valid(i))
                        continue;
@@ -3281,7 +3372,7 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg
                        continue;
                ret = get_msr(cpu, offset, &msr_cur);
                if (ret) {
-                       fprintf(outf, "Can not update msr(0x%x)\n", offset);
+                       fprintf(outf, "Can not update msr(0x%llx)\n", (unsigned long long)offset);
                        continue;
                }
 
@@ -3294,8 +3385,7 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg
        return 0;
 }
 
-static void
-msr_record_handler(union sigval v)
+static void msr_record_handler(union sigval v)
 {
        for_all_cpus(update_msr_sum, EVEN_COUNTERS);
 }
@@ -3340,12 +3430,38 @@ void msr_sum_record(void)
        }
        return;
 
- release_timer:
+release_timer:
        timer_delete(timerid);
- release_msr:
+release_msr:
        free(per_cpu_msr_sum);
 }
 
+/*
+ * set_my_sched_priority(pri)
+ * return previous
+ */
+int set_my_sched_priority(int priority)
+{
+       int retval;
+       int original_priority;
+
+       errno = 0;
+       original_priority = getpriority(PRIO_PROCESS, 0);
+       if (errno && (original_priority == -1))
+               err(errno, "getpriority");
+
+       retval = setpriority(PRIO_PROCESS, 0, priority);
+       if (retval)
+               err(retval, "setpriority(%d)", priority);
+
+       errno = 0;
+       retval = getpriority(PRIO_PROCESS, 0);
+       if (retval != priority)
+               err(-1, "getpriority(%d) != setpriority(%d)", retval, priority);
+
+       return original_priority;
+}
+
 void turbostat_loop()
 {
        int retval;
@@ -3354,6 +3470,11 @@ void turbostat_loop()
 
        setup_signal_handler();
 
+       /*
+        * elevate own priority for interval mode
+        */
+       set_my_sched_priority(-20);
+
 restart:
        restarted++;
 
@@ -3434,7 +3555,7 @@ void check_dev_msr()
 
        sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
        if (stat(pathname, &sb))
-               if (system("/sbin/modprobe msr > /dev/null 2>&1"))
+               if (system("/sbin/modprobe msr > /dev/null 2>&1"))
                        err(-5, "no /dev/cpu/0/msr, Try \"# modprobe msr\" ");
 }
 
@@ -3456,8 +3577,7 @@ int check_for_cap_sys_rawio(void)
                err(-6, "cap_get\n");
 
        if (cap_flag_value != CAP_SET) {
-               warnx("capget(CAP_SYS_RAWIO) failed,"
-                       " try \"# setcap cap_sys_rawio=ep %s\"", progname);
+               warnx("capget(CAP_SYS_RAWIO) failed," " try \"# setcap cap_sys_rawio=ep %s\"", progname);
                return 1;
        }
 
@@ -3466,6 +3586,7 @@ int check_for_cap_sys_rawio(void)
 
        return 0;
 }
+
 void check_permissions(void)
 {
        int do_exit = 0;
@@ -3551,6 +3672,10 @@ int probe_nhm_msrs(unsigned int family, unsigned int model)
                pkg_cstate_limits = skx_pkg_cstate_limits;
                has_misc_feature_control = 1;
                break;
+       case INTEL_FAM6_ICELAKE_X:      /* ICX */
+               pkg_cstate_limits = icx_pkg_cstate_limits;
+               has_misc_feature_control = 1;
+               break;
        case INTEL_FAM6_ATOM_SILVERMONT:        /* BYT */
                no_MSR_MISC_PWR_MGMT = 1;
        case INTEL_FAM6_ATOM_SILVERMONT_D:      /* AVN */
@@ -3567,7 +3692,7 @@ int probe_nhm_msrs(unsigned int family, unsigned int model)
        case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
        case INTEL_FAM6_ATOM_GOLDMONT_D:        /* DNV */
        case INTEL_FAM6_ATOM_TREMONT:   /* EHL */
-       case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */
+       case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */
                pkg_cstate_limits = glm_pkg_cstate_limits;
                break;
        default:
@@ -3583,6 +3708,7 @@ int probe_nhm_msrs(unsigned int family, unsigned int model)
        has_base_hz = 1;
        return 1;
 }
+
 /*
  * SLV client has support for unique MSRs:
  *
@@ -3603,6 +3729,7 @@ int has_slv_msrs(unsigned int family, unsigned int model)
        }
        return 0;
 }
+
 int is_dnv(unsigned int family, unsigned int model)
 {
 
@@ -3615,6 +3742,7 @@ int is_dnv(unsigned int family, unsigned int model)
        }
        return 0;
 }
+
 int is_bdx(unsigned int family, unsigned int model)
 {
 
@@ -3627,6 +3755,7 @@ int is_bdx(unsigned int family, unsigned int model)
        }
        return 0;
 }
+
 int is_skx(unsigned int family, unsigned int model)
 {
 
@@ -3639,6 +3768,20 @@ int is_skx(unsigned int family, unsigned int model)
        }
        return 0;
 }
+
+int is_icx(unsigned int family, unsigned int model)
+{
+
+       if (!genuine_intel)
+               return 0;
+
+       switch (model) {
+       case INTEL_FAM6_ICELAKE_X:
+               return 1;
+       }
+       return 0;
+}
+
 int is_ehl(unsigned int family, unsigned int model)
 {
        if (!genuine_intel)
@@ -3650,6 +3793,7 @@ int is_ehl(unsigned int family, unsigned int model)
        }
        return 0;
 }
+
 int is_jvl(unsigned int family, unsigned int model)
 {
        if (!genuine_intel)
@@ -3668,7 +3812,7 @@ int has_turbo_ratio_limit(unsigned int family, unsigned int model)
                return 0;
 
        switch (model) {
-       /* Nehalem compatible, but do not include turbo-ratio limit support */
+               /* Nehalem compatible, but do not include turbo-ratio limit support */
        case INTEL_FAM6_NEHALEM_EX:     /* Nehalem-EX Xeon - Beckton */
        case INTEL_FAM6_XEON_PHI_KNL:   /* PHI - Knights Landing (different MSR definition) */
                return 0;
@@ -3676,6 +3820,7 @@ int has_turbo_ratio_limit(unsigned int family, unsigned int model)
                return 1;
        }
 }
+
 int has_atom_turbo_ratio_limit(unsigned int family, unsigned int model)
 {
        if (has_slv_msrs(family, model))
@@ -3683,6 +3828,7 @@ int has_atom_turbo_ratio_limit(unsigned int family, unsigned int model)
 
        return 0;
 }
+
 int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model)
 {
        if (!genuine_intel)
@@ -3699,6 +3845,7 @@ int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model)
                return 0;
        }
 }
+
 int has_hsw_turbo_ratio_limit(unsigned int family, unsigned int model)
 {
        if (!genuine_intel)
@@ -3730,6 +3877,7 @@ int has_knl_turbo_ratio_limit(unsigned int family, unsigned int model)
                return 0;
        }
 }
+
 int has_glm_turbo_ratio_limit(unsigned int family, unsigned int model)
 {
        if (!genuine_intel)
@@ -3741,11 +3889,13 @@ int has_glm_turbo_ratio_limit(unsigned int family, unsigned int model)
        switch (model) {
        case INTEL_FAM6_ATOM_GOLDMONT:
        case INTEL_FAM6_SKYLAKE_X:
+       case INTEL_FAM6_ICELAKE_X:
                return 1;
        default:
                return 0;
        }
 }
+
 int has_config_tdp(unsigned int family, unsigned int model)
 {
        if (!genuine_intel)
@@ -3766,6 +3916,7 @@ int has_config_tdp(unsigned int family, unsigned int model)
        case INTEL_FAM6_SKYLAKE_L:      /* SKL */
        case INTEL_FAM6_CANNONLAKE_L:   /* CNL */
        case INTEL_FAM6_SKYLAKE_X:      /* SKX */
+       case INTEL_FAM6_ICELAKE_X:      /* ICX */
 
        case INTEL_FAM6_XEON_PHI_KNL:   /* Knights Landing */
                return 1;
@@ -3774,8 +3925,41 @@ int has_config_tdp(unsigned int family, unsigned int model)
        }
 }
 
-static void
-remove_underbar(char *s)
+/*
+ * tcc_offset_bits:
+ * 0: Tcc Offset not supported (Default)
+ * 6: Bit 29:24 of MSR_PLATFORM_INFO
+ * 4: Bit 27:24 of MSR_PLATFORM_INFO
+ */
+void check_tcc_offset(int model)
+{
+       unsigned long long msr;
+
+       if (!genuine_intel)
+               return;
+
+       switch (model) {
+       case INTEL_FAM6_SKYLAKE_L:
+       case INTEL_FAM6_SKYLAKE:
+       case INTEL_FAM6_KABYLAKE_L:
+       case INTEL_FAM6_KABYLAKE:
+       case INTEL_FAM6_ICELAKE_L:
+       case INTEL_FAM6_ICELAKE:
+       case INTEL_FAM6_TIGERLAKE_L:
+       case INTEL_FAM6_TIGERLAKE:
+       case INTEL_FAM6_COMETLAKE:
+               if (!get_msr(base_cpu, MSR_PLATFORM_INFO, &msr)) {
+                       msr = (msr >> 30) & 1;
+                       if (msr)
+                               tcc_offset_bits = 6;
+               }
+               return;
+       default:
+               return;
+       }
+}
+
+static void remove_underbar(char *s)
 {
        char *to = s;
 
@@ -3788,8 +3972,7 @@ remove_underbar(char *s)
        *to = 0;
 }
 
-static void
-dump_cstate_pstate_config_info(unsigned int family, unsigned int model)
+static void dump_cstate_pstate_config_info(unsigned int family, unsigned int model)
 {
        if (!do_nhm_platform_info)
                return;
@@ -3834,8 +4017,8 @@ static void dump_sysfs_file(char *path)
 
        fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf);
 }
-static void
-dump_sysfs_cstate_config(void)
+
+static void dump_sysfs_cstate_config(void)
 {
        char path[64];
        char name_buf[16];
@@ -3855,15 +4038,14 @@ dump_sysfs_cstate_config(void)
 
        for (state = 0; state < 10; ++state) {
 
-               sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name",
-                       base_cpu, state);
+               sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
                input = fopen(path, "r");
                if (input == NULL)
                        continue;
                if (!fgets(name_buf, sizeof(name_buf), input))
                        err(1, "%s: failed to read file", path);
 
-                /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
+               /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
                sp = strchr(name_buf, '-');
                if (!sp)
                        sp = strchrnul(name_buf, '\n');
@@ -3872,8 +4054,7 @@ dump_sysfs_cstate_config(void)
 
                remove_underbar(name_buf);
 
-               sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc",
-                       base_cpu, state);
+               sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", base_cpu, state);
                input = fopen(path, "r");
                if (input == NULL)
                        continue;
@@ -3884,8 +4065,8 @@ dump_sysfs_cstate_config(void)
                fclose(input);
        }
 }
-static void
-dump_sysfs_pstate_config(void)
+
+static void dump_sysfs_pstate_config(void)
 {
        char path[64];
        char driver_buf[64];
@@ -3893,8 +4074,7 @@ dump_sysfs_pstate_config(void)
        FILE *input;
        int turbo;
 
-       sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver",
-                       base_cpu);
+       sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", base_cpu);
        input = fopen(path, "r");
        if (input == NULL) {
                fprintf(outf, "NSFOD %s\n", path);
@@ -3904,8 +4084,7 @@ dump_sysfs_pstate_config(void)
                err(1, "%s: failed to read file", path);
        fclose(input);
 
-       sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor",
-                       base_cpu);
+       sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", base_cpu);
        input = fopen(path, "r");
        if (input == NULL) {
                fprintf(outf, "NSFOD %s\n", path);
@@ -3937,7 +4116,6 @@ dump_sysfs_pstate_config(void)
        }
 }
 
-
 /*
  * print_epb()
  * Decode the ENERGY_PERF_BIAS MSR
@@ -3983,6 +4161,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 
        return 0;
 }
+
 /*
  * print_hwp()
  * Decode the MSR_HWP_CAPABILITIES
@@ -4009,8 +4188,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
        if (get_msr(cpu, MSR_PM_ENABLE, &msr))
                return 0;
 
-       fprintf(outf, "cpu%d: MSR_PM_ENABLE: 0x%08llx (%sHWP)\n",
-               cpu, msr, (msr & (1 << 0)) ? "" : "No-");
+       fprintf(outf, "cpu%d: MSR_PM_ENABLE: 0x%08llx (%sHWP)\n", cpu, msr, (msr & (1 << 0)) ? "" : "No-");
 
        /* MSR_PM_ENABLE[1] == 1 if HWP is enabled and MSRs visible */
        if ((msr & (1 << 0)) == 0)
@@ -4020,25 +4198,23 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                return 0;
 
        fprintf(outf, "cpu%d: MSR_HWP_CAPABILITIES: 0x%08llx "
-                       "(high %d guar %d eff %d low %d)\n",
-                       cpu, msr,
-                       (unsigned int)HWP_HIGHEST_PERF(msr),
-                       (unsigned int)HWP_GUARANTEED_PERF(msr),
-                       (unsigned int)HWP_MOSTEFFICIENT_PERF(msr),
-                       (unsigned int)HWP_LOWEST_PERF(msr));
+               "(high %d guar %d eff %d low %d)\n",
+               cpu, msr,
+               (unsigned int)HWP_HIGHEST_PERF(msr),
+               (unsigned int)HWP_GUARANTEED_PERF(msr),
+               (unsigned int)HWP_MOSTEFFICIENT_PERF(msr), (unsigned int)HWP_LOWEST_PERF(msr));
 
        if (get_msr(cpu, MSR_HWP_REQUEST, &msr))
                return 0;
 
        fprintf(outf, "cpu%d: MSR_HWP_REQUEST: 0x%08llx "
-                       "(min %d max %d des %d epp 0x%x window 0x%x pkg 0x%x)\n",
-                       cpu, msr,
-                       (unsigned int)(((msr) >> 0) & 0xff),
-                       (unsigned int)(((msr) >> 8) & 0xff),
-                       (unsigned int)(((msr) >> 16) & 0xff),
-                       (unsigned int)(((msr) >> 24) & 0xff),
-                       (unsigned int)(((msr) >> 32) & 0xff3),
-                       (unsigned int)(((msr) >> 42) & 0x1));
+               "(min %d max %d des %d epp 0x%x window 0x%x pkg 0x%x)\n",
+               cpu, msr,
+               (unsigned int)(((msr) >> 0) & 0xff),
+               (unsigned int)(((msr) >> 8) & 0xff),
+               (unsigned int)(((msr) >> 16) & 0xff),
+               (unsigned int)(((msr) >> 24) & 0xff),
+               (unsigned int)(((msr) >> 32) & 0xff3), (unsigned int)(((msr) >> 42) & 0x1));
 
        if (has_hwp_pkg) {
                if (get_msr(cpu, MSR_HWP_REQUEST_PKG, &msr))
@@ -4050,8 +4226,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                        (unsigned int)(((msr) >> 0) & 0xff),
                        (unsigned int)(((msr) >> 8) & 0xff),
                        (unsigned int)(((msr) >> 16) & 0xff),
-                       (unsigned int)(((msr) >> 24) & 0xff),
-                       (unsigned int)(((msr) >> 32) & 0xff3));
+                       (unsigned int)(((msr) >> 24) & 0xff), (unsigned int)(((msr) >> 32) & 0xff3));
        }
        if (has_hwp_notify) {
                if (get_msr(cpu, MSR_HWP_INTERRUPT, &msr))
@@ -4059,18 +4234,14 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 
                fprintf(outf, "cpu%d: MSR_HWP_INTERRUPT: 0x%08llx "
                        "(%s_Guaranteed_Perf_Change, %s_Excursion_Min)\n",
-                       cpu, msr,
-                       ((msr) & 0x1) ? "EN" : "Dis",
-                       ((msr) & 0x2) ? "EN" : "Dis");
+                       cpu, msr, ((msr) & 0x1) ? "EN" : "Dis", ((msr) & 0x2) ? "EN" : "Dis");
        }
        if (get_msr(cpu, MSR_HWP_STATUS, &msr))
                return 0;
 
        fprintf(outf, "cpu%d: MSR_HWP_STATUS: 0x%08llx "
-                       "(%sGuaranteed_Perf_Change, %sExcursion_Min)\n",
-                       cpu, msr,
-                       ((msr) & 0x1) ? "" : "No-",
-                       ((msr) & 0x2) ? "" : "No-");
+               "(%sGuaranteed_Perf_Change, %sExcursion_Min)\n",
+               cpu, msr, ((msr) & 0x1) ? "" : "No-", ((msr) & 0x2) ? "" : "No-");
 
        return 0;
 }
@@ -4110,8 +4281,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
                        (msr & 1 << 5) ? "Auto-HWP, " : "",
                        (msr & 1 << 4) ? "Graphics, " : "",
                        (msr & 1 << 2) ? "bit2, " : "",
-                       (msr & 1 << 1) ? "ThermStatus, " : "",
-                       (msr & 1 << 0) ? "PROCHOT, " : "");
+                       (msr & 1 << 1) ? "ThermStatus, " : "", (msr & 1 << 0) ? "PROCHOT, " : "");
                fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n",
                        (msr & 1 << 31) ? "bit31, " : "",
                        (msr & 1 << 30) ? "bit30, " : "",
@@ -4125,8 +4295,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
                        (msr & 1 << 21) ? "Auto-HWP, " : "",
                        (msr & 1 << 20) ? "Graphics, " : "",
                        (msr & 1 << 18) ? "bit18, " : "",
-                       (msr & 1 << 17) ? "ThermStatus, " : "",
-                       (msr & 1 << 16) ? "PROCHOT, " : "");
+                       (msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 16) ? "PROCHOT, " : "");
 
        }
        if (do_gfx_perf_limit_reasons) {
@@ -4139,8 +4308,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
                        (msr & 1 << 6) ? "VR-Therm, " : "",
                        (msr & 1 << 8) ? "Amps, " : "",
                        (msr & 1 << 9) ? "GFXPwr, " : "",
-                       (msr & 1 << 10) ? "PkgPwrL1, " : "",
-                       (msr & 1 << 11) ? "PkgPwrL2, " : "");
+                       (msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : "");
                fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s)\n",
                        (msr & 1 << 16) ? "PROCHOT, " : "",
                        (msr & 1 << 17) ? "ThermStatus, " : "",
@@ -4148,8 +4316,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
                        (msr & 1 << 22) ? "VR-Therm, " : "",
                        (msr & 1 << 24) ? "Amps, " : "",
                        (msr & 1 << 25) ? "GFXPwr, " : "",
-                       (msr & 1 << 26) ? "PkgPwrL1, " : "",
-                       (msr & 1 << 27) ? "PkgPwrL2, " : "");
+                       (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : "");
        }
        if (do_ring_perf_limit_reasons) {
                get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr);
@@ -4159,21 +4326,19 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
                        (msr & 1 << 1) ? "ThermStatus, " : "",
                        (msr & 1 << 6) ? "VR-Therm, " : "",
                        (msr & 1 << 8) ? "Amps, " : "",
-                       (msr & 1 << 10) ? "PkgPwrL1, " : "",
-                       (msr & 1 << 11) ? "PkgPwrL2, " : "");
+                       (msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : "");
                fprintf(outf, " (Logged: %s%s%s%s%s%s)\n",
                        (msr & 1 << 16) ? "PROCHOT, " : "",
                        (msr & 1 << 17) ? "ThermStatus, " : "",
                        (msr & 1 << 22) ? "VR-Therm, " : "",
                        (msr & 1 << 24) ? "Amps, " : "",
-                       (msr & 1 << 26) ? "PkgPwrL1, " : "",
-                       (msr & 1 << 27) ? "PkgPwrL2, " : "");
+                       (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : "");
        }
        return 0;
 }
 
 #define        RAPL_POWER_GRANULARITY  0x7FFF  /* 15 bit power granularity */
-#define        RAPL_TIME_GRANULARITY   0x3F /* 6 bit time granularity */
+#define        RAPL_TIME_GRANULARITY   0x3F    /* 6 bit time granularity */
 
 double get_tdp_intel(unsigned int model)
 {
@@ -4202,14 +4367,14 @@ double get_tdp_amd(unsigned int family)
  * rapl_dram_energy_units_probe()
  * Energy units are either hard-coded, or come from RAPL Energy Unit MSR.
  */
-static double
-rapl_dram_energy_units_probe(int  model, double rapl_energy_units)
+static double rapl_dram_energy_units_probe(int model, double rapl_energy_units)
 {
        /* only called for genuine_intel, family 6 */
 
        switch (model) {
        case INTEL_FAM6_HASWELL_X:      /* HSX */
        case INTEL_FAM6_BROADWELL_X:    /* BDX */
+       case INTEL_FAM6_SKYLAKE_X:      /* SKX */
        case INTEL_FAM6_XEON_PHI_KNL:   /* KNL */
                return (rapl_dram_energy_units = 15.3 / 1000000);
        default:
@@ -4254,7 +4419,9 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
                        BIC_PRESENT(BIC_PkgWatt);
                break;
        case INTEL_FAM6_ATOM_TREMONT:   /* EHL */
-               do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_GFX | RAPL_PKG_POWER_INFO;
+               do_rapl =
+                   RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS
+                   | RAPL_GFX | RAPL_PKG_POWER_INFO;
                if (rapl_joules) {
                        BIC_PRESENT(BIC_Pkg_J);
                        BIC_PRESENT(BIC_Cor_J);
@@ -4277,7 +4444,9 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
                break;
        case INTEL_FAM6_SKYLAKE_L:      /* SKL */
        case INTEL_FAM6_CANNONLAKE_L:   /* CNL */
-               do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_GFX | RAPL_PKG_POWER_INFO;
+               do_rapl =
+                   RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS
+                   | RAPL_GFX | RAPL_PKG_POWER_INFO;
                BIC_PRESENT(BIC_PKG__);
                BIC_PRESENT(BIC_RAM__);
                if (rapl_joules) {
@@ -4295,8 +4464,11 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
        case INTEL_FAM6_HASWELL_X:      /* HSX */
        case INTEL_FAM6_BROADWELL_X:    /* BDX */
        case INTEL_FAM6_SKYLAKE_X:      /* SKX */
+       case INTEL_FAM6_ICELAKE_X:      /* ICX */
        case INTEL_FAM6_XEON_PHI_KNL:   /* KNL */
-               do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO;
+               do_rapl =
+                   RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS |
+                   RAPL_PKG_POWER_INFO;
                BIC_PRESENT(BIC_PKG__);
                BIC_PRESENT(BIC_RAM__);
                if (rapl_joules) {
@@ -4309,7 +4481,9 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
                break;
        case INTEL_FAM6_SANDYBRIDGE_X:
        case INTEL_FAM6_IVYBRIDGE_X:
-               do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_PKG_PERF_STATUS | RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO;
+               do_rapl =
+                   RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_PKG_PERF_STATUS |
+                   RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO;
                BIC_PRESENT(BIC_PKG__);
                BIC_PRESENT(BIC_RAM__);
                if (rapl_joules) {
@@ -4334,7 +4508,9 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
                }
                break;
        case INTEL_FAM6_ATOM_GOLDMONT_D:        /* DNV */
-               do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO | RAPL_CORES_ENERGY_STATUS;
+               do_rapl =
+                   RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS |
+                   RAPL_PKG_POWER_INFO | RAPL_CORES_ENERGY_STATUS;
                BIC_PRESENT(BIC_PKG__);
                BIC_PRESENT(BIC_RAM__);
                if (rapl_joules) {
@@ -4451,10 +4627,16 @@ void perf_limit_reasons_probe(unsigned int family, unsigned int model)
 
 void automatic_cstate_conversion_probe(unsigned int family, unsigned int model)
 {
-       if (is_skx(family, model) || is_bdx(family, model))
+       if (is_skx(family, model) || is_bdx(family, model) || is_icx(family, model))
                has_automatic_cstate_conversion = 1;
 }
 
+void prewake_cstate_probe(unsigned int family, unsigned int model)
+{
+       if (is_icx(family, model))
+               dis_cstate_prewake = 1;
+}
+
 int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
        unsigned long long msr;
@@ -4480,8 +4662,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
                        return 0;
 
                dts = (msr >> 16) & 0x7F;
-               fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n",
-                       cpu, msr, tcc_activation_temp - dts);
+               fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts);
 
                if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr))
                        return 0;
@@ -4489,10 +4670,9 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
                dts = (msr >> 16) & 0x7F;
                dts2 = (msr >> 8) & 0x7F;
                fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
-                       cpu, msr, tcc_activation_temp - dts, tcc_activation_temp - dts2);
+                       cpu, msr, tj_max - dts, tj_max - dts2);
        }
 
-
        if (do_dts && debug) {
                unsigned int resolution;
 
@@ -4502,7 +4682,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
                dts = (msr >> 16) & 0x7F;
                resolution = (msr >> 27) & 0xF;
                fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n",
-                       cpu, msr, tcc_activation_temp - dts, resolution);
+                       cpu, msr, tj_max - dts, resolution);
 
                if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr))
                        return 0;
@@ -4510,7 +4690,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
                dts = (msr >> 16) & 0x7F;
                dts2 = (msr >> 8) & 0x7F;
                fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
-                       cpu, msr, tcc_activation_temp - dts, tcc_activation_temp - dts2);
+                       cpu, msr, tj_max - dts, tj_max - dts2);
        }
 
        return 0;
@@ -4522,7 +4702,7 @@ void print_power_limit_msr(int cpu, unsigned long long msr, char *label)
                cpu, label,
                ((msr >> 15) & 1) ? "EN" : "DIS",
                ((msr >> 0) & 0x7FFF) * rapl_power_units,
-               (1.0 + (((msr >> 22) & 0x3)/4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units,
+               (1.0 + (((msr >> 22) & 0x3) / 4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units,
                (((msr >> 16) & 1) ? "EN" : "DIS"));
 
        return;
@@ -4563,12 +4743,11 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
        if (do_rapl & RAPL_PKG_POWER_INFO) {
 
                if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr))
-                       return -5;
-
+                       return -5;
 
                fprintf(outf, "cpu%d: MSR_PKG_POWER_INFO: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
                        cpu, msr,
-                       ((msr >>  0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
+                       ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
                        ((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
                        ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
                        ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
@@ -4587,17 +4766,17 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                        cpu,
                        ((msr >> 47) & 1) ? "EN" : "DIS",
                        ((msr >> 32) & 0x7FFF) * rapl_power_units,
-                       (1.0 + (((msr >> 54) & 0x3)/4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units,
+                       (1.0 + (((msr >> 54) & 0x3) / 4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units,
                        ((msr >> 48) & 1) ? "EN" : "DIS");
        }
 
        if (do_rapl & RAPL_DRAM_POWER_INFO) {
                if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr))
-                       return -6;
+                       return -6;
 
                fprintf(outf, "cpu%d: MSR_DRAM_POWER_INFO,: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
                        cpu, msr,
-                       ((msr >>  0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
+                       ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
                        ((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
                        ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
                        ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
@@ -4606,7 +4785,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr))
                        return -9;
                fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n",
-                               cpu, msr, (msr >> 31) & 1 ? "" : "UN");
+                       cpu, msr, (msr >> 31) & 1 ? "" : "UN");
 
                print_power_limit_msr(cpu, msr, "DRAM Limit");
        }
@@ -4620,7 +4799,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr))
                        return -9;
                fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n",
-                               cpu, msr, (msr >> 31) & 1 ? "" : "UN");
+                       cpu, msr, (msr >> 31) & 1 ? "" : "UN");
                print_power_limit_msr(cpu, msr, "Cores Limit");
        }
        if (do_rapl & RAPL_GFX) {
@@ -4632,7 +4811,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                if (get_msr(cpu, MSR_PP1_POWER_LIMIT, &msr))
                        return -9;
                fprintf(outf, "cpu%d: MSR_PP1_POWER_LIMIT: 0x%08llx (%slocked)\n",
-                               cpu, msr, (msr >> 31) & 1 ? "" : "UN");
+                       cpu, msr, (msr >> 31) & 1 ? "" : "UN");
                print_power_limit_msr(cpu, msr, "GFX Limit");
        }
        return 0;
@@ -4654,23 +4833,24 @@ int has_snb_msrs(unsigned int family, unsigned int model)
        switch (model) {
        case INTEL_FAM6_SANDYBRIDGE:
        case INTEL_FAM6_SANDYBRIDGE_X:
-       case INTEL_FAM6_IVYBRIDGE:              /* IVB */
-       case INTEL_FAM6_IVYBRIDGE_X:            /* IVB Xeon */
-       case INTEL_FAM6_HASWELL:                /* HSW */
-       case INTEL_FAM6_HASWELL_X:              /* HSW */
-       case INTEL_FAM6_HASWELL_L:              /* HSW */
-       case INTEL_FAM6_HASWELL_G:              /* HSW */
-       case INTEL_FAM6_BROADWELL:              /* BDW */
-       case INTEL_FAM6_BROADWELL_G:            /* BDW */
-       case INTEL_FAM6_BROADWELL_X:            /* BDX */
-       case INTEL_FAM6_SKYLAKE_L:              /* SKL */
-       case INTEL_FAM6_CANNONLAKE_L:           /* CNL */
-       case INTEL_FAM6_SKYLAKE_X:              /* SKX */
-       case INTEL_FAM6_ATOM_GOLDMONT:          /* BXT */
+       case INTEL_FAM6_IVYBRIDGE:      /* IVB */
+       case INTEL_FAM6_IVYBRIDGE_X:    /* IVB Xeon */
+       case INTEL_FAM6_HASWELL:        /* HSW */
+       case INTEL_FAM6_HASWELL_X:      /* HSW */
+       case INTEL_FAM6_HASWELL_L:      /* HSW */
+       case INTEL_FAM6_HASWELL_G:      /* HSW */
+       case INTEL_FAM6_BROADWELL:      /* BDW */
+       case INTEL_FAM6_BROADWELL_G:    /* BDW */
+       case INTEL_FAM6_BROADWELL_X:    /* BDX */
+       case INTEL_FAM6_SKYLAKE_L:      /* SKL */
+       case INTEL_FAM6_CANNONLAKE_L:   /* CNL */
+       case INTEL_FAM6_SKYLAKE_X:      /* SKX */
+       case INTEL_FAM6_ICELAKE_X:      /* ICX */
+       case INTEL_FAM6_ATOM_GOLDMONT:  /* BXT */
        case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
        case INTEL_FAM6_ATOM_GOLDMONT_D:        /* DNV */
-       case INTEL_FAM6_ATOM_TREMONT:           /* EHL */
-       case INTEL_FAM6_ATOM_TREMONT_D:         /* JVL */
+       case INTEL_FAM6_ATOM_TREMONT:   /* EHL */
+       case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */
                return 1;
        }
        return 0;
@@ -4756,7 +4936,7 @@ int is_cnl(unsigned int family, unsigned int model)
                return 0;
 
        switch (model) {
-       case INTEL_FAM6_CANNONLAKE_L: /* CNL */
+       case INTEL_FAM6_CANNONLAKE_L:   /* CNL */
                return 1;
        }
 
@@ -4771,7 +4951,7 @@ unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model)
 }
 
 #define SLM_BCLK_FREQS 5
-double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0};
+double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 };
 
 double slm_bclk(void)
 {
@@ -4805,6 +4985,28 @@ double discover_bclk(unsigned int family, unsigned int model)
                return 133.33;
 }
 
+int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+       unsigned int eax, ebx, ecx, edx;
+
+       if (!genuine_intel)
+               return 0;
+
+       if (cpu_migrate(t->cpu_id)) {
+               fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id);
+               return -1;
+       }
+
+       if (max_level < 0x1a)
+               return 0;
+
+       __cpuid(0x1a, eax, ebx, ecx, edx);
+       eax = (eax >> 24) & 0xFF;
+       if (eax == 0x20)
+               t->is_atom = true;
+       return 0;
+}
+
 /*
  * MSR_IA32_TEMPERATURE_TARGET indicates the temperature where
  * the Thermal Control Circuit (TCC) activates.
@@ -4817,53 +5019,69 @@ double discover_bclk(unsigned int family, unsigned int model)
  * below this value, including the Digital Thermal Sensor (DTS),
  * Package Thermal Management Sensor (PTM), and thermal event thresholds.
  */
-int read_tcc_activation_temp()
+int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
        unsigned long long msr;
-       unsigned int tcc, target_c, offset_c;
+       unsigned int tcc_default, tcc_offset;
+       int cpu;
 
-       /* Temperature Target MSR is Nehalem and newer only */
-       if (!do_nhm_platform_info)
+       /* tj_max is used only for dts or ptm */
+       if (!(do_dts || do_ptm))
                return 0;
 
-       if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
+       /* this is a per-package concept */
+       if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
                return 0;
 
-       target_c = (msr >> 16) & 0xFF;
+       cpu = t->cpu_id;
+       if (cpu_migrate(cpu)) {
+               fprintf(outf, "Could not migrate to CPU %d\n", cpu);
+               return -1;
+       }
 
-       offset_c = (msr >> 24) & 0xF;
+       if (tj_max_override != 0) {
+               tj_max = tj_max_override;
+               fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", cpu, tj_max);
+               return 0;
+       }
 
-       tcc = target_c - offset_c;
+       /* Temperature Target MSR is Nehalem and newer only */
+       if (!do_nhm_platform_info)
+               goto guess;
 
-       if (!quiet)
-               fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n",
-                       base_cpu, msr, tcc, target_c, offset_c);
+       if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
+               goto guess;
 
-       return tcc;
-}
+       tcc_default = (msr >> 16) & 0xFF;
 
-int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p)
-{
-       /* tcc_activation_temp is used only for dts or ptm */
-       if (!(do_dts || do_ptm))
-               return 0;
+       if (!quiet) {
+               switch (tcc_offset_bits) {
+               case 4:
+                       tcc_offset = (msr >> 24) & 0xF;
+                       fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n",
+                               cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset);
+                       break;
+               case 6:
+                       tcc_offset = (msr >> 24) & 0x3F;
+                       fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n",
+                               cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset);
+                       break;
+               default:
+                       fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, tcc_default);
+                       break;
+               }
+       }
 
-       /* this is a per-package concept */
-       if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
-               return 0;
+       if (!tcc_default)
+               goto guess;
 
-       if (tcc_activation_temp_override != 0) {
-               tcc_activation_temp = tcc_activation_temp_override;
-               fprintf(outf, "Using cmdline TCC Target (%d C)\n", tcc_activation_temp);
-               return 0;
-       }
+       tj_max = tcc_default;
 
-       tcc_activation_temp = read_tcc_activation_temp();
-       if (tcc_activation_temp)
-               return 0;
+       return 0;
 
-       tcc_activation_temp = TJMAX_DEFAULT;
-       fprintf(outf, "Guessing tjMax %d C, Please use -T to specify\n", tcc_activation_temp);
+guess:
+       tj_max = TJMAX_DEFAULT;
+       fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", cpu, tj_max);
 
        return 0;
 }
@@ -4874,9 +5092,7 @@ void decode_feature_control_msr(void)
 
        if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr))
                fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n",
-                       base_cpu, msr,
-                       msr & FEAT_CTL_LOCKED ? "" : "UN-",
-                       msr & (1 << 18) ? "SGX" : "");
+                       base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : "");
 }
 
 void decode_misc_enable_msr(void)
@@ -4904,13 +5120,12 @@ void decode_misc_feature_control(void)
                return;
 
        if (!get_msr(base_cpu, MSR_MISC_FEATURE_CONTROL, &msr))
-               fprintf(outf, "cpu%d: MSR_MISC_FEATURE_CONTROL: 0x%08llx (%sL2-Prefetch %sL2-Prefetch-pair %sL1-Prefetch %sL1-IP-Prefetch)\n",
-                       base_cpu, msr,
-                       msr & (0 << 0) ? "No-" : "",
-                       msr & (1 << 0) ? "No-" : "",
-                       msr & (2 << 0) ? "No-" : "",
-                       msr & (3 << 0) ? "No-" : "");
+               fprintf(outf,
+                       "cpu%d: MSR_MISC_FEATURE_CONTROL: 0x%08llx (%sL2-Prefetch %sL2-Prefetch-pair %sL1-Prefetch %sL1-IP-Prefetch)\n",
+                       base_cpu, msr, msr & (0 << 0) ? "No-" : "", msr & (1 << 0) ? "No-" : "",
+                       msr & (2 << 0) ? "No-" : "", msr & (3 << 0) ? "No-" : "");
 }
+
 /*
  * Decode MSR_MISC_PWR_MGMT
  *
@@ -4931,10 +5146,9 @@ void decode_misc_pwr_mgmt_msr(void)
        if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr))
                fprintf(outf, "cpu%d: MSR_MISC_PWR_MGMT: 0x%08llx (%sable-EIST_Coordination %sable-EPB %sable-OOB)\n",
                        base_cpu, msr,
-                       msr & (1 << 0) ? "DIS" : "EN",
-                       msr & (1 << 1) ? "EN" : "DIS",
-                       msr & (1 << 8) ? "EN" : "DIS");
+                       msr & (1 << 0) ? "DIS" : "EN", msr & (1 << 1) ? "EN" : "DIS", msr & (1 << 8) ? "EN" : "DIS");
 }
+
 /*
  * Decode MSR_CC6_DEMOTION_POLICY_CONFIG, MSR_MC6_DEMOTION_POLICY_CONFIG
  *
@@ -4960,10 +5174,10 @@ void decode_c6_demotion_policy_msr(void)
 unsigned int intel_model_duplicates(unsigned int model)
 {
 
-       switch(model) {
+       switch (model) {
        case INTEL_FAM6_NEHALEM_EP:     /* Core i7, Xeon 5500 series - Bloomfield, Gainstown NHM-EP */
        case INTEL_FAM6_NEHALEM:        /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */
-       case 0x1F:      /* Core i7 and i5 Processor - Nehalem */
+       case 0x1F:              /* Core i7 and i5 Processor - Nehalem */
        case INTEL_FAM6_WESTMERE:       /* Westmere Client - Clarkdale, Arrandale */
        case INTEL_FAM6_WESTMERE_EP:    /* Westmere EP - Gulftown */
                return INTEL_FAM6_NEHALEM;
@@ -4994,14 +5208,15 @@ unsigned int intel_model_duplicates(unsigned int model)
        case INTEL_FAM6_ROCKETLAKE:
        case INTEL_FAM6_LAKEFIELD:
        case INTEL_FAM6_ALDERLAKE:
+       case INTEL_FAM6_ALDERLAKE_L:
                return INTEL_FAM6_CANNONLAKE_L;
 
        case INTEL_FAM6_ATOM_TREMONT_L:
                return INTEL_FAM6_ATOM_TREMONT;
 
-       case INTEL_FAM6_ICELAKE_X:
+       case INTEL_FAM6_ICELAKE_D:
        case INTEL_FAM6_SAPPHIRERAPIDS_X:
-               return INTEL_FAM6_SKYLAKE_X;
+               return INTEL_FAM6_ICELAKE_X;
        }
        return model;
 }
@@ -5025,17 +5240,36 @@ void print_dev_latency(void)
                close(fd);
                return;
        }
-       fprintf(outf, "/dev/cpu_dma_latency: %d usec (%s)\n",
-               value, value == 2000000000 ? "default" : "constrained");
+       fprintf(outf, "/dev/cpu_dma_latency: %d usec (%s)\n", value, value == 2000000000 ? "default" : "constrained");
 
        close(fd);
 }
 
+/*
+ * Linux-perf manages the the HW instructions-retired counter
+ * by enabling when requested, and hiding rollover
+ */
+void linux_perf_init(void)
+{
+       if (!BIC_IS_ENABLED(BIC_IPC))
+               return;
+
+       if (access("/proc/sys/kernel/perf_event_paranoid", F_OK))
+               return;
+
+       fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
+       if (fd_instr_count_percpu == NULL)
+               err(-1, "calloc fd_instr_count_percpu");
+
+       BIC_PRESENT(BIC_IPC);
+}
+
 void process_cpuid()
 {
        unsigned int eax, ebx, ecx, edx;
        unsigned int fms, family, model, stepping, ecx_flags, edx_flags;
        unsigned int has_turbo;
+       unsigned long long ucode_patch = 0;
 
        eax = ebx = ecx = edx = 0;
 
@@ -5049,8 +5283,8 @@ void process_cpuid()
                hygon_genuine = 1;
 
        if (!quiet)
-               fprintf(outf, "CPUID(0): %.4s%.4s%.4s ",
-                       (char *)&ebx, (char *)&edx, (char *)&ecx);
+               fprintf(outf, "CPUID(0): %.4s%.4s%.4s 0x%x CPUID levels\n",
+                       (char *)&ebx, (char *)&edx, (char *)&ecx, max_level);
 
        __cpuid(1, fms, ebx, ecx, edx);
        family = (fms >> 8) & 0xf;
@@ -5063,6 +5297,9 @@ void process_cpuid()
        ecx_flags = ecx;
        edx_flags = edx;
 
+       if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
+               warnx("get_msr(UCODE)\n");
+
        /*
         * check max extended function levels of CPUID.
         * This is needed to check for invariant TSC.
@@ -5072,8 +5309,10 @@ void process_cpuid()
        __cpuid(0x80000000, max_extended_level, ebx, ecx, edx);
 
        if (!quiet) {
-               fprintf(outf, "0x%x CPUID levels; 0x%x xlevels; family:model:stepping 0x%x:%x:%x (%d:%d:%d)\n",
-                       max_level, max_extended_level, family, model, stepping, family, model, stepping);
+               fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d) microcode 0x%x\n",
+                       family, model, stepping, family, model, stepping,
+                       (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF));
+               fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level);
                fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n",
                        ecx_flags & (1 << 0) ? "SSE3" : "-",
                        ecx_flags & (1 << 3) ? "MONITOR" : "-",
@@ -5083,11 +5322,12 @@ void process_cpuid()
                        edx_flags & (1 << 4) ? "TSC" : "-",
                        edx_flags & (1 << 5) ? "MSR" : "-",
                        edx_flags & (1 << 22) ? "ACPI-TM" : "-",
-                       edx_flags & (1 << 28) ? "HT" : "-",
-                       edx_flags & (1 << 29) ? "TM" : "-");
+                       edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-");
        }
-       if (genuine_intel)
+       if (genuine_intel) {
+               model_orig = model;
                model = intel_model_duplicates(model);
+       }
 
        if (!(edx_flags & (1 << 5)))
                errx(1, "CPUID: no MSR");
@@ -5138,14 +5378,11 @@ void process_cpuid()
                        has_hwp ? "" : "No-",
                        has_hwp_notify ? "" : "No-",
                        has_hwp_activity_window ? "" : "No-",
-                       has_hwp_epp ? "" : "No-",
-                       has_hwp_pkg ? "" : "No-",
-                       has_epb ? "" : "No-");
+                       has_hwp_epp ? "" : "No-", has_hwp_pkg ? "" : "No-", has_epb ? "" : "No-");
 
        if (!quiet)
                decode_misc_enable_msr();
 
-
        if (max_level >= 0x7 && !quiet) {
                int has_sgx;
 
@@ -5177,7 +5414,7 @@ void process_cpuid()
                                        eax_crystal, ebx_tsc, crystal_hz);
 
                        if (crystal_hz == 0)
-                               switch(model) {
+                               switch (model) {
                                case INTEL_FAM6_SKYLAKE_L:      /* SKL */
                                        crystal_hz = 24000000;  /* 24.0 MHz */
                                        break;
@@ -5190,13 +5427,13 @@ void process_cpuid()
                                        break;
                                default:
                                        crystal_hz = 0;
-                       }
+                               }
 
                        if (crystal_hz) {
-                               tsc_hz =  (unsigned long long) crystal_hz * ebx_tsc / eax_crystal;
+                               tsc_hz = (unsigned long long)crystal_hz *ebx_tsc / eax_crystal;
                                if (!quiet)
                                        fprintf(outf, "TSC: %lld MHz (%d Hz * %d / %d / 1000000)\n",
-                                               tsc_hz / 1000000, crystal_hz, ebx_tsc,  eax_crystal);
+                                               tsc_hz / 1000000, crystal_hz, ebx_tsc, eax_crystal);
                        }
                }
        }
@@ -5265,7 +5502,7 @@ void process_cpuid()
                BIC_NOT_PRESENT(BIC_Pkgpc7);
                use_c1_residency_msr = 1;
        }
-       if (is_skx(family, model)) {
+       if (is_skx(family, model) || is_icx(family, model)) {
                BIC_NOT_PRESENT(BIC_CPU_c3);
                BIC_NOT_PRESENT(BIC_Pkgpc3);
                BIC_NOT_PRESENT(BIC_CPU_c7);
@@ -5291,10 +5528,9 @@ void process_cpuid()
                BIC_PRESENT(BIC_CPUGFX);
        }
        do_slm_cstates = is_slm(family, model);
-       do_knl_cstates  = is_knl(family, model);
+       do_knl_cstates = is_knl(family, model);
 
-       if (do_slm_cstates || do_knl_cstates || is_cnl(family, model) ||
-           is_ehl(family, model))
+       if (do_slm_cstates || do_knl_cstates || is_cnl(family, model) || is_ehl(family, model))
                BIC_NOT_PRESENT(BIC_CPU_c3);
 
        if (!quiet)
@@ -5307,6 +5543,8 @@ void process_cpuid()
        perf_limit_reasons_probe(family, model);
        automatic_cstate_conversion_probe(family, model);
 
+       check_tcc_offset(model_orig);
+
        if (!quiet)
                dump_cstate_pstate_config_info(family, model);
 
@@ -5317,7 +5555,7 @@ void process_cpuid()
        if (!quiet)
                dump_sysfs_pstate_config();
 
-       if (has_skl_msrs(family, model))
+       if (has_skl_msrs(family, model) || is_ehl(family, model))
                calculate_tsc_tweak();
 
        if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
@@ -5386,7 +5624,7 @@ void topology_probe()
        if (debug > 1)
                fprintf(outf, "num_cpus %d max_cpu_num %d\n", topo.num_cpus, topo.max_cpu_num);
 
-       cpus = calloc(1, (topo.max_cpu_num  + 1) * sizeof(struct cpu_topology));
+       cpus = calloc(1, (topo.max_cpu_num + 1) * sizeof(struct cpu_topology));
        if (cpus == NULL)
                err(1, "calloc cpus");
 
@@ -5465,22 +5703,19 @@ void topology_probe()
 
        topo.cores_per_node = max_core_id + 1;
        if (debug > 1)
-               fprintf(outf, "max_core_id %d, sizing for %d cores per package\n",
-                       max_core_id, topo.cores_per_node);
+               fprintf(outf, "max_core_id %d, sizing for %d cores per package\n", max_core_id, topo.cores_per_node);
        if (!summary_only && topo.cores_per_node > 1)
                BIC_PRESENT(BIC_Core);
 
        topo.num_die = max_die_id + 1;
        if (debug > 1)
-               fprintf(outf, "max_die_id %d, sizing for %d die\n",
-                               max_die_id, topo.num_die);
+               fprintf(outf, "max_die_id %d, sizing for %d die\n", max_die_id, topo.num_die);
        if (!summary_only && topo.num_die > 1)
                BIC_PRESENT(BIC_Die);
 
        topo.num_packages = max_package_id + 1;
        if (debug > 1)
-               fprintf(outf, "max_package_id %d, sizing for %d packages\n",
-                       max_package_id, topo.num_packages);
+               fprintf(outf, "max_package_id %d, sizing for %d packages\n", max_package_id, topo.num_packages);
        if (!summary_only && topo.num_packages > 1)
                BIC_PRESENT(BIC_Package);
 
@@ -5503,21 +5738,15 @@ void topology_probe()
                fprintf(outf,
                        "cpu %d pkg %d die %d node %d lnode %d core %d thread %d\n",
                        i, cpus[i].physical_package_id, cpus[i].die_id,
-                       cpus[i].physical_node_id,
-                       cpus[i].logical_node_id,
-                       cpus[i].physical_core_id,
-                       cpus[i].thread_id);
+                       cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].physical_core_id, cpus[i].thread_id);
        }
 
 }
 
-void
-allocate_counters(struct thread_data **t, struct core_data **c,
-                 struct pkg_data **p)
+void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_data **p)
 {
        int i;
-       int num_cores = topo.cores_per_node * topo.nodes_per_pkg *
-                       topo.num_packages;
+       int num_cores = topo.cores_per_node * topo.nodes_per_pkg * topo.num_packages;
        int num_threads = topo.threads_per_core * num_cores;
 
        *t = calloc(num_threads, sizeof(struct thread_data));
@@ -5545,13 +5774,13 @@ allocate_counters(struct thread_data **t, struct core_data **c,
 error:
        err(1, "calloc counters");
 }
+
 /*
  * init_counter()
  *
  * set FIRST_THREAD_IN_CORE and FIRST_CORE_IN_PACKAGE
  */
-void init_counter(struct thread_data *thread_base, struct core_data *core_base,
-       struct pkg_data *pkg_base, int cpu_id)
+void init_counter(struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base, int cpu_id)
 {
        int pkg_id = cpus[cpu_id].physical_package_id;
        int node_id = cpus[cpu_id].logical_node_id;
@@ -5561,7 +5790,6 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base,
        struct core_data *c;
        struct pkg_data *p;
 
-
        /* Workaround for systems where physical_node_id==-1
         * and logical_node_id==(-1 - topo.num_cpus)
         */
@@ -5583,7 +5811,6 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base,
        p->package_id = pkg_id;
 }
 
-
 int initialize_counters(int cpu_id)
 {
        init_counter(EVEN_COUNTERS, cpu_id);
@@ -5598,12 +5825,14 @@ void allocate_output_buffer()
        if (outp == NULL)
                err(-1, "calloc output buffer");
 }
+
 void allocate_fd_percpu(void)
 {
        fd_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
        if (fd_percpu == NULL)
                err(-1, "calloc fd_percpu");
 }
+
 void allocate_irq_buffers(void)
 {
        irq_column_2_cpu = calloc(topo.num_cpus, sizeof(int));
@@ -5614,6 +5843,7 @@ void allocate_irq_buffers(void)
        if (irqs_per_cpu == NULL)
                err(-1, "calloc %d", topo.max_cpu_num + 1);
 }
+
 void setup_all_buffers(void)
 {
        topology_probe();
@@ -5642,7 +5872,7 @@ void turbostat_init()
        check_dev_msr();
        check_permissions();
        process_cpuid();
-
+       linux_perf_init();
 
        if (!quiet)
                for_all_cpus(print_hwp, ODD_COUNTERS);
@@ -5658,6 +5888,9 @@ void turbostat_init()
 
        for_all_cpus(set_temperature_target, ODD_COUNTERS);
 
+       for_all_cpus(get_cpu_type, ODD_COUNTERS);
+       for_all_cpus(get_cpu_type, EVEN_COUNTERS);
+
        if (!quiet)
                for_all_cpus(print_thermal, ODD_COUNTERS);
 
@@ -5713,7 +5946,7 @@ int fork_it(char **argv)
                format_all_counters(EVEN_COUNTERS);
        }
 
-       fprintf(outf, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec/1000000.0);
+       fprintf(outf, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec / 1000000.0);
 
        flush_output_stderr();
 
@@ -5738,14 +5971,14 @@ int get_and_dump_counters(void)
        return status;
 }
 
-void print_version() {
-       fprintf(outf, "turbostat version 20.09.30"
-               " - Len Brown <lenb@kernel.org>\n");
+void print_version()
+{
+       fprintf(outf, "turbostat version 21.05.04" " - Len Brown <lenb@kernel.org>\n");
 }
 
 int add_counter(unsigned int msr_num, char *path, char *name,
-       unsigned int width, enum counter_scope scope,
-       enum counter_type type, enum counter_format format, int flags)
+               unsigned int width, enum counter_scope scope,
+               enum counter_type type, enum counter_format format, int flags)
 {
        struct msr_counter *msrp;
 
@@ -5771,8 +6004,7 @@ int add_counter(unsigned int msr_num, char *path, char *name,
                sys.tp = msrp;
                sys.added_thread_counters++;
                if (sys.added_thread_counters > MAX_ADDED_THREAD_COUNTERS) {
-                       fprintf(stderr, "exceeded max %d added thread counters\n",
-                               MAX_ADDED_COUNTERS);
+                       fprintf(stderr, "exceeded max %d added thread counters\n", MAX_ADDED_COUNTERS);
                        exit(-1);
                }
                break;
@@ -5782,8 +6014,7 @@ int add_counter(unsigned int msr_num, char *path, char *name,
                sys.cp = msrp;
                sys.added_core_counters++;
                if (sys.added_core_counters > MAX_ADDED_COUNTERS) {
-                       fprintf(stderr, "exceeded max %d added core counters\n",
-                               MAX_ADDED_COUNTERS);
+                       fprintf(stderr, "exceeded max %d added core counters\n", MAX_ADDED_COUNTERS);
                        exit(-1);
                }
                break;
@@ -5793,8 +6024,7 @@ int add_counter(unsigned int msr_num, char *path, char *name,
                sys.pp = msrp;
                sys.added_package_counters++;
                if (sys.added_package_counters > MAX_ADDED_COUNTERS) {
-                       fprintf(stderr, "exceeded max %d added package counters\n",
-                               MAX_ADDED_COUNTERS);
+                       fprintf(stderr, "exceeded max %d added package counters\n", MAX_ADDED_COUNTERS);
                        exit(-1);
                }
                break;
@@ -5931,15 +6161,14 @@ void probe_sysfs(void)
 
        for (state = 10; state >= 0; --state) {
 
-               sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name",
-                       base_cpu, state);
+               sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
                input = fopen(path, "r");
                if (input == NULL)
                        continue;
                if (!fgets(name_buf, sizeof(name_buf), input))
                        err(1, "%s: failed to read file", path);
 
-                /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
+               /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
                sp = strchr(name_buf, '-');
                if (!sp)
                        sp = strchrnul(name_buf, '\n');
@@ -5955,20 +6184,18 @@ void probe_sysfs(void)
                if (is_deferred_skip(name_buf))
                        continue;
 
-               add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC,
-                               FORMAT_PERCENT, SYSFS_PERCPU);
+               add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU);
        }
 
        for (state = 10; state >= 0; --state) {
 
-               sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name",
-                       base_cpu, state);
+               sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
                input = fopen(path, "r");
                if (input == NULL)
                        continue;
                if (!fgets(name_buf, sizeof(name_buf), input))
                        err(1, "%s: failed to read file", path);
-                /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
+               /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
                sp = strchr(name_buf, '-');
                if (!sp)
                        sp = strchrnul(name_buf, '\n');
@@ -5982,13 +6209,11 @@ void probe_sysfs(void)
                if (is_deferred_skip(name_buf))
                        continue;
 
-               add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS,
-                               FORMAT_DELTA, SYSFS_PERCPU);
+               add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU);
        }
 
 }
 
-
 /*
  * parse cpuset with following syntax
  * 1,2,4..6,8-10 and set bits in cpu_subset
@@ -6075,36 +6300,35 @@ error:
        exit(-1);
 }
 
-
 void cmdline(int argc, char **argv)
 {
        int opt;
        int option_index = 0;
        static struct option long_options[] = {
-               {"add",         required_argument,      0, 'a'},
-               {"cpu",         required_argument,      0, 'c'},
-               {"Dump",        no_argument,            0, 'D'},
-               {"debug",       no_argument,            0, 'd'},        /* internal, not documented */
-               {"enable",      required_argument,      0, 'e'},
-               {"interval",    required_argument,      0, 'i'},
-               {"num_iterations",      required_argument,      0, 'n'},
-               {"help",        no_argument,            0, 'h'},
-               {"hide",        required_argument,      0, 'H'},        // meh, -h taken by --help
-               {"Joules",      no_argument,            0, 'J'},
-               {"list",        no_argument,            0, 'l'},
-               {"out",         required_argument,      0, 'o'},
-               {"quiet",       no_argument,            0, 'q'},
-               {"show",        required_argument,      0, 's'},
-               {"Summary",     no_argument,            0, 'S'},
-               {"TCC",         required_argument,      0, 'T'},
-               {"version",     no_argument,            0, 'v' },
-               {0,             0,                      0,  0 }
+               { "add", required_argument, 0, 'a' },
+               { "cpu", required_argument, 0, 'c' },
+               { "Dump", no_argument, 0, 'D' },
+               { "debug", no_argument, 0, 'd' },       /* internal, not documented */
+               { "enable", required_argument, 0, 'e' },
+               { "interval", required_argument, 0, 'i' },
+               { "IPC", no_argument, 0, 'I' },
+               { "num_iterations", required_argument, 0, 'n' },
+               { "help", no_argument, 0, 'h' },
+               { "hide", required_argument, 0, 'H' },  // meh, -h taken by --help
+               { "Joules", no_argument, 0, 'J' },
+               { "list", no_argument, 0, 'l' },
+               { "out", required_argument, 0, 'o' },
+               { "quiet", no_argument, 0, 'q' },
+               { "show", required_argument, 0, 's' },
+               { "Summary", no_argument, 0, 'S' },
+               { "TCC", required_argument, 0, 'T' },
+               { "version", no_argument, 0, 'v' },
+               { 0, 0, 0, 0 }
        };
 
        progname = argv[0];
 
-       while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v",
-                               long_options, &option_index)) != -1) {
+       while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v", long_options, &option_index)) != -1) {
                switch (opt) {
                case 'a':
                        parse_add_command(optarg);
@@ -6139,8 +6363,7 @@ void cmdline(int argc, char **argv)
                                double interval = strtod(optarg, NULL);
 
                                if (interval < 0.001) {
-                                       fprintf(outf, "interval %f seconds is too small\n",
-                                               interval);
+                                       fprintf(outf, "interval %f seconds is too small\n", interval);
                                        exit(2);
                                }
 
@@ -6167,8 +6390,7 @@ void cmdline(int argc, char **argv)
                        num_iterations = strtod(optarg, NULL);
 
                        if (num_iterations <= 0) {
-                               fprintf(outf, "iterations %d should be positive number\n",
-                                       num_iterations);
+                               fprintf(outf, "iterations %d should be positive number\n", num_iterations);
                                exit(2);
                        }
                        break;
@@ -6188,7 +6410,7 @@ void cmdline(int argc, char **argv)
                        summary_only++;
                        break;
                case 'T':
-                       tcc_activation_temp_override = atoi(optarg);
+                       tj_max_override = atoi(optarg);
                        break;
                case 'v':
                        print_version();
index 25adfec..f9271f3 100644 (file)
@@ -38,6 +38,7 @@ EXTRA_WARNINGS += -Wswitch-enum
 EXTRA_WARNINGS += -Wundef
 EXTRA_WARNINGS += -Wwrite-strings
 EXTRA_WARNINGS += -Wformat
+EXTRA_WARNINGS += -Wno-type-limits
 
 # Makefiles suck: This macro sets a default value of $(2) for the
 # variable named by $(1), unless the variable has been set by
diff --git a/tools/testing/ktest/examples/vmware.conf b/tools/testing/ktest/examples/vmware.conf
new file mode 100644 (file)
index 0000000..6195816
--- /dev/null
@@ -0,0 +1,137 @@
+#
+# This config is an example usage of ktest.pl with a vmware guest
+#
+# VMware Setup:
+# -------------
+# - Edit the Virtual Machine ("Edit virtual machine settings")
+# - Add a Serial Port
+#   - You almost certainly want it set "Connect at power on"
+#   - Select "Use socket (named pipe)"
+#   - Select a name that you'll recognize, like 'ktestserialpipe'
+#   - From: Server
+#   - To: A Virtual Machine
+#   - Save
+# - Make sure you note the name, it will be in the base directory of the
+#   virtual machine (where the "disks" are stored.  The default
+#   is /var/lib/vmware/<virtual machine name>/<the name you entered above>
+#
+# - Make note of the path to the VM
+# </End VMware setup>
+#
+# The guest is called 'Guest' and this would be something that
+# could be run on the host to test a virtual machine target.
+
+MACHINE = Guest
+
+# Name of the serial pipe you set in the VMware settings
+VMWARE_SERIAL_NAME = <the name you entered above>
+
+# Define a variable of the name of the VM
+# Noting this needs to be the name of the kmx file, and usually, the
+# name of the directory that it's in.  If the directory and name
+# differ change the VMWARE_VM_DIR accordingly.
+# Please ommit the .kmx extension
+VMWARE_VM_NAME = <virtual machine name>
+
+# VM dir name.  This is usually the same as the virtual machine's name,
+# but not always the case.  Change if they differ
+VMWARE_VM_DIR = ${VMWARE_VM_NAME}
+
+# Base directory that the Virtual machine is contained in
+# /var/lib/vmware is the default on Linux
+VMWARE_VM_BASE_DIR = /var/lib/vmware/${VMWARE_VM_DIR}
+
+# Use ncat to read the unix pipe.  Anything that can read the Unix Pipe
+# and output it's contents to stdout will work
+CONSOLE = /usr/bin/ncat -U ${VMWARE_VM_BASE_DIR}/${VMWARE_SERIAL_NAME}
+
+# Define what version of Workstation you are using
+# This is used by vmrun to use the appropriate appripriate pieces to 
+# test this.  In all likelihood you want 'ws' or 'player' 
+# Valid options:
+#      ws - Workstation (Windows or Linux host)
+#      fusion - Fusion (Mac host)
+#      player - Using VMware Player (Windows or Linux host)
+# Note: vmrun has to run directly on the host machine
+VMWARE_HOST_TYPE = ws
+
+# VMware provides `vmrun` to allow you to do certain things to the virtual machine
+# This should hard reset the VM and force a boot
+VMWARE_POWER_CYCLE = /usr/bin/vmrun -T ${VMWARE_HOST_TYPE} reset ${VMWARE_VM_BASE_DIR}/${VMWARE_VM_NAME}.kmx nogui
+
+#*************************************#
+# This part is the same as test.conf  #
+#*************************************#
+
+# The include files will set up the type of test to run. Just set TEST to
+# which test you want to run.
+#
+# TESTS = patchcheck, randconfig, boot, test, config-bisect, bisect, min-config
+#
+# See the include/*.conf files that define these tests
+#
+TEST := patchcheck
+
+# Some tests may have more than one test to run. Define MULTI := 1 to run
+# the extra tests.
+MULTI := 0
+
+# In case you want to differentiate which type of system you are testing
+BITS := 64
+
+# REBOOT = none, error, fail, empty
+#  See include/defaults.conf
+REBOOT := empty
+
+
+# The defaults file will set up various settings that can be used by all
+# machine configs.
+INCLUDE include/defaults.conf
+
+
+#*************************************#
+# Now we are different from test.conf #
+#*************************************#
+
+
+# The example here assumes that Guest is running a Fedora release
+# that uses dracut for its initfs. The POST_INSTALL will be executed
+# after the install of the kernel and modules are complete.
+#
+POST_INSTALL = ${SSH} /sbin/dracut -f /boot/initramfs-test.img $KERNEL_VERSION
+
+# Guests sometimes get stuck on reboot. We wait 3 seconds after running
+# the reboot command and then  do a full power-cycle of the guest.
+# This forces the guest to restart.
+#
+POWERCYCLE_AFTER_REBOOT = 3
+
+# We do the same after the halt command, but this time we wait 20 seconds.
+POWEROFF_AFTER_HALT = 20
+
+
+# As the defaults.conf file has a POWER_CYCLE option already defined,
+# and options can not be defined in the same section more than once
+# (all DEFAULTS sections are considered the same). We use the
+# DEFAULTS OVERRIDE to tell ktest.pl to ignore the previous defined
+# options, for the options set in the OVERRIDE section.
+#
+DEFAULTS OVERRIDE
+
+# Instead of using the default POWER_CYCLE option defined in
+# defaults.conf, we use virsh to cycle it. To do so, we destroy
+# the guest, wait 5 seconds, and then start it up again.
+# Crude, but effective.
+#
+POWER_CYCLE = ${VMWARE_POWER_CYCLE}
+
+
+DEFAULTS
+
+# The following files each handle a different test case.
+# Having them included allows you to set up more than one machine and share
+# the same tests.
+INCLUDE include/patchcheck.conf
+INCLUDE include/tests.conf
+INCLUDE include/bisect.conf
+INCLUDE include/min-config.conf
index 4e24509..09d1578 100755 (executable)
@@ -24,7 +24,7 @@ my %evals;
 
 #default opts
 my %default = (
-    "MAILER"                   => "sendmail",  # default mailer
+    "MAILER"                   => "sendmail",  # default mailer
     "EMAIL_ON_ERROR"           => 1,
     "EMAIL_WHEN_FINISHED"      => 1,
     "EMAIL_WHEN_CANCELED"      => 0,
@@ -36,15 +36,15 @@ my %default = (
     "CLOSE_CONSOLE_SIGNAL"     => "INT",
     "TIMEOUT"                  => 120,
     "TMP_DIR"                  => "/tmp/ktest/\${MACHINE}",
-    "SLEEP_TIME"               => 60,  # sleep time between tests
+    "SLEEP_TIME"               => 60,          # sleep time between tests
     "BUILD_NOCLEAN"            => 0,
     "REBOOT_ON_ERROR"          => 0,
     "POWEROFF_ON_ERROR"                => 0,
     "REBOOT_ON_SUCCESS"                => 1,
     "POWEROFF_ON_SUCCESS"      => 0,
     "BUILD_OPTIONS"            => "",
-    "BISECT_SLEEP_TIME"                => 60,   # sleep time between bisects
-    "PATCHCHECK_SLEEP_TIME"    => 60, # sleep time between patch checks
+    "BISECT_SLEEP_TIME"                => 60,          # sleep time between bisects
+    "PATCHCHECK_SLEEP_TIME"    => 60,          # sleep time between patch checks
     "CLEAR_LOG"                        => 0,
     "BISECT_MANUAL"            => 0,
     "BISECT_SKIP"              => 1,
@@ -512,6 +512,69 @@ $config_help{"REBOOT_SCRIPT"} = << "EOF"
 EOF
     ;
 
+# used with process_expression()
+my $d = 0;
+
+# defined before get_test_name()
+my $in_die = 0;
+
+# defined before process_warning_line()
+my $check_build_re = ".*:.*(warning|error|Error):.*";
+my $utf8_quote = "\\x{e2}\\x{80}(\\x{98}|\\x{99})";
+
+# defined before child_finished()
+my $child_done;
+
+# config_ignore holds the configs that were set (or unset) for
+# a good config and we will ignore these configs for the rest
+# of a config bisect. These configs stay as they were.
+my %config_ignore;
+
+# config_set holds what all configs were set as.
+my %config_set;
+
+# config_off holds the set of configs that the bad config had disabled.
+# We need to record them and set them in the .config when running
+# olddefconfig, because olddefconfig keeps the defaults.
+my %config_off;
+
+# config_off_tmp holds a set of configs to turn off for now
+my @config_off_tmp;
+
+# config_list is the set of configs that are being tested
+my %config_list;
+my %null_config;
+
+my %dependency;
+
+# found above run_config_bisect()
+my $pass = 1;
+
+# found above add_dep()
+
+my %depends;
+my %depcount;
+my $iflevel = 0;
+my @ifdeps;
+
+# prevent recursion
+my %read_kconfigs;
+
+# found above test_this_config()
+my %min_configs;
+my %keep_configs;
+my %save_configs;
+my %processed_configs;
+my %nochange_config;
+
+#
+# These are first defined here, main function later on
+#
+sub run_command;
+sub start_monitor;
+sub end_monitor;
+sub wait_for_monitor;
+
 sub _logit {
     if (defined($opt{"LOG_FILE"})) {
        print LOG @_;
@@ -537,7 +600,7 @@ sub read_prompt {
     my $ans;
 
     for (;;) {
-       if ($cancel) {
+        if ($cancel) {
            print "$prompt [y/n/C] ";
        } else {
            print "$prompt [Y/n] ";
@@ -760,7 +823,7 @@ sub process_variables {
     # remove the space added in the beginning
     $retval =~ s/ //;
 
-    return "$retval"
+    return "$retval";
 }
 
 sub set_value {
@@ -863,7 +926,6 @@ sub value_defined {
        defined($opt{$2});
 }
 
-my $d = 0;
 sub process_expression {
     my ($name, $val) = @_;
 
@@ -978,7 +1040,6 @@ sub __read_config {
            $override = 0;
 
            if ($type eq "TEST_START") {
-
                if ($num_tests_set) {
                    die "$name: $.: Can not specify both NUM_TESTS and TEST_START\n";
                }
@@ -1048,7 +1109,6 @@ sub __read_config {
                $test_num = $old_test_num;
                $repeat = $old_repeat;
            }
-
        } elsif (/^\s*ELSE\b(.*)$/) {
            if (!$if) {
                die "$name: $.: ELSE found with out matching IF section\n$_";
@@ -1095,7 +1155,7 @@ sub __read_config {
                    }
                }
            }
-               
+
            if ( ! -r $file ) {
                die "$name: $.: Can't read file $file\n$_";
            }
@@ -1186,13 +1246,13 @@ sub __read_config {
 }
 
 sub get_test_case {
-       print "What test case would you like to run?\n";
-       print " (build, install or boot)\n";
-       print " Other tests are available but require editing ktest.conf\n";
-       print " (see tools/testing/ktest/sample.conf)\n";
-       my $ans = <STDIN>;
-       chomp $ans;
-       $default{"TEST_TYPE"} = $ans;
+    print "What test case would you like to run?\n";
+    print " (build, install or boot)\n";
+    print " Other tests are available but require editing ktest.conf\n";
+    print " (see tools/testing/ktest/sample.conf)\n";
+    my $ans = <STDIN>;
+    chomp $ans;
+    $default{"TEST_TYPE"} = $ans;
 }
 
 sub read_config {
@@ -1368,11 +1428,6 @@ sub eval_option {
     return $option;
 }
 
-sub run_command;
-sub start_monitor;
-sub end_monitor;
-sub wait_for_monitor;
-
 sub reboot {
     my ($time) = @_;
     my $powercycle = 0;
@@ -1457,8 +1512,6 @@ sub do_not_reboot {
        ($test_type eq "config_bisect" && $opt{"CONFIG_BISECT_TYPE[$i]"} eq "build");
 }
 
-my $in_die = 0;
-
 sub get_test_name() {
     my $name;
 
@@ -1471,7 +1524,6 @@ sub get_test_name() {
 }
 
 sub dodie {
-
     # avoid recursion
     return if ($in_die);
     $in_die = 1;
@@ -1481,10 +1533,8 @@ sub dodie {
     doprint "CRITICAL FAILURE... [TEST $i] ", @_, "\n";
 
     if ($reboot_on_error && !do_not_reboot) {
-
        doprint "REBOOTING\n";
        reboot_to_good;
-
     } elsif ($poweroff_on_error && defined($power_off)) {
        doprint "POWERING OFF\n";
        `$power_off`;
@@ -1519,13 +1569,14 @@ sub dodie {
            close O;
            close L;
        }
-        send_email("KTEST: critical failure for test $i [$name]",
-                "Your test started at $script_start_time has failed with:\n@_\n", $log_file);
+
+       send_email("KTEST: critical failure for test $i [$name]",
+               "Your test started at $script_start_time has failed with:\n@_\n", $log_file);
     }
 
     if ($monitor_cnt) {
-           # restore terminal settings
-           system("stty $stty_orig");
+       # restore terminal settings
+       system("stty $stty_orig");
     }
 
     if (defined($post_test)) {
@@ -1709,81 +1760,81 @@ sub wait_for_monitor {
 }
 
 sub save_logs {
-       my ($result, $basedir) = @_;
-       my @t = localtime;
-       my $date = sprintf "%04d%02d%02d%02d%02d%02d",
-               1900+$t[5],$t[4],$t[3],$t[2],$t[1],$t[0];
+    my ($result, $basedir) = @_;
+    my @t = localtime;
+    my $date = sprintf "%04d%02d%02d%02d%02d%02d",
+       1900+$t[5],$t[4],$t[3],$t[2],$t[1],$t[0];
 
-       my $type = $build_type;
-       if ($type =~ /useconfig/) {
-           $type = "useconfig";
-       }
+    my $type = $build_type;
+    if ($type =~ /useconfig/) {
+       $type = "useconfig";
+    }
 
-       my $dir = "$machine-$test_type-$type-$result-$date";
+    my $dir = "$machine-$test_type-$type-$result-$date";
 
-       $dir = "$basedir/$dir";
+    $dir = "$basedir/$dir";
 
-       if (!-d $dir) {
-           mkpath($dir) or
-               dodie "can't create $dir";
-       }
+    if (!-d $dir) {
+       mkpath($dir) or
+           dodie "can't create $dir";
+    }
 
-       my %files = (
-               "config" => $output_config,
-               "buildlog" => $buildlog,
-               "dmesg" => $dmesg,
-               "testlog" => $testlog,
-       );
+    my %files = (
+       "config" => $output_config,
+       "buildlog" => $buildlog,
+       "dmesg" => $dmesg,
+       "testlog" => $testlog,
+    );
 
-       while (my ($name, $source) = each(%files)) {
-               if (-f "$source") {
-                       cp "$source", "$dir/$name" or
-                               dodie "failed to copy $source";
-               }
+    while (my ($name, $source) = each(%files)) {
+       if (-f "$source") {
+           cp "$source", "$dir/$name" or
+               dodie "failed to copy $source";
        }
+    }
 
-       doprint "*** Saved info to $dir ***\n";
+    doprint "*** Saved info to $dir ***\n";
 }
 
 sub fail {
 
-       if ($die_on_failure) {
-               dodie @_;
-       }
+    if ($die_on_failure) {
+       dodie @_;
+    }
 
-       doprint "FAILED\n";
+    doprint "FAILED\n";
 
-       my $i = $iteration;
+    my $i = $iteration;
 
-       # no need to reboot for just building.
-       if (!do_not_reboot) {
-           doprint "REBOOTING\n";
-           reboot_to_good $sleep_time;
-       }
+    # no need to reboot for just building.
+    if (!do_not_reboot) {
+       doprint "REBOOTING\n";
+       reboot_to_good $sleep_time;
+    }
 
-       my $name = "";
+    my $name = "";
 
-       if (defined($test_name)) {
-           $name = " ($test_name)";
-       }
+    if (defined($test_name)) {
+       $name = " ($test_name)";
+    }
 
-       print_times;
+    print_times;
 
-       doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n";
-       doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n";
-       doprint "KTEST RESULT: TEST $i$name Failed: ", @_, "\n";
-       doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n";
-       doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n";
+    doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n";
+    doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n";
+    doprint "KTEST RESULT: TEST $i$name Failed: ", @_, "\n";
+    doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n";
+    doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n";
 
-       if (defined($store_failures)) {
-           save_logs "fail", $store_failures;
-        }
+    if (defined($store_failures)) {
+       save_logs "fail", $store_failures;
+    }
 
-       if (defined($post_test)) {
-               run_command $post_test;
-       }
+    if (defined($post_test)) {
+       run_command $post_test;
+    }
 
-       return 1;
+    return 1;
 }
 
 sub run_command {
@@ -1915,8 +1966,8 @@ sub _get_grub_index {
     my ($command, $target, $skip) = @_;
 
     return if (defined($grub_number) && defined($last_grub_menu) &&
-              $last_grub_menu eq $grub_menu && defined($last_machine) &&
-              $last_machine eq $machine);
+       $last_grub_menu eq $grub_menu && defined($last_machine) &&
+       $last_machine eq $machine);
 
     doprint "Find $reboot_type menu ... ";
     $grub_number = -1;
@@ -1924,8 +1975,8 @@ sub _get_grub_index {
     my $ssh_grub = $ssh_exec;
     $ssh_grub =~ s,\$SSH_COMMAND,$command,g;
 
-    open(IN, "$ssh_grub |")
-       or dodie "unable to execute $command";
+    open(IN, "$ssh_grub |") or
+       dodie "unable to execute $command";
 
     my $found = 0;
 
@@ -1969,9 +2020,9 @@ sub get_grub_index {
        $target = '^menuentry.*' . $grub_menu_qt;
        $skip = '^menuentry\s|^submenu\s';
     } elsif ($reboot_type eq "grub2bls") {
-        $command = $grub_bls_get;
-        $target = '^title=.*' . $grub_menu_qt;
-        $skip = '^title=';
+       $command = $grub_bls_get;
+       $target = '^title=.*' . $grub_menu_qt;
+       $skip = '^title=';
     } else {
        return;
     }
@@ -1979,8 +2030,7 @@ sub get_grub_index {
     _get_grub_index($command, $target, $skip);
 }
 
-sub wait_for_input
-{
+sub wait_for_input {
     my ($fp, $time) = @_;
     my $start_time;
     my $rin;
@@ -2096,7 +2146,6 @@ sub monitor {
     my $version_found = 0;
 
     while (!$done) {
-
        if ($bug && defined($stop_after_failure) &&
            $stop_after_failure >= 0) {
            my $time = $stop_after_failure - (time - $failure_start);
@@ -2349,9 +2398,6 @@ sub start_monitor_and_install {
     return monitor;
 }
 
-my $check_build_re = ".*:.*(warning|error|Error):.*";
-my $utf8_quote = "\\x{e2}\\x{80}(\\x{98}|\\x{99})";
-
 sub process_warning_line {
     my ($line) = @_;
 
@@ -2394,7 +2440,7 @@ sub check_buildlog {
        while (<IN>) {
            if (/$check_build_re/) {
                my $warning = process_warning_line $_;
-               
+
                $warnings_list{$warning} = 1;
            }
        }
@@ -2571,7 +2617,6 @@ sub build {
            run_command "mv $outputdir/config_temp $output_config" or
                dodie "moving config_temp";
        }
-
     } elsif (!$noclean) {
        unlink "$output_config";
        run_command "$make mrproper" or
@@ -2594,6 +2639,9 @@ sub build {
     # Run old config regardless, to enforce min configurations
     make_oldconfig;
 
+    if (not defined($build_options)){
+       $build_options = "";
+    }
     my $build_ret = run_command "$make $build_options", $buildlog;
 
     if (defined($post_build)) {
@@ -2649,14 +2697,15 @@ sub success {
 
     print_times;
 
-    doprint "\n\n*******************************************\n";
-    doprint     "*******************************************\n";
-    doprint     "KTEST RESULT: TEST $i$name SUCCESS!!!!         **\n";
-    doprint     "*******************************************\n";
-    doprint     "*******************************************\n";
+    doprint "\n\n";
+    doprint "*******************************************\n";
+    doprint "*******************************************\n";
+    doprint "KTEST RESULT: TEST $i$name SUCCESS!!!!   **\n";
+    doprint "*******************************************\n";
+    doprint "*******************************************\n";
 
     if (defined($store_successes)) {
-        save_logs "success", $store_successes;
+       save_logs "success", $store_successes;
     }
 
     if ($i != $opt{"NUM_TESTS"} && !do_not_reboot) {
@@ -2698,8 +2747,6 @@ sub child_run_test {
     exit $run_command_status;
 }
 
-my $child_done;
-
 sub child_finished {
     $child_done = 1;
 }
@@ -3031,7 +3078,6 @@ sub bisect {
     }
 
     if ($do_check) {
-
        # get current HEAD
        my $head = get_sha1("HEAD");
 
@@ -3071,13 +3117,11 @@ sub bisect {
        run_command "git bisect replay $replay" or
            dodie "failed to run replay";
     } else {
-
        run_command "git bisect good $good" or
            dodie "could not set bisect good to $good";
 
        run_git_bisect "git bisect bad $bad" or
            dodie "could not set bisect bad to $bad";
-
     }
 
     if (defined($start)) {
@@ -3103,35 +3147,13 @@ sub bisect {
     success $i;
 }
 
-# config_ignore holds the configs that were set (or unset) for
-# a good config and we will ignore these configs for the rest
-# of a config bisect. These configs stay as they were.
-my %config_ignore;
-
-# config_set holds what all configs were set as.
-my %config_set;
-
-# config_off holds the set of configs that the bad config had disabled.
-# We need to record them and set them in the .config when running
-# olddefconfig, because olddefconfig keeps the defaults.
-my %config_off;
-
-# config_off_tmp holds a set of configs to turn off for now
-my @config_off_tmp;
-
-# config_list is the set of configs that are being tested
-my %config_list;
-my %null_config;
-
-my %dependency;
-
 sub assign_configs {
     my ($hash, $config) = @_;
 
     doprint "Reading configs from $config\n";
 
-    open (IN, $config)
-       or dodie "Failed to read $config";
+    open (IN, $config) or
+       dodie "Failed to read $config";
 
     while (<IN>) {
        chomp;
@@ -3219,8 +3241,6 @@ sub config_bisect_end {
     doprint "***************************************\n\n";
 }
 
-my $pass = 1;
-
 sub run_config_bisect {
     my ($good, $bad, $last_result) = @_;
     my $reset = "";
@@ -3243,13 +3263,13 @@ sub run_config_bisect {
 
     $ret = run_config_bisect_test $config_bisect_type;
     if ($ret) {
-        doprint "NEW GOOD CONFIG ($pass)\n";
+       doprint "NEW GOOD CONFIG ($pass)\n";
        system("cp $output_config $tmpdir/good_config.tmp.$pass");
        $pass++;
        # Return 3 for good config
        return 3;
     } else {
-        doprint "NEW BAD CONFIG ($pass)\n";
+       doprint "NEW BAD CONFIG ($pass)\n";
        system("cp $output_config $tmpdir/bad_config.tmp.$pass");
        $pass++;
        # Return 4 for bad config
@@ -3284,10 +3304,11 @@ sub config_bisect {
 
     if (!defined($config_bisect_exec)) {
        # First check the location that ktest.pl ran
-       my @locations = ( "$pwd/config-bisect.pl",
-                         "$dirname/config-bisect.pl",
-                         "$builddir/tools/testing/ktest/config-bisect.pl",
-                         undef );
+       my @locations = (
+               "$pwd/config-bisect.pl",
+               "$dirname/config-bisect.pl",
+               "$builddir/tools/testing/ktest/config-bisect.pl",
+               undef );
        foreach my $loc (@locations) {
            doprint "loc = $loc\n";
            $config_bisect_exec = $loc;
@@ -3368,7 +3389,7 @@ sub config_bisect {
     } while ($ret == 3 || $ret == 4);
 
     if ($ret == 2) {
-        config_bisect_end "$good_config.tmp", "$bad_config.tmp";
+       config_bisect_end "$good_config.tmp", "$bad_config.tmp";
     }
 
     return $ret if ($ret < 0);
@@ -3511,14 +3532,6 @@ sub patchcheck {
     return 1;
 }
 
-my %depends;
-my %depcount;
-my $iflevel = 0;
-my @ifdeps;
-
-# prevent recursion
-my %read_kconfigs;
-
 sub add_dep {
     # $config depends on $dep
     my ($config, $dep) = @_;
@@ -3548,7 +3561,6 @@ sub read_kconfig {
     my $cont = 0;
     my $line;
 
-
     if (! -f $kconfig) {
        doprint "file $kconfig does not exist, skipping\n";
        return;
@@ -3630,8 +3642,8 @@ sub read_kconfig {
 
 sub read_depends {
     # find out which arch this is by the kconfig file
-    open (IN, $output_config)
-       or dodie "Failed to read $output_config";
+    open (IN, $output_config) or
+       dodie "Failed to read $output_config";
     my $arch;
     while (<IN>) {
        if (m,Linux/(\S+)\s+\S+\s+Kernel Configuration,) {
@@ -3657,7 +3669,7 @@ sub read_depends {
 
     if (! -f $kconfig && $arch =~ /\d$/) {
        my $orig = $arch;
-       # some subarchs have numbers, truncate them
+       # some subarchs have numbers, truncate them
        $arch =~ s/\d*$//;
        $kconfig = "$builddir/arch/$arch/Kconfig";
        if (! -f $kconfig) {
@@ -3706,7 +3718,6 @@ sub get_depends {
     my @configs;
 
     while ($dep =~ /[$valid]/) {
-
        if ($dep =~ /^[^$valid]*([$valid]+)/) {
            my $conf = "CONFIG_" . $1;
 
@@ -3721,12 +3732,6 @@ sub get_depends {
     return @configs;
 }
 
-my %min_configs;
-my %keep_configs;
-my %save_configs;
-my %processed_configs;
-my %nochange_config;
-
 sub test_this_config {
     my ($config) = @_;
 
@@ -3852,7 +3857,7 @@ sub make_min_config {
     foreach my $config (@config_keys) {
        my $kconfig = chomp_config $config;
        if (!defined $depcount{$kconfig}) {
-               $depcount{$kconfig} = 0;
+           $depcount{$kconfig} = 0;
        }
     }
 
@@ -3887,7 +3892,6 @@ sub make_min_config {
     my $take_two = 0;
 
     while (!$done) {
-
        my $config;
        my $found;
 
@@ -3898,7 +3902,7 @@ sub make_min_config {
 
        # Sort keys by who is most dependent on
        @test_configs = sort  { $depcount{chomp_config($b)} <=> $depcount{chomp_config($a)} }
-                         @test_configs ;
+           @test_configs ;
 
        # Put configs that did not modify the config at the end.
        my $reset = 1;
@@ -3954,13 +3958,13 @@ sub make_min_config {
        my $failed = 0;
        build "oldconfig" or $failed = 1;
        if (!$failed) {
-               start_monitor_and_install or $failed = 1;
+           start_monitor_and_install or $failed = 1;
 
-               if ($type eq "test" && !$failed) {
-                   do_run_test or $failed = 1;
-               }
+           if ($type eq "test" && !$failed) {
+               do_run_test or $failed = 1;
+           }
 
-               end_monitor;
+           end_monitor;
        }
 
        $in_bisect = 0;
@@ -3974,8 +3978,8 @@ sub make_min_config {
 
            # update new ignore configs
            if (defined($ignore_config)) {
-               open (OUT, ">$temp_config")
-                   or dodie "Can't write to $temp_config";
+               open (OUT, ">$temp_config") or
+                   dodie "Can't write to $temp_config";
                foreach my $config (keys %save_configs) {
                    print OUT "$save_configs{$config}\n";
                }
@@ -4002,8 +4006,8 @@ sub make_min_config {
            }
 
            # Save off all the current mandatory configs
-           open (OUT, ">$temp_config")
-               or dodie "Can't write to $temp_config";
+           open (OUT, ">$temp_config") or
+               dodie "Can't write to $temp_config";
            foreach my $config (keys %keep_configs) {
                print OUT "$keep_configs{$config}\n";
            }
@@ -4041,7 +4045,6 @@ sub make_warnings_file {
 
     open(IN, $buildlog) or dodie "Can't open $buildlog";
     while (<IN>) {
-
        # Some compilers use UTF-8 extended for quotes
        # for distcc heterogeneous systems, this causes issues
        s/$utf8_quote/'/g;
@@ -4057,98 +4060,6 @@ sub make_warnings_file {
     success $i;
 }
 
-$#ARGV < 1 or die "ktest.pl version: $VERSION\n   usage: ktest.pl [config-file]\n";
-
-if ($#ARGV == 0) {
-    $ktest_config = $ARGV[0];
-    if (! -f $ktest_config) {
-       print "$ktest_config does not exist.\n";
-       if (!read_yn "Create it?") {
-           exit 0;
-       }
-    }
-}
-
-if (! -f $ktest_config) {
-    $newconfig = 1;
-    get_test_case;
-    open(OUT, ">$ktest_config") or die "Can not create $ktest_config";
-    print OUT << "EOF"
-# Generated by ktest.pl
-#
-
-# PWD is a ktest.pl variable that will result in the process working
-# directory that ktest.pl is executed in.
-
-# THIS_DIR is automatically assigned the PWD of the path that generated
-# the config file. It is best to use this variable when assigning other
-# directory paths within this directory. This allows you to easily
-# move the test cases to other locations or to other machines.
-#
-THIS_DIR := $variable{"PWD"}
-
-# Define each test with TEST_START
-# The config options below it will override the defaults
-TEST_START
-TEST_TYPE = $default{"TEST_TYPE"}
-
-DEFAULTS
-EOF
-;
-    close(OUT);
-}
-read_config $ktest_config;
-
-if (defined($opt{"LOG_FILE"})) {
-    $opt{"LOG_FILE"} = eval_option("LOG_FILE", $opt{"LOG_FILE"}, -1);
-}
-
-# Append any configs entered in manually to the config file.
-my @new_configs = keys %entered_configs;
-if ($#new_configs >= 0) {
-    print "\nAppending entered in configs to $ktest_config\n";
-    open(OUT, ">>$ktest_config") or die "Can not append to $ktest_config";
-    foreach my $config (@new_configs) {
-       print OUT "$config = $entered_configs{$config}\n";
-       $opt{$config} = process_variables($entered_configs{$config});
-    }
-}
-
-if (defined($opt{"LOG_FILE"})) {
-    if ($opt{"CLEAR_LOG"}) {
-       unlink $opt{"LOG_FILE"};
-    }
-    open(LOG, ">> $opt{LOG_FILE}") or die "Can't write to $opt{LOG_FILE}";
-    LOG->autoflush(1);
-}
-
-doprint "\n\nSTARTING AUTOMATED TESTS\n\n";
-
-for (my $i = 0, my $repeat = 1; $i <= $opt{"NUM_TESTS"}; $i += $repeat) {
-
-    if (!$i) {
-       doprint "DEFAULT OPTIONS:\n";
-    } else {
-       doprint "\nTEST $i OPTIONS";
-       if (defined($repeat_tests{$i})) {
-           $repeat = $repeat_tests{$i};
-           doprint " ITERATE $repeat";
-       }
-       doprint "\n";
-    }
-
-    foreach my $option (sort keys %opt) {
-
-       if ($option =~ /\[(\d+)\]$/) {
-           next if ($i != $1);
-       } else {
-           next if ($i);
-       }
-
-       doprint "$option = $opt{$option}\n";
-    }
-}
-
 sub option_defined {
     my ($option) = @_;
 
@@ -4261,7 +4172,6 @@ sub do_send_mail {
 }
 
 sub send_email {
-
     if (defined($mailto)) {
        if (!defined($mailer)) {
            doprint "No email sent: email or mailer not specified in config.\n";
@@ -4274,12 +4184,103 @@ sub send_email {
 sub cancel_test {
     if ($email_when_canceled) {
        my $name = get_test_name;
-        send_email("KTEST: Your [$name] test was cancelled",
-                "Your test started at $script_start_time was cancelled: sig int");
+       send_email("KTEST: Your [$name] test was cancelled",
+           "Your test started at $script_start_time was cancelled: sig int");
     }
     die "\nCaught Sig Int, test interrupted: $!\n"
 }
 
+$#ARGV < 1 or die "ktest.pl version: $VERSION\n   usage: ktest.pl [config-file]\n";
+
+if ($#ARGV == 0) {
+    $ktest_config = $ARGV[0];
+    if (! -f $ktest_config) {
+       print "$ktest_config does not exist.\n";
+       if (!read_yn "Create it?") {
+           exit 0;
+       }
+    }
+}
+
+if (! -f $ktest_config) {
+    $newconfig = 1;
+    get_test_case;
+    open(OUT, ">$ktest_config") or die "Can not create $ktest_config";
+    print OUT << "EOF"
+# Generated by ktest.pl
+#
+
+# PWD is a ktest.pl variable that will result in the process working
+# directory that ktest.pl is executed in.
+
+# THIS_DIR is automatically assigned the PWD of the path that generated
+# the config file. It is best to use this variable when assigning other
+# directory paths within this directory. This allows you to easily
+# move the test cases to other locations or to other machines.
+#
+THIS_DIR := $variable{"PWD"}
+
+# Define each test with TEST_START
+# The config options below it will override the defaults
+TEST_START
+TEST_TYPE = $default{"TEST_TYPE"}
+
+DEFAULTS
+EOF
+;
+    close(OUT);
+}
+read_config $ktest_config;
+
+if (defined($opt{"LOG_FILE"})) {
+    $opt{"LOG_FILE"} = eval_option("LOG_FILE", $opt{"LOG_FILE"}, -1);
+}
+
+# Append any configs entered in manually to the config file.
+my @new_configs = keys %entered_configs;
+if ($#new_configs >= 0) {
+    print "\nAppending entered in configs to $ktest_config\n";
+    open(OUT, ">>$ktest_config") or die "Can not append to $ktest_config";
+    foreach my $config (@new_configs) {
+       print OUT "$config = $entered_configs{$config}\n";
+       $opt{$config} = process_variables($entered_configs{$config});
+    }
+}
+
+if (defined($opt{"LOG_FILE"})) {
+    if ($opt{"CLEAR_LOG"}) {
+       unlink $opt{"LOG_FILE"};
+    }
+    open(LOG, ">> $opt{LOG_FILE}") or die "Can't write to $opt{LOG_FILE}";
+    LOG->autoflush(1);
+}
+
+doprint "\n\nSTARTING AUTOMATED TESTS\n\n";
+
+for (my $i = 0, my $repeat = 1; $i <= $opt{"NUM_TESTS"}; $i += $repeat) {
+
+    if (!$i) {
+       doprint "DEFAULT OPTIONS:\n";
+    } else {
+       doprint "\nTEST $i OPTIONS";
+       if (defined($repeat_tests{$i})) {
+           $repeat = $repeat_tests{$i};
+           doprint " ITERATE $repeat";
+       }
+       doprint "\n";
+    }
+
+    foreach my $option (sort keys %opt) {
+       if ($option =~ /\[(\d+)\]$/) {
+           next if ($i != $1);
+       } else {
+           next if ($i);
+       }
+
+       doprint "$option = $opt{$option}\n";
+    }
+}
+
 $SIG{INT} = qw(cancel_test);
 
 # First we need to do is the builds
@@ -4323,15 +4324,15 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) {
 
     # The first test may override the PRE_KTEST option
     if ($i == 1) {
-        if (defined($pre_ktest)) {
-            doprint "\n";
-            run_command $pre_ktest;
-        }
-        if ($email_when_started) {
+       if (defined($pre_ktest)) {
+           doprint "\n";
+           run_command $pre_ktest;
+       }
+       if ($email_when_started) {
            my $name = get_test_name;
-            send_email("KTEST: Your [$name] test was started",
-                "Your test was started on $script_start_time");
-        }
+           send_email("KTEST: Your [$name] test was started",
+               "Your test was started on $script_start_time");
+       }
     }
 
     # Any test can override the POST_KTEST option
@@ -4409,7 +4410,7 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) {
        my $ret = run_command $pre_test;
        if (!$ret && defined($pre_test_die) &&
            $pre_test_die) {
-           dodie "failed to pre_test\n";
+               dodie "failed to pre_test\n";
        }
     }
 
@@ -4503,12 +4504,11 @@ if ($opt{"POWEROFF_ON_SUCCESS"}) {
     run_command $switch_to_good;
 }
 
-
 doprint "\n    $successes of $opt{NUM_TESTS} tests were successful\n\n";
 
 if ($email_when_finished) {
     send_email("KTEST: Your test has finished!",
-            "$successes of $opt{NUM_TESTS} tests started at $script_start_time were successful!");
+       "$successes of $opt{NUM_TESTS} tests started at $script_start_time were successful!");
 }
 
 if (defined($opt{"LOG_FILE"})) {
@@ -4517,3 +4517,12 @@ if (defined($opt{"LOG_FILE"})) {
 }
 
 exit 0;
+
+##
+# The following are here to standardize tabs/spaces/etc across the most likely editors
+###
+
+# Local Variables:
+# mode: perl
+# End:
+# vim: softtabstop=4
index 6c575cf..bc3299a 100644 (file)
@@ -25,6 +25,7 @@ TARGETS += ir
 TARGETS += kcmp
 TARGETS += kexec
 TARGETS += kvm
+TARGETS += landlock
 TARGETS += lib
 TARGETS += livepatch
 TARGETS += lkdtm
index a958c22..dffbcaa 100644 (file)
@@ -43,6 +43,8 @@ void test_snprintf_positive(void)
        if (!ASSERT_OK_PTR(skel, "skel_open"))
                return;
 
+       skel->bss->pid = getpid();
+
        if (!ASSERT_OK(test_snprintf__attach(skel), "skel_attach"))
                goto cleanup;
 
index 951a030..e35129b 100644 (file)
@@ -5,6 +5,8 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
+__u32 pid = 0;
+
 char num_out[64] = {};
 long num_ret = 0;
 
@@ -42,6 +44,9 @@ int handler(const void *ctx)
        static const char str1[] = "str1";
        static const char longstr[] = "longstr";
 
+       if ((int)bpf_get_current_pid_tgid() != pid)
+               return 0;
+
        /* Integer types */
        num_ret  = BPF_SNPRINTF(num_out, sizeof(num_out),
                                "%d %u %x %li %llu %lX",
index fb23ce9..485dff5 100644 (file)
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Copyright (C) 2020 Hisilicon Limited.
+ * Copyright (C) 2020 HiSilicon Limited.
  */
 
 #include <fcntl.h>
@@ -40,7 +40,8 @@ struct map_benchmark {
        __u32 dma_bits; /* DMA addressing capability */
        __u32 dma_dir; /* DMA data direction */
        __u32 dma_trans_ns; /* time for DMA transmission in ns */
-       __u8 expansion[80];     /* For future use */
+       __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */
+       __u8 expansion[76];     /* For future use */
 };
 
 int main(int argc, char **argv)
@@ -51,11 +52,13 @@ int main(int argc, char **argv)
        int threads = 1, seconds = 20, node = -1;
        /* default dma mask 32bit, bidirectional DMA */
        int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL;
+       /* default granule 1 PAGESIZE */
+       int granule = 1;
 
        int cmd = DMA_MAP_BENCHMARK;
        char *p;
 
-       while ((opt = getopt(argc, argv, "t:s:n:b:d:x:")) != -1) {
+       while ((opt = getopt(argc, argv, "t:s:n:b:d:x:g:")) != -1) {
                switch (opt) {
                case 't':
                        threads = atoi(optarg);
@@ -75,6 +78,9 @@ int main(int argc, char **argv)
                case 'x':
                        xdelay = atoi(optarg);
                        break;
+               case 'g':
+                       granule = atoi(optarg);
+                       break;
                default:
                        return -1;
                }
@@ -110,6 +116,11 @@ int main(int argc, char **argv)
                exit(1);
        }
 
+       if (granule < 1 || granule > 1024) {
+               fprintf(stderr, "invalid granule size\n");
+               exit(1);
+       }
+
        fd = open("/sys/kernel/debug/dma_map_benchmark", O_RDWR);
        if (fd == -1) {
                perror("open");
@@ -123,14 +134,15 @@ int main(int argc, char **argv)
        map.dma_bits = bits;
        map.dma_dir = dir;
        map.dma_trans_ns = xdelay;
+       map.granule = granule;
 
        if (ioctl(fd, cmd, &map)) {
                perror("ioctl");
                exit(1);
        }
 
-       printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s\n",
-                       threads, seconds, node, dir[directions]);
+       printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n",
+                       threads, seconds, node, dir[directions], granule);
        printf("average map latency(us):%.1f standard deviation:%.1f\n",
                        map.avg_map_100ns/10.0, map.map_stddev/10.0);
        printf("average unmap latency(us):%.1f standard deviation:%.1f\n",
index ad7fabd..65ede50 100644 (file)
@@ -3449,4 +3449,48 @@ TEST(epoll63)
        close(sfd[1]);
 }
 
+/*
+ *        t0    t1
+ *     (ew) \  / (ew)
+ *           e0
+ *            | (lt)
+ *           s0
+ */
+TEST(epoll64)
+{
+       pthread_t waiter[2];
+       struct epoll_event e;
+       struct epoll_mtcontext ctx = { 0 };
+
+       signal(SIGUSR1, signal_handler);
+
+       ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+       ctx.efd[0] = epoll_create(1);
+       ASSERT_GE(ctx.efd[0], 0);
+
+       e.events = EPOLLIN;
+       ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+       /*
+        * main will act as the emitter once both waiter threads are
+        * blocked and expects to both be awoken upon the ready event.
+        */
+       ctx.main = pthread_self();
+       ASSERT_EQ(pthread_create(&waiter[0], NULL, waiter_entry1a, &ctx), 0);
+       ASSERT_EQ(pthread_create(&waiter[1], NULL, waiter_entry1a, &ctx), 0);
+
+       usleep(100000);
+       ASSERT_EQ(write(ctx.sfd[1], "w", 1), 1);
+
+       ASSERT_EQ(pthread_join(waiter[0], NULL), 0);
+       ASSERT_EQ(pthread_join(waiter[1], NULL), 0);
+
+       EXPECT_EQ(ctx.count, 2);
+
+       close(ctx.efd[0]);
+       close(ctx.sfd[0]);
+       close(ctx.sfd[1]);
+}
+
 TEST_HARNESS_MAIN
index 7bd7e77..bd83158 100644 (file)
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 /aarch64/get-reg-list
 /aarch64/get-reg-list-sve
+/aarch64/vgic_init
 /s390x/memop
 /s390x/resets
 /s390x/sync_regs_test
@@ -38,6 +39,7 @@
 /dirty_log_perf_test
 /hardware_disable_test
 /kvm_create_max_vcpus
+/kvm_page_table_test
 /memslot_modification_stress_test
 /set_memory_region_test
 /steal_time
index cb95b5b..e439d02 100644 (file)
@@ -72,16 +72,19 @@ TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
 TEST_GEN_PROGS_x86_64 += hardware_disable_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86_64 += kvm_page_table_test
 TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
 TEST_GEN_PROGS_x86_64 += set_memory_region_test
 TEST_GEN_PROGS_x86_64 += steal_time
 
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve
+TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
 TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
 TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_aarch64 += kvm_page_table_test
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
 
@@ -91,6 +94,7 @@ TEST_GEN_PROGS_s390x += s390x/sync_regs_test
 TEST_GEN_PROGS_s390x += demand_paging_test
 TEST_GEN_PROGS_s390x += dirty_log_test
 TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
+TEST_GEN_PROGS_s390x += kvm_page_table_test
 TEST_GEN_PROGS_s390x += set_memory_region_test
 
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c
new file mode 100644 (file)
index 0000000..623f31a
--- /dev/null
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * vgic init sequence tests
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#define _GNU_SOURCE
+#include <linux/kernel.h>
+#include <sys/syscall.h>
+#include <asm/kvm.h>
+#include <asm/kvm_para.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define NR_VCPUS               4
+
+#define REDIST_REGION_ATTR_ADDR(count, base, flags, index) (((uint64_t)(count) << 52) | \
+       ((uint64_t)((base) >> 16) << 16) | ((uint64_t)(flags) << 12) | index)
+#define REG_OFFSET(vcpu, offset) (((uint64_t)vcpu << 32) | offset)
+
+#define GICR_TYPER 0x8
+
+struct vm_gic {
+       struct kvm_vm *vm;
+       int gic_fd;
+};
+
+static int max_ipa_bits;
+
+/* helper to access a redistributor register */
+static int access_redist_reg(int gicv3_fd, int vcpu, int offset,
+                            uint32_t *val, bool write)
+{
+       uint64_t attr = REG_OFFSET(vcpu, offset);
+
+       return _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS,
+                                 attr, val, write);
+}
+
+/* dummy guest code */
+static void guest_code(void)
+{
+       GUEST_SYNC(0);
+       GUEST_SYNC(1);
+       GUEST_SYNC(2);
+       GUEST_DONE();
+}
+
+/* we don't want to assert on run execution, hence that helper */
+static int run_vcpu(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       ucall_init(vm, NULL);
+       int ret = _vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL);
+       if (ret)
+               return -errno;
+       return 0;
+}
+
+static struct vm_gic vm_gic_create(void)
+{
+       struct vm_gic v;
+
+       v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL);
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       return v;
+}
+
+static void vm_gic_destroy(struct vm_gic *v)
+{
+       close(v->gic_fd);
+       kvm_vm_free(v->vm);
+}
+
+/**
+ * Helper routine that performs KVM device tests in general and
+ * especially ARM_VGIC_V3 ones. Eventually the ARM_VGIC_V3
+ * device gets created, a legacy RDIST region is set at @0x0
+ * and a DIST region is set @0x60000
+ */
+static void subtest_dist_rdist(struct vm_gic *v)
+{
+       int ret;
+       uint64_t addr;
+
+       /* Check existing group/attributes */
+       kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                             KVM_VGIC_V3_ADDR_TYPE_DIST);
+
+       kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                             KVM_VGIC_V3_ADDR_TYPE_REDIST);
+
+       /* check non existing attribute */
+       ret = _kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 0);
+       TEST_ASSERT(ret && errno == ENXIO, "attribute not supported");
+
+       /* misaligned DIST and REDIST address settings */
+       addr = 0x1000;
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "GICv3 dist base not 64kB aligned");
+
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "GICv3 redist base not 64kB aligned");
+
+       /* out of range address */
+       if (max_ipa_bits) {
+               addr = 1ULL << max_ipa_bits;
+               ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                        KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true);
+               TEST_ASSERT(ret && errno == E2BIG, "dist address beyond IPA limit");
+
+               ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                        KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+               TEST_ASSERT(ret && errno == E2BIG, "redist address beyond IPA limit");
+       }
+
+       /* set REDIST base address @0x0*/
+       addr = 0x00000;
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+
+       /* Attempt to create a second legacy redistributor region */
+       addr = 0xE0000;
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+       TEST_ASSERT(ret && errno == EEXIST, "GICv3 redist base set again");
+
+       /* Attempt to mix legacy and new redistributor regions */
+       addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 0, 0);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "attempt to mix GICv3 REDIST and REDIST_REGION");
+
+       /*
+        * Set overlapping DIST / REDIST, cannot be detected here. Will be detected
+        * on first vcpu run instead.
+        */
+       addr = 3 * 2 * 0x10000;
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_DIST,
+                         &addr, true);
+}
+
+/* Test the new REDIST region API */
+static void subtest_redist_regions(struct vm_gic *v)
+{
+       uint64_t addr, expected_addr;
+       int ret;
+
+       ret = kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                    KVM_VGIC_V3_ADDR_TYPE_REDIST);
+       TEST_ASSERT(!ret, "Multiple redist regions advertised");
+
+       addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 2, 0);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with flags != 0");
+
+       addr = REDIST_REGION_ATTR_ADDR(0, 0x100000, 0, 0);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with count== 0");
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL,
+                   "attempt to register the first rdist region with index != 0");
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x201000, 0, 1);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "rdist region with misaligned address");
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "register an rdist region with already used index");
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x210000, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL,
+                   "register an rdist region overlapping with another one");
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "register redist region with index not +1");
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1);
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 1ULL << max_ipa_bits, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == E2BIG,
+                   "register redist region with base address beyond IPA range");
+
+       addr = 0x260000;
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL,
+                   "Mix KVM_VGIC_V3_ADDR_TYPE_REDIST and REDIST_REGION");
+
+       /*
+        * Now there are 2 redist regions:
+        * region 0 @ 0x200000 2 redists
+        * region 1 @ 0x240000 1 redist
+        * Attempt to read their characteristics
+        */
+
+       addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 0);
+       expected_addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false);
+       TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #0");
+
+       addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 1);
+       expected_addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false);
+       TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #1");
+
+       addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false);
+       TEST_ASSERT(ret && errno == ENOENT, "read characteristics of non existing region");
+
+       addr = 0x260000;
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x260000, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "register redist region colliding with dist");
+}
+
+/*
+ * VGIC KVM device is created and initialized before the secondary CPUs
+ * get created
+ */
+static void test_vgic_then_vcpus(void)
+{
+       struct vm_gic v;
+       int ret, i;
+
+       v.vm = vm_create_default(0, 0, guest_code);
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       subtest_dist_rdist(&v);
+
+       /* Add the rest of the VCPUs */
+       for (i = 1; i < NR_VCPUS; ++i)
+               vm_vcpu_add_default(v.vm, i, guest_code);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run");
+
+       vm_gic_destroy(&v);
+}
+
+/* All the VCPUs are created before the VGIC KVM device gets initialized */
+static void test_vcpus_then_vgic(void)
+{
+       struct vm_gic v;
+       int ret;
+
+       v = vm_gic_create();
+
+       subtest_dist_rdist(&v);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run");
+
+       vm_gic_destroy(&v);
+}
+
+static void test_new_redist_regions(void)
+{
+       void *dummy = NULL;
+       struct vm_gic v;
+       uint64_t addr;
+       int ret;
+
+       v = vm_gic_create();
+       subtest_redist_regions(&v);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(ret == -ENXIO, "running without sufficient number of rdists");
+       vm_gic_destroy(&v);
+
+       /* step2 */
+
+       v = vm_gic_create();
+       subtest_redist_regions(&v);
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(ret == -EBUSY, "running without vgic explicit init");
+
+       vm_gic_destroy(&v);
+
+       /* step 3 */
+
+       v = vm_gic_create();
+       subtest_redist_regions(&v);
+
+       _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, dummy, true);
+       TEST_ASSERT(ret && errno == EFAULT,
+                   "register a third region allowing to cover the 4 vcpus");
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(!ret, "vcpu run");
+
+       vm_gic_destroy(&v);
+}
+
+static void test_typer_accesses(void)
+{
+       struct vm_gic v;
+       uint64_t addr;
+       uint32_t val;
+       int ret, i;
+
+       v.vm = vm_create_default(0, 0, guest_code);
+
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       vm_vcpu_add_default(v.vm, 3, guest_code);
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(ret && errno == EINVAL, "attempting to read GICR_TYPER of non created vcpu");
+
+       vm_vcpu_add_default(v.vm, 1, guest_code);
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(ret && errno == EBUSY, "read GICR_TYPER before GIC initialized");
+
+       vm_vcpu_add_default(v.vm, 2, guest_code);
+
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       for (i = 0; i < NR_VCPUS ; i++) {
+               ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false);
+               TEST_ASSERT(!ret && !val, "read GICR_TYPER before rdist region setting");
+       }
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       /* The 2 first rdists should be put there (vcpu 0 and 3) */
+       ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && !val, "read typer of rdist #0");
+
+       ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #1");
+
+       addr = REDIST_REGION_ATTR_ADDR(10, 0x100000, 0, 1);
+       ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "collision with previous rdist region");
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x100,
+                   "no redist region attached to vcpu #1 yet, last cannot be returned");
+
+       ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x200,
+                   "no redist region attached to vcpu #2, last cannot be returned");
+
+       addr = REDIST_REGION_ATTR_ADDR(10, 0x20000, 0, 1);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1");
+
+       ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x210,
+                   "read typer of rdist #1, last properly returned");
+
+       vm_gic_destroy(&v);
+}
+
+/**
+ * Test GICR_TYPER last bit with new redist regions
+ * rdist regions #1 and #2 are contiguous
+ * rdist region #0 @0x100000 2 rdist capacity
+ *     rdists: 0, 3 (Last)
+ * rdist region #1 @0x240000 2 rdist capacity
+ *     rdists:  5, 4 (Last)
+ * rdist region #2 @0x200000 2 rdist capacity
+ *     rdists: 1, 2
+ */
+static void test_last_bit_redist_regions(void)
+{
+       uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 };
+       struct vm_gic v;
+       uint64_t addr;
+       uint32_t val;
+       int ret;
+
+       v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids);
+
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x100000, 0, 0);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x240000, 0, 1);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 2);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0");
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1");
+
+       ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x200, "read typer of rdist #2");
+
+       ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #3");
+
+       ret = access_redist_reg(v.gic_fd, 5, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #5");
+
+       ret = access_redist_reg(v.gic_fd, 4, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x410, "read typer of rdist #4");
+
+       vm_gic_destroy(&v);
+}
+
+/* Test last bit with legacy region */
+static void test_last_bit_single_rdist(void)
+{
+       uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 };
+       struct vm_gic v;
+       uint64_t addr;
+       uint32_t val;
+       int ret;
+
+       v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids);
+
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       addr = 0x10000;
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+
+       ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0");
+
+       ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x300, "read typer of rdist #1");
+
+       ret = access_redist_reg(v.gic_fd, 5, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #2");
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #3");
+
+       ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x210, "read typer of rdist #3");
+
+       vm_gic_destroy(&v);
+}
+
+void test_kvm_device(void)
+{
+       struct vm_gic v;
+       int ret, fd;
+
+       v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL);
+
+       /* try to create a non existing KVM device */
+       ret = _kvm_create_device(v.vm, 0, true, &fd);
+       TEST_ASSERT(ret && errno == ENODEV, "unsupported device");
+
+       /* trial mode with VGIC_V3 device */
+       ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, true, &fd);
+       if (ret) {
+               print_skip("GICv3 not supported");
+               exit(KSFT_SKIP);
+       }
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false, &fd);
+       TEST_ASSERT(ret && errno == EEXIST, "create GICv3 device twice");
+
+       kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, true);
+
+       if (!_kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V2, true, &fd)) {
+               ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V2, false, &fd);
+               TEST_ASSERT(ret && errno == EINVAL, "create GICv2 while v3 exists");
+       }
+
+       vm_gic_destroy(&v);
+}
+
+int main(int ac, char **av)
+{
+       max_ipa_bits = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+
+       test_kvm_device();
+       test_vcpus_then_vgic();
+       test_vgic_then_vcpus();
+       test_new_redist_regions();
+       test_typer_accesses();
+       test_last_bit_redist_regions();
+       test_last_bit_single_rdist();
+
+       return 0;
+}
index bb2752d..81edbd2 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/bitmap.h>
 #include <linux/bitops.h>
 #include <asm/barrier.h>
+#include <linux/atomic.h>
 
 #include "kvm_util.h"
 #include "test_util.h"
@@ -137,12 +138,20 @@ static uint64_t host_clear_count;
 static uint64_t host_track_next_count;
 
 /* Whether dirty ring reset is requested, or finished */
-static sem_t dirty_ring_vcpu_stop;
-static sem_t dirty_ring_vcpu_cont;
+static sem_t sem_vcpu_stop;
+static sem_t sem_vcpu_cont;
+/*
+ * This is only set by main thread, and only cleared by vcpu thread.  It is
+ * used to request vcpu thread to stop at the next GUEST_SYNC, since GUEST_SYNC
+ * is the only place that we'll guarantee both "dirty bit" and "dirty data"
+ * will match.  E.g., SIG_IPI won't guarantee that if the vcpu is interrupted
+ * after setting dirty bit but before the data is written.
+ */
+static atomic_t vcpu_sync_stop_requested;
 /*
  * This is updated by the vcpu thread to tell the host whether it's a
  * ring-full event.  It should only be read until a sem_wait() of
- * dirty_ring_vcpu_stop and before vcpu continues to run.
+ * sem_vcpu_stop and before vcpu continues to run.
  */
 static bool dirty_ring_vcpu_ring_full;
 /*
@@ -234,6 +243,17 @@ static void clear_log_collect_dirty_pages(struct kvm_vm *vm, int slot,
        kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages);
 }
 
+/* Should only be called after a GUEST_SYNC */
+static void vcpu_handle_sync_stop(void)
+{
+       if (atomic_read(&vcpu_sync_stop_requested)) {
+               /* It means main thread is sleeping waiting */
+               atomic_set(&vcpu_sync_stop_requested, false);
+               sem_post(&sem_vcpu_stop);
+               sem_wait_until(&sem_vcpu_cont);
+       }
+}
+
 static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
 {
        struct kvm_run *run = vcpu_state(vm, VCPU_ID);
@@ -244,6 +264,8 @@ static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
        TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC,
                    "Invalid guest sync status: exit_reason=%s\n",
                    exit_reason_str(run->exit_reason));
+
+       vcpu_handle_sync_stop();
 }
 
 static bool dirty_ring_supported(void)
@@ -301,13 +323,13 @@ static void dirty_ring_wait_vcpu(void)
 {
        /* This makes sure that hardware PML cache flushed */
        vcpu_kick();
-       sem_wait_until(&dirty_ring_vcpu_stop);
+       sem_wait_until(&sem_vcpu_stop);
 }
 
 static void dirty_ring_continue_vcpu(void)
 {
        pr_info("Notifying vcpu to continue\n");
-       sem_post(&dirty_ring_vcpu_cont);
+       sem_post(&sem_vcpu_cont);
 }
 
 static void dirty_ring_collect_dirty_pages(struct kvm_vm *vm, int slot,
@@ -361,11 +383,11 @@ static void dirty_ring_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
                /* Update the flag first before pause */
                WRITE_ONCE(dirty_ring_vcpu_ring_full,
                           run->exit_reason == KVM_EXIT_DIRTY_RING_FULL);
-               sem_post(&dirty_ring_vcpu_stop);
+               sem_post(&sem_vcpu_stop);
                pr_info("vcpu stops because %s...\n",
                        dirty_ring_vcpu_ring_full ?
                        "dirty ring is full" : "vcpu is kicked out");
-               sem_wait_until(&dirty_ring_vcpu_cont);
+               sem_wait_until(&sem_vcpu_cont);
                pr_info("vcpu continues now.\n");
        } else {
                TEST_ASSERT(false, "Invalid guest sync status: "
@@ -377,7 +399,7 @@ static void dirty_ring_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
 static void dirty_ring_before_vcpu_join(void)
 {
        /* Kick another round of vcpu just to make sure it will quit */
-       sem_post(&dirty_ring_vcpu_cont);
+       sem_post(&sem_vcpu_cont);
 }
 
 struct log_mode {
@@ -505,9 +527,8 @@ static void *vcpu_worker(void *data)
         */
        sigmask->len = 8;
        pthread_sigmask(0, NULL, sigset);
+       sigdelset(sigset, SIG_IPI);
        vcpu_ioctl(vm, VCPU_ID, KVM_SET_SIGNAL_MASK, sigmask);
-       sigaddset(sigset, SIG_IPI);
-       pthread_sigmask(SIG_BLOCK, sigset, NULL);
 
        sigemptyset(sigset);
        sigaddset(sigset, SIG_IPI);
@@ -768,7 +789,25 @@ static void run_test(enum vm_guest_mode mode, void *arg)
                usleep(p->interval * 1000);
                log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX,
                                             bmap, host_num_pages);
+
+               /*
+                * See vcpu_sync_stop_requested definition for details on why
+                * we need to stop vcpu when verify data.
+                */
+               atomic_set(&vcpu_sync_stop_requested, true);
+               sem_wait_until(&sem_vcpu_stop);
+               /*
+                * NOTE: for dirty ring, it's possible that we didn't stop at
+                * GUEST_SYNC but instead we stopped because ring is full;
+                * that's okay too because ring full means we're only missing
+                * the flush of the last page, and since we handle the last
+                * page specially verification will succeed anyway.
+                */
+               assert(host_log_mode == LOG_MODE_DIRTY_RING ||
+                      atomic_read(&vcpu_sync_stop_requested) == false);
                vm_dirty_log_verify(mode, bmap);
+               sem_post(&sem_vcpu_cont);
+
                iteration++;
                sync_global_to_guest(vm, iteration);
        }
@@ -818,9 +857,10 @@ int main(int argc, char *argv[])
                .interval = TEST_HOST_LOOP_INTERVAL,
        };
        int opt, i;
+       sigset_t sigset;
 
-       sem_init(&dirty_ring_vcpu_stop, 0, 0);
-       sem_init(&dirty_ring_vcpu_cont, 0, 0);
+       sem_init(&sem_vcpu_stop, 0, 0);
+       sem_init(&sem_vcpu_cont, 0, 0);
 
        guest_modes_append_default();
 
@@ -876,6 +916,11 @@ int main(int argc, char *argv[])
 
        srandom(time(0));
 
+       /* Ensure that vCPU threads start with SIG_IPI blocked.  */
+       sigemptyset(&sigset);
+       sigaddset(&sigset, SIG_IPI);
+       pthread_sigmask(SIG_BLOCK, &sigset, NULL);
+
        if (host_log_mode_option == LOG_MODE_ALL) {
                /* Run each log mode */
                for (i = 0; i < LOG_MODE_NUM; i++) {
index 0f4258e..a8f0227 100644 (file)
@@ -69,9 +69,6 @@ enum vm_guest_mode {
 #define MIN_PAGE_SIZE          (1U << MIN_PAGE_SHIFT)
 #define PTES_PER_MIN_PAGE      ptes_per_page(MIN_PAGE_SIZE)
 
-#define vm_guest_mode_string(m) vm_guest_mode_string[m]
-extern const char * const vm_guest_mode_string[];
-
 struct vm_guest_mode_params {
        unsigned int pa_bits;
        unsigned int va_bits;
@@ -85,6 +82,7 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
 int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
                    struct kvm_enable_cap *cap);
 void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
+const char *vm_guest_mode_string(uint32_t i);
 
 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
 void kvm_vm_free(struct kvm_vm *vmp);
@@ -225,6 +223,15 @@ int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid,
 #endif
 void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid);
 
+int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr);
+int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr);
+int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd);
+int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test);
+int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
+                      void *val, bool write);
+int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
+                     void *val, bool write);
+
 const char *exit_reason_str(unsigned int exit_reason);
 
 void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot);
index b7f4139..fade313 100644 (file)
@@ -71,13 +71,32 @@ enum vm_mem_backing_src_type {
        VM_MEM_SRC_ANONYMOUS,
        VM_MEM_SRC_ANONYMOUS_THP,
        VM_MEM_SRC_ANONYMOUS_HUGETLB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
+       NUM_SRC_TYPES,
 };
 
 struct vm_mem_backing_src_alias {
        const char *name;
-       enum vm_mem_backing_src_type type;
+       uint32_t flag;
 };
 
+bool thp_configured(void);
+size_t get_trans_hugepagesz(void);
+size_t get_def_hugetlb_pagesz(void);
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i);
+size_t get_backing_src_pagesz(uint32_t i);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
new file mode 100644 (file)
index 0000000..1c4753f
--- /dev/null
@@ -0,0 +1,506 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM page table test
+ *
+ * Copyright (C) 2021, Huawei, Inc.
+ *
+ * Make sure that THP has been enabled or enough HUGETLB pages with specific
+ * page size have been pre-allocated on your system, if you are planning to
+ * use hugepages to back the guest memory for testing.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <pthread.h>
+#include <semaphore.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "guest_modes.h"
+
+#define TEST_MEM_SLOT_INDEX             1
+
+/* Default size(1GB) of the memory for testing */
+#define DEFAULT_TEST_MEM_SIZE          (1 << 30)
+
+/* Default guest test virtual memory offset */
+#define DEFAULT_GUEST_TEST_MEM         0xc0000000
+
+/* Different guest memory accessing stages */
+enum test_stage {
+       KVM_BEFORE_MAPPINGS,
+       KVM_CREATE_MAPPINGS,
+       KVM_UPDATE_MAPPINGS,
+       KVM_ADJUST_MAPPINGS,
+       NUM_TEST_STAGES,
+};
+
+static const char * const test_stage_string[] = {
+       "KVM_BEFORE_MAPPINGS",
+       "KVM_CREATE_MAPPINGS",
+       "KVM_UPDATE_MAPPINGS",
+       "KVM_ADJUST_MAPPINGS",
+};
+
+struct vcpu_args {
+       int vcpu_id;
+       bool vcpu_write;
+};
+
+struct test_args {
+       struct kvm_vm *vm;
+       uint64_t guest_test_virt_mem;
+       uint64_t host_page_size;
+       uint64_t host_num_pages;
+       uint64_t large_page_size;
+       uint64_t large_num_pages;
+       uint64_t host_pages_per_lpage;
+       enum vm_mem_backing_src_type src_type;
+       struct vcpu_args vcpu_args[KVM_MAX_VCPUS];
+};
+
+/*
+ * Guest variables. Use addr_gva2hva() if these variables need
+ * to be changed in host.
+ */
+static enum test_stage guest_test_stage;
+
+/* Host variables */
+static uint32_t nr_vcpus = 1;
+static struct test_args test_args;
+static enum test_stage *current_stage;
+static bool host_quit;
+
+/* Whether the test stage is updated, or completed */
+static sem_t test_stage_updated;
+static sem_t test_stage_completed;
+
+/*
+ * Guest physical memory offset of the testing memory slot.
+ * This will be set to the topmost valid physical address minus
+ * the test memory size.
+ */
+static uint64_t guest_test_phys_mem;
+
+/*
+ * Guest virtual memory offset of the testing memory slot.
+ * Must not conflict with identity mapped test code.
+ */
+static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
+
+static void guest_code(int vcpu_id)
+{
+       struct test_args *p = &test_args;
+       struct vcpu_args *vcpu_args = &p->vcpu_args[vcpu_id];
+       enum test_stage *current_stage = &guest_test_stage;
+       uint64_t addr;
+       int i, j;
+
+       /* Make sure vCPU args data structure is not corrupt */
+       GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id);
+
+       while (true) {
+               addr = p->guest_test_virt_mem;
+
+               switch (READ_ONCE(*current_stage)) {
+               /*
+                * All vCPU threads will be started in this stage,
+                * where guest code of each vCPU will do nothing.
+                */
+               case KVM_BEFORE_MAPPINGS:
+                       break;
+
+               /*
+                * Before dirty logging, vCPUs concurrently access the first
+                * 8 bytes of each page (host page/large page) within the same
+                * memory region with different accessing types (read/write).
+                * Then KVM will create normal page mappings or huge block
+                * mappings for them.
+                */
+               case KVM_CREATE_MAPPINGS:
+                       for (i = 0; i < p->large_num_pages; i++) {
+                               if (vcpu_args->vcpu_write)
+                                       *(uint64_t *)addr = 0x0123456789ABCDEF;
+                               else
+                                       READ_ONCE(*(uint64_t *)addr);
+
+                               addr += p->large_page_size;
+                       }
+                       break;
+
+               /*
+                * During dirty logging, KVM will only update attributes of the
+                * normal page mappings from RO to RW if memory backing src type
+                * is anonymous. In other cases, KVM will split the huge block
+                * mappings into normal page mappings if memory backing src type
+                * is THP or HUGETLB.
+                */
+               case KVM_UPDATE_MAPPINGS:
+                       if (p->src_type == VM_MEM_SRC_ANONYMOUS) {
+                               for (i = 0; i < p->host_num_pages; i++) {
+                                       *(uint64_t *)addr = 0x0123456789ABCDEF;
+                                       addr += p->host_page_size;
+                               }
+                               break;
+                       }
+
+                       for (i = 0; i < p->large_num_pages; i++) {
+                               /*
+                                * Write to the first host page in each large
+                                * page region, and triger break of large pages.
+                                */
+                               *(uint64_t *)addr = 0x0123456789ABCDEF;
+
+                               /*
+                                * Access the middle host pages in each large
+                                * page region. Since dirty logging is enabled,
+                                * this will create new mappings at the smallest
+                                * granularity.
+                                */
+                               addr += p->large_page_size / 2;
+                               for (j = 0; j < p->host_pages_per_lpage / 2; j++) {
+                                       READ_ONCE(*(uint64_t *)addr);
+                                       addr += p->host_page_size;
+                               }
+                       }
+                       break;
+
+               /*
+                * After dirty logging is stopped, vCPUs concurrently read
+                * from every single host page. Then KVM will coalesce the
+                * split page mappings back to block mappings. And a TLB
+                * conflict abort could occur here if TLB entries of the
+                * page mappings are not fully invalidated.
+                */
+               case KVM_ADJUST_MAPPINGS:
+                       for (i = 0; i < p->host_num_pages; i++) {
+                               READ_ONCE(*(uint64_t *)addr);
+                               addr += p->host_page_size;
+                       }
+                       break;
+
+               default:
+                       GUEST_ASSERT(0);
+               }
+
+               GUEST_SYNC(1);
+       }
+}
+
+static void *vcpu_worker(void *data)
+{
+       int ret;
+       struct vcpu_args *vcpu_args = data;
+       struct kvm_vm *vm = test_args.vm;
+       int vcpu_id = vcpu_args->vcpu_id;
+       struct kvm_run *run;
+       struct timespec start;
+       struct timespec ts_diff;
+       enum test_stage stage;
+
+       vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
+       run = vcpu_state(vm, vcpu_id);
+
+       while (!READ_ONCE(host_quit)) {
+               ret = sem_wait(&test_stage_updated);
+               TEST_ASSERT(ret == 0, "Error in sem_wait");
+
+               if (READ_ONCE(host_quit))
+                       return NULL;
+
+               clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+               ret = _vcpu_run(vm, vcpu_id);
+               ts_diff = timespec_elapsed(start);
+
+               TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+               TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC,
+                           "Invalid guest sync status: exit_reason=%s\n",
+                           exit_reason_str(run->exit_reason));
+
+               pr_debug("Got sync event from vCPU %d\n", vcpu_id);
+               stage = READ_ONCE(*current_stage);
+
+               /*
+                * Here we can know the execution time of every
+                * single vcpu running in different test stages.
+                */
+               pr_debug("vCPU %d has completed stage %s\n"
+                        "execution time is: %ld.%.9lds\n\n",
+                        vcpu_id, test_stage_string[stage],
+                        ts_diff.tv_sec, ts_diff.tv_nsec);
+
+               ret = sem_post(&test_stage_completed);
+               TEST_ASSERT(ret == 0, "Error in sem_post");
+       }
+
+       return NULL;
+}
+
+struct test_params {
+       uint64_t phys_offset;
+       uint64_t test_mem_size;
+       enum vm_mem_backing_src_type src_type;
+};
+
+static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
+{
+       int ret;
+       struct test_params *p = arg;
+       struct vcpu_args *vcpu_args;
+       enum vm_mem_backing_src_type src_type = p->src_type;
+       uint64_t large_page_size = get_backing_src_pagesz(src_type);
+       uint64_t guest_page_size = vm_guest_mode_params[mode].page_size;
+       uint64_t host_page_size = getpagesize();
+       uint64_t test_mem_size = p->test_mem_size;
+       uint64_t guest_num_pages;
+       uint64_t alignment;
+       void *host_test_mem;
+       struct kvm_vm *vm;
+       int vcpu_id;
+
+       /* Align up the test memory size */
+       alignment = max(large_page_size, guest_page_size);
+       test_mem_size = (test_mem_size + alignment - 1) & ~(alignment - 1);
+
+       /* Create a VM with enough guest pages */
+       guest_num_pages = test_mem_size / guest_page_size;
+       vm = vm_create_with_vcpus(mode, nr_vcpus,
+                                 guest_num_pages, 0, guest_code, NULL);
+
+       /* Align down GPA of the testing memslot */
+       if (!p->phys_offset)
+               guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) *
+                                      guest_page_size;
+       else
+               guest_test_phys_mem = p->phys_offset;
+#ifdef __s390x__
+       alignment = max(0x100000, alignment);
+#endif
+       guest_test_phys_mem &= ~(alignment - 1);
+
+       /* Set up the shared data structure test_args */
+       test_args.vm = vm;
+       test_args.guest_test_virt_mem = guest_test_virt_mem;
+       test_args.host_page_size = host_page_size;
+       test_args.host_num_pages = test_mem_size / host_page_size;
+       test_args.large_page_size = large_page_size;
+       test_args.large_num_pages = test_mem_size / large_page_size;
+       test_args.host_pages_per_lpage = large_page_size / host_page_size;
+       test_args.src_type = src_type;
+
+       for (vcpu_id = 0; vcpu_id < KVM_MAX_VCPUS; vcpu_id++) {
+               vcpu_args = &test_args.vcpu_args[vcpu_id];
+               vcpu_args->vcpu_id = vcpu_id;
+               vcpu_args->vcpu_write = !(vcpu_id % 2);
+       }
+
+       /* Add an extra memory slot with specified backing src type */
+       vm_userspace_mem_region_add(vm, src_type, guest_test_phys_mem,
+                                   TEST_MEM_SLOT_INDEX, guest_num_pages, 0);
+
+       /* Do mapping(GVA->GPA) for the testing memory slot */
+       virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0);
+
+       /* Cache the HVA pointer of the region */
+       host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
+
+       /* Export shared structure test_args to guest */
+       ucall_init(vm, NULL);
+       sync_global_to_guest(vm, test_args);
+
+       ret = sem_init(&test_stage_updated, 0, 0);
+       TEST_ASSERT(ret == 0, "Error in sem_init");
+
+       ret = sem_init(&test_stage_completed, 0, 0);
+       TEST_ASSERT(ret == 0, "Error in sem_init");
+
+       current_stage = addr_gva2hva(vm, (vm_vaddr_t)(&guest_test_stage));
+       *current_stage = NUM_TEST_STAGES;
+
+       pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+       pr_info("Testing memory backing src type: %s\n",
+               vm_mem_backing_src_alias(src_type)->name);
+       pr_info("Testing memory backing src granularity: 0x%lx\n",
+               large_page_size);
+       pr_info("Testing memory size(aligned): 0x%lx\n", test_mem_size);
+       pr_info("Guest physical test memory offset: 0x%lx\n",
+               guest_test_phys_mem);
+       pr_info("Host  virtual  test memory offset: 0x%lx\n",
+               (uint64_t)host_test_mem);
+       pr_info("Number of testing vCPUs: %d\n", nr_vcpus);
+
+       return vm;
+}
+
+static void vcpus_complete_new_stage(enum test_stage stage)
+{
+       int ret;
+       int vcpus;
+
+       /* Wake up all the vcpus to run new test stage */
+       for (vcpus = 0; vcpus < nr_vcpus; vcpus++) {
+               ret = sem_post(&test_stage_updated);
+               TEST_ASSERT(ret == 0, "Error in sem_post");
+       }
+       pr_debug("All vcpus have been notified to continue\n");
+
+       /* Wait for all the vcpus to complete new test stage */
+       for (vcpus = 0; vcpus < nr_vcpus; vcpus++) {
+               ret = sem_wait(&test_stage_completed);
+               TEST_ASSERT(ret == 0, "Error in sem_wait");
+
+               pr_debug("%d vcpus have completed stage %s\n",
+                        vcpus + 1, test_stage_string[stage]);
+       }
+
+       pr_debug("All vcpus have completed stage %s\n",
+                test_stage_string[stage]);
+}
+
+static void run_test(enum vm_guest_mode mode, void *arg)
+{
+       int ret;
+       pthread_t *vcpu_threads;
+       struct kvm_vm *vm;
+       int vcpu_id;
+       struct timespec start;
+       struct timespec ts_diff;
+
+       /* Create VM with vCPUs and make some pre-initialization */
+       vm = pre_init_before_test(mode, arg);
+
+       vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
+       TEST_ASSERT(vcpu_threads, "Memory allocation failed");
+
+       host_quit = false;
+       *current_stage = KVM_BEFORE_MAPPINGS;
+
+       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+               pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
+                              &test_args.vcpu_args[vcpu_id]);
+       }
+
+       vcpus_complete_new_stage(*current_stage);
+       pr_info("Started all vCPUs successfully\n");
+
+       /* Test the stage of KVM creating mappings */
+       *current_stage = KVM_CREATE_MAPPINGS;
+
+       clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+       vcpus_complete_new_stage(*current_stage);
+       ts_diff = timespec_elapsed(start);
+
+       pr_info("KVM_CREATE_MAPPINGS: total execution time: %ld.%.9lds\n\n",
+               ts_diff.tv_sec, ts_diff.tv_nsec);
+
+       /* Test the stage of KVM updating mappings */
+       vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX,
+                               KVM_MEM_LOG_DIRTY_PAGES);
+
+       *current_stage = KVM_UPDATE_MAPPINGS;
+
+       clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+       vcpus_complete_new_stage(*current_stage);
+       ts_diff = timespec_elapsed(start);
+
+       pr_info("KVM_UPDATE_MAPPINGS: total execution time: %ld.%.9lds\n\n",
+               ts_diff.tv_sec, ts_diff.tv_nsec);
+
+       /* Test the stage of KVM adjusting mappings */
+       vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, 0);
+
+       *current_stage = KVM_ADJUST_MAPPINGS;
+
+       clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+       vcpus_complete_new_stage(*current_stage);
+       ts_diff = timespec_elapsed(start);
+
+       pr_info("KVM_ADJUST_MAPPINGS: total execution time: %ld.%.9lds\n\n",
+               ts_diff.tv_sec, ts_diff.tv_nsec);
+
+       /* Tell the vcpu thread to quit */
+       host_quit = true;
+       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+               ret = sem_post(&test_stage_updated);
+               TEST_ASSERT(ret == 0, "Error in sem_post");
+       }
+
+       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
+               pthread_join(vcpu_threads[vcpu_id], NULL);
+
+       ret = sem_destroy(&test_stage_updated);
+       TEST_ASSERT(ret == 0, "Error in sem_destroy");
+
+       ret = sem_destroy(&test_stage_completed);
+       TEST_ASSERT(ret == 0, "Error in sem_destroy");
+
+       free(vcpu_threads);
+       ucall_uninit(vm);
+       kvm_vm_free(vm);
+}
+
+static void help(char *name)
+{
+       puts("");
+       printf("usage: %s [-h] [-p offset] [-m mode] "
+              "[-b mem-size] [-v vcpus] [-s mem-type]\n", name);
+       puts("");
+       printf(" -p: specify guest physical test memory offset\n"
+              "     Warning: a low offset can conflict with the loaded test code.\n");
+       guest_modes_help();
+       printf(" -b: specify size of the memory region for testing. e.g. 10M or 3G.\n"
+              "     (default: 1G)\n");
+       printf(" -v: specify the number of vCPUs to run\n"
+              "     (default: 1)\n");
+       printf(" -s: specify the type of memory that should be used to\n"
+              "     back the guest data region.\n"
+              "     (default: anonymous)\n\n");
+       backing_src_help();
+       puts("");
+}
+
+int main(int argc, char *argv[])
+{
+       int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+       struct test_params p = {
+               .test_mem_size = DEFAULT_TEST_MEM_SIZE,
+               .src_type = VM_MEM_SRC_ANONYMOUS,
+       };
+       int opt;
+
+       guest_modes_append_default();
+
+       while ((opt = getopt(argc, argv, "hp:m:b:v:s:")) != -1) {
+               switch (opt) {
+               case 'p':
+                       p.phys_offset = strtoull(optarg, NULL, 0);
+                       break;
+               case 'm':
+                       guest_modes_cmdline(optarg);
+                       break;
+               case 'b':
+                       p.test_mem_size = parse_size(optarg);
+                       break;
+               case 'v':
+                       nr_vcpus = atoi(optarg);
+                       TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
+                                   "Invalid number of vcpus, must be between 1 and %d", max_vcpus);
+                       break;
+               case 's':
+                       p.src_type = parse_backing_src_type(optarg);
+                       break;
+               case 'h':
+               default:
+                       help(argv[0]);
+                       exit(0);
+               }
+       }
+
+       for_each_guest_mode(run_test, &p);
+
+       return 0;
+}
index 5ebbd0d..71ade61 100644 (file)
@@ -71,9 +71,9 @@ test_assert(bool exp, const char *exp_str,
 
                fprintf(stderr, "==== Test Assertion Failure ====\n"
                        "  %s:%u: %s\n"
-                       "  pid=%d tid=%d - %s\n",
+                       "  pid=%d tid=%d errno=%d - %s\n",
                        file, line, exp_str, getpid(), _gettid(),
-                       strerror(errno));
+                       errno, strerror(errno));
                test_dump_stack();
                if (fmt) {
                        fputs("  ", stderr);
index b8849a1..fc83f6c 100644 (file)
@@ -18,7 +18,6 @@
 #include <unistd.h>
 #include <linux/kernel.h>
 
-#define KVM_UTIL_PGS_PER_HUGEPG 512
 #define KVM_UTIL_MIN_PFN       2
 
 static int vcpu_mmap_sz(void);
@@ -143,17 +142,24 @@ static void vm_open(struct kvm_vm *vm, int perm)
                "rc: %i errno: %i", vm->fd, errno);
 }
 
-const char * const vm_guest_mode_string[] = {
-       "PA-bits:52,  VA-bits:48,  4K pages",
-       "PA-bits:52,  VA-bits:48, 64K pages",
-       "PA-bits:48,  VA-bits:48,  4K pages",
-       "PA-bits:48,  VA-bits:48, 64K pages",
-       "PA-bits:40,  VA-bits:48,  4K pages",
-       "PA-bits:40,  VA-bits:48, 64K pages",
-       "PA-bits:ANY, VA-bits:48,  4K pages",
-};
-_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
-              "Missing new mode strings?");
+const char *vm_guest_mode_string(uint32_t i)
+{
+       static const char * const strings[] = {
+               [VM_MODE_P52V48_4K]     = "PA-bits:52,  VA-bits:48,  4K pages",
+               [VM_MODE_P52V48_64K]    = "PA-bits:52,  VA-bits:48, 64K pages",
+               [VM_MODE_P48V48_4K]     = "PA-bits:48,  VA-bits:48,  4K pages",
+               [VM_MODE_P48V48_64K]    = "PA-bits:48,  VA-bits:48, 64K pages",
+               [VM_MODE_P40V48_4K]     = "PA-bits:40,  VA-bits:48,  4K pages",
+               [VM_MODE_P40V48_64K]    = "PA-bits:40,  VA-bits:48, 64K pages",
+               [VM_MODE_PXXV48_4K]     = "PA-bits:ANY, VA-bits:48,  4K pages",
+       };
+       _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
+                      "Missing new mode strings?");
+
+       TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i);
+
+       return strings[i];
+}
 
 const struct vm_guest_mode_params vm_guest_mode_params[] = {
        { 52, 48,  0x1000, 12 },
@@ -514,7 +520,7 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct vcpu *vcpu)
        ret = munmap(vcpu->state, vcpu_mmap_sz());
        TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i "
                "errno: %i", ret, errno);
-       close(vcpu->fd);
+       ret = close(vcpu->fd);
        TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i "
                "errno: %i", ret, errno);
 
@@ -534,7 +540,7 @@ void kvm_vm_release(struct kvm_vm *vmp)
        TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
                "  vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno);
 
-       close(vmp->kvm_fd);
+       ret = close(vmp->kvm_fd);
        TEST_ASSERT(ret == 0, "Close of /dev/kvm fd failed,\n"
                "  vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno);
 }
@@ -681,7 +687,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 {
        int ret;
        struct userspace_mem_region *region;
-       size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
+       size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
        size_t alignment;
 
        TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
@@ -743,7 +749,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 #endif
 
        if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
-               alignment = max(huge_page_size, alignment);
+               alignment = max(backing_src_pagesz, alignment);
 
        /* Add enough memory to align up if necessary */
        if (alignment > 1)
@@ -752,7 +758,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
        region->mmap_start = mmap(NULL, region->mmap_size,
                                  PROT_READ | PROT_WRITE,
                                  MAP_PRIVATE | MAP_ANONYMOUS
-                                 | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? MAP_HUGETLB : 0),
+                                 | vm_mem_backing_src_alias(src_type)->flag,
                                  -1, 0);
        TEST_ASSERT(region->mmap_start != MAP_FAILED,
                    "test_malloc failed, mmap_start: %p errno: %i",
@@ -762,22 +768,13 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
        region->host_mem = align(region->mmap_start, alignment);
 
        /* As needed perform madvise */
-       if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) {
-               struct stat statbuf;
-
-               ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
-               TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
-                           "stat /sys/kernel/mm/transparent_hugepage");
-
-               TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP,
-                           "VM_MEM_SRC_ANONYMOUS_THP requires THP to be configured in the host kernel");
-
-               if (ret == 0) {
-                       ret = madvise(region->host_mem, npages * vm->page_size,
-                                     src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
-                       TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %x",
-                                   region->host_mem, npages * vm->page_size, src_type);
-               }
+       if ((src_type == VM_MEM_SRC_ANONYMOUS ||
+            src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {
+               ret = madvise(region->host_mem, npages * vm->page_size,
+                             src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
+               TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s",
+                           region->host_mem, npages * vm->page_size,
+                           vm_mem_backing_src_alias(src_type)->name);
        }
 
        region->unused_phy_pages = sparsebit_alloc();
@@ -1733,6 +1730,81 @@ int _kvm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg)
        return ioctl(vm->kvm_fd, cmd, arg);
 }
 
+/*
+ * Device Ioctl
+ */
+
+int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
+{
+       struct kvm_device_attr attribute = {
+               .group = group,
+               .attr = attr,
+               .flags = 0,
+       };
+
+       return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute);
+}
+
+int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
+{
+       int ret = _kvm_device_check_attr(dev_fd, group, attr);
+
+       TEST_ASSERT(ret >= 0, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno);
+       return ret;
+}
+
+int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd)
+{
+       struct kvm_create_device create_dev;
+       int ret;
+
+       create_dev.type = type;
+       create_dev.fd = -1;
+       create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
+       ret = ioctl(vm_get_fd(vm), KVM_CREATE_DEVICE, &create_dev);
+       *fd = create_dev.fd;
+       return ret;
+}
+
+int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test)
+{
+       int fd, ret;
+
+       ret = _kvm_create_device(vm, type, test, &fd);
+
+       if (!test) {
+               TEST_ASSERT(ret >= 0,
+                           "KVM_CREATE_DEVICE IOCTL failed, rc: %i errno: %i", ret, errno);
+               return fd;
+       }
+       return ret;
+}
+
+int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
+                     void *val, bool write)
+{
+       struct kvm_device_attr kvmattr = {
+               .group = group,
+               .attr = attr,
+               .flags = 0,
+               .addr = (uintptr_t)val,
+       };
+       int ret;
+
+       ret = ioctl(dev_fd, write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
+                   &kvmattr);
+       return ret;
+}
+
+int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
+                     void *val, bool write)
+{
+       int ret = _kvm_device_access(dev_fd, group, attr, val, write);
+
+       TEST_ASSERT(ret >= 0, "KVM_SET|GET_DEVICE_ATTR IOCTL failed, rc: %i errno: %i", ret, errno);
+       return ret;
+}
+
 /*
  * VM Dump
  *
index 031ba3c..a0d0c83 100644 (file)
@@ -1890,7 +1890,6 @@ void sparsebit_validate_internal(struct sparsebit *s)
  */
 
 #include <stdlib.h>
-#include <assert.h>
 
 struct range {
        sparsebit_idx_t first, last;
index 906c955..63d2bc7 100644 (file)
@@ -10,6 +10,8 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <time.h>
+#include <sys/stat.h>
+#include <linux/mman.h>
 #include "linux/kernel.h"
 
 #include "test_util.h"
@@ -111,28 +113,169 @@ void print_skip(const char *fmt, ...)
        puts(", skipping test");
 }
 
-const struct vm_mem_backing_src_alias backing_src_aliases[] = {
-       {"anonymous", VM_MEM_SRC_ANONYMOUS,},
-       {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,},
-       {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
-};
+bool thp_configured(void)
+{
+       int ret;
+       struct stat statbuf;
+
+       ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
+       TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
+                   "Error in stating /sys/kernel/mm/transparent_hugepage");
+
+       return ret == 0;
+}
+
+size_t get_trans_hugepagesz(void)
+{
+       size_t size;
+       FILE *f;
+
+       TEST_ASSERT(thp_configured(), "THP is not configured in host kernel");
+
+       f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
+       TEST_ASSERT(f != NULL, "Error in opening transparent_hugepage/hpage_pmd_size");
+
+       fscanf(f, "%ld", &size);
+       fclose(f);
+
+       return size;
+}
+
+size_t get_def_hugetlb_pagesz(void)
+{
+       char buf[64];
+       const char *tag = "Hugepagesize:";
+       FILE *f;
+
+       f = fopen("/proc/meminfo", "r");
+       TEST_ASSERT(f != NULL, "Error in opening /proc/meminfo");
+
+       while (fgets(buf, sizeof(buf), f) != NULL) {
+               if (strstr(buf, tag) == buf) {
+                       fclose(f);
+                       return strtoull(buf + strlen(tag), NULL, 10) << 10;
+               }
+       }
+
+       if (feof(f))
+               TEST_FAIL("HUGETLB is not configured in host kernel");
+       else
+               TEST_FAIL("Error in reading /proc/meminfo");
+
+       fclose(f);
+       return 0;
+}
+
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
+{
+       static const struct vm_mem_backing_src_alias aliases[] = {
+               [VM_MEM_SRC_ANONYMOUS] = {
+                       .name = "anonymous",
+                       .flag = 0,
+               },
+               [VM_MEM_SRC_ANONYMOUS_THP] = {
+                       .name = "anonymous_thp",
+                       .flag = 0,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB] = {
+                       .name = "anonymous_hugetlb",
+                       .flag = MAP_HUGETLB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = {
+                       .name = "anonymous_hugetlb_16kb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_16KB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = {
+                       .name = "anonymous_hugetlb_64kb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_64KB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = {
+                       .name = "anonymous_hugetlb_512kb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_512KB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = {
+                       .name = "anonymous_hugetlb_1mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_1MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = {
+                       .name = "anonymous_hugetlb_2mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_2MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = {
+                       .name = "anonymous_hugetlb_8mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_8MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = {
+                       .name = "anonymous_hugetlb_16mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_16MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = {
+                       .name = "anonymous_hugetlb_32mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_32MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = {
+                       .name = "anonymous_hugetlb_256mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_256MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = {
+                       .name = "anonymous_hugetlb_512mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_512MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = {
+                       .name = "anonymous_hugetlb_1gb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_1GB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = {
+                       .name = "anonymous_hugetlb_2gb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_2GB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = {
+                       .name = "anonymous_hugetlb_16gb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_16GB,
+               },
+       };
+       _Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES,
+                      "Missing new backing src types?");
+
+       TEST_ASSERT(i < NUM_SRC_TYPES, "Backing src type ID %d too big", i);
+
+       return &aliases[i];
+}
+
+#define MAP_HUGE_PAGE_SIZE(x) (1ULL << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))
+
+size_t get_backing_src_pagesz(uint32_t i)
+{
+       uint32_t flag = vm_mem_backing_src_alias(i)->flag;
+
+       switch (i) {
+       case VM_MEM_SRC_ANONYMOUS:
+               return getpagesize();
+       case VM_MEM_SRC_ANONYMOUS_THP:
+               return get_trans_hugepagesz();
+       case VM_MEM_SRC_ANONYMOUS_HUGETLB:
+               return get_def_hugetlb_pagesz();
+       default:
+               return MAP_HUGE_PAGE_SIZE(flag);
+       }
+}
 
 void backing_src_help(void)
 {
        int i;
 
        printf("Available backing src types:\n");
-       for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++)
-               printf("\t%s\n", backing_src_aliases[i].name);
+       for (i = 0; i < NUM_SRC_TYPES; i++)
+               printf("\t%s\n", vm_mem_backing_src_alias(i)->name);
 }
 
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name)
 {
        int i;
 
-       for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++)
-               if (!strcmp(type_name, backing_src_aliases[i].name))
-                       return backing_src_aliases[i].type;
+       for (i = 0; i < NUM_SRC_TYPES; i++)
+               if (!strcmp(type_name, vm_mem_backing_src_alias(i)->name))
+                       return i;
 
        backing_src_help();
        TEST_FAIL("Unknown backing src type: %s", type_name);
index f127ed3..978f5b5 100644 (file)
@@ -329,6 +329,22 @@ static void test_zero_memory_regions(void)
 }
 #endif /* __x86_64__ */
 
+static int test_memory_region_add(struct kvm_vm *vm, void *mem, uint32_t slot,
+                                  uint32_t size, uint64_t guest_addr)
+{
+       struct kvm_userspace_memory_region region;
+       int ret;
+
+       region.slot = slot;
+       region.flags = 0;
+       region.guest_phys_addr = guest_addr;
+       region.memory_size = size;
+       region.userspace_addr = (uintptr_t) mem;
+       ret = ioctl(vm_get_fd(vm), KVM_SET_USER_MEMORY_REGION, &region);
+
+       return ret;
+}
+
 /*
  * Test it can be added memory slots up to KVM_CAP_NR_MEMSLOTS, then any
  * tentative to add further slots should fail.
@@ -339,9 +355,15 @@ static void test_add_max_memory_regions(void)
        struct kvm_vm *vm;
        uint32_t max_mem_slots;
        uint32_t slot;
-       uint64_t guest_addr = 0x0;
-       uint64_t mem_reg_npages;
-       void *mem;
+       void *mem, *mem_aligned, *mem_extra;
+       size_t alignment;
+
+#ifdef __s390x__
+       /* On s390x, the host address must be aligned to 1M (due to PGSTEs) */
+       alignment = 0x100000;
+#else
+       alignment = 1;
+#endif
 
        max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
        TEST_ASSERT(max_mem_slots > 0,
@@ -350,30 +372,37 @@ static void test_add_max_memory_regions(void)
 
        vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
 
-       mem_reg_npages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, MEM_REGION_SIZE);
-
        /* Check it can be added memory slots up to the maximum allowed */
        pr_info("Adding slots 0..%i, each memory region with %dK size\n",
                (max_mem_slots - 1), MEM_REGION_SIZE >> 10);
+
+       mem = mmap(NULL, MEM_REGION_SIZE * max_mem_slots + alignment,
+                  PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host");
+       mem_aligned = (void *)(((size_t) mem + alignment - 1) & ~(alignment - 1));
+
        for (slot = 0; slot < max_mem_slots; slot++) {
-               vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
-                                           guest_addr, slot, mem_reg_npages,
-                                           0);
-               guest_addr += MEM_REGION_SIZE;
+               ret = test_memory_region_add(vm, mem_aligned +
+                                            ((uint64_t)slot * MEM_REGION_SIZE),
+                                            slot, MEM_REGION_SIZE,
+                                            (uint64_t)slot * MEM_REGION_SIZE);
+               TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
+                           "  rc: %i errno: %i slot: %i\n",
+                           ret, errno, slot);
        }
 
        /* Check it cannot be added memory slots beyond the limit */
-       mem = mmap(NULL, MEM_REGION_SIZE, PROT_READ | PROT_WRITE,
-                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host");
+       mem_extra = mmap(NULL, MEM_REGION_SIZE, PROT_READ | PROT_WRITE,
+                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       TEST_ASSERT(mem_extra != MAP_FAILED, "Failed to mmap() host");
 
-       ret = ioctl(vm_get_fd(vm), KVM_SET_USER_MEMORY_REGION,
-                   &(struct kvm_userspace_memory_region) {slot, 0, guest_addr,
-                   MEM_REGION_SIZE, (uint64_t) mem});
+       ret = test_memory_region_add(vm, mem_extra, max_mem_slots, MEM_REGION_SIZE,
+                                    (uint64_t)max_mem_slots * MEM_REGION_SIZE);
        TEST_ASSERT(ret == -1 && errno == EINVAL,
                    "Adding one more memory slot should fail with EINVAL");
 
-       munmap(mem, MEM_REGION_SIZE);
+       munmap(mem, MEM_REGION_SIZE * max_mem_slots + alignment);
+       munmap(mem_extra, MEM_REGION_SIZE);
        kvm_vm_free(vm);
 }
 
index 804ff5f..1f4a059 100644 (file)
@@ -186,7 +186,7 @@ int main(int argc, char *argv[])
                vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st);
        }
 
-       struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);;
+       struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
        rs->state = 0x5a;
 
        for (;;) {
diff --git a/tools/testing/selftests/landlock/.gitignore b/tools/testing/selftests/landlock/.gitignore
new file mode 100644 (file)
index 0000000..470203a
--- /dev/null
@@ -0,0 +1,2 @@
+/*_test
+/true
diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile
new file mode 100644 (file)
index 0000000..a99596c
--- /dev/null
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += -Wall -O2
+
+src_test := $(wildcard *_test.c)
+
+TEST_GEN_PROGS := $(src_test:.c=)
+
+TEST_GEN_PROGS_EXTENDED := true
+
+KSFT_KHDR_INSTALL := 1
+OVERRIDE_TARGETS := 1
+include ../lib.mk
+
+khdr_dir = $(top_srcdir)/usr/include
+
+$(khdr_dir)/linux/landlock.h: khdr
+       @:
+
+$(OUTPUT)/true: true.c
+       $(LINK.c) $< $(LDLIBS) -o $@ -static
+
+$(OUTPUT)/%_test: %_test.c $(khdr_dir)/linux/landlock.h ../kselftest_harness.h common.h
+       $(LINK.c) $< $(LDLIBS) -o $@ -lcap -I$(khdr_dir)
diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c
new file mode 100644 (file)
index 0000000..ca40abe
--- /dev/null
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Landlock tests - Common user space base
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019-2020 ANSSI
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/landlock.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "common.h"
+
+#ifndef O_PATH
+#define O_PATH         010000000
+#endif
+
+TEST(inconsistent_attr) {
+       const long page_size = sysconf(_SC_PAGESIZE);
+       char *const buf = malloc(page_size + 1);
+       struct landlock_ruleset_attr *const ruleset_attr = (void *)buf;
+
+       ASSERT_NE(NULL, buf);
+
+       /* Checks copy_from_user(). */
+       ASSERT_EQ(-1, landlock_create_ruleset(ruleset_attr, 0, 0));
+       /* The size if less than sizeof(struct landlock_attr_enforce). */
+       ASSERT_EQ(EINVAL, errno);
+       ASSERT_EQ(-1, landlock_create_ruleset(ruleset_attr, 1, 0));
+       ASSERT_EQ(EINVAL, errno);
+
+       ASSERT_EQ(-1, landlock_create_ruleset(NULL, 1, 0));
+       /* The size if less than sizeof(struct landlock_attr_enforce). */
+       ASSERT_EQ(EFAULT, errno);
+
+       ASSERT_EQ(-1, landlock_create_ruleset(NULL,
+                               sizeof(struct landlock_ruleset_attr), 0));
+       ASSERT_EQ(EFAULT, errno);
+
+       ASSERT_EQ(-1, landlock_create_ruleset(ruleset_attr, page_size + 1, 0));
+       ASSERT_EQ(E2BIG, errno);
+
+       ASSERT_EQ(-1, landlock_create_ruleset(ruleset_attr,
+                               sizeof(struct landlock_ruleset_attr), 0));
+       ASSERT_EQ(ENOMSG, errno);
+       ASSERT_EQ(-1, landlock_create_ruleset(ruleset_attr, page_size, 0));
+       ASSERT_EQ(ENOMSG, errno);
+
+       /* Checks non-zero value. */
+       buf[page_size - 2] = '.';
+       ASSERT_EQ(-1, landlock_create_ruleset(ruleset_attr, page_size, 0));
+       ASSERT_EQ(E2BIG, errno);
+
+       ASSERT_EQ(-1, landlock_create_ruleset(ruleset_attr, page_size + 1, 0));
+       ASSERT_EQ(E2BIG, errno);
+
+       free(buf);
+}
+
+TEST(abi_version) {
+       const struct landlock_ruleset_attr ruleset_attr = {
+               .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE,
+       };
+       ASSERT_EQ(1, landlock_create_ruleset(NULL, 0,
+                               LANDLOCK_CREATE_RULESET_VERSION));
+
+       ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0,
+                               LANDLOCK_CREATE_RULESET_VERSION));
+       ASSERT_EQ(EINVAL, errno);
+
+       ASSERT_EQ(-1, landlock_create_ruleset(NULL, sizeof(ruleset_attr),
+                               LANDLOCK_CREATE_RULESET_VERSION));
+       ASSERT_EQ(EINVAL, errno);
+
+       ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr,
+                               sizeof(ruleset_attr),
+                               LANDLOCK_CREATE_RULESET_VERSION));
+       ASSERT_EQ(EINVAL, errno);
+
+       ASSERT_EQ(-1, landlock_create_ruleset(NULL, 0,
+                               LANDLOCK_CREATE_RULESET_VERSION | 1 << 31));
+       ASSERT_EQ(EINVAL, errno);
+}
+
+TEST(inval_create_ruleset_flags) {
+       const int last_flag = LANDLOCK_CREATE_RULESET_VERSION;
+       const int invalid_flag = last_flag << 1;
+       const struct landlock_ruleset_attr ruleset_attr = {
+               .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE,
+       };
+
+       ASSERT_EQ(-1, landlock_create_ruleset(NULL, 0, invalid_flag));
+       ASSERT_EQ(EINVAL, errno);
+
+       ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0, invalid_flag));
+       ASSERT_EQ(EINVAL, errno);
+
+       ASSERT_EQ(-1, landlock_create_ruleset(NULL, sizeof(ruleset_attr),
+                               invalid_flag));
+       ASSERT_EQ(EINVAL, errno);
+
+       ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr,
+                               sizeof(ruleset_attr), invalid_flag));
+       ASSERT_EQ(EINVAL, errno);
+}
+
+TEST(empty_path_beneath_attr) {
+       const struct landlock_ruleset_attr ruleset_attr = {
+               .handled_access_fs = LANDLOCK_ACCESS_FS_EXECUTE,
+       };
+       const int ruleset_fd = landlock_create_ruleset(&ruleset_attr,
+                       sizeof(ruleset_attr), 0);
+
+       ASSERT_LE(0, ruleset_fd);
+
+       /* Similar to struct landlock_path_beneath_attr.parent_fd = 0 */
+       ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+                               NULL, 0));
+       ASSERT_EQ(EFAULT, errno);
+       ASSERT_EQ(0, close(ruleset_fd));
+}
+
+TEST(inval_fd_enforce) {
+       ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+
+       ASSERT_EQ(-1, landlock_restrict_self(-1, 0));
+       ASSERT_EQ(EBADF, errno);
+}
+
+TEST(unpriv_enforce_without_no_new_privs) {
+       int err;
+
+       drop_caps(_metadata);
+       err = landlock_restrict_self(-1, 0);
+       ASSERT_EQ(EPERM, errno);
+       ASSERT_EQ(err, -1);
+}
+
+TEST(ruleset_fd_io)
+{
+       struct landlock_ruleset_attr ruleset_attr = {
+               .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE,
+       };
+       int ruleset_fd;
+       char buf;
+
+       drop_caps(_metadata);
+       ruleset_fd = landlock_create_ruleset(&ruleset_attr,
+                       sizeof(ruleset_attr), 0);
+       ASSERT_LE(0, ruleset_fd);
+
+       ASSERT_EQ(-1, write(ruleset_fd, ".", 1));
+       ASSERT_EQ(EINVAL, errno);
+       ASSERT_EQ(-1, read(ruleset_fd, &buf, 1));
+       ASSERT_EQ(EINVAL, errno);
+
+       ASSERT_EQ(0, close(ruleset_fd));
+}
+
+/* Tests enforcement of a ruleset FD transferred through a UNIX socket. */
+TEST(ruleset_fd_transfer)
+{
+       struct landlock_ruleset_attr ruleset_attr = {
+               .handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR,
+       };
+       struct landlock_path_beneath_attr path_beneath_attr = {
+               .allowed_access = LANDLOCK_ACCESS_FS_READ_DIR,
+       };
+       int ruleset_fd_tx, dir_fd;
+       union {
+               /* Aligned ancillary data buffer. */
+               char buf[CMSG_SPACE(sizeof(ruleset_fd_tx))];
+               struct cmsghdr _align;
+       } cmsg_tx = {};
+       char data_tx = '.';
+       struct iovec io = {
+               .iov_base = &data_tx,
+               .iov_len = sizeof(data_tx),
+       };
+       struct msghdr msg = {
+               .msg_iov = &io,
+               .msg_iovlen = 1,
+               .msg_control = &cmsg_tx.buf,
+               .msg_controllen = sizeof(cmsg_tx.buf),
+       };
+       struct cmsghdr *cmsg;
+       int socket_fds[2];
+       pid_t child;
+       int status;
+
+       drop_caps(_metadata);
+
+       /* Creates a test ruleset with a simple rule. */
+       ruleset_fd_tx = landlock_create_ruleset(&ruleset_attr,
+                       sizeof(ruleset_attr), 0);
+       ASSERT_LE(0, ruleset_fd_tx);
+       path_beneath_attr.parent_fd = open("/tmp", O_PATH | O_NOFOLLOW |
+                       O_DIRECTORY | O_CLOEXEC);
+       ASSERT_LE(0, path_beneath_attr.parent_fd);
+       ASSERT_EQ(0, landlock_add_rule(ruleset_fd_tx, LANDLOCK_RULE_PATH_BENEATH,
+                               &path_beneath_attr, 0));
+       ASSERT_EQ(0, close(path_beneath_attr.parent_fd));
+
+       cmsg = CMSG_FIRSTHDR(&msg);
+       ASSERT_NE(NULL, cmsg);
+       cmsg->cmsg_len = CMSG_LEN(sizeof(ruleset_fd_tx));
+       cmsg->cmsg_level = SOL_SOCKET;
+       cmsg->cmsg_type = SCM_RIGHTS;
+       memcpy(CMSG_DATA(cmsg), &ruleset_fd_tx, sizeof(ruleset_fd_tx));
+
+       /* Sends the ruleset FD over a socketpair and then close it. */
+       ASSERT_EQ(0, socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, socket_fds));
+       ASSERT_EQ(sizeof(data_tx), sendmsg(socket_fds[0], &msg, 0));
+       ASSERT_EQ(0, close(socket_fds[0]));
+       ASSERT_EQ(0, close(ruleset_fd_tx));
+
+       child = fork();
+       ASSERT_LE(0, child);
+       if (child == 0) {
+               int ruleset_fd_rx;
+
+               *(char *)msg.msg_iov->iov_base = '\0';
+               ASSERT_EQ(sizeof(data_tx), recvmsg(socket_fds[1], &msg, MSG_CMSG_CLOEXEC));
+               ASSERT_EQ('.', *(char *)msg.msg_iov->iov_base);
+               ASSERT_EQ(0, close(socket_fds[1]));
+               cmsg = CMSG_FIRSTHDR(&msg);
+               ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(ruleset_fd_tx)));
+               memcpy(&ruleset_fd_rx, CMSG_DATA(cmsg), sizeof(ruleset_fd_tx));
+
+               /* Enforces the received ruleset on the child. */
+               ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+               ASSERT_EQ(0, landlock_restrict_self(ruleset_fd_rx, 0));
+               ASSERT_EQ(0, close(ruleset_fd_rx));
+
+               /* Checks that the ruleset enforcement. */
+               ASSERT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC));
+               ASSERT_EQ(EACCES, errno);
+               dir_fd = open("/tmp", O_RDONLY | O_DIRECTORY | O_CLOEXEC);
+               ASSERT_LE(0, dir_fd);
+               ASSERT_EQ(0, close(dir_fd));
+               _exit(_metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE);
+               return;
+       }
+
+       ASSERT_EQ(0, close(socket_fds[1]));
+
+       /* Checks that the parent is unrestricted. */
+       dir_fd = open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC);
+       ASSERT_LE(0, dir_fd);
+       ASSERT_EQ(0, close(dir_fd));
+       dir_fd = open("/tmp", O_RDONLY | O_DIRECTORY | O_CLOEXEC);
+       ASSERT_LE(0, dir_fd);
+       ASSERT_EQ(0, close(dir_fd));
+
+       ASSERT_EQ(child, waitpid(child, &status, 0));
+       ASSERT_EQ(1, WIFEXITED(status));
+       ASSERT_EQ(EXIT_SUCCESS, WEXITSTATUS(status));
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h
new file mode 100644 (file)
index 0000000..20e2a92
--- /dev/null
@@ -0,0 +1,183 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Landlock test helpers
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019-2020 ANSSI
+ * Copyright © 2021 Microsoft Corporation
+ */
+
+#include <errno.h>
+#include <linux/landlock.h>
+#include <sys/capability.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+/*
+ * TEST_F_FORK() is useful when a test drop privileges but the corresponding
+ * FIXTURE_TEARDOWN() requires them (e.g. to remove files from a directory
+ * where write actions are denied).  For convenience, FIXTURE_TEARDOWN() is
+ * also called when the test failed, but not when FIXTURE_SETUP() failed.  For
+ * this to be possible, we must not call abort() but instead exit smoothly
+ * (hence the step print).
+ */
+#define TEST_F_FORK(fixture_name, test_name) \
+       static void fixture_name##_##test_name##_child( \
+               struct __test_metadata *_metadata, \
+               FIXTURE_DATA(fixture_name) *self, \
+               const FIXTURE_VARIANT(fixture_name) *variant); \
+       TEST_F(fixture_name, test_name) \
+       { \
+               int status; \
+               const pid_t child = fork(); \
+               if (child < 0) \
+                       abort(); \
+               if (child == 0) { \
+                       _metadata->no_print = 1; \
+                       fixture_name##_##test_name##_child(_metadata, self, variant); \
+                       if (_metadata->skip) \
+                               _exit(255); \
+                       if (_metadata->passed) \
+                               _exit(0); \
+                       _exit(_metadata->step); \
+               } \
+               if (child != waitpid(child, &status, 0)) \
+                       abort(); \
+               if (WIFSIGNALED(status) || !WIFEXITED(status)) { \
+                       _metadata->passed = 0; \
+                       _metadata->step = 1; \
+                       return; \
+               } \
+               switch (WEXITSTATUS(status)) { \
+               case 0: \
+                       _metadata->passed = 1; \
+                       break; \
+               case 255: \
+                       _metadata->passed = 1; \
+                       _metadata->skip = 1; \
+                       break; \
+               default: \
+                       _metadata->passed = 0; \
+                       _metadata->step = WEXITSTATUS(status); \
+                       break; \
+               } \
+       } \
+       static void fixture_name##_##test_name##_child( \
+               struct __test_metadata __attribute__((unused)) *_metadata, \
+               FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \
+               const FIXTURE_VARIANT(fixture_name) \
+                       __attribute__((unused)) *variant)
+
+#ifndef landlock_create_ruleset
+static inline int landlock_create_ruleset(
+               const struct landlock_ruleset_attr *const attr,
+               const size_t size, const __u32 flags)
+{
+       return syscall(__NR_landlock_create_ruleset, attr, size, flags);
+}
+#endif
+
+#ifndef landlock_add_rule
+static inline int landlock_add_rule(const int ruleset_fd,
+               const enum landlock_rule_type rule_type,
+               const void *const rule_attr, const __u32 flags)
+{
+       return syscall(__NR_landlock_add_rule, ruleset_fd, rule_type,
+                       rule_attr, flags);
+}
+#endif
+
+#ifndef landlock_restrict_self
+static inline int landlock_restrict_self(const int ruleset_fd,
+               const __u32 flags)
+{
+       return syscall(__NR_landlock_restrict_self, ruleset_fd, flags);
+}
+#endif
+
+static void _init_caps(struct __test_metadata *const _metadata, bool drop_all)
+{
+       cap_t cap_p;
+       /* Only these three capabilities are useful for the tests. */
+       const cap_value_t caps[] = {
+               CAP_DAC_OVERRIDE,
+               CAP_MKNOD,
+               CAP_SYS_ADMIN,
+               CAP_SYS_CHROOT,
+       };
+
+       cap_p = cap_get_proc();
+       EXPECT_NE(NULL, cap_p) {
+               TH_LOG("Failed to cap_get_proc: %s", strerror(errno));
+       }
+       EXPECT_NE(-1, cap_clear(cap_p)) {
+               TH_LOG("Failed to cap_clear: %s", strerror(errno));
+       }
+       if (!drop_all) {
+               EXPECT_NE(-1, cap_set_flag(cap_p, CAP_PERMITTED,
+                                       ARRAY_SIZE(caps), caps, CAP_SET)) {
+                       TH_LOG("Failed to cap_set_flag: %s", strerror(errno));
+               }
+       }
+       EXPECT_NE(-1, cap_set_proc(cap_p)) {
+               TH_LOG("Failed to cap_set_proc: %s", strerror(errno));
+       }
+       EXPECT_NE(-1, cap_free(cap_p)) {
+               TH_LOG("Failed to cap_free: %s", strerror(errno));
+       }
+}
+
+/* We cannot put such helpers in a library because of kselftest_harness.h . */
+__attribute__((__unused__))
+static void disable_caps(struct __test_metadata *const _metadata)
+{
+       _init_caps(_metadata, false);
+}
+
+__attribute__((__unused__))
+static void drop_caps(struct __test_metadata *const _metadata)
+{
+       _init_caps(_metadata, true);
+}
+
+static void _effective_cap(struct __test_metadata *const _metadata,
+               const cap_value_t caps, const cap_flag_value_t value)
+{
+       cap_t cap_p;
+
+       cap_p = cap_get_proc();
+       EXPECT_NE(NULL, cap_p) {
+               TH_LOG("Failed to cap_get_proc: %s", strerror(errno));
+       }
+       EXPECT_NE(-1, cap_set_flag(cap_p, CAP_EFFECTIVE, 1, &caps, value)) {
+               TH_LOG("Failed to cap_set_flag: %s", strerror(errno));
+       }
+       EXPECT_NE(-1, cap_set_proc(cap_p)) {
+               TH_LOG("Failed to cap_set_proc: %s", strerror(errno));
+       }
+       EXPECT_NE(-1, cap_free(cap_p)) {
+               TH_LOG("Failed to cap_free: %s", strerror(errno));
+       }
+}
+
+__attribute__((__unused__))
+static void set_cap(struct __test_metadata *const _metadata,
+               const cap_value_t caps)
+{
+       _effective_cap(_metadata, caps, CAP_SET);
+}
+
+__attribute__((__unused__))
+static void clear_cap(struct __test_metadata *const _metadata,
+               const cap_value_t caps)
+{
+       _effective_cap(_metadata, caps, CAP_CLEAR);
+}
diff --git a/tools/testing/selftests/landlock/config b/tools/testing/selftests/landlock/config
new file mode 100644 (file)
index 0000000..0f0a652
--- /dev/null
@@ -0,0 +1,7 @@
+CONFIG_OVERLAY_FS=y
+CONFIG_SECURITY_LANDLOCK=y
+CONFIG_SECURITY_PATH=y
+CONFIG_SECURITY=y
+CONFIG_SHMEM=y
+CONFIG_TMPFS_XATTR=y
+CONFIG_TMPFS=y
diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
new file mode 100644 (file)
index 0000000..10c9a1e
--- /dev/null
@@ -0,0 +1,2791 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Landlock tests - Filesystem
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2020 ANSSI
+ * Copyright © 2020-2021 Microsoft Corporation
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <linux/landlock.h>
+#include <sched.h>
+#include <string.h>
+#include <sys/capability.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/sendfile.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+
+#include "common.h"
+
+#define TMP_DIR                "tmp"
+#define BINARY_PATH    "./true"
+
+/* Paths (sibling number and depth) */
+static const char dir_s1d1[] = TMP_DIR "/s1d1";
+static const char file1_s1d1[] = TMP_DIR "/s1d1/f1";
+static const char file2_s1d1[] = TMP_DIR "/s1d1/f2";
+static const char dir_s1d2[] = TMP_DIR "/s1d1/s1d2";
+static const char file1_s1d2[] = TMP_DIR "/s1d1/s1d2/f1";
+static const char file2_s1d2[] = TMP_DIR "/s1d1/s1d2/f2";
+static const char dir_s1d3[] = TMP_DIR "/s1d1/s1d2/s1d3";
+static const char file1_s1d3[] = TMP_DIR "/s1d1/s1d2/s1d3/f1";
+static const char file2_s1d3[] = TMP_DIR "/s1d1/s1d2/s1d3/f2";
+
+static const char dir_s2d1[] = TMP_DIR "/s2d1";
+static const char file1_s2d1[] = TMP_DIR "/s2d1/f1";
+static const char dir_s2d2[] = TMP_DIR "/s2d1/s2d2";
+static const char file1_s2d2[] = TMP_DIR "/s2d1/s2d2/f1";
+static const char dir_s2d3[] = TMP_DIR "/s2d1/s2d2/s2d3";
+static const char file1_s2d3[] = TMP_DIR "/s2d1/s2d2/s2d3/f1";
+static const char file2_s2d3[] = TMP_DIR "/s2d1/s2d2/s2d3/f2";
+
+static const char dir_s3d1[] = TMP_DIR "/s3d1";
+/* dir_s3d2 is a mount point. */
+static const char dir_s3d2[] = TMP_DIR "/s3d1/s3d2";
+static const char dir_s3d3[] = TMP_DIR "/s3d1/s3d2/s3d3";
+
+/*
+ * layout1 hierarchy:
+ *
+ * tmp
+ * ├── s1d1
+ * │   ├── f1
+ * │   ├── f2
+ * │   └── s1d2
+ * │       ├── f1
+ * │       ├── f2
+ * │       └── s1d3
+ * │           ├── f1
+ * │           └── f2
+ * ├── s2d1
+ * │   ├── f1
+ * │   └── s2d2
+ * │       ├── f1
+ * │       └── s2d3
+ * │           ├── f1
+ * │           └── f2
+ * └── s3d1
+ *     └── s3d2
+ *         └── s3d3
+ */
+
+static void mkdir_parents(struct __test_metadata *const _metadata,
+               const char *const path)
+{
+       char *walker;
+       const char *parent;
+       int i, err;
+
+       ASSERT_NE(path[0], '\0');
+       walker = strdup(path);
+       ASSERT_NE(NULL, walker);
+       parent = walker;
+       for (i = 1; walker[i]; i++) {
+               if (walker[i] != '/')
+                       continue;
+               walker[i] = '\0';
+               err = mkdir(parent, 0700);
+               ASSERT_FALSE(err && errno != EEXIST) {
+                       TH_LOG("Failed to create directory \"%s\": %s",
+                                       parent, strerror(errno));
+               }
+               walker[i] = '/';
+       }
+       free(walker);
+}
+
+static void create_directory(struct __test_metadata *const _metadata,
+               const char *const path)
+{
+       mkdir_parents(_metadata, path);
+       ASSERT_EQ(0, mkdir(path, 0700)) {
+               TH_LOG("Failed to create directory \"%s\": %s", path,
+                               strerror(errno));
+       }
+}
+
+static void create_file(struct __test_metadata *const _metadata,
+               const char *const path)
+{
+       mkdir_parents(_metadata, path);
+       ASSERT_EQ(0, mknod(path, S_IFREG | 0700, 0)) {
+               TH_LOG("Failed to create file \"%s\": %s", path,
+                               strerror(errno));
+       }
+}
+
+static int remove_path(const char *const path)
+{
+       char *walker;
+       int i, ret, err = 0;
+
+       walker = strdup(path);
+       if (!walker) {
+               err = ENOMEM;
+               goto out;
+       }
+       if (unlink(path) && rmdir(path)) {
+               if (errno != ENOENT)
+                       err = errno;
+               goto out;
+       }
+       for (i = strlen(walker); i > 0; i--) {
+               if (walker[i] != '/')
+                       continue;
+               walker[i] = '\0';
+               ret = rmdir(walker);
+               if (ret) {
+                       if (errno != ENOTEMPTY && errno != EBUSY)
+                               err = errno;
+                       goto out;
+               }
+               if (strcmp(walker, TMP_DIR) == 0)
+                       goto out;
+       }
+
+out:
+       free(walker);
+       return err;
+}
+
+static void prepare_layout(struct __test_metadata *const _metadata)
+{
+       disable_caps(_metadata);
+       umask(0077);
+       create_directory(_metadata, TMP_DIR);
+
+       /*
+        * Do not pollute the rest of the system: creates a private mount point
+        * for tests relying on pivot_root(2) and move_mount(2).
+        */
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       ASSERT_EQ(0, unshare(CLONE_NEWNS));
+       ASSERT_EQ(0, mount("tmp", TMP_DIR, "tmpfs", 0, "size=4m,mode=700"));
+       ASSERT_EQ(0, mount(NULL, TMP_DIR, NULL, MS_PRIVATE | MS_REC, NULL));
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+}
+
+static void cleanup_layout(struct __test_metadata *const _metadata)
+{
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       EXPECT_EQ(0, umount(TMP_DIR));
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+       EXPECT_EQ(0, remove_path(TMP_DIR));
+}
+
+static void create_layout1(struct __test_metadata *const _metadata)
+{
+       create_file(_metadata, file1_s1d1);
+       create_file(_metadata, file1_s1d2);
+       create_file(_metadata, file1_s1d3);
+       create_file(_metadata, file2_s1d1);
+       create_file(_metadata, file2_s1d2);
+       create_file(_metadata, file2_s1d3);
+
+       create_file(_metadata, file1_s2d1);
+       create_file(_metadata, file1_s2d2);
+       create_file(_metadata, file1_s2d3);
+       create_file(_metadata, file2_s2d3);
+
+       create_directory(_metadata, dir_s3d2);
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       ASSERT_EQ(0, mount("tmp", dir_s3d2, "tmpfs", 0, "size=4m,mode=700"));
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+
+       ASSERT_EQ(0, mkdir(dir_s3d3, 0700));
+}
+
+static void remove_layout1(struct __test_metadata *const _metadata)
+{
+       EXPECT_EQ(0, remove_path(file2_s1d3));
+       EXPECT_EQ(0, remove_path(file2_s1d2));
+       EXPECT_EQ(0, remove_path(file2_s1d1));
+       EXPECT_EQ(0, remove_path(file1_s1d3));
+       EXPECT_EQ(0, remove_path(file1_s1d2));
+       EXPECT_EQ(0, remove_path(file1_s1d1));
+
+       EXPECT_EQ(0, remove_path(file2_s2d3));
+       EXPECT_EQ(0, remove_path(file1_s2d3));
+       EXPECT_EQ(0, remove_path(file1_s2d2));
+       EXPECT_EQ(0, remove_path(file1_s2d1));
+
+       EXPECT_EQ(0, remove_path(dir_s3d3));
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       umount(dir_s3d2);
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+       EXPECT_EQ(0, remove_path(dir_s3d2));
+}
+
+FIXTURE(layout1) {
+};
+
+FIXTURE_SETUP(layout1)
+{
+       prepare_layout(_metadata);
+
+       create_layout1(_metadata);
+}
+
+FIXTURE_TEARDOWN(layout1)
+{
+       remove_layout1(_metadata);
+
+       cleanup_layout(_metadata);
+}
+
+/*
+ * This helper enables to use the ASSERT_* macros and print the line number
+ * pointing to the test caller.
+ */
+static int test_open_rel(const int dirfd, const char *const path, const int flags)
+{
+       int fd;
+
+       /* Works with file and directories. */
+       fd = openat(dirfd, path, flags | O_CLOEXEC);
+       if (fd < 0)
+               return errno;
+       /*
+        * Mixing error codes from close(2) and open(2) should not lead to any
+        * (access type) confusion for this test.
+        */
+       if (close(fd) != 0)
+               return errno;
+       return 0;
+}
+
+static int test_open(const char *const path, const int flags)
+{
+       return test_open_rel(AT_FDCWD, path, flags);
+}
+
+TEST_F_FORK(layout1, no_restriction)
+{
+       ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s1d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(file2_s1d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(file2_s1d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY));
+
+       ASSERT_EQ(0, test_open(dir_s2d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s2d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s2d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s2d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s2d3, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s2d3, O_RDONLY));
+
+       ASSERT_EQ(0, test_open(dir_s3d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s3d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s3d3, O_RDONLY));
+}
+
+TEST_F_FORK(layout1, inval)
+{
+       struct landlock_path_beneath_attr path_beneath = {
+               .allowed_access = LANDLOCK_ACCESS_FS_READ_FILE |
+                       LANDLOCK_ACCESS_FS_WRITE_FILE,
+               .parent_fd = -1,
+       };
+       struct landlock_ruleset_attr ruleset_attr = {
+               .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE |
+                       LANDLOCK_ACCESS_FS_WRITE_FILE,
+       };
+       int ruleset_fd;
+
+       path_beneath.parent_fd = open(dir_s1d2, O_PATH | O_DIRECTORY |
+                       O_CLOEXEC);
+       ASSERT_LE(0, path_beneath.parent_fd);
+
+       ruleset_fd = open(dir_s1d1, O_PATH | O_DIRECTORY | O_CLOEXEC);
+       ASSERT_LE(0, ruleset_fd);
+       ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+                               &path_beneath, 0));
+       /* Returns EBADF because ruleset_fd is not a landlock-ruleset FD. */
+       ASSERT_EQ(EBADF, errno);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       ruleset_fd = open(dir_s1d1, O_DIRECTORY | O_CLOEXEC);
+       ASSERT_LE(0, ruleset_fd);
+       ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+                               &path_beneath, 0));
+       /* Returns EBADFD because ruleset_fd is not a valid ruleset. */
+       ASSERT_EQ(EBADFD, errno);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Gets a real ruleset. */
+       ruleset_fd = landlock_create_ruleset(&ruleset_attr,
+                       sizeof(ruleset_attr), 0);
+       ASSERT_LE(0, ruleset_fd);
+       ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+                               &path_beneath, 0));
+       ASSERT_EQ(0, close(path_beneath.parent_fd));
+
+       /* Tests without O_PATH. */
+       path_beneath.parent_fd = open(dir_s1d2, O_DIRECTORY | O_CLOEXEC);
+       ASSERT_LE(0, path_beneath.parent_fd);
+       ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+                               &path_beneath, 0));
+       ASSERT_EQ(0, close(path_beneath.parent_fd));
+
+       /* Tests with a ruleset FD. */
+       path_beneath.parent_fd = ruleset_fd;
+       ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+                               &path_beneath, 0));
+       ASSERT_EQ(EBADFD, errno);
+
+       /* Checks unhandled allowed_access. */
+       path_beneath.parent_fd = open(dir_s1d2, O_PATH | O_DIRECTORY |
+                       O_CLOEXEC);
+       ASSERT_LE(0, path_beneath.parent_fd);
+
+       /* Test with legitimate values. */
+       path_beneath.allowed_access |= LANDLOCK_ACCESS_FS_EXECUTE;
+       ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+                               &path_beneath, 0));
+       ASSERT_EQ(EINVAL, errno);
+       path_beneath.allowed_access &= ~LANDLOCK_ACCESS_FS_EXECUTE;
+
+       /* Test with unknown (64-bits) value. */
+       path_beneath.allowed_access |= (1ULL << 60);
+       ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+                               &path_beneath, 0));
+       ASSERT_EQ(EINVAL, errno);
+       path_beneath.allowed_access &= ~(1ULL << 60);
+
+       /* Test with no access. */
+       path_beneath.allowed_access = 0;
+       ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+                               &path_beneath, 0));
+       ASSERT_EQ(ENOMSG, errno);
+       path_beneath.allowed_access &= ~(1ULL << 60);
+
+       ASSERT_EQ(0, close(path_beneath.parent_fd));
+
+       /* Enforces the ruleset. */
+       ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+       ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0));
+
+       ASSERT_EQ(0, close(ruleset_fd));
+}
+
+#define ACCESS_FILE ( \
+       LANDLOCK_ACCESS_FS_EXECUTE | \
+       LANDLOCK_ACCESS_FS_WRITE_FILE | \
+       LANDLOCK_ACCESS_FS_READ_FILE)
+
+#define ACCESS_LAST LANDLOCK_ACCESS_FS_MAKE_SYM
+
+#define ACCESS_ALL ( \
+       ACCESS_FILE | \
+       LANDLOCK_ACCESS_FS_READ_DIR | \
+       LANDLOCK_ACCESS_FS_REMOVE_DIR | \
+       LANDLOCK_ACCESS_FS_REMOVE_FILE | \
+       LANDLOCK_ACCESS_FS_MAKE_CHAR | \
+       LANDLOCK_ACCESS_FS_MAKE_DIR | \
+       LANDLOCK_ACCESS_FS_MAKE_REG | \
+       LANDLOCK_ACCESS_FS_MAKE_SOCK | \
+       LANDLOCK_ACCESS_FS_MAKE_FIFO | \
+       LANDLOCK_ACCESS_FS_MAKE_BLOCK | \
+       ACCESS_LAST)
+
+TEST_F_FORK(layout1, file_access_rights)
+{
+       __u64 access;
+       int err;
+       struct landlock_path_beneath_attr path_beneath = {};
+       struct landlock_ruleset_attr ruleset_attr = {
+               .handled_access_fs = ACCESS_ALL,
+       };
+       const int ruleset_fd = landlock_create_ruleset(&ruleset_attr,
+                       sizeof(ruleset_attr), 0);
+
+       ASSERT_LE(0, ruleset_fd);
+
+       /* Tests access rights for files. */
+       path_beneath.parent_fd = open(file1_s1d2, O_PATH | O_CLOEXEC);
+       ASSERT_LE(0, path_beneath.parent_fd);
+       for (access = 1; access <= ACCESS_LAST; access <<= 1) {
+               path_beneath.allowed_access = access;
+               err = landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+                               &path_beneath, 0);
+               if ((access | ACCESS_FILE) == ACCESS_FILE) {
+                       ASSERT_EQ(0, err);
+               } else {
+                       ASSERT_EQ(-1, err);
+                       ASSERT_EQ(EINVAL, errno);
+               }
+       }
+       ASSERT_EQ(0, close(path_beneath.parent_fd));
+}
+
+static void add_path_beneath(struct __test_metadata *const _metadata,
+               const int ruleset_fd, const __u64 allowed_access,
+               const char *const path)
+{
+       struct landlock_path_beneath_attr path_beneath = {
+               .allowed_access = allowed_access,
+       };
+
+       path_beneath.parent_fd = open(path, O_PATH | O_CLOEXEC);
+       ASSERT_LE(0, path_beneath.parent_fd) {
+               TH_LOG("Failed to open directory \"%s\": %s", path,
+                               strerror(errno));
+       }
+       ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+                               &path_beneath, 0)) {
+               TH_LOG("Failed to update the ruleset with \"%s\": %s", path,
+                               strerror(errno));
+       }
+       ASSERT_EQ(0, close(path_beneath.parent_fd));
+}
+
+struct rule {
+       const char *path;
+       __u64 access;
+};
+
+#define ACCESS_RO ( \
+       LANDLOCK_ACCESS_FS_READ_FILE | \
+       LANDLOCK_ACCESS_FS_READ_DIR)
+
+#define ACCESS_RW ( \
+       ACCESS_RO | \
+       LANDLOCK_ACCESS_FS_WRITE_FILE)
+
+static int create_ruleset(struct __test_metadata *const _metadata,
+               const __u64 handled_access_fs, const struct rule rules[])
+{
+       int ruleset_fd, i;
+       struct landlock_ruleset_attr ruleset_attr = {
+               .handled_access_fs = handled_access_fs,
+       };
+
+       ASSERT_NE(NULL, rules) {
+               TH_LOG("No rule list");
+       }
+       ASSERT_NE(NULL, rules[0].path) {
+               TH_LOG("Empty rule list");
+       }
+
+       ruleset_fd = landlock_create_ruleset(&ruleset_attr,
+                       sizeof(ruleset_attr), 0);
+       ASSERT_LE(0, ruleset_fd) {
+               TH_LOG("Failed to create a ruleset: %s", strerror(errno));
+       }
+
+       for (i = 0; rules[i].path; i++) {
+               add_path_beneath(_metadata, ruleset_fd, rules[i].access,
+                               rules[i].path);
+       }
+       return ruleset_fd;
+}
+
+static void enforce_ruleset(struct __test_metadata *const _metadata,
+               const int ruleset_fd)
+{
+       ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+       ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0)) {
+               TH_LOG("Failed to enforce ruleset: %s", strerror(errno));
+       }
+}
+
+TEST_F_FORK(layout1, proc_nsfs)
+{
+       const struct rule rules[] = {
+               {
+                       .path = "/dev/null",
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE |
+                               LANDLOCK_ACCESS_FS_WRITE_FILE,
+               },
+               {}
+       };
+       struct landlock_path_beneath_attr path_beneath;
+       const int ruleset_fd = create_ruleset(_metadata, rules[0].access |
+                       LANDLOCK_ACCESS_FS_READ_DIR, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       ASSERT_EQ(0, test_open("/proc/self/ns/mnt", O_RDONLY));
+
+       enforce_ruleset(_metadata, ruleset_fd);
+
+       ASSERT_EQ(EACCES, test_open("/", O_RDONLY));
+       ASSERT_EQ(EACCES, test_open("/dev", O_RDONLY));
+       ASSERT_EQ(0, test_open("/dev/null", O_RDONLY));
+       ASSERT_EQ(EACCES, test_open("/dev/full", O_RDONLY));
+
+       ASSERT_EQ(EACCES, test_open("/proc", O_RDONLY));
+       ASSERT_EQ(EACCES, test_open("/proc/self", O_RDONLY));
+       ASSERT_EQ(EACCES, test_open("/proc/self/ns", O_RDONLY));
+       /*
+        * Because nsfs is an internal filesystem, /proc/self/ns/mnt is a
+        * disconnected path.  Such path cannot be identified and must then be
+        * allowed.
+        */
+       ASSERT_EQ(0, test_open("/proc/self/ns/mnt", O_RDONLY));
+
+       /*
+        * Checks that it is not possible to add nsfs-like filesystem
+        * references to a ruleset.
+        */
+       path_beneath.allowed_access = LANDLOCK_ACCESS_FS_READ_FILE |
+               LANDLOCK_ACCESS_FS_WRITE_FILE,
+       path_beneath.parent_fd = open("/proc/self/ns/mnt", O_PATH | O_CLOEXEC);
+       ASSERT_LE(0, path_beneath.parent_fd);
+       ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH,
+                               &path_beneath, 0));
+       ASSERT_EQ(EBADFD, errno);
+       ASSERT_EQ(0, close(path_beneath.parent_fd));
+}
+
+TEST_F_FORK(layout1, unpriv) {
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = ACCESS_RO,
+               },
+               {}
+       };
+       int ruleset_fd;
+
+       drop_caps(_metadata);
+
+       ruleset_fd = create_ruleset(_metadata, ACCESS_RO, rules);
+       ASSERT_LE(0, ruleset_fd);
+       ASSERT_EQ(-1, landlock_restrict_self(ruleset_fd, 0));
+       ASSERT_EQ(EPERM, errno);
+
+       /* enforce_ruleset() calls prctl(no_new_privs). */
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+}
+
+TEST_F_FORK(layout1, effective_access)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = ACCESS_RO,
+               },
+               {
+                       .path = file1_s2d2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE |
+                               LANDLOCK_ACCESS_FS_WRITE_FILE,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules);
+       char buf;
+       int reg_fd;
+
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Tests on a directory. */
+       ASSERT_EQ(EACCES, test_open("/", O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY));
+
+       /* Tests on a file. */
+       ASSERT_EQ(EACCES, test_open(dir_s2d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s2d2, O_RDONLY));
+
+       /* Checks effective read and write actions. */
+       reg_fd = open(file1_s2d2, O_RDWR | O_CLOEXEC);
+       ASSERT_LE(0, reg_fd);
+       ASSERT_EQ(1, write(reg_fd, ".", 1));
+       ASSERT_LE(0, lseek(reg_fd, 0, SEEK_SET));
+       ASSERT_EQ(1, read(reg_fd, &buf, 1));
+       ASSERT_EQ('.', buf);
+       ASSERT_EQ(0, close(reg_fd));
+
+       /* Just in case, double-checks effective actions. */
+       reg_fd = open(file1_s2d2, O_RDONLY | O_CLOEXEC);
+       ASSERT_LE(0, reg_fd);
+       ASSERT_EQ(-1, write(reg_fd, &buf, 1));
+       ASSERT_EQ(EBADF, errno);
+       ASSERT_EQ(0, close(reg_fd));
+}
+
+TEST_F_FORK(layout1, unhandled_access)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = ACCESS_RO,
+               },
+               {}
+       };
+       /* Here, we only handle read accesses, not write accesses. */
+       const int ruleset_fd = create_ruleset(_metadata, ACCESS_RO, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /*
+        * Because the policy does not handle LANDLOCK_ACCESS_FS_WRITE_FILE,
+        * opening for write-only should be allowed, but not read-write.
+        */
+       ASSERT_EQ(0, test_open(file1_s1d1, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDWR));
+
+       ASSERT_EQ(0, test_open(file1_s1d2, O_WRONLY));
+       ASSERT_EQ(0, test_open(file1_s1d2, O_RDWR));
+}
+
+TEST_F_FORK(layout1, ruleset_overlap)
+{
+       const struct rule rules[] = {
+               /* These rules should be ORed among them. */
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE |
+                               LANDLOCK_ACCESS_FS_WRITE_FILE,
+               },
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE |
+                               LANDLOCK_ACCESS_FS_READ_DIR,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks s1d1 hierarchy. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDWR));
+       ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY | O_DIRECTORY));
+
+       /* Checks s1d2 hierarchy. */
+       ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s1d2, O_WRONLY));
+       ASSERT_EQ(0, test_open(file1_s1d2, O_RDWR));
+       ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY));
+
+       /* Checks s1d3 hierarchy. */
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s1d3, O_WRONLY));
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDWR));
+       ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY));
+}
+
+TEST_F_FORK(layout1, non_overlapping_accesses)
+{
+       const struct rule layer1[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_MAKE_REG,
+               },
+               {}
+       };
+       const struct rule layer2[] = {
+               {
+                       .path = dir_s1d3,
+                       .access = LANDLOCK_ACCESS_FS_REMOVE_FILE,
+               },
+               {}
+       };
+       int ruleset_fd;
+
+       ASSERT_EQ(0, unlink(file1_s1d1));
+       ASSERT_EQ(0, unlink(file1_s1d2));
+
+       ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_MAKE_REG,
+                       layer1);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       ASSERT_EQ(-1, mknod(file1_s1d1, S_IFREG | 0700, 0));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(0, mknod(file1_s1d2, S_IFREG | 0700, 0));
+       ASSERT_EQ(0, unlink(file1_s1d2));
+
+       ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_REMOVE_FILE,
+                       layer2);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Unchanged accesses for file creation. */
+       ASSERT_EQ(-1, mknod(file1_s1d1, S_IFREG | 0700, 0));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(0, mknod(file1_s1d2, S_IFREG | 0700, 0));
+
+       /* Checks file removing. */
+       ASSERT_EQ(-1, unlink(file1_s1d2));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(0, unlink(file1_s1d3));
+}
+
+TEST_F_FORK(layout1, interleaved_masked_accesses)
+{
+       /*
+        * Checks overly restrictive rules:
+        * layer 1: allows R   s1d1/s1d2/s1d3/file1
+        * layer 2: allows RW  s1d1/s1d2/s1d3
+        *          allows  W  s1d1/s1d2
+        *          denies R   s1d1/s1d2
+        * layer 3: allows R   s1d1
+        * layer 4: allows R   s1d1/s1d2
+        *          denies  W  s1d1/s1d2
+        * layer 5: allows R   s1d1/s1d2
+        * layer 6: allows   X ----
+        * layer 7: allows  W  s1d1/s1d2
+        *          denies R   s1d1/s1d2
+        */
+       const struct rule layer1_read[] = {
+               /* Allows read access to file1_s1d3 with the first layer. */
+               {
+                       .path = file1_s1d3,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {}
+       };
+       /* First rule with write restrictions. */
+       const struct rule layer2_read_write[] = {
+               /* Start by granting read-write access via its parent directory... */
+               {
+                       .path = dir_s1d3,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE |
+                               LANDLOCK_ACCESS_FS_WRITE_FILE,
+               },
+               /* ...but also denies read access via its grandparent directory. */
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_WRITE_FILE,
+               },
+               {}
+       };
+       const struct rule layer3_read[] = {
+               /* Allows read access via its great-grandparent directory. */
+               {
+                       .path = dir_s1d1,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {}
+       };
+       const struct rule layer4_read_write[] = {
+               /*
+                * Try to confuse the deny access by denying write (but not
+                * read) access via its grandparent directory.
+                */
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {}
+       };
+       const struct rule layer5_read[] = {
+               /*
+                * Try to override layer2's deny read access by explicitly
+                * allowing read access via file1_s1d3's grandparent.
+                */
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {}
+       };
+       const struct rule layer6_execute[] = {
+               /*
+                * Restricts an unrelated file hierarchy with a new access
+                * (non-overlapping) type.
+                */
+               {
+                       .path = dir_s2d1,
+                       .access = LANDLOCK_ACCESS_FS_EXECUTE,
+               },
+               {}
+       };
+       const struct rule layer7_read_write[] = {
+               /*
+                * Finally, denies read access to file1_s1d3 via its
+                * grandparent.
+                */
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_WRITE_FILE,
+               },
+               {}
+       };
+       int ruleset_fd;
+
+       ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_FILE,
+                       layer1_read);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks that read access is granted for file1_s1d3 with layer 1. */
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDWR));
+       ASSERT_EQ(EACCES, test_open(file2_s1d3, O_RDONLY));
+       ASSERT_EQ(0, test_open(file2_s1d3, O_WRONLY));
+
+       ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_FILE |
+                       LANDLOCK_ACCESS_FS_WRITE_FILE, layer2_read_write);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks that previous access rights are unchanged with layer 2. */
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDWR));
+       ASSERT_EQ(EACCES, test_open(file2_s1d3, O_RDONLY));
+       ASSERT_EQ(0, test_open(file2_s1d3, O_WRONLY));
+
+       ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_FILE,
+                       layer3_read);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks that previous access rights are unchanged with layer 3. */
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDWR));
+       ASSERT_EQ(EACCES, test_open(file2_s1d3, O_RDONLY));
+       ASSERT_EQ(0, test_open(file2_s1d3, O_WRONLY));
+
+       /* This time, denies write access for the file hierarchy. */
+       ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_FILE |
+                       LANDLOCK_ACCESS_FS_WRITE_FILE, layer4_read_write);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /*
+        * Checks that the only change with layer 4 is that write access is
+        * denied.
+        */
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(file2_s1d3, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file2_s1d3, O_WRONLY));
+
+       ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_FILE,
+                       layer5_read);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks that previous access rights are unchanged with layer 5. */
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(file2_s1d3, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(file2_s1d3, O_RDONLY));
+
+       ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_EXECUTE,
+                       layer6_execute);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks that previous access rights are unchanged with layer 6. */
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(file2_s1d3, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(file2_s1d3, O_RDONLY));
+
+       ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_FILE |
+                       LANDLOCK_ACCESS_FS_WRITE_FILE, layer7_read_write);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks read access is now denied with layer 7. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d3, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(file2_s1d3, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(file2_s1d3, O_RDONLY));
+}
+
+TEST_F_FORK(layout1, inherit_subset)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE |
+                               LANDLOCK_ACCESS_FS_READ_DIR,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY | O_DIRECTORY));
+
+       /* Write access is forbidden. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d2, O_WRONLY));
+       /* Readdir access is allowed. */
+       ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY));
+
+       /* Write access is forbidden. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY));
+       /* Readdir access is allowed. */
+       ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY));
+
+       /*
+        * Tests shared rule extension: the following rules should not grant
+        * any new access, only remove some.  Once enforced, these rules are
+        * ANDed with the previous ones.
+        */
+       add_path_beneath(_metadata, ruleset_fd, LANDLOCK_ACCESS_FS_WRITE_FILE,
+                       dir_s1d2);
+       /*
+        * According to ruleset_fd, dir_s1d2 should now have the
+        * LANDLOCK_ACCESS_FS_READ_FILE and LANDLOCK_ACCESS_FS_WRITE_FILE
+        * access rights (even if this directory is opened a second time).
+        * However, when enforcing this updated ruleset, the ruleset tied to
+        * the current process (i.e. its domain) will still only have the
+        * dir_s1d2 with LANDLOCK_ACCESS_FS_READ_FILE and
+        * LANDLOCK_ACCESS_FS_READ_DIR accesses, but
+        * LANDLOCK_ACCESS_FS_WRITE_FILE must not be allowed because it would
+        * be a privilege escalation.
+        */
+       enforce_ruleset(_metadata, ruleset_fd);
+
+       /* Same tests and results as above. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY | O_DIRECTORY));
+
+       /* It is still forbidden to write in file1_s1d2. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d2, O_WRONLY));
+       /* Readdir access is still allowed. */
+       ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY));
+
+       /* It is still forbidden to write in file1_s1d3. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY));
+       /* Readdir access is still allowed. */
+       ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY));
+
+       /*
+        * Try to get more privileges by adding new access rights to the parent
+        * directory: dir_s1d1.
+        */
+       add_path_beneath(_metadata, ruleset_fd, ACCESS_RW, dir_s1d1);
+       enforce_ruleset(_metadata, ruleset_fd);
+
+       /* Same tests and results as above. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY | O_DIRECTORY));
+
+       /* It is still forbidden to write in file1_s1d2. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d2, O_WRONLY));
+       /* Readdir access is still allowed. */
+       ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY));
+
+       /* It is still forbidden to write in file1_s1d3. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY));
+       /* Readdir access is still allowed. */
+       ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY));
+
+       /*
+        * Now, dir_s1d3 get a new rule tied to it, only allowing
+        * LANDLOCK_ACCESS_FS_WRITE_FILE.  The (kernel internal) difference is
+        * that there was no rule tied to it before.
+        */
+       add_path_beneath(_metadata, ruleset_fd, LANDLOCK_ACCESS_FS_WRITE_FILE,
+                       dir_s1d3);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /*
+        * Same tests and results as above, except for open(dir_s1d3) which is
+        * now denied because the new rule mask the rule previously inherited
+        * from dir_s1d2.
+        */
+
+       /* Same tests and results as above. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY | O_DIRECTORY));
+
+       /* It is still forbidden to write in file1_s1d2. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d2, O_WRONLY));
+       /* Readdir access is still allowed. */
+       ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY));
+
+       /* It is still forbidden to write in file1_s1d3. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY));
+       /*
+        * Readdir of dir_s1d3 is still allowed because of the OR policy inside
+        * the same layer.
+        */
+       ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY));
+}
+
+TEST_F_FORK(layout1, inherit_superset)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d3,
+                       .access = ACCESS_RO,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+
+       /* Readdir access is denied for dir_s1d2. */
+       ASSERT_EQ(EACCES, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY));
+       /* Readdir access is allowed for dir_s1d3. */
+       ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY));
+       /* File access is allowed for file1_s1d3. */
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY));
+
+       /* Now dir_s1d2, parent of dir_s1d3, gets a new rule tied to it. */
+       add_path_beneath(_metadata, ruleset_fd, LANDLOCK_ACCESS_FS_READ_FILE |
+                       LANDLOCK_ACCESS_FS_READ_DIR, dir_s1d2);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Readdir access is still denied for dir_s1d2. */
+       ASSERT_EQ(EACCES, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY));
+       /* Readdir access is still allowed for dir_s1d3. */
+       ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY));
+       /* File access is still allowed for file1_s1d3. */
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY));
+}
+
+TEST_F_FORK(layout1, max_layers)
+{
+       int i, err;
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = ACCESS_RO,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       for (i = 0; i < 64; i++)
+               enforce_ruleset(_metadata, ruleset_fd);
+
+       for (i = 0; i < 2; i++) {
+               err = landlock_restrict_self(ruleset_fd, 0);
+               ASSERT_EQ(-1, err);
+               ASSERT_EQ(E2BIG, errno);
+       }
+       ASSERT_EQ(0, close(ruleset_fd));
+}
+
+TEST_F_FORK(layout1, empty_or_same_ruleset)
+{
+       struct landlock_ruleset_attr ruleset_attr = {};
+       int ruleset_fd;
+
+       /* Tests empty handled_access_fs. */
+       ruleset_fd = landlock_create_ruleset(&ruleset_attr,
+                       sizeof(ruleset_attr), 0);
+       ASSERT_LE(-1, ruleset_fd);
+       ASSERT_EQ(ENOMSG, errno);
+
+       /* Enforces policy which deny read access to all files. */
+       ruleset_attr.handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE;
+       ruleset_fd = landlock_create_ruleset(&ruleset_attr,
+                       sizeof(ruleset_attr), 0);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY));
+
+       /* Nests a policy which deny read access to all directories. */
+       ruleset_attr.handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR;
+       ruleset_fd = landlock_create_ruleset(&ruleset_attr,
+                       sizeof(ruleset_attr), 0);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY));
+
+       /* Enforces a second time with the same ruleset. */
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+}
+
+TEST_F_FORK(layout1, rule_on_mountpoint)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d1,
+                       .access = ACCESS_RO,
+               },
+               {
+                       /* dir_s3d2 is a mount point. */
+                       .path = dir_s3d2,
+                       .access = ACCESS_RO,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY));
+
+       ASSERT_EQ(EACCES, test_open(dir_s2d1, O_RDONLY));
+
+       ASSERT_EQ(EACCES, test_open(dir_s3d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s3d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s3d3, O_RDONLY));
+}
+
+TEST_F_FORK(layout1, rule_over_mountpoint)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d1,
+                       .access = ACCESS_RO,
+               },
+               {
+                       /* dir_s3d2 is a mount point. */
+                       .path = dir_s3d1,
+                       .access = ACCESS_RO,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY));
+
+       ASSERT_EQ(EACCES, test_open(dir_s2d1, O_RDONLY));
+
+       ASSERT_EQ(0, test_open(dir_s3d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s3d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s3d3, O_RDONLY));
+}
+
+/*
+ * This test verifies that we can apply a landlock rule on the root directory
+ * (which might require special handling).
+ */
+TEST_F_FORK(layout1, rule_over_root_allow_then_deny)
+{
+       struct rule rules[] = {
+               {
+                       .path = "/",
+                       .access = ACCESS_RO,
+               },
+               {}
+       };
+       int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks allowed access. */
+       ASSERT_EQ(0, test_open("/", O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY));
+
+       rules[0].access = LANDLOCK_ACCESS_FS_READ_FILE;
+       ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks denied access (on a directory). */
+       ASSERT_EQ(EACCES, test_open("/", O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY));
+}
+
+TEST_F_FORK(layout1, rule_over_root_deny)
+{
+       const struct rule rules[] = {
+               {
+                       .path = "/",
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks denied access (on a directory). */
+       ASSERT_EQ(EACCES, test_open("/", O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY));
+}
+
+TEST_F_FORK(layout1, rule_inside_mount_ns)
+{
+       const struct rule rules[] = {
+               {
+                       .path = "s3d3",
+                       .access = ACCESS_RO,
+               },
+               {}
+       };
+       int ruleset_fd;
+
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       ASSERT_EQ(0, syscall(SYS_pivot_root, dir_s3d2, dir_s3d3)) {
+               TH_LOG("Failed to pivot root: %s", strerror(errno));
+       };
+       ASSERT_EQ(0, chdir("/"));
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+
+       ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       ASSERT_EQ(0, test_open("s3d3", O_RDONLY));
+       ASSERT_EQ(EACCES, test_open("/", O_RDONLY));
+}
+
+TEST_F_FORK(layout1, mount_and_pivot)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s3d2,
+                       .access = ACCESS_RO,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       ASSERT_EQ(-1, mount(NULL, dir_s3d2, NULL, MS_RDONLY, NULL));
+       ASSERT_EQ(EPERM, errno);
+       ASSERT_EQ(-1, syscall(SYS_pivot_root, dir_s3d2, dir_s3d3));
+       ASSERT_EQ(EPERM, errno);
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+}
+
+TEST_F_FORK(layout1, move_mount)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s3d2,
+                       .access = ACCESS_RO,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       ASSERT_EQ(0, syscall(SYS_move_mount, AT_FDCWD, dir_s3d2, AT_FDCWD,
+                               dir_s1d2, 0)) {
+               TH_LOG("Failed to move mount: %s", strerror(errno));
+       }
+
+       ASSERT_EQ(0, syscall(SYS_move_mount, AT_FDCWD, dir_s1d2, AT_FDCWD,
+                               dir_s3d2, 0));
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       ASSERT_EQ(-1, syscall(SYS_move_mount, AT_FDCWD, dir_s3d2, AT_FDCWD,
+                               dir_s1d2, 0));
+       ASSERT_EQ(EPERM, errno);
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+}
+
+TEST_F_FORK(layout1, release_inodes)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d1,
+                       .access = ACCESS_RO,
+               },
+               {
+                       .path = dir_s3d2,
+                       .access = ACCESS_RO,
+               },
+               {
+                       .path = dir_s3d3,
+                       .access = ACCESS_RO,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       /* Unmount a file hierarchy while it is being used by a ruleset. */
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       ASSERT_EQ(0, umount(dir_s3d2));
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       ASSERT_EQ(0, test_open(file1_s1d1, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(dir_s3d2, O_RDONLY));
+       /* This dir_s3d3 would not be allowed and does not exist anyway. */
+       ASSERT_EQ(ENOENT, test_open(dir_s3d3, O_RDONLY));
+}
+
+enum relative_access {
+       REL_OPEN,
+       REL_CHDIR,
+       REL_CHROOT_ONLY,
+       REL_CHROOT_CHDIR,
+};
+
+static void test_relative_path(struct __test_metadata *const _metadata,
+               const enum relative_access rel)
+{
+       /*
+        * Common layer to check that chroot doesn't ignore it (i.e. a chroot
+        * is not a disconnected root directory).
+        */
+       const struct rule layer1_base[] = {
+               {
+                       .path = TMP_DIR,
+                       .access = ACCESS_RO,
+               },
+               {}
+       };
+       const struct rule layer2_subs[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = ACCESS_RO,
+               },
+               {
+                       .path = dir_s2d2,
+                       .access = ACCESS_RO,
+               },
+               {}
+       };
+       int dirfd, ruleset_fd;
+
+       ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer1_base);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer2_subs);
+
+       ASSERT_LE(0, ruleset_fd);
+       switch (rel) {
+       case REL_OPEN:
+       case REL_CHDIR:
+               break;
+       case REL_CHROOT_ONLY:
+               ASSERT_EQ(0, chdir(dir_s2d2));
+               break;
+       case REL_CHROOT_CHDIR:
+               ASSERT_EQ(0, chdir(dir_s1d2));
+               break;
+       default:
+               ASSERT_TRUE(false);
+               return;
+       }
+
+       set_cap(_metadata, CAP_SYS_CHROOT);
+       enforce_ruleset(_metadata, ruleset_fd);
+
+       switch (rel) {
+       case REL_OPEN:
+               dirfd = open(dir_s1d2, O_DIRECTORY);
+               ASSERT_LE(0, dirfd);
+               break;
+       case REL_CHDIR:
+               ASSERT_EQ(0, chdir(dir_s1d2));
+               dirfd = AT_FDCWD;
+               break;
+       case REL_CHROOT_ONLY:
+               /* Do chroot into dir_s1d2 (relative to dir_s2d2). */
+               ASSERT_EQ(0, chroot("../../s1d1/s1d2")) {
+                       TH_LOG("Failed to chroot: %s", strerror(errno));
+               }
+               dirfd = AT_FDCWD;
+               break;
+       case REL_CHROOT_CHDIR:
+               /* Do chroot into dir_s1d2. */
+               ASSERT_EQ(0, chroot(".")) {
+                       TH_LOG("Failed to chroot: %s", strerror(errno));
+               }
+               dirfd = AT_FDCWD;
+               break;
+       }
+
+       ASSERT_EQ((rel == REL_CHROOT_CHDIR) ? 0 : EACCES,
+                       test_open_rel(dirfd, "..", O_RDONLY));
+       ASSERT_EQ(0, test_open_rel(dirfd, ".", O_RDONLY));
+
+       if (rel == REL_CHROOT_ONLY) {
+               /* The current directory is dir_s2d2. */
+               ASSERT_EQ(0, test_open_rel(dirfd, "./s2d3", O_RDONLY));
+       } else {
+               /* The current directory is dir_s1d2. */
+               ASSERT_EQ(0, test_open_rel(dirfd, "./s1d3", O_RDONLY));
+       }
+
+       if (rel == REL_CHROOT_ONLY || rel == REL_CHROOT_CHDIR) {
+               /* Checks the root dir_s1d2. */
+               ASSERT_EQ(0, test_open_rel(dirfd, "/..", O_RDONLY));
+               ASSERT_EQ(0, test_open_rel(dirfd, "/", O_RDONLY));
+               ASSERT_EQ(0, test_open_rel(dirfd, "/f1", O_RDONLY));
+               ASSERT_EQ(0, test_open_rel(dirfd, "/s1d3", O_RDONLY));
+       }
+
+       if (rel != REL_CHROOT_CHDIR) {
+               ASSERT_EQ(EACCES, test_open_rel(dirfd, "../../s1d1", O_RDONLY));
+               ASSERT_EQ(0, test_open_rel(dirfd, "../../s1d1/s1d2", O_RDONLY));
+               ASSERT_EQ(0, test_open_rel(dirfd, "../../s1d1/s1d2/s1d3", O_RDONLY));
+
+               ASSERT_EQ(EACCES, test_open_rel(dirfd, "../../s2d1", O_RDONLY));
+               ASSERT_EQ(0, test_open_rel(dirfd, "../../s2d1/s2d2", O_RDONLY));
+               ASSERT_EQ(0, test_open_rel(dirfd, "../../s2d1/s2d2/s2d3", O_RDONLY));
+       }
+
+       if (rel == REL_OPEN)
+               ASSERT_EQ(0, close(dirfd));
+       ASSERT_EQ(0, close(ruleset_fd));
+}
+
+TEST_F_FORK(layout1, relative_open)
+{
+       test_relative_path(_metadata, REL_OPEN);
+}
+
+TEST_F_FORK(layout1, relative_chdir)
+{
+       test_relative_path(_metadata, REL_CHDIR);
+}
+
+TEST_F_FORK(layout1, relative_chroot_only)
+{
+       test_relative_path(_metadata, REL_CHROOT_ONLY);
+}
+
+TEST_F_FORK(layout1, relative_chroot_chdir)
+{
+       test_relative_path(_metadata, REL_CHROOT_CHDIR);
+}
+
+static void copy_binary(struct __test_metadata *const _metadata,
+               const char *const dst_path)
+{
+       int dst_fd, src_fd;
+       struct stat statbuf;
+
+       dst_fd = open(dst_path, O_WRONLY | O_TRUNC | O_CLOEXEC);
+       ASSERT_LE(0, dst_fd) {
+               TH_LOG("Failed to open \"%s\": %s", dst_path,
+                               strerror(errno));
+       }
+       src_fd = open(BINARY_PATH, O_RDONLY | O_CLOEXEC);
+       ASSERT_LE(0, src_fd) {
+               TH_LOG("Failed to open \"" BINARY_PATH "\": %s",
+                               strerror(errno));
+       }
+       ASSERT_EQ(0, fstat(src_fd, &statbuf));
+       ASSERT_EQ(statbuf.st_size, sendfile(dst_fd, src_fd, 0,
+                               statbuf.st_size));
+       ASSERT_EQ(0, close(src_fd));
+       ASSERT_EQ(0, close(dst_fd));
+}
+
+static void test_execute(struct __test_metadata *const _metadata,
+               const int err, const char *const path)
+{
+       int status;
+       char *const argv[] = {(char *)path, NULL};
+       const pid_t child = fork();
+
+       ASSERT_LE(0, child);
+       if (child == 0) {
+               ASSERT_EQ(err ? -1 : 0, execve(path, argv, NULL)) {
+                       TH_LOG("Failed to execute \"%s\": %s", path,
+                                       strerror(errno));
+               };
+               ASSERT_EQ(err, errno);
+               _exit(_metadata->passed ? 2 : 1);
+               return;
+       }
+       ASSERT_EQ(child, waitpid(child, &status, 0));
+       ASSERT_EQ(1, WIFEXITED(status));
+       ASSERT_EQ(err ? 2 : 0, WEXITSTATUS(status)) {
+               TH_LOG("Unexpected return code for \"%s\": %s", path,
+                               strerror(errno));
+       };
+}
+
+TEST_F_FORK(layout1, execute)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_EXECUTE,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, rules[0].access,
+                       rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       copy_binary(_metadata, file1_s1d1);
+       copy_binary(_metadata, file1_s1d2);
+       copy_binary(_metadata, file1_s1d3);
+
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s1d1, O_RDONLY));
+       test_execute(_metadata, EACCES, file1_s1d1);
+
+       ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY));
+       test_execute(_metadata, 0, file1_s1d2);
+
+       ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY));
+       test_execute(_metadata, 0, file1_s1d3);
+}
+
+TEST_F_FORK(layout1, link)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_MAKE_REG,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, rules[0].access,
+                       rules);
+
+       ASSERT_LE(0, ruleset_fd);
+
+       ASSERT_EQ(0, unlink(file1_s1d1));
+       ASSERT_EQ(0, unlink(file1_s1d2));
+       ASSERT_EQ(0, unlink(file1_s1d3));
+
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       ASSERT_EQ(-1, link(file2_s1d1, file1_s1d1));
+       ASSERT_EQ(EACCES, errno);
+       /* Denies linking because of reparenting. */
+       ASSERT_EQ(-1, link(file1_s2d1, file1_s1d2));
+       ASSERT_EQ(EXDEV, errno);
+       ASSERT_EQ(-1, link(file2_s1d2, file1_s1d3));
+       ASSERT_EQ(EXDEV, errno);
+
+       ASSERT_EQ(0, link(file2_s1d2, file1_s1d2));
+       ASSERT_EQ(0, link(file2_s1d3, file1_s1d3));
+}
+
+TEST_F_FORK(layout1, rename_file)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d3,
+                       .access = LANDLOCK_ACCESS_FS_REMOVE_FILE,
+               },
+               {
+                       .path = dir_s2d2,
+                       .access = LANDLOCK_ACCESS_FS_REMOVE_FILE,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, rules[0].access,
+                       rules);
+
+       ASSERT_LE(0, ruleset_fd);
+
+       ASSERT_EQ(0, unlink(file1_s1d1));
+       ASSERT_EQ(0, unlink(file1_s1d2));
+
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /*
+        * Tries to replace a file, from a directory that allows file removal,
+        * but to a different directory (which also allows file removal).
+        */
+       ASSERT_EQ(-1, rename(file1_s2d3, file1_s1d3));
+       ASSERT_EQ(EXDEV, errno);
+       ASSERT_EQ(-1, renameat2(AT_FDCWD, file1_s2d3, AT_FDCWD, file1_s1d3,
+                               RENAME_EXCHANGE));
+       ASSERT_EQ(EXDEV, errno);
+       ASSERT_EQ(-1, renameat2(AT_FDCWD, file1_s2d3, AT_FDCWD, dir_s1d3,
+                               RENAME_EXCHANGE));
+       ASSERT_EQ(EXDEV, errno);
+
+       /*
+        * Tries to replace a file, from a directory that denies file removal,
+        * to a different directory (which allows file removal).
+        */
+       ASSERT_EQ(-1, rename(file1_s2d1, file1_s1d3));
+       ASSERT_EQ(EXDEV, errno);
+       ASSERT_EQ(-1, renameat2(AT_FDCWD, file1_s2d1, AT_FDCWD, file1_s1d3,
+                               RENAME_EXCHANGE));
+       ASSERT_EQ(EXDEV, errno);
+       ASSERT_EQ(-1, renameat2(AT_FDCWD, dir_s2d2, AT_FDCWD, file1_s1d3,
+                               RENAME_EXCHANGE));
+       ASSERT_EQ(EXDEV, errno);
+
+       /* Exchanges files and directories that partially allow removal. */
+       ASSERT_EQ(-1, renameat2(AT_FDCWD, dir_s2d2, AT_FDCWD, file1_s2d1,
+                               RENAME_EXCHANGE));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(-1, renameat2(AT_FDCWD, file1_s2d1, AT_FDCWD, dir_s2d2,
+                               RENAME_EXCHANGE));
+       ASSERT_EQ(EACCES, errno);
+
+       /* Renames files with different parents. */
+       ASSERT_EQ(-1, rename(file1_s2d2, file1_s1d2));
+       ASSERT_EQ(EXDEV, errno);
+       ASSERT_EQ(0, unlink(file1_s1d3));
+       ASSERT_EQ(-1, rename(file1_s2d1, file1_s1d3));
+       ASSERT_EQ(EXDEV, errno);
+
+       /* Exchanges and renames files with same parent. */
+       ASSERT_EQ(0, renameat2(AT_FDCWD, file2_s2d3, AT_FDCWD, file1_s2d3,
+                               RENAME_EXCHANGE));
+       ASSERT_EQ(0, rename(file2_s2d3, file1_s2d3));
+
+       /* Exchanges files and directories with same parent, twice. */
+       ASSERT_EQ(0, renameat2(AT_FDCWD, file1_s2d2, AT_FDCWD, dir_s2d3,
+                               RENAME_EXCHANGE));
+       ASSERT_EQ(0, renameat2(AT_FDCWD, file1_s2d2, AT_FDCWD, dir_s2d3,
+                               RENAME_EXCHANGE));
+}
+
+TEST_F_FORK(layout1, rename_dir)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_REMOVE_DIR,
+               },
+               {
+                       .path = dir_s2d1,
+                       .access = LANDLOCK_ACCESS_FS_REMOVE_DIR,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, rules[0].access,
+                       rules);
+
+       ASSERT_LE(0, ruleset_fd);
+
+       /* Empties dir_s1d3 to allow renaming. */
+       ASSERT_EQ(0, unlink(file1_s1d3));
+       ASSERT_EQ(0, unlink(file2_s1d3));
+
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Exchanges and renames directory to a different parent. */
+       ASSERT_EQ(-1, renameat2(AT_FDCWD, dir_s2d3, AT_FDCWD, dir_s1d3,
+                               RENAME_EXCHANGE));
+       ASSERT_EQ(EXDEV, errno);
+       ASSERT_EQ(-1, rename(dir_s2d3, dir_s1d3));
+       ASSERT_EQ(EXDEV, errno);
+       ASSERT_EQ(-1, renameat2(AT_FDCWD, file1_s2d2, AT_FDCWD, dir_s1d3,
+                               RENAME_EXCHANGE));
+       ASSERT_EQ(EXDEV, errno);
+
+       /*
+        * Exchanges directory to the same parent, which doesn't allow
+        * directory removal.
+        */
+       ASSERT_EQ(-1, renameat2(AT_FDCWD, dir_s1d1, AT_FDCWD, dir_s2d1,
+                               RENAME_EXCHANGE));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(-1, renameat2(AT_FDCWD, file1_s1d1, AT_FDCWD, dir_s1d2,
+                               RENAME_EXCHANGE));
+       ASSERT_EQ(EACCES, errno);
+
+       /*
+        * Exchanges and renames directory to the same parent, which allows
+        * directory removal.
+        */
+       ASSERT_EQ(0, renameat2(AT_FDCWD, dir_s1d3, AT_FDCWD, file1_s1d2,
+                               RENAME_EXCHANGE));
+       ASSERT_EQ(0, unlink(dir_s1d3));
+       ASSERT_EQ(0, mkdir(dir_s1d3, 0700));
+       ASSERT_EQ(0, rename(file1_s1d2, dir_s1d3));
+       ASSERT_EQ(0, rmdir(dir_s1d3));
+}
+
+TEST_F_FORK(layout1, remove_dir)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_REMOVE_DIR,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, rules[0].access,
+                       rules);
+
+       ASSERT_LE(0, ruleset_fd);
+
+       ASSERT_EQ(0, unlink(file1_s1d1));
+       ASSERT_EQ(0, unlink(file1_s1d2));
+       ASSERT_EQ(0, unlink(file1_s1d3));
+       ASSERT_EQ(0, unlink(file2_s1d3));
+
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       ASSERT_EQ(0, rmdir(dir_s1d3));
+       ASSERT_EQ(0, mkdir(dir_s1d3, 0700));
+       ASSERT_EQ(0, unlinkat(AT_FDCWD, dir_s1d3, AT_REMOVEDIR));
+
+       /* dir_s1d2 itself cannot be removed. */
+       ASSERT_EQ(-1, rmdir(dir_s1d2));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(-1, unlinkat(AT_FDCWD, dir_s1d2, AT_REMOVEDIR));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(-1, rmdir(dir_s1d1));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(-1, unlinkat(AT_FDCWD, dir_s1d1, AT_REMOVEDIR));
+       ASSERT_EQ(EACCES, errno);
+}
+
+TEST_F_FORK(layout1, remove_file)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_REMOVE_FILE,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, rules[0].access,
+                       rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       ASSERT_EQ(-1, unlink(file1_s1d1));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(-1, unlinkat(AT_FDCWD, file1_s1d1, 0));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(0, unlink(file1_s1d2));
+       ASSERT_EQ(0, unlinkat(AT_FDCWD, file1_s1d3, 0));
+}
+
+static void test_make_file(struct __test_metadata *const _metadata,
+               const __u64 access, const mode_t mode, const dev_t dev)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = access,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, access, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+
+       ASSERT_EQ(0, unlink(file1_s1d1));
+       ASSERT_EQ(0, unlink(file2_s1d1));
+       ASSERT_EQ(0, mknod(file2_s1d1, mode | 0400, dev)) {
+               TH_LOG("Failed to make file \"%s\": %s",
+                               file2_s1d1, strerror(errno));
+       };
+
+       ASSERT_EQ(0, unlink(file1_s1d2));
+       ASSERT_EQ(0, unlink(file2_s1d2));
+
+       ASSERT_EQ(0, unlink(file1_s1d3));
+       ASSERT_EQ(0, unlink(file2_s1d3));
+
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       ASSERT_EQ(-1, mknod(file1_s1d1, mode | 0400, dev));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(-1, link(file2_s1d1, file1_s1d1));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(-1, rename(file2_s1d1, file1_s1d1));
+       ASSERT_EQ(EACCES, errno);
+
+       ASSERT_EQ(0, mknod(file1_s1d2, mode | 0400, dev)) {
+               TH_LOG("Failed to make file \"%s\": %s",
+                               file1_s1d2, strerror(errno));
+       };
+       ASSERT_EQ(0, link(file1_s1d2, file2_s1d2));
+       ASSERT_EQ(0, unlink(file2_s1d2));
+       ASSERT_EQ(0, rename(file1_s1d2, file2_s1d2));
+
+       ASSERT_EQ(0, mknod(file1_s1d3, mode | 0400, dev));
+       ASSERT_EQ(0, link(file1_s1d3, file2_s1d3));
+       ASSERT_EQ(0, unlink(file2_s1d3));
+       ASSERT_EQ(0, rename(file1_s1d3, file2_s1d3));
+}
+
+TEST_F_FORK(layout1, make_char)
+{
+       /* Creates a /dev/null device. */
+       set_cap(_metadata, CAP_MKNOD);
+       test_make_file(_metadata, LANDLOCK_ACCESS_FS_MAKE_CHAR, S_IFCHR,
+                       makedev(1, 3));
+}
+
+TEST_F_FORK(layout1, make_block)
+{
+       /* Creates a /dev/loop0 device. */
+       set_cap(_metadata, CAP_MKNOD);
+       test_make_file(_metadata, LANDLOCK_ACCESS_FS_MAKE_BLOCK, S_IFBLK,
+                       makedev(7, 0));
+}
+
+TEST_F_FORK(layout1, make_reg_1)
+{
+       test_make_file(_metadata, LANDLOCK_ACCESS_FS_MAKE_REG, S_IFREG, 0);
+}
+
+TEST_F_FORK(layout1, make_reg_2)
+{
+       test_make_file(_metadata, LANDLOCK_ACCESS_FS_MAKE_REG, 0, 0);
+}
+
+TEST_F_FORK(layout1, make_sock)
+{
+       test_make_file(_metadata, LANDLOCK_ACCESS_FS_MAKE_SOCK, S_IFSOCK, 0);
+}
+
+TEST_F_FORK(layout1, make_fifo)
+{
+       test_make_file(_metadata, LANDLOCK_ACCESS_FS_MAKE_FIFO, S_IFIFO, 0);
+}
+
+TEST_F_FORK(layout1, make_sym)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_MAKE_SYM,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, rules[0].access,
+                       rules);
+
+       ASSERT_LE(0, ruleset_fd);
+
+       ASSERT_EQ(0, unlink(file1_s1d1));
+       ASSERT_EQ(0, unlink(file2_s1d1));
+       ASSERT_EQ(0, symlink("none", file2_s1d1));
+
+       ASSERT_EQ(0, unlink(file1_s1d2));
+       ASSERT_EQ(0, unlink(file2_s1d2));
+
+       ASSERT_EQ(0, unlink(file1_s1d3));
+       ASSERT_EQ(0, unlink(file2_s1d3));
+
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       ASSERT_EQ(-1, symlink("none", file1_s1d1));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(-1, link(file2_s1d1, file1_s1d1));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(-1, rename(file2_s1d1, file1_s1d1));
+       ASSERT_EQ(EACCES, errno);
+
+       ASSERT_EQ(0, symlink("none", file1_s1d2));
+       ASSERT_EQ(0, link(file1_s1d2, file2_s1d2));
+       ASSERT_EQ(0, unlink(file2_s1d2));
+       ASSERT_EQ(0, rename(file1_s1d2, file2_s1d2));
+
+       ASSERT_EQ(0, symlink("none", file1_s1d3));
+       ASSERT_EQ(0, link(file1_s1d3, file2_s1d3));
+       ASSERT_EQ(0, unlink(file2_s1d3));
+       ASSERT_EQ(0, rename(file1_s1d3, file2_s1d3));
+}
+
+TEST_F_FORK(layout1, make_dir)
+{
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_MAKE_DIR,
+               },
+               {}
+       };
+       const int ruleset_fd = create_ruleset(_metadata, rules[0].access,
+                       rules);
+
+       ASSERT_LE(0, ruleset_fd);
+
+       ASSERT_EQ(0, unlink(file1_s1d1));
+       ASSERT_EQ(0, unlink(file1_s1d2));
+       ASSERT_EQ(0, unlink(file1_s1d3));
+
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Uses file_* as directory names. */
+       ASSERT_EQ(-1, mkdir(file1_s1d1, 0700));
+       ASSERT_EQ(EACCES, errno);
+       ASSERT_EQ(0, mkdir(file1_s1d2, 0700));
+       ASSERT_EQ(0, mkdir(file1_s1d3, 0700));
+}
+
+static int open_proc_fd(struct __test_metadata *const _metadata, const int fd,
+               const int open_flags)
+{
+       static const char path_template[] = "/proc/self/fd/%d";
+       char procfd_path[sizeof(path_template) + 10];
+       const int procfd_path_size = snprintf(procfd_path, sizeof(procfd_path),
+                       path_template, fd);
+
+       ASSERT_LT(procfd_path_size, sizeof(procfd_path));
+       return open(procfd_path, open_flags);
+}
+
+TEST_F_FORK(layout1, proc_unlinked_file)
+{
+       const struct rule rules[] = {
+               {
+                       .path = file1_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {}
+       };
+       int reg_fd, proc_fd;
+       const int ruleset_fd = create_ruleset(_metadata,
+                       LANDLOCK_ACCESS_FS_READ_FILE |
+                       LANDLOCK_ACCESS_FS_WRITE_FILE, rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       ASSERT_EQ(EACCES, test_open(file1_s1d2, O_RDWR));
+       ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY));
+       reg_fd = open(file1_s1d2, O_RDONLY | O_CLOEXEC);
+       ASSERT_LE(0, reg_fd);
+       ASSERT_EQ(0, unlink(file1_s1d2));
+
+       proc_fd = open_proc_fd(_metadata, reg_fd, O_RDONLY | O_CLOEXEC);
+       ASSERT_LE(0, proc_fd);
+       ASSERT_EQ(0, close(proc_fd));
+
+       proc_fd = open_proc_fd(_metadata, reg_fd, O_RDWR | O_CLOEXEC);
+       ASSERT_EQ(-1, proc_fd) {
+               TH_LOG("Successfully opened /proc/self/fd/%d: %s",
+                               reg_fd, strerror(errno));
+       }
+       ASSERT_EQ(EACCES, errno);
+
+       ASSERT_EQ(0, close(reg_fd));
+}
+
+TEST_F_FORK(layout1, proc_pipe)
+{
+       int proc_fd;
+       int pipe_fds[2];
+       char buf = '\0';
+       const struct rule rules[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE |
+                               LANDLOCK_ACCESS_FS_WRITE_FILE,
+               },
+               {}
+       };
+       /* Limits read and write access to files tied to the filesystem. */
+       const int ruleset_fd = create_ruleset(_metadata, rules[0].access,
+                       rules);
+
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks enforcement for normal files. */
+       ASSERT_EQ(0, test_open(file1_s1d2, O_RDWR));
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDWR));
+
+       /* Checks access to pipes through FD. */
+       ASSERT_EQ(0, pipe2(pipe_fds, O_CLOEXEC));
+       ASSERT_EQ(1, write(pipe_fds[1], ".", 1)) {
+               TH_LOG("Failed to write in pipe: %s", strerror(errno));
+       }
+       ASSERT_EQ(1, read(pipe_fds[0], &buf, 1));
+       ASSERT_EQ('.', buf);
+
+       /* Checks write access to pipe through /proc/self/fd . */
+       proc_fd = open_proc_fd(_metadata, pipe_fds[1], O_WRONLY | O_CLOEXEC);
+       ASSERT_LE(0, proc_fd);
+       ASSERT_EQ(1, write(proc_fd, ".", 1)) {
+               TH_LOG("Failed to write through /proc/self/fd/%d: %s",
+                               pipe_fds[1], strerror(errno));
+       }
+       ASSERT_EQ(0, close(proc_fd));
+
+       /* Checks read access to pipe through /proc/self/fd . */
+       proc_fd = open_proc_fd(_metadata, pipe_fds[0], O_RDONLY | O_CLOEXEC);
+       ASSERT_LE(0, proc_fd);
+       buf = '\0';
+       ASSERT_EQ(1, read(proc_fd, &buf, 1)) {
+               TH_LOG("Failed to read through /proc/self/fd/%d: %s",
+                               pipe_fds[1], strerror(errno));
+       }
+       ASSERT_EQ(0, close(proc_fd));
+
+       ASSERT_EQ(0, close(pipe_fds[0]));
+       ASSERT_EQ(0, close(pipe_fds[1]));
+}
+
+FIXTURE(layout1_bind) {
+};
+
+FIXTURE_SETUP(layout1_bind)
+{
+       prepare_layout(_metadata);
+
+       create_layout1(_metadata);
+
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       ASSERT_EQ(0, mount(dir_s1d2, dir_s2d2, NULL, MS_BIND, NULL));
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+}
+
+FIXTURE_TEARDOWN(layout1_bind)
+{
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       EXPECT_EQ(0, umount(dir_s2d2));
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+
+       remove_layout1(_metadata);
+
+       cleanup_layout(_metadata);
+}
+
+static const char bind_dir_s1d3[] = TMP_DIR "/s2d1/s2d2/s1d3";
+static const char bind_file1_s1d3[] = TMP_DIR "/s2d1/s2d2/s1d3/f1";
+
+/*
+ * layout1_bind hierarchy:
+ *
+ * tmp
+ * ├── s1d1
+ * │   ├── f1
+ * │   ├── f2
+ * │   └── s1d2
+ * │       ├── f1
+ * │       ├── f2
+ * │       └── s1d3
+ * │           ├── f1
+ * │           └── f2
+ * ├── s2d1
+ * │   ├── f1
+ * │   └── s2d2
+ * │       ├── f1
+ * │       ├── f2
+ * │       └── s1d3
+ * │           ├── f1
+ * │           └── f2
+ * └── s3d1
+ *     └── s3d2
+ *         └── s3d3
+ */
+
+TEST_F_FORK(layout1_bind, no_restriction)
+{
+       ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s1d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY));
+
+       ASSERT_EQ(0, test_open(dir_s2d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s2d1, O_RDONLY));
+       ASSERT_EQ(0, test_open(dir_s2d2, O_RDONLY));
+       ASSERT_EQ(0, test_open(file1_s2d2, O_RDONLY));
+       ASSERT_EQ(ENOENT, test_open(dir_s2d3, O_RDONLY));
+       ASSERT_EQ(ENOENT, test_open(file1_s2d3, O_RDONLY));
+
+       ASSERT_EQ(0, test_open(bind_dir_s1d3, O_RDONLY));
+       ASSERT_EQ(0, test_open(bind_file1_s1d3, O_RDONLY));
+
+       ASSERT_EQ(0, test_open(dir_s3d1, O_RDONLY));
+}
+
+TEST_F_FORK(layout1_bind, same_content_same_file)
+{
+       /*
+        * Sets access right on parent directories of both source and
+        * destination mount points.
+        */
+       const struct rule layer1_parent[] = {
+               {
+                       .path = dir_s1d1,
+                       .access = ACCESS_RO,
+               },
+               {
+                       .path = dir_s2d1,
+                       .access = ACCESS_RW,
+               },
+               {}
+       };
+       /*
+        * Sets access rights on the same bind-mounted directories.  The result
+        * should be ACCESS_RW for both directories, but not both hierarchies
+        * because of the first layer.
+        */
+       const struct rule layer2_mount_point[] = {
+               {
+                       .path = dir_s1d2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = dir_s2d2,
+                       .access = ACCESS_RW,
+               },
+               {}
+       };
+       /* Only allow read-access to the s1d3 hierarchies. */
+       const struct rule layer3_source[] = {
+               {
+                       .path = dir_s1d3,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {}
+       };
+       /* Removes all access rights. */
+       const struct rule layer4_destination[] = {
+               {
+                       .path = bind_file1_s1d3,
+                       .access = LANDLOCK_ACCESS_FS_WRITE_FILE,
+               },
+               {}
+       };
+       int ruleset_fd;
+
+       /* Sets rules for the parent directories. */
+       ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer1_parent);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks source hierarchy. */
+       ASSERT_EQ(0, test_open(file1_s1d1, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_WRONLY));
+       ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY | O_DIRECTORY));
+
+       ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d2, O_WRONLY));
+       ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY));
+
+       /* Checks destination hierarchy. */
+       ASSERT_EQ(0, test_open(file1_s2d1, O_RDWR));
+       ASSERT_EQ(0, test_open(dir_s2d1, O_RDONLY | O_DIRECTORY));
+
+       ASSERT_EQ(0, test_open(file1_s2d2, O_RDWR));
+       ASSERT_EQ(0, test_open(dir_s2d2, O_RDONLY | O_DIRECTORY));
+
+       /* Sets rules for the mount points. */
+       ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer2_mount_point);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks source hierarchy. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d1, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY | O_DIRECTORY));
+
+       ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d2, O_WRONLY));
+       ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY));
+
+       /* Checks destination hierarchy. */
+       ASSERT_EQ(EACCES, test_open(file1_s2d1, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s2d1, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(dir_s2d1, O_RDONLY | O_DIRECTORY));
+
+       ASSERT_EQ(0, test_open(file1_s2d2, O_RDWR));
+       ASSERT_EQ(0, test_open(dir_s2d2, O_RDONLY | O_DIRECTORY));
+       ASSERT_EQ(0, test_open(bind_dir_s1d3, O_RDONLY | O_DIRECTORY));
+
+       /* Sets a (shared) rule only on the source. */
+       ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer3_source);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks source hierarchy. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d2, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d2, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY));
+
+       ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY));
+
+       /* Checks destination hierarchy. */
+       ASSERT_EQ(EACCES, test_open(file1_s2d2, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s2d2, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(dir_s2d2, O_RDONLY | O_DIRECTORY));
+
+       ASSERT_EQ(0, test_open(bind_file1_s1d3, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(bind_file1_s1d3, O_WRONLY));
+       ASSERT_EQ(EACCES, test_open(bind_dir_s1d3, O_RDONLY | O_DIRECTORY));
+
+       /* Sets a (shared) rule only on the destination. */
+       ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer4_destination);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks source hierarchy. */
+       ASSERT_EQ(EACCES, test_open(file1_s1d3, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY));
+
+       /* Checks destination hierarchy. */
+       ASSERT_EQ(EACCES, test_open(bind_file1_s1d3, O_RDONLY));
+       ASSERT_EQ(EACCES, test_open(bind_file1_s1d3, O_WRONLY));
+}
+
+#define LOWER_BASE     TMP_DIR "/lower"
+#define LOWER_DATA     LOWER_BASE "/data"
+static const char lower_fl1[] = LOWER_DATA "/fl1";
+static const char lower_dl1[] = LOWER_DATA "/dl1";
+static const char lower_dl1_fl2[] = LOWER_DATA "/dl1/fl2";
+static const char lower_fo1[] = LOWER_DATA "/fo1";
+static const char lower_do1[] = LOWER_DATA "/do1";
+static const char lower_do1_fo2[] = LOWER_DATA "/do1/fo2";
+static const char lower_do1_fl3[] = LOWER_DATA "/do1/fl3";
+
+static const char (*lower_base_files[])[] = {
+       &lower_fl1,
+       &lower_fo1,
+       NULL
+};
+static const char (*lower_base_directories[])[] = {
+       &lower_dl1,
+       &lower_do1,
+       NULL
+};
+static const char (*lower_sub_files[])[] = {
+       &lower_dl1_fl2,
+       &lower_do1_fo2,
+       &lower_do1_fl3,
+       NULL
+};
+
+#define UPPER_BASE     TMP_DIR "/upper"
+#define UPPER_DATA     UPPER_BASE "/data"
+#define UPPER_WORK     UPPER_BASE "/work"
+static const char upper_fu1[] = UPPER_DATA "/fu1";
+static const char upper_du1[] = UPPER_DATA "/du1";
+static const char upper_du1_fu2[] = UPPER_DATA "/du1/fu2";
+static const char upper_fo1[] = UPPER_DATA "/fo1";
+static const char upper_do1[] = UPPER_DATA "/do1";
+static const char upper_do1_fo2[] = UPPER_DATA "/do1/fo2";
+static const char upper_do1_fu3[] = UPPER_DATA "/do1/fu3";
+
+static const char (*upper_base_files[])[] = {
+       &upper_fu1,
+       &upper_fo1,
+       NULL
+};
+static const char (*upper_base_directories[])[] = {
+       &upper_du1,
+       &upper_do1,
+       NULL
+};
+static const char (*upper_sub_files[])[] = {
+       &upper_du1_fu2,
+       &upper_do1_fo2,
+       &upper_do1_fu3,
+       NULL
+};
+
+#define MERGE_BASE     TMP_DIR "/merge"
+#define MERGE_DATA     MERGE_BASE "/data"
+static const char merge_fl1[] = MERGE_DATA "/fl1";
+static const char merge_dl1[] = MERGE_DATA "/dl1";
+static const char merge_dl1_fl2[] = MERGE_DATA "/dl1/fl2";
+static const char merge_fu1[] = MERGE_DATA "/fu1";
+static const char merge_du1[] = MERGE_DATA "/du1";
+static const char merge_du1_fu2[] = MERGE_DATA "/du1/fu2";
+static const char merge_fo1[] = MERGE_DATA "/fo1";
+static const char merge_do1[] = MERGE_DATA "/do1";
+static const char merge_do1_fo2[] = MERGE_DATA "/do1/fo2";
+static const char merge_do1_fl3[] = MERGE_DATA "/do1/fl3";
+static const char merge_do1_fu3[] = MERGE_DATA "/do1/fu3";
+
+static const char (*merge_base_files[])[] = {
+       &merge_fl1,
+       &merge_fu1,
+       &merge_fo1,
+       NULL
+};
+static const char (*merge_base_directories[])[] = {
+       &merge_dl1,
+       &merge_du1,
+       &merge_do1,
+       NULL
+};
+static const char (*merge_sub_files[])[] = {
+       &merge_dl1_fl2,
+       &merge_du1_fu2,
+       &merge_do1_fo2,
+       &merge_do1_fl3,
+       &merge_do1_fu3,
+       NULL
+};
+
+/*
+ * layout2_overlay hierarchy:
+ *
+ * tmp
+ * ├── lower
+ * │   └── data
+ * │       ├── dl1
+ * │       │   └── fl2
+ * │       ├── do1
+ * │       │   ├── fl3
+ * │       │   └── fo2
+ * │       ├── fl1
+ * │       └── fo1
+ * ├── merge
+ * │   └── data
+ * │       ├── dl1
+ * │       │   └── fl2
+ * │       ├── do1
+ * │       │   ├── fl3
+ * │       │   ├── fo2
+ * │       │   └── fu3
+ * │       ├── du1
+ * │       │   └── fu2
+ * │       ├── fl1
+ * │       ├── fo1
+ * │       └── fu1
+ * └── upper
+ *     ├── data
+ *     │   ├── do1
+ *     │   │   ├── fo2
+ *     │   │   └── fu3
+ *     │   ├── du1
+ *     │   │   └── fu2
+ *     │   ├── fo1
+ *     │   └── fu1
+ *     └── work
+ *         └── work
+ */
+
+FIXTURE(layout2_overlay) {
+};
+
+FIXTURE_SETUP(layout2_overlay)
+{
+       prepare_layout(_metadata);
+
+       create_directory(_metadata, LOWER_BASE);
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       /* Creates tmpfs mount points to get deterministic overlayfs. */
+       ASSERT_EQ(0, mount("tmp", LOWER_BASE, "tmpfs", 0, "size=4m,mode=700"));
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+       create_file(_metadata, lower_fl1);
+       create_file(_metadata, lower_dl1_fl2);
+       create_file(_metadata, lower_fo1);
+       create_file(_metadata, lower_do1_fo2);
+       create_file(_metadata, lower_do1_fl3);
+
+       create_directory(_metadata, UPPER_BASE);
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       ASSERT_EQ(0, mount("tmp", UPPER_BASE, "tmpfs", 0, "size=4m,mode=700"));
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+       create_file(_metadata, upper_fu1);
+       create_file(_metadata, upper_du1_fu2);
+       create_file(_metadata, upper_fo1);
+       create_file(_metadata, upper_do1_fo2);
+       create_file(_metadata, upper_do1_fu3);
+       ASSERT_EQ(0, mkdir(UPPER_WORK, 0700));
+
+       create_directory(_metadata, MERGE_DATA);
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       set_cap(_metadata, CAP_DAC_OVERRIDE);
+       ASSERT_EQ(0, mount("overlay", MERGE_DATA, "overlay", 0,
+                               "lowerdir=" LOWER_DATA
+                               ",upperdir=" UPPER_DATA
+                               ",workdir=" UPPER_WORK));
+       clear_cap(_metadata, CAP_DAC_OVERRIDE);
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+}
+
+FIXTURE_TEARDOWN(layout2_overlay)
+{
+       EXPECT_EQ(0, remove_path(lower_do1_fl3));
+       EXPECT_EQ(0, remove_path(lower_dl1_fl2));
+       EXPECT_EQ(0, remove_path(lower_fl1));
+       EXPECT_EQ(0, remove_path(lower_do1_fo2));
+       EXPECT_EQ(0, remove_path(lower_fo1));
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       EXPECT_EQ(0, umount(LOWER_BASE));
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+       EXPECT_EQ(0, remove_path(LOWER_BASE));
+
+       EXPECT_EQ(0, remove_path(upper_do1_fu3));
+       EXPECT_EQ(0, remove_path(upper_du1_fu2));
+       EXPECT_EQ(0, remove_path(upper_fu1));
+       EXPECT_EQ(0, remove_path(upper_do1_fo2));
+       EXPECT_EQ(0, remove_path(upper_fo1));
+       EXPECT_EQ(0, remove_path(UPPER_WORK "/work"));
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       EXPECT_EQ(0, umount(UPPER_BASE));
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+       EXPECT_EQ(0, remove_path(UPPER_BASE));
+
+       set_cap(_metadata, CAP_SYS_ADMIN);
+       EXPECT_EQ(0, umount(MERGE_DATA));
+       clear_cap(_metadata, CAP_SYS_ADMIN);
+       EXPECT_EQ(0, remove_path(MERGE_DATA));
+
+       cleanup_layout(_metadata);
+}
+
+TEST_F_FORK(layout2_overlay, no_restriction)
+{
+       ASSERT_EQ(0, test_open(lower_fl1, O_RDONLY));
+       ASSERT_EQ(0, test_open(lower_dl1, O_RDONLY));
+       ASSERT_EQ(0, test_open(lower_dl1_fl2, O_RDONLY));
+       ASSERT_EQ(0, test_open(lower_fo1, O_RDONLY));
+       ASSERT_EQ(0, test_open(lower_do1, O_RDONLY));
+       ASSERT_EQ(0, test_open(lower_do1_fo2, O_RDONLY));
+       ASSERT_EQ(0, test_open(lower_do1_fl3, O_RDONLY));
+
+       ASSERT_EQ(0, test_open(upper_fu1, O_RDONLY));
+       ASSERT_EQ(0, test_open(upper_du1, O_RDONLY));
+       ASSERT_EQ(0, test_open(upper_du1_fu2, O_RDONLY));
+       ASSERT_EQ(0, test_open(upper_fo1, O_RDONLY));
+       ASSERT_EQ(0, test_open(upper_do1, O_RDONLY));
+       ASSERT_EQ(0, test_open(upper_do1_fo2, O_RDONLY));
+       ASSERT_EQ(0, test_open(upper_do1_fu3, O_RDONLY));
+
+       ASSERT_EQ(0, test_open(merge_fl1, O_RDONLY));
+       ASSERT_EQ(0, test_open(merge_dl1, O_RDONLY));
+       ASSERT_EQ(0, test_open(merge_dl1_fl2, O_RDONLY));
+       ASSERT_EQ(0, test_open(merge_fu1, O_RDONLY));
+       ASSERT_EQ(0, test_open(merge_du1, O_RDONLY));
+       ASSERT_EQ(0, test_open(merge_du1_fu2, O_RDONLY));
+       ASSERT_EQ(0, test_open(merge_fo1, O_RDONLY));
+       ASSERT_EQ(0, test_open(merge_do1, O_RDONLY));
+       ASSERT_EQ(0, test_open(merge_do1_fo2, O_RDONLY));
+       ASSERT_EQ(0, test_open(merge_do1_fl3, O_RDONLY));
+       ASSERT_EQ(0, test_open(merge_do1_fu3, O_RDONLY));
+}
+
+#define for_each_path(path_list, path_entry, i)                        \
+       for (i = 0, path_entry = *path_list[i]; path_list[i];   \
+                       path_entry = *path_list[++i])
+
+TEST_F_FORK(layout2_overlay, same_content_different_file)
+{
+       /* Sets access right on parent directories of both layers. */
+       const struct rule layer1_base[] = {
+               {
+                       .path = LOWER_BASE,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = UPPER_BASE,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = MERGE_BASE,
+                       .access = ACCESS_RW,
+               },
+               {}
+       };
+       const struct rule layer2_data[] = {
+               {
+                       .path = LOWER_DATA,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = UPPER_DATA,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = MERGE_DATA,
+                       .access = ACCESS_RW,
+               },
+               {}
+       };
+       /* Sets access right on directories inside both layers. */
+       const struct rule layer3_subdirs[] = {
+               {
+                       .path = lower_dl1,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = lower_do1,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = upper_du1,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = upper_do1,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = merge_dl1,
+                       .access = ACCESS_RW,
+               },
+               {
+                       .path = merge_du1,
+                       .access = ACCESS_RW,
+               },
+               {
+                       .path = merge_do1,
+                       .access = ACCESS_RW,
+               },
+               {}
+       };
+       /* Tighten access rights to the files. */
+       const struct rule layer4_files[] = {
+               {
+                       .path = lower_dl1_fl2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = lower_do1_fo2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = lower_do1_fl3,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = upper_du1_fu2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = upper_do1_fo2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = upper_do1_fu3,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE,
+               },
+               {
+                       .path = merge_dl1_fl2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE |
+                               LANDLOCK_ACCESS_FS_WRITE_FILE,
+               },
+               {
+                       .path = merge_du1_fu2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE |
+                               LANDLOCK_ACCESS_FS_WRITE_FILE,
+               },
+               {
+                       .path = merge_do1_fo2,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE |
+                               LANDLOCK_ACCESS_FS_WRITE_FILE,
+               },
+               {
+                       .path = merge_do1_fl3,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE |
+                               LANDLOCK_ACCESS_FS_WRITE_FILE,
+               },
+               {
+                       .path = merge_do1_fu3,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE |
+                               LANDLOCK_ACCESS_FS_WRITE_FILE,
+               },
+               {}
+       };
+       const struct rule layer5_merge_only[] = {
+               {
+                       .path = MERGE_DATA,
+                       .access = LANDLOCK_ACCESS_FS_READ_FILE |
+                               LANDLOCK_ACCESS_FS_WRITE_FILE,
+               },
+               {}
+       };
+       int ruleset_fd;
+       size_t i;
+       const char *path_entry;
+
+       /* Sets rules on base directories (i.e. outside overlay scope). */
+       ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer1_base);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks lower layer. */
+       for_each_path(lower_base_files, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDONLY));
+               ASSERT_EQ(EACCES, test_open(path_entry, O_WRONLY));
+       }
+       for_each_path(lower_base_directories, path_entry, i) {
+               ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY | O_DIRECTORY));
+       }
+       for_each_path(lower_sub_files, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDONLY));
+               ASSERT_EQ(EACCES, test_open(path_entry, O_WRONLY));
+       }
+       /* Checks upper layer. */
+       for_each_path(upper_base_files, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDONLY));
+               ASSERT_EQ(EACCES, test_open(path_entry, O_WRONLY));
+       }
+       for_each_path(upper_base_directories, path_entry, i) {
+               ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY | O_DIRECTORY));
+       }
+       for_each_path(upper_sub_files, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDONLY));
+               ASSERT_EQ(EACCES, test_open(path_entry, O_WRONLY));
+       }
+       /*
+        * Checks that access rights are independent from the lower and upper
+        * layers: write access to upper files viewed through the merge point
+        * is still allowed, and write access to lower file viewed (and copied)
+        * through the merge point is still allowed.
+        */
+       for_each_path(merge_base_files, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDWR));
+       }
+       for_each_path(merge_base_directories, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDONLY | O_DIRECTORY));
+       }
+       for_each_path(merge_sub_files, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDWR));
+       }
+
+       /* Sets rules on data directories (i.e. inside overlay scope). */
+       ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer2_data);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks merge. */
+       for_each_path(merge_base_files, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDWR));
+       }
+       for_each_path(merge_base_directories, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDONLY | O_DIRECTORY));
+       }
+       for_each_path(merge_sub_files, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDWR));
+       }
+
+       /* Same checks with tighter rules. */
+       ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer3_subdirs);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks changes for lower layer. */
+       for_each_path(lower_base_files, path_entry, i) {
+               ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY));
+       }
+       /* Checks changes for upper layer. */
+       for_each_path(upper_base_files, path_entry, i) {
+               ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY));
+       }
+       /* Checks all merge accesses. */
+       for_each_path(merge_base_files, path_entry, i) {
+               ASSERT_EQ(EACCES, test_open(path_entry, O_RDWR));
+       }
+       for_each_path(merge_base_directories, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDONLY | O_DIRECTORY));
+       }
+       for_each_path(merge_sub_files, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDWR));
+       }
+
+       /* Sets rules directly on overlayed files. */
+       ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer4_files);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks unchanged accesses on lower layer. */
+       for_each_path(lower_sub_files, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDONLY));
+               ASSERT_EQ(EACCES, test_open(path_entry, O_WRONLY));
+       }
+       /* Checks unchanged accesses on upper layer. */
+       for_each_path(upper_sub_files, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDONLY));
+               ASSERT_EQ(EACCES, test_open(path_entry, O_WRONLY));
+       }
+       /* Checks all merge accesses. */
+       for_each_path(merge_base_files, path_entry, i) {
+               ASSERT_EQ(EACCES, test_open(path_entry, O_RDWR));
+       }
+       for_each_path(merge_base_directories, path_entry, i) {
+               ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY | O_DIRECTORY));
+       }
+       for_each_path(merge_sub_files, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDWR));
+       }
+
+       /* Only allowes access to the merge hierarchy. */
+       ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer5_merge_only);
+       ASSERT_LE(0, ruleset_fd);
+       enforce_ruleset(_metadata, ruleset_fd);
+       ASSERT_EQ(0, close(ruleset_fd));
+
+       /* Checks new accesses on lower layer. */
+       for_each_path(lower_sub_files, path_entry, i) {
+               ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY));
+       }
+       /* Checks new accesses on upper layer. */
+       for_each_path(upper_sub_files, path_entry, i) {
+               ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY));
+       }
+       /* Checks all merge accesses. */
+       for_each_path(merge_base_files, path_entry, i) {
+               ASSERT_EQ(EACCES, test_open(path_entry, O_RDWR));
+       }
+       for_each_path(merge_base_directories, path_entry, i) {
+               ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY | O_DIRECTORY));
+       }
+       for_each_path(merge_sub_files, path_entry, i) {
+               ASSERT_EQ(0, test_open(path_entry, O_RDWR));
+       }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/landlock/ptrace_test.c b/tools/testing/selftests/landlock/ptrace_test.c
new file mode 100644 (file)
index 0000000..15fbef9
--- /dev/null
@@ -0,0 +1,337 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Landlock tests - Ptrace
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019-2020 ANSSI
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/landlock.h>
+#include <signal.h>
+#include <sys/prctl.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "common.h"
+
+static void create_domain(struct __test_metadata *const _metadata)
+{
+       int ruleset_fd;
+       struct landlock_ruleset_attr ruleset_attr = {
+               .handled_access_fs = LANDLOCK_ACCESS_FS_MAKE_BLOCK,
+       };
+
+       ruleset_fd = landlock_create_ruleset(&ruleset_attr,
+                       sizeof(ruleset_attr), 0);
+       EXPECT_LE(0, ruleset_fd) {
+               TH_LOG("Failed to create a ruleset: %s", strerror(errno));
+       }
+       EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+       EXPECT_EQ(0, landlock_restrict_self(ruleset_fd, 0));
+       EXPECT_EQ(0, close(ruleset_fd));
+}
+
+static int test_ptrace_read(const pid_t pid)
+{
+       static const char path_template[] = "/proc/%d/environ";
+       char procenv_path[sizeof(path_template) + 10];
+       int procenv_path_size, fd;
+
+       procenv_path_size = snprintf(procenv_path, sizeof(procenv_path),
+                       path_template, pid);
+       if (procenv_path_size >= sizeof(procenv_path))
+               return E2BIG;
+
+       fd = open(procenv_path, O_RDONLY | O_CLOEXEC);
+       if (fd < 0)
+               return errno;
+       /*
+        * Mixing error codes from close(2) and open(2) should not lead to any
+        * (access type) confusion for this test.
+        */
+       if (close(fd) != 0)
+               return errno;
+       return 0;
+}
+
+FIXTURE(hierarchy) { };
+
+FIXTURE_VARIANT(hierarchy) {
+       const bool domain_both;
+       const bool domain_parent;
+       const bool domain_child;
+};
+
+/*
+ * Test multiple tracing combinations between a parent process P1 and a child
+ * process P2.
+ *
+ * Yama's scoped ptrace is presumed disabled.  If enabled, this optional
+ * restriction is enforced in addition to any Landlock check, which means that
+ * all P2 requests to trace P1 would be denied.
+ */
+
+/*
+ *        No domain
+ *
+ *   P1-.               P1 -> P2 : allow
+ *       \              P2 -> P1 : allow
+ *        'P2
+ */
+FIXTURE_VARIANT_ADD(hierarchy, allow_without_domain) {
+       .domain_both = false,
+       .domain_parent = false,
+       .domain_child = false,
+};
+
+/*
+ *        Child domain
+ *
+ *   P1--.              P1 -> P2 : allow
+ *        \             P2 -> P1 : deny
+ *        .'-----.
+ *        |  P2  |
+ *        '------'
+ */
+FIXTURE_VARIANT_ADD(hierarchy, allow_with_one_domain) {
+       .domain_both = false,
+       .domain_parent = false,
+       .domain_child = true,
+};
+
+/*
+ *        Parent domain
+ * .------.
+ * |  P1  --.           P1 -> P2 : deny
+ * '------'  \          P2 -> P1 : allow
+ *            '
+ *            P2
+ */
+FIXTURE_VARIANT_ADD(hierarchy, deny_with_parent_domain) {
+       .domain_both = false,
+       .domain_parent = true,
+       .domain_child = false,
+};
+
+/*
+ *        Parent + child domain (siblings)
+ * .------.
+ * |  P1  ---.          P1 -> P2 : deny
+ * '------'   \         P2 -> P1 : deny
+ *         .---'--.
+ *         |  P2  |
+ *         '------'
+ */
+FIXTURE_VARIANT_ADD(hierarchy, deny_with_sibling_domain) {
+       .domain_both = false,
+       .domain_parent = true,
+       .domain_child = true,
+};
+
+/*
+ *         Same domain (inherited)
+ * .-------------.
+ * | P1----.     |      P1 -> P2 : allow
+ * |        \    |      P2 -> P1 : allow
+ * |         '   |
+ * |         P2  |
+ * '-------------'
+ */
+FIXTURE_VARIANT_ADD(hierarchy, allow_sibling_domain) {
+       .domain_both = true,
+       .domain_parent = false,
+       .domain_child = false,
+};
+
+/*
+ *         Inherited + child domain
+ * .-----------------.
+ * |  P1----.        |  P1 -> P2 : allow
+ * |         \       |  P2 -> P1 : deny
+ * |        .-'----. |
+ * |        |  P2  | |
+ * |        '------' |
+ * '-----------------'
+ */
+FIXTURE_VARIANT_ADD(hierarchy, allow_with_nested_domain) {
+       .domain_both = true,
+       .domain_parent = false,
+       .domain_child = true,
+};
+
+/*
+ *         Inherited + parent domain
+ * .-----------------.
+ * |.------.         |  P1 -> P2 : deny
+ * ||  P1  ----.     |  P2 -> P1 : allow
+ * |'------'    \    |
+ * |             '   |
+ * |             P2  |
+ * '-----------------'
+ */
+FIXTURE_VARIANT_ADD(hierarchy, deny_with_nested_and_parent_domain) {
+       .domain_both = true,
+       .domain_parent = true,
+       .domain_child = false,
+};
+
+/*
+ *         Inherited + parent and child domain (siblings)
+ * .-----------------.
+ * | .------.        |  P1 -> P2 : deny
+ * | |  P1  .        |  P2 -> P1 : deny
+ * | '------'\       |
+ * |          \      |
+ * |        .--'---. |
+ * |        |  P2  | |
+ * |        '------' |
+ * '-----------------'
+ */
+FIXTURE_VARIANT_ADD(hierarchy, deny_with_forked_domain) {
+       .domain_both = true,
+       .domain_parent = true,
+       .domain_child = true,
+};
+
+FIXTURE_SETUP(hierarchy)
+{ }
+
+FIXTURE_TEARDOWN(hierarchy)
+{ }
+
+/* Test PTRACE_TRACEME and PTRACE_ATTACH for parent and child. */
+TEST_F(hierarchy, trace)
+{
+       pid_t child, parent;
+       int status, err_proc_read;
+       int pipe_child[2], pipe_parent[2];
+       char buf_parent;
+       long ret;
+
+       /*
+        * Removes all effective and permitted capabilities to not interfere
+        * with cap_ptrace_access_check() in case of PTRACE_MODE_FSCREDS.
+        */
+       drop_caps(_metadata);
+
+       parent = getpid();
+       ASSERT_EQ(0, pipe2(pipe_child, O_CLOEXEC));
+       ASSERT_EQ(0, pipe2(pipe_parent, O_CLOEXEC));
+       if (variant->domain_both) {
+               create_domain(_metadata);
+               if (!_metadata->passed)
+                       /* Aborts before forking. */
+                       return;
+       }
+
+       child = fork();
+       ASSERT_LE(0, child);
+       if (child == 0) {
+               char buf_child;
+
+               ASSERT_EQ(0, close(pipe_parent[1]));
+               ASSERT_EQ(0, close(pipe_child[0]));
+               if (variant->domain_child)
+                       create_domain(_metadata);
+
+               /* Waits for the parent to be in a domain, if any. */
+               ASSERT_EQ(1, read(pipe_parent[0], &buf_child, 1));
+
+               /* Tests PTRACE_ATTACH and PTRACE_MODE_READ on the parent. */
+               err_proc_read = test_ptrace_read(parent);
+               ret = ptrace(PTRACE_ATTACH, parent, NULL, 0);
+               if (variant->domain_child) {
+                       EXPECT_EQ(-1, ret);
+                       EXPECT_EQ(EPERM, errno);
+                       EXPECT_EQ(EACCES, err_proc_read);
+               } else {
+                       EXPECT_EQ(0, ret);
+                       EXPECT_EQ(0, err_proc_read);
+               }
+               if (ret == 0) {
+                       ASSERT_EQ(parent, waitpid(parent, &status, 0));
+                       ASSERT_EQ(1, WIFSTOPPED(status));
+                       ASSERT_EQ(0, ptrace(PTRACE_DETACH, parent, NULL, 0));
+               }
+
+               /* Tests child PTRACE_TRACEME. */
+               ret = ptrace(PTRACE_TRACEME);
+               if (variant->domain_parent) {
+                       EXPECT_EQ(-1, ret);
+                       EXPECT_EQ(EPERM, errno);
+               } else {
+                       EXPECT_EQ(0, ret);
+               }
+
+               /*
+                * Signals that the PTRACE_ATTACH test is done and the
+                * PTRACE_TRACEME test is ongoing.
+                */
+               ASSERT_EQ(1, write(pipe_child[1], ".", 1));
+
+               if (!variant->domain_parent) {
+                       ASSERT_EQ(0, raise(SIGSTOP));
+               }
+
+               /* Waits for the parent PTRACE_ATTACH test. */
+               ASSERT_EQ(1, read(pipe_parent[0], &buf_child, 1));
+               _exit(_metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE);
+               return;
+       }
+
+       ASSERT_EQ(0, close(pipe_child[1]));
+       ASSERT_EQ(0, close(pipe_parent[0]));
+       if (variant->domain_parent)
+               create_domain(_metadata);
+
+       /* Signals that the parent is in a domain, if any. */
+       ASSERT_EQ(1, write(pipe_parent[1], ".", 1));
+
+       /*
+        * Waits for the child to test PTRACE_ATTACH on the parent and start
+        * testing PTRACE_TRACEME.
+        */
+       ASSERT_EQ(1, read(pipe_child[0], &buf_parent, 1));
+
+       /* Tests child PTRACE_TRACEME. */
+       if (!variant->domain_parent) {
+               ASSERT_EQ(child, waitpid(child, &status, 0));
+               ASSERT_EQ(1, WIFSTOPPED(status));
+               ASSERT_EQ(0, ptrace(PTRACE_DETACH, child, NULL, 0));
+       } else {
+               /* The child should not be traced by the parent. */
+               EXPECT_EQ(-1, ptrace(PTRACE_DETACH, child, NULL, 0));
+               EXPECT_EQ(ESRCH, errno);
+       }
+
+       /* Tests PTRACE_ATTACH and PTRACE_MODE_READ on the child. */
+       err_proc_read = test_ptrace_read(child);
+       ret = ptrace(PTRACE_ATTACH, child, NULL, 0);
+       if (variant->domain_parent) {
+               EXPECT_EQ(-1, ret);
+               EXPECT_EQ(EPERM, errno);
+               EXPECT_EQ(EACCES, err_proc_read);
+       } else {
+               EXPECT_EQ(0, ret);
+               EXPECT_EQ(0, err_proc_read);
+       }
+       if (ret == 0) {
+               ASSERT_EQ(child, waitpid(child, &status, 0));
+               ASSERT_EQ(1, WIFSTOPPED(status));
+               ASSERT_EQ(0, ptrace(PTRACE_DETACH, child, NULL, 0));
+       }
+
+       /* Signals that the parent PTRACE_ATTACH test is done. */
+       ASSERT_EQ(1, write(pipe_parent[1], ".", 1));
+       ASSERT_EQ(child, waitpid(child, &status, 0));
+       if (WIFSIGNALED(status) || !WIFEXITED(status) ||
+                       WEXITSTATUS(status) != EXIT_SUCCESS)
+               _metadata->passed = 0;
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/landlock/true.c b/tools/testing/selftests/landlock/true.c
new file mode 100644 (file)
index 0000000..3f9ccbf
--- /dev/null
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+int main(void)
+{
+       return 0;
+}
index 5a1e85f..e541066 100644 (file)
@@ -14,7 +14,6 @@
 #include <sys/mman.h>
 #include <string.h>
 #include <fcntl.h>
-#include <string.h>
 
 #include "../kselftest.h"
 #include "../kselftest_harness.h"
index f85a093..48344a7 100644 (file)
@@ -33,7 +33,6 @@
 #include <sched.h>
 #include <time.h>
 #include <stdarg.h>
-#include <sched.h>
 #include <pthread.h>
 #include <signal.h>
 #include <sys/prctl.h>
index 8be8a03..1054e40 100644 (file)
@@ -12,6 +12,7 @@ TEST_GEN_PROGS += proc-self-map-files-001
 TEST_GEN_PROGS += proc-self-map-files-002
 TEST_GEN_PROGS += proc-self-syscall
 TEST_GEN_PROGS += proc-self-wchan
+TEST_GEN_PROGS += proc-subset-pid
 TEST_GEN_PROGS += proc-uptime-001
 TEST_GEN_PROGS += proc-uptime-002
 TEST_GEN_PROGS += read
diff --git a/tools/testing/selftests/proc/proc-subset-pid.c b/tools/testing/selftests/proc/proc-subset-pid.c
new file mode 100644 (file)
index 0000000..d1052bc
--- /dev/null
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2021 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * Test that "mount -t proc -o subset=pid" hides everything but pids,
+ * /proc/self and /proc/thread-self.
+ */
+#undef NDEBUG
+#include <assert.h>
+#include <errno.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <stdio.h>
+
+static inline bool streq(const char *a, const char *b)
+{
+       return strcmp(a, b) == 0;
+}
+
+static void make_private_proc(void)
+{
+       if (unshare(CLONE_NEWNS) == -1) {
+               if (errno == ENOSYS || errno == EPERM) {
+                       exit(4);
+               }
+               exit(1);
+       }
+       if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
+               exit(1);
+       }
+       if (mount(NULL, "/proc", "proc", 0, "subset=pid") == -1) {
+               exit(1);
+       }
+}
+
+static bool string_is_pid(const char *s)
+{
+       while (1) {
+               switch (*s++) {
+               case '0':case '1':case '2':case '3':case '4':
+               case '5':case '6':case '7':case '8':case '9':
+                       continue;
+
+               case '\0':
+                       return true;
+
+               default:
+                       return false;
+               }
+       }
+}
+
+int main(void)
+{
+       make_private_proc();
+
+       DIR *d = opendir("/proc");
+       assert(d);
+
+       struct dirent *de;
+
+       bool dot = false;
+       bool dot_dot = false;
+       bool self = false;
+       bool thread_self = false;
+
+       while ((de = readdir(d))) {
+               if (streq(de->d_name, ".")) {
+                       assert(!dot);
+                       dot = true;
+                       assert(de->d_type == DT_DIR);
+               } else if (streq(de->d_name, "..")) {
+                       assert(!dot_dot);
+                       dot_dot = true;
+                       assert(de->d_type == DT_DIR);
+               } else if (streq(de->d_name, "self")) {
+                       assert(!self);
+                       self = true;
+                       assert(de->d_type == DT_LNK);
+               } else if (streq(de->d_name, "thread-self")) {
+                       assert(!thread_self);
+                       thread_self = true;
+                       assert(de->d_type == DT_LNK);
+               } else {
+                       if (!string_is_pid(de->d_name)) {
+                               fprintf(stderr, "d_name '%s'\n", de->d_name);
+                               assert(0);
+                       }
+                       assert(de->d_type == DT_DIR);
+               }
+       }
+
+       char c;
+       int rv = readlink("/proc/cpuinfo", &c, 1);
+       assert(rv == -1 && errno == ENOENT);
+
+       int fd = open("/proc/cpuinfo", O_RDONLY);
+       assert(fd == -1 && errno == ENOENT);
+
+       return 0;
+}
index b3ef9e1..35ee78d 100644 (file)
@@ -14,7 +14,7 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 // Test
-// 1) read of every file in /proc
+// 1) read and lseek on every file in /proc
 // 2) readlink of every symlink in /proc
 // 3) recursively (1) + (2) for every directory in /proc
 // 4) write to /proc/*/clear_refs and /proc/*/task/*/clear_refs
@@ -45,6 +45,8 @@ static void f_reg(DIR *d, const char *filename)
        fd = openat(dirfd(d), filename, O_RDONLY|O_NONBLOCK);
        if (fd == -1)
                return;
+       /* struct proc_ops::proc_lseek is mandatory if file is seekable. */
+       (void)lseek(fd, 0, SEEK_SET);
        rv = read(fd, buf, sizeof(buf));
        assert((0 <= rv && rv <= sizeof(buf)) || rv == -1);
        close(fd);
index 9a35c3f..1f651e8 100644 (file)
@@ -22,3 +22,4 @@ map_fixed_noreplace
 write_to_hugetlbfs
 hmm-tests
 local_config.*
+split_huge_page_test
index 8b0cd42..73e1cc9 100644 (file)
@@ -42,6 +42,7 @@ TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
+TEST_GEN_FILES += split_huge_page_test
 
 ifeq ($(MACHINE),x86_64)
 CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
index 6c6336d..1e662d5 100644 (file)
@@ -13,6 +13,7 @@
 
 /* Just the flags we need, copied from mm.h: */
 #define FOLL_WRITE     0x01    /* check pte is writable */
+#define FOLL_TOUCH     0x02    /* mark page accessed */
 
 static char *cmd_to_str(unsigned long cmd)
 {
@@ -37,13 +38,13 @@ int main(int argc, char **argv)
 {
        struct gup_test gup = { 0 };
        unsigned long size = 128 * MB;
-       int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0;
+       int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 1;
        unsigned long cmd = GUP_FAST_BENCHMARK;
-       int flags = MAP_PRIVATE;
+       int flags = MAP_PRIVATE, touch = 0;
        char *file = "/dev/zero";
        char *p;
 
-       while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwSH")) != -1) {
+       while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwWSHpz")) != -1) {
                switch (opt) {
                case 'a':
                        cmd = PIN_FAST_BENCHMARK;
@@ -65,9 +66,13 @@ int main(int argc, char **argv)
                         */
                        gup.which_pages[0] = 1;
                        break;
+               case 'p':
+                       /* works only with DUMP_USER_PAGES_TEST */
+                       gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN;
+                       break;
                case 'F':
                        /* strtol, so you can pass flags in hex form */
-                       gup.flags = strtol(optarg, 0, 0);
+                       gup.gup_flags = strtol(optarg, 0, 0);
                        break;
                case 'm':
                        size = atoi(optarg) * MB;
@@ -93,6 +98,9 @@ int main(int argc, char **argv)
                case 'w':
                        write = 1;
                        break;
+               case 'W':
+                       write = 0;
+                       break;
                case 'f':
                        file = optarg;
                        break;
@@ -103,6 +111,10 @@ int main(int argc, char **argv)
                case 'H':
                        flags |= (MAP_HUGETLB | MAP_ANONYMOUS);
                        break;
+               case 'z':
+                       /* fault pages in gup, do not fault in userland */
+                       touch = 1;
+                       break;
                default:
                        return -1;
                }
@@ -140,7 +152,7 @@ int main(int argc, char **argv)
 
        gup.nr_pages_per_call = nr_pages;
        if (write)
-               gup.flags |= FOLL_WRITE;
+               gup.gup_flags |= FOLL_WRITE;
 
        fd = open("/sys/kernel/debug/gup_test", O_RDWR);
        if (fd == -1) {
@@ -160,8 +172,18 @@ int main(int argc, char **argv)
        else if (thp == 0)
                madvise(p, size, MADV_NOHUGEPAGE);
 
-       for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
-               p[0] = 0;
+       /*
+        * FOLL_TOUCH, in gup_test, is used as an either/or case: either
+        * fault pages in from the kernel via FOLL_TOUCH, or fault them
+        * in here, from user space. This allows comparison of performance
+        * between those two cases.
+        */
+       if (touch) {
+               gup.gup_flags |= FOLL_TOUCH;
+       } else {
+               for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
+                       p[0] = 0;
+       }
 
        /* Only report timing information on the *_BENCHMARK commands: */
        if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) ||
diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
new file mode 100644 (file)
index 0000000..1af16d2
--- /dev/null
@@ -0,0 +1,390 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
+ * address range in a process via <debugfs>/split_huge_pages interface.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <malloc.h>
+#include <stdbool.h>
+
+uint64_t pagesize;
+unsigned int pageshift;
+uint64_t pmd_pagesize;
+
+#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
+#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages"
+#define SMAP_PATH "/proc/self/smaps"
+#define INPUT_MAX 80
+
+#define PID_FMT "%d,0x%lx,0x%lx"
+#define PATH_FMT "%s,0x%lx,0x%lx"
+
+#define PFN_MASK     ((1UL<<55)-1)
+#define KPF_THP      (1UL<<22)
+
+int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
+{
+       uint64_t paddr;
+       uint64_t page_flags;
+
+       if (pagemap_file) {
+               pread(pagemap_file, &paddr, sizeof(paddr),
+                       ((long)vaddr >> pageshift) * sizeof(paddr));
+
+               if (kpageflags_file) {
+                       pread(kpageflags_file, &page_flags, sizeof(page_flags),
+                               (paddr & PFN_MASK) * sizeof(page_flags));
+
+                       return !!(page_flags & KPF_THP);
+               }
+       }
+       return 0;
+}
+
+
+static uint64_t read_pmd_pagesize(void)
+{
+       int fd;
+       char buf[20];
+       ssize_t num_read;
+
+       fd = open(PMD_SIZE_PATH, O_RDONLY);
+       if (fd == -1) {
+               perror("Open hpage_pmd_size failed");
+               exit(EXIT_FAILURE);
+       }
+       num_read = read(fd, buf, 19);
+       if (num_read < 1) {
+               close(fd);
+               perror("Read hpage_pmd_size failed");
+               exit(EXIT_FAILURE);
+       }
+       buf[num_read] = '\0';
+       close(fd);
+
+       return strtoul(buf, NULL, 10);
+}
+
+static int write_file(const char *path, const char *buf, size_t buflen)
+{
+       int fd;
+       ssize_t numwritten;
+
+       fd = open(path, O_WRONLY);
+       if (fd == -1)
+               return 0;
+
+       numwritten = write(fd, buf, buflen - 1);
+       close(fd);
+       if (numwritten < 1)
+               return 0;
+
+       return (unsigned int) numwritten;
+}
+
+static void write_debugfs(const char *fmt, ...)
+{
+       char input[INPUT_MAX];
+       int ret;
+       va_list argp;
+
+       va_start(argp, fmt);
+       ret = vsnprintf(input, INPUT_MAX, fmt, argp);
+       va_end(argp);
+
+       if (ret >= INPUT_MAX) {
+               printf("%s: Debugfs input is too long\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+
+       if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
+               perror(SPLIT_DEBUGFS);
+               exit(EXIT_FAILURE);
+       }
+}
+
+#define MAX_LINE_LENGTH 500
+
+static bool check_for_pattern(FILE *fp, const char *pattern, char *buf)
+{
+       while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
+               if (!strncmp(buf, pattern, strlen(pattern)))
+                       return true;
+       }
+       return false;
+}
+
+static uint64_t check_huge(void *addr)
+{
+       uint64_t thp = 0;
+       int ret;
+       FILE *fp;
+       char buffer[MAX_LINE_LENGTH];
+       char addr_pattern[MAX_LINE_LENGTH];
+
+       ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+                      (unsigned long) addr);
+       if (ret >= MAX_LINE_LENGTH) {
+               printf("%s: Pattern is too long\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+
+
+       fp = fopen(SMAP_PATH, "r");
+       if (!fp) {
+               printf("%s: Failed to open file %s\n", __func__, SMAP_PATH);
+               exit(EXIT_FAILURE);
+       }
+       if (!check_for_pattern(fp, addr_pattern, buffer))
+               goto err_out;
+
+       /*
+        * Fetch the AnonHugePages: in the same block and check the number of
+        * hugepages.
+        */
+       if (!check_for_pattern(fp, "AnonHugePages:", buffer))
+               goto err_out;
+
+       if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) {
+               printf("Reading smap error\n");
+               exit(EXIT_FAILURE);
+       }
+
+err_out:
+       fclose(fp);
+       return thp;
+}
+
+void split_pmd_thp(void)
+{
+       char *one_page;
+       size_t len = 4 * pmd_pagesize;
+       uint64_t thp_size;
+       size_t i;
+
+       one_page = memalign(pmd_pagesize, len);
+
+       if (!one_page) {
+               printf("Fail to allocate memory\n");
+               exit(EXIT_FAILURE);
+       }
+
+       madvise(one_page, len, MADV_HUGEPAGE);
+
+       for (i = 0; i < len; i++)
+               one_page[i] = (char)i;
+
+       thp_size = check_huge(one_page);
+       if (!thp_size) {
+               printf("No THP is allocated\n");
+               exit(EXIT_FAILURE);
+       }
+
+       /* split all THPs */
+       write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
+               (uint64_t)one_page + len);
+
+       for (i = 0; i < len; i++)
+               if (one_page[i] != (char)i) {
+                       printf("%ld byte corrupted\n", i);
+                       exit(EXIT_FAILURE);
+               }
+
+
+       thp_size = check_huge(one_page);
+       if (thp_size) {
+               printf("Still %ld kB AnonHugePages not split\n", thp_size);
+               exit(EXIT_FAILURE);
+       }
+
+       printf("Split huge pages successful\n");
+       free(one_page);
+}
+
+void split_pte_mapped_thp(void)
+{
+       char *one_page, *pte_mapped, *pte_mapped2;
+       size_t len = 4 * pmd_pagesize;
+       uint64_t thp_size;
+       size_t i;
+       const char *pagemap_template = "/proc/%d/pagemap";
+       const char *kpageflags_proc = "/proc/kpageflags";
+       char pagemap_proc[255];
+       int pagemap_fd;
+       int kpageflags_fd;
+
+       if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
+               perror("get pagemap proc error");
+               exit(EXIT_FAILURE);
+       }
+       pagemap_fd = open(pagemap_proc, O_RDONLY);
+
+       if (pagemap_fd == -1) {
+               perror("read pagemap:");
+               exit(EXIT_FAILURE);
+       }
+
+       kpageflags_fd = open(kpageflags_proc, O_RDONLY);
+
+       if (kpageflags_fd == -1) {
+               perror("read kpageflags:");
+               exit(EXIT_FAILURE);
+       }
+
+       one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
+                       MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+       madvise(one_page, len, MADV_HUGEPAGE);
+
+       for (i = 0; i < len; i++)
+               one_page[i] = (char)i;
+
+       thp_size = check_huge(one_page);
+       if (!thp_size) {
+               printf("No THP is allocated\n");
+               exit(EXIT_FAILURE);
+       }
+
+       /* remap the first pagesize of first THP */
+       pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
+
+       /* remap the Nth pagesize of Nth THP */
+       for (i = 1; i < 4; i++) {
+               pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
+                                    pagesize, pagesize,
+                                    MREMAP_MAYMOVE|MREMAP_FIXED,
+                                    pte_mapped + pagesize * i);
+               if (pte_mapped2 == (char *)-1) {
+                       perror("mremap failed");
+                       exit(EXIT_FAILURE);
+               }
+       }
+
+       /* smap does not show THPs after mremap, use kpageflags instead */
+       thp_size = 0;
+       for (i = 0; i < pagesize * 4; i++)
+               if (i % pagesize == 0 &&
+                   is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+                       thp_size++;
+
+       if (thp_size != 4) {
+               printf("Some THPs are missing during mremap\n");
+               exit(EXIT_FAILURE);
+       }
+
+       /* split all remapped THPs */
+       write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped,
+                     (uint64_t)pte_mapped + pagesize * 4);
+
+       /* smap does not show THPs after mremap, use kpageflags instead */
+       thp_size = 0;
+       for (i = 0; i < pagesize * 4; i++) {
+               if (pte_mapped[i] != (char)i) {
+                       printf("%ld byte corrupted\n", i);
+                       exit(EXIT_FAILURE);
+               }
+               if (i % pagesize == 0 &&
+                   is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+                       thp_size++;
+       }
+
+       if (thp_size) {
+               printf("Still %ld THPs not split\n", thp_size);
+               exit(EXIT_FAILURE);
+       }
+
+       printf("Split PTE-mapped huge pages successful\n");
+       munmap(one_page, len);
+       close(pagemap_fd);
+       close(kpageflags_fd);
+}
+
+void split_file_backed_thp(void)
+{
+       int status;
+       int fd;
+       ssize_t num_written;
+       char tmpfs_template[] = "/tmp/thp_split_XXXXXX";
+       const char *tmpfs_loc = mkdtemp(tmpfs_template);
+       char testfile[INPUT_MAX];
+       uint64_t pgoff_start = 0, pgoff_end = 1024;
+
+       printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n");
+
+       status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
+
+       if (status) {
+               printf("Unable to create a tmpfs for testing\n");
+               exit(EXIT_FAILURE);
+       }
+
+       status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
+       if (status >= INPUT_MAX) {
+               printf("Fail to create file-backed THP split testing file\n");
+               goto cleanup;
+       }
+
+       fd = open(testfile, O_CREAT|O_WRONLY);
+       if (fd == -1) {
+               perror("Cannot open testing file\n");
+               goto cleanup;
+       }
+
+       /* write something to the file, so a file-backed THP can be allocated */
+       num_written = write(fd, tmpfs_loc, sizeof(tmpfs_loc));
+       close(fd);
+
+       if (num_written < 1) {
+               printf("Fail to write data to testing file\n");
+               goto cleanup;
+       }
+
+       /* split the file-backed THP */
+       write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end);
+
+       status = unlink(testfile);
+       if (status)
+               perror("Cannot remove testing file\n");
+
+cleanup:
+       status = umount(tmpfs_loc);
+       if (status) {
+               printf("Unable to umount %s\n", tmpfs_loc);
+               exit(EXIT_FAILURE);
+       }
+       status = rmdir(tmpfs_loc);
+       if (status) {
+               perror("cannot remove tmp dir");
+               exit(EXIT_FAILURE);
+       }
+
+       printf("file-backed THP split test done, please check dmesg for more information\n");
+}
+
+int main(int argc, char **argv)
+{
+       if (geteuid() != 0) {
+               printf("Please run the benchmark as root\n");
+               exit(EXIT_FAILURE);
+       }
+
+       pagesize = getpagesize();
+       pageshift = ffs(pagesize) - 1;
+       pmd_pagesize = read_pmd_pagesize();
+
+       split_pmd_thp();
+       split_pte_mapped_thp();
+       split_file_backed_thp();
+
+       return 0;
+}
index 92b8ec4..f5ab5e0 100644 (file)
@@ -81,6 +81,8 @@ static volatile bool test_uffdio_copy_eexist = true;
 static volatile bool test_uffdio_zeropage_eexist = true;
 /* Whether to test uffd write-protection */
 static bool test_uffdio_wp = false;
+/* Whether to test uffd minor faults */
+static bool test_uffdio_minor = false;
 
 static bool map_shared;
 static int huge_fd;
@@ -96,6 +98,7 @@ struct uffd_stats {
        int cpu;
        unsigned long missing_faults;
        unsigned long wp_faults;
+       unsigned long minor_faults;
 };
 
 /* pthread_mutex_t starts at page offset 0 */
@@ -153,17 +156,19 @@ static void uffd_stats_reset(struct uffd_stats *uffd_stats,
                uffd_stats[i].cpu = i;
                uffd_stats[i].missing_faults = 0;
                uffd_stats[i].wp_faults = 0;
+               uffd_stats[i].minor_faults = 0;
        }
 }
 
 static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
 {
        int i;
-       unsigned long long miss_total = 0, wp_total = 0;
+       unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
 
        for (i = 0; i < n_cpus; i++) {
                miss_total += stats[i].missing_faults;
                wp_total += stats[i].wp_faults;
+               minor_total += stats[i].minor_faults;
        }
 
        printf("userfaults: %llu missing (", miss_total);
@@ -172,6 +177,9 @@ static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
        printf("\b), %llu wp (", wp_total);
        for (i = 0; i < n_cpus; i++)
                printf("%lu+", stats[i].wp_faults);
+       printf("\b), %llu minor (", minor_total);
+       for (i = 0; i < n_cpus; i++)
+               printf("%lu+", stats[i].minor_faults);
        printf("\b)\n");
 }
 
@@ -328,7 +336,7 @@ static struct uffd_test_ops shmem_uffd_test_ops = {
 };
 
 static struct uffd_test_ops hugetlb_uffd_test_ops = {
-       .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
+       .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE),
        .allocate_area  = hugetlb_allocate_area,
        .release_pages  = hugetlb_release_pages,
        .alias_mapping = hugetlb_alias_mapping,
@@ -362,6 +370,22 @@ static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
        }
 }
 
+static void continue_range(int ufd, __u64 start, __u64 len)
+{
+       struct uffdio_continue req;
+
+       req.range.start = start;
+       req.range.len = len;
+       req.mode = 0;
+
+       if (ioctl(ufd, UFFDIO_CONTINUE, &req)) {
+               fprintf(stderr,
+                       "UFFDIO_CONTINUE failed for address 0x%" PRIx64 "\n",
+                       (uint64_t)start);
+               exit(1);
+       }
+}
+
 static void *locking_thread(void *arg)
 {
        unsigned long cpu = (unsigned long) arg;
@@ -569,8 +593,32 @@ static void uffd_handle_page_fault(struct uffd_msg *msg,
        }
 
        if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
+               /* Write protect page faults */
                wp_range(uffd, msg->arg.pagefault.address, page_size, false);
                stats->wp_faults++;
+       } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
+               uint8_t *area;
+               int b;
+
+               /*
+                * Minor page faults
+                *
+                * To prove we can modify the original range for testing
+                * purposes, we're going to bit flip this range before
+                * continuing.
+                *
+                * Note that this requires all minor page fault tests operate on
+                * area_dst (non-UFFD-registered) and area_dst_alias
+                * (UFFD-registered).
+                */
+
+               area = (uint8_t *)(area_dst +
+                                  ((char *)msg->arg.pagefault.address -
+                                   area_dst_alias));
+               for (b = 0; b < page_size; ++b)
+                       area[b] = ~area[b];
+               continue_range(uffd, msg->arg.pagefault.address, page_size);
+               stats->minor_faults++;
        } else {
                /* Missing page faults */
                if (bounces & BOUNCE_VERIFY &&
@@ -779,7 +827,7 @@ static int stress(struct uffd_stats *uffd_stats)
        return 0;
 }
 
-static int userfaultfd_open(int features)
+static int userfaultfd_open_ext(uint64_t *features)
 {
        struct uffdio_api uffdio_api;
 
@@ -792,7 +840,7 @@ static int userfaultfd_open(int features)
        uffd_flags = fcntl(uffd, F_GETFD, NULL);
 
        uffdio_api.api = UFFD_API;
-       uffdio_api.features = features;
+       uffdio_api.features = *features;
        if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
                fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to "
                        "run with either root or ptrace capability.\n");
@@ -804,9 +852,15 @@ static int userfaultfd_open(int features)
                return 1;
        }
 
+       *features = uffdio_api.features;
        return 0;
 }
 
+static int userfaultfd_open(uint64_t features)
+{
+       return userfaultfd_open_ext(&features);
+}
+
 sigjmp_buf jbuf, *sigbuf;
 
 static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
@@ -1112,7 +1166,7 @@ static int userfaultfd_events_test(void)
        }
 
        if (!pid)
-               return faulting_process(0);
+               exit(faulting_process(0));
 
        waitpid(pid, &err, 0);
        if (err) {
@@ -1215,6 +1269,102 @@ static int userfaultfd_sig_test(void)
        return userfaults != 0;
 }
 
+static int userfaultfd_minor_test(void)
+{
+       struct uffdio_register uffdio_register;
+       unsigned long expected_ioctls;
+       unsigned long p;
+       pthread_t uffd_mon;
+       uint8_t expected_byte;
+       void *expected_page;
+       char c;
+       struct uffd_stats stats = { 0 };
+       uint64_t features = UFFD_FEATURE_MINOR_HUGETLBFS;
+
+       if (!test_uffdio_minor)
+               return 0;
+
+       printf("testing minor faults: ");
+       fflush(stdout);
+
+       if (uffd_test_ops->release_pages(area_dst))
+               return 1;
+
+       if (userfaultfd_open_ext(&features))
+               return 1;
+       /* If kernel reports the feature isn't supported, skip the test. */
+       if (!(features & UFFD_FEATURE_MINOR_HUGETLBFS)) {
+               printf("skipping test due to lack of feature support\n");
+               fflush(stdout);
+               return 0;
+       }
+
+       uffdio_register.range.start = (unsigned long)area_dst_alias;
+       uffdio_register.range.len = nr_pages * page_size;
+       uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
+       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+               fprintf(stderr, "register failure\n");
+               exit(1);
+       }
+
+       expected_ioctls = uffd_test_ops->expected_ioctls;
+       expected_ioctls |= 1 << _UFFDIO_CONTINUE;
+       if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
+               fprintf(stderr, "unexpected missing ioctl(s)\n");
+               exit(1);
+       }
+
+       /*
+        * After registering with UFFD, populate the non-UFFD-registered side of
+        * the shared mapping. This should *not* trigger any UFFD minor faults.
+        */
+       for (p = 0; p < nr_pages; ++p) {
+               memset(area_dst + (p * page_size), p % ((uint8_t)-1),
+                      page_size);
+       }
+
+       if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
+               perror("uffd_poll_thread create");
+               exit(1);
+       }
+
+       /*
+        * Read each of the pages back using the UFFD-registered mapping. We
+        * expect that the first time we touch a page, it will result in a minor
+        * fault. uffd_poll_thread will resolve the fault by bit-flipping the
+        * page's contents, and then issuing a CONTINUE ioctl.
+        */
+
+       if (posix_memalign(&expected_page, page_size, page_size)) {
+               fprintf(stderr, "out of memory\n");
+               return 1;
+       }
+
+       for (p = 0; p < nr_pages; ++p) {
+               expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
+               memset(expected_page, expected_byte, page_size);
+               if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
+                           page_size)) {
+                       fprintf(stderr,
+                               "unexpected page contents after minor fault\n");
+                       exit(1);
+               }
+       }
+
+       if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
+               perror("pipe write");
+               exit(1);
+       }
+       if (pthread_join(uffd_mon, NULL))
+               return 1;
+
+       close(uffd);
+
+       uffd_stats_report(&stats, 1);
+
+       return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
+}
+
 static int userfaultfd_stress(void)
 {
        void *area;
@@ -1413,7 +1563,7 @@ static int userfaultfd_stress(void)
 
        close(uffd);
        return userfaultfd_zeropage_test() || userfaultfd_sig_test()
-               || userfaultfd_events_test();
+               || userfaultfd_events_test() || userfaultfd_minor_test();
 }
 
 /*
@@ -1454,6 +1604,8 @@ static void set_test_type(const char *type)
                map_shared = true;
                test_type = TEST_HUGETLB;
                uffd_test_ops = &hugetlb_uffd_test_ops;
+               /* Minor faults require shared hugetlb; only enable here. */
+               test_uffdio_minor = true;
        } else if (!strcmp(type, "shmem")) {
                map_shared = true;
                test_type = TEST_SHMEM;
index b69de92..3a2e6bb 100644 (file)
@@ -1018,7 +1018,7 @@ static long go_to_sleep(const struct entry *req)
                cond_timedwait(&printstate.cond, &printstate.mutex, &future);
                if (time_has_passed(&future))
                        break;
-       };
+       }
 
        if (printstate_has_new_req_arrived(req))
                delay = -1;
@@ -1941,7 +1941,7 @@ static void scan_arguments(int argc, char *argv[])
                        if (value < 0) {
                                warnx("TIME must be >= 0\n");
                                show_usage();
-                               ;
+                               exit(0);
                        }
                        trace_enable = true;
                        use_random_sleep = true;
index e8cad6a..73f914d 100644 (file)
@@ -272,5 +272,3 @@ do
        echo ''
     done
 done
-
-# vim: sw=4
index 62bd908..f08f5e8 100644 (file)
@@ -174,21 +174,36 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
                                           struct kvm_coalesced_mmio_zone *zone)
 {
        struct kvm_coalesced_mmio_dev *dev, *tmp;
+       int r;
 
        if (zone->pio != 1 && zone->pio != 0)
                return -EINVAL;
 
        mutex_lock(&kvm->slots_lock);
 
-       list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list)
+       list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) {
                if (zone->pio == dev->zone.pio &&
                    coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
-                       kvm_io_bus_unregister_dev(kvm,
+                       r = kvm_io_bus_unregister_dev(kvm,
                                zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev);
                        kvm_iodevice_destructor(&dev->dev);
+
+                       /*
+                        * On failure, unregister destroys all devices on the
+                        * bus _except_ the target device, i.e. coalesced_zones
+                        * has been modified.  No need to restart the walk as
+                        * there aren't any zones left.
+                        */
+                       if (r)
+                               break;
                }
+       }
 
        mutex_unlock(&kvm->slots_lock);
 
+       /*
+        * Ignore the result of kvm_io_bus_unregister_dev(), from userspace's
+        * perspective, the coalesced MMIO is most definitely unregistered.
+        */
        return 0;
 }
index 383df23..2799c66 100644 (file)
@@ -451,35 +451,170 @@ static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
        srcu_read_unlock(&kvm->srcu, idx);
 }
 
-static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
-                                       struct mm_struct *mm,
-                                       unsigned long address,
-                                       pte_t pte)
+typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
+
+typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
+                            unsigned long end);
+
+struct kvm_hva_range {
+       unsigned long start;
+       unsigned long end;
+       pte_t pte;
+       hva_handler_t handler;
+       on_lock_fn_t on_lock;
+       bool flush_on_ret;
+       bool may_block;
+};
+
+/*
+ * Use a dedicated stub instead of NULL to indicate that there is no callback
+ * function/handler.  The compiler technically can't guarantee that a real
+ * function will have a non-zero address, and so it will generate code to
+ * check for !NULL, whereas comparing against a stub will be elided at compile
+ * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
+ */
+static void kvm_null_fn(void)
 {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int idx;
+
+}
+#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
+
+static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
+                                                 const struct kvm_hva_range *range)
+{
+       bool ret = false, locked = false;
+       struct kvm_gfn_range gfn_range;
+       struct kvm_memory_slot *slot;
+       struct kvm_memslots *slots;
+       int i, idx;
+
+       /* A null handler is allowed if and only if on_lock() is provided. */
+       if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
+                        IS_KVM_NULL_FN(range->handler)))
+               return 0;
 
        idx = srcu_read_lock(&kvm->srcu);
 
-       KVM_MMU_LOCK(kvm);
+       /* The on_lock() path does not yet support lock elision. */
+       if (!IS_KVM_NULL_FN(range->on_lock)) {
+               locked = true;
+               KVM_MMU_LOCK(kvm);
 
-       kvm->mmu_notifier_seq++;
+               range->on_lock(kvm, range->start, range->end);
+
+               if (IS_KVM_NULL_FN(range->handler))
+                       goto out_unlock;
+       }
 
-       if (kvm_set_spte_hva(kvm, address, pte))
+       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+               slots = __kvm_memslots(kvm, i);
+               kvm_for_each_memslot(slot, slots) {
+                       unsigned long hva_start, hva_end;
+
+                       hva_start = max(range->start, slot->userspace_addr);
+                       hva_end = min(range->end, slot->userspace_addr +
+                                                 (slot->npages << PAGE_SHIFT));
+                       if (hva_start >= hva_end)
+                               continue;
+
+                       /*
+                        * To optimize for the likely case where the address
+                        * range is covered by zero or one memslots, don't
+                        * bother making these conditional (to avoid writes on
+                        * the second or later invocation of the handler).
+                        */
+                       gfn_range.pte = range->pte;
+                       gfn_range.may_block = range->may_block;
+
+                       /*
+                        * {gfn(page) | page intersects with [hva_start, hva_end)} =
+                        * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+                        */
+                       gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
+                       gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
+                       gfn_range.slot = slot;
+
+                       if (!locked) {
+                               locked = true;
+                               KVM_MMU_LOCK(kvm);
+                       }
+                       ret |= range->handler(kvm, &gfn_range);
+               }
+       }
+
+       if (range->flush_on_ret && (ret || kvm->tlbs_dirty))
                kvm_flush_remote_tlbs(kvm);
 
-       KVM_MMU_UNLOCK(kvm);
+out_unlock:
+       if (locked)
+               KVM_MMU_UNLOCK(kvm);
+
        srcu_read_unlock(&kvm->srcu, idx);
+
+       /* The notifiers are averse to booleans. :-( */
+       return (int)ret;
 }
 
-static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
-                                       const struct mmu_notifier_range *range)
+static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
+                                               unsigned long start,
+                                               unsigned long end,
+                                               pte_t pte,
+                                               hva_handler_t handler)
 {
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int need_tlb_flush = 0, idx;
+       const struct kvm_hva_range range = {
+               .start          = start,
+               .end            = end,
+               .pte            = pte,
+               .handler        = handler,
+               .on_lock        = (void *)kvm_null_fn,
+               .flush_on_ret   = true,
+               .may_block      = false,
+       };
+
+       return __kvm_handle_hva_range(kvm, &range);
+}
+
+static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
+                                                        unsigned long start,
+                                                        unsigned long end,
+                                                        hva_handler_t handler)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       const struct kvm_hva_range range = {
+               .start          = start,
+               .end            = end,
+               .pte            = __pte(0),
+               .handler        = handler,
+               .on_lock        = (void *)kvm_null_fn,
+               .flush_on_ret   = false,
+               .may_block      = false,
+       };
+
+       return __kvm_handle_hva_range(kvm, &range);
+}
+static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+                                       struct mm_struct *mm,
+                                       unsigned long address,
+                                       pte_t pte)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+
+       trace_kvm_set_spte_hva(address);
 
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
+       /*
+        * .change_pte() must be surrounded by .invalidate_range_{start,end}(),
+        * and so always runs with an elevated notifier count.  This obviates
+        * the need to bump the sequence count.
+        */
+       WARN_ON_ONCE(!kvm->mmu_notifier_count);
+
+       kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
+}
+
+static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
+                                  unsigned long end)
+{
        /*
         * The count increase must become visible at unlock time as no
         * spte can be established without taking the mmu_lock and
@@ -487,8 +622,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
         */
        kvm->mmu_notifier_count++;
        if (likely(kvm->mmu_notifier_count == 1)) {
-               kvm->mmu_notifier_range_start = range->start;
-               kvm->mmu_notifier_range_end = range->end;
+               kvm->mmu_notifier_range_start = start;
+               kvm->mmu_notifier_range_end = end;
        } else {
                /*
                 * Fully tracking multiple concurrent ranges has dimishing
@@ -500,28 +635,36 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
                 * complete.
                 */
                kvm->mmu_notifier_range_start =
-                       min(kvm->mmu_notifier_range_start, range->start);
+                       min(kvm->mmu_notifier_range_start, start);
                kvm->mmu_notifier_range_end =
-                       max(kvm->mmu_notifier_range_end, range->end);
+                       max(kvm->mmu_notifier_range_end, end);
        }
-       need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
-                                            range->flags);
-       /* we've to flush the tlb before the pages can be freed */
-       if (need_tlb_flush || kvm->tlbs_dirty)
-               kvm_flush_remote_tlbs(kvm);
-
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return 0;
 }
 
-static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
                                        const struct mmu_notifier_range *range)
 {
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       const struct kvm_hva_range hva_range = {
+               .start          = range->start,
+               .end            = range->end,
+               .pte            = __pte(0),
+               .handler        = kvm_unmap_gfn_range,
+               .on_lock        = kvm_inc_notifier_count,
+               .flush_on_ret   = true,
+               .may_block      = mmu_notifier_range_blockable(range),
+       };
 
-       KVM_MMU_LOCK(kvm);
+       trace_kvm_unmap_hva_range(range->start, range->end);
+
+       __kvm_handle_hva_range(kvm, &hva_range);
+
+       return 0;
+}
+
+static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
+                                  unsigned long end)
+{
        /*
         * This sequence increase will notify the kvm page fault that
         * the page that is going to be mapped in the spte could have
@@ -535,7 +678,23 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
         * in conjunction with the smp_rmb in mmu_notifier_retry().
         */
        kvm->mmu_notifier_count--;
-       KVM_MMU_UNLOCK(kvm);
+}
+
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+                                       const struct mmu_notifier_range *range)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       const struct kvm_hva_range hva_range = {
+               .start          = range->start,
+               .end            = range->end,
+               .pte            = __pte(0),
+               .handler        = (void *)kvm_null_fn,
+               .on_lock        = kvm_dec_notifier_count,
+               .flush_on_ret   = false,
+               .may_block      = mmu_notifier_range_blockable(range),
+       };
+
+       __kvm_handle_hva_range(kvm, &hva_range);
 
        BUG_ON(kvm->mmu_notifier_count < 0);
 }
@@ -545,20 +704,9 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
                                              unsigned long start,
                                              unsigned long end)
 {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int young, idx;
+       trace_kvm_age_hva(start, end);
 
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
-
-       young = kvm_age_hva(kvm, start, end);
-       if (young)
-               kvm_flush_remote_tlbs(kvm);
-
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return young;
+       return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
 }
 
 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
@@ -566,11 +714,8 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
                                        unsigned long start,
                                        unsigned long end)
 {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int young, idx;
+       trace_kvm_age_hva(start, end);
 
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
        /*
         * Even though we do not flush TLB, this will still adversely
         * affect performance on pre-Haswell Intel EPT, where there is
@@ -584,27 +729,17 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
         * cadence. If we find this inaccurate, we might come up with a
         * more sophisticated heuristic later.
         */
-       young = kvm_age_hva(kvm, start, end);
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return young;
+       return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
 }
 
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
                                       struct mm_struct *mm,
                                       unsigned long address)
 {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int young, idx;
+       trace_kvm_test_age_hva(address);
 
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
-       young = kvm_test_age_hva(kvm, address);
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return young;
+       return kvm_handle_hva_range_no_flush(mn, address, address + 1,
+                                            kvm_test_age_gfn);
 }
 
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
@@ -3002,6 +3137,11 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
        return false;
 }
 
+bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+       return false;
+}
+
 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 {
        struct kvm *kvm = me->kvm;
@@ -3035,7 +3175,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
                            !vcpu_dy_runnable(vcpu))
                                continue;
                        if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
-                               !kvm_arch_vcpu_in_kernel(vcpu))
+                           !kvm_arch_dy_has_pending_interrupt(vcpu) &&
+                           !kvm_arch_vcpu_in_kernel(vcpu))
                                continue;
                        if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                                continue;
@@ -3182,7 +3323,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        if (r)
                goto vcpu_decrement;
 
-       vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+       vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
        if (!vcpu) {
                r = -ENOMEM;
                goto vcpu_decrement;
@@ -4062,6 +4203,12 @@ static struct file_operations kvm_vm_fops = {
        KVM_COMPAT(kvm_vm_compat_ioctl),
 };
 
+bool file_is_kvm(struct file *file)
+{
+       return file && file->f_op == &kvm_vm_fops;
+}
+EXPORT_SYMBOL_GPL(file_is_kvm);
+
 static int kvm_dev_ioctl_create_vm(unsigned long type)
 {
        int r;
@@ -4485,24 +4632,26 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
        return 0;
 }
 
-/* Caller must hold slots_lock. */
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-                              struct kvm_io_device *dev)
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                             struct kvm_io_device *dev)
 {
        int i, j;
        struct kvm_io_bus *new_bus, *bus;
 
+       lockdep_assert_held(&kvm->slots_lock);
+
        bus = kvm_get_bus(kvm, bus_idx);
        if (!bus)
-               return;
+               return 0;
 
-       for (i = 0; i < bus->dev_count; i++)
+       for (i = 0; i < bus->dev_count; i++) {
                if (bus->range[i].dev == dev) {
                        break;
                }
+       }
 
        if (i == bus->dev_count)
-               return;
+               return 0;
 
        new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
                          GFP_KERNEL_ACCOUNT);
@@ -4511,7 +4660,13 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                new_bus->dev_count--;
                memcpy(new_bus->range + i, bus->range + i + 1,
                                flex_array_size(new_bus, range, new_bus->dev_count - i));
-       } else {
+       }
+
+       rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+       synchronize_srcu_expedited(&kvm->srcu);
+
+       /* Destroy the old bus _after_ installing the (null) bus. */
+       if (!new_bus) {
                pr_err("kvm: failed to shrink bus, removing it completely\n");
                for (j = 0; j < bus->dev_count; j++) {
                        if (j == i)
@@ -4520,10 +4675,8 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                }
        }
 
-       rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
-       synchronize_srcu_expedited(&kvm->srcu);
        kfree(bus);
-       return;
+       return new_bus ? 0 : -ENOMEM;
 }
 
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,