Merge tag 'char-misc-5.15-rc1-lkdtm' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 12 Sep 2021 18:56:00 +0000 (11:56 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 12 Sep 2021 18:56:00 +0000 (11:56 -0700)
Pull misc driver fix from Greg KH:
 "Here is a single patch for 5.15-rc1, for the lkdtm misc driver.

  It resolves a build issue that many people were hitting with your
  current tree, and Kees and others felt would be good to get merged
  before -rc1 comes out, to prevent them from having to constantly hit
  it as many development trees restart on -rc1, not older -rc releases.

  It has NOT been in linux-next, but has passed 0-day testing and looks
  'obviously correct' when reviewing it locally :)"

* tag 'char-misc-5.15-rc1-lkdtm' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc:
  lkdtm: Use init_uts_ns.name instead of macros

315 files changed:
Documentation/ABI/testing/debugfs-driver-habanalabs
Documentation/core-api/kernel-api.rst
Documentation/devicetree/bindings/display/msm/dsi-phy-7nm.yaml
Documentation/devicetree/bindings/gpio/gpio-virtio.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/i2c/i2c-virtio.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/allwinner,sun4i-a10-lradc-keys.yaml
Documentation/devicetree/bindings/input/qcom,pm8941-pwrkey.txt [deleted file]
Documentation/devicetree/bindings/input/qcom,pm8941-pwrkey.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/regulator-haptic.txt [deleted file]
Documentation/devicetree/bindings/input/regulator-haptic.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/chipone,icn8318.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/chipone_icn8318.txt [deleted file]
Documentation/devicetree/bindings/input/touchscreen/pixcir,pixcir_ts.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/pixcir_i2c_ts.txt [deleted file]
Documentation/devicetree/bindings/input/touchscreen/ti,tsc2005.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/tsc2005.txt [deleted file]
Documentation/devicetree/bindings/power/reset/qcom,pon.txt [deleted file]
Documentation/devicetree/bindings/power/reset/qcom,pon.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/power/reset/reboot-mode.yaml
Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml
Documentation/devicetree/bindings/rtc/trivial-rtc.yaml
Documentation/devicetree/bindings/spi/omap-spi.yaml
Documentation/devicetree/bindings/spi/spi-xilinx.yaml
Documentation/devicetree/bindings/thermal/qcom-lmh.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/thermal/thermal-zones.yaml
Documentation/devicetree/bindings/virtio/mmio.yaml
Documentation/devicetree/bindings/virtio/virtio-device.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/watchdog/maxim,max63xx.yaml
Documentation/features/vm/ELF-ASLR/arch-support.txt
Documentation/filesystems/api-summary.rst
Documentation/gpu/drm-mm.rst
Documentation/power/energy-model.rst
Documentation/userspace-api/index.rst
Documentation/userspace-api/ioctl/ioctl-number.rst
Documentation/userspace-api/vduse.rst [new file with mode: 0644]
MAINTAINERS
arch/arm64/mm/init.c
arch/parisc/Kconfig
arch/parisc/boot/compressed/Makefile
arch/parisc/include/asm/processor.h
arch/parisc/include/asm/rt_sigframe.h
arch/parisc/include/asm/thread_info.h
arch/parisc/include/asm/uaccess.h
arch/parisc/kernel/asm-offsets.c
arch/parisc/kernel/parisc_ksyms.c
arch/parisc/kernel/setup.c
arch/parisc/kernel/signal.c
arch/parisc/kernel/signal32.h
arch/parisc/kernel/time.c
arch/parisc/lib/lusercopy.S
arch/riscv/Kconfig
arch/riscv/Makefile
arch/riscv/boot/Makefile
arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
arch/riscv/configs/defconfig
arch/riscv/include/asm/elf.h
arch/riscv/kernel/vmlinux-xip.lds.S
arch/riscv/kernel/vmlinux.lds.S
block/Makefile
block/bdev.c [new file with mode: 0644]
block/blk-mq.c
block/blk-throttle.c
block/blk.h
block/fops.c [new file with mode: 0644]
block/genhd.c
drivers/acpi/cppc_acpi.c
drivers/acpi/prmt.c
drivers/acpi/scan.c
drivers/base/power/main.c
drivers/base/power/wakeirq.c
drivers/block/n64cart.c
drivers/block/virtio_blk.c
drivers/char/ipmi/ipmi_si_intf.c
drivers/clk/qcom/gcc-sm6350.c
drivers/cpufreq/intel_pstate.c
drivers/dma-buf/Kconfig
drivers/firewire/net.c
drivers/firmware/qcom_scm.c
drivers/firmware/qcom_scm.h
drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
drivers/gpu/drm/amd/amdgpu/vi.c
drivers/gpu/drm/amd/amdkfd/kfd_svm.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
drivers/gpu/drm/amd/display/dc/dcn303/dcn303_resource.c
drivers/gpu/drm/amd/display/dc/dcn31/dcn31_panel_cntl.c
drivers/gpu/drm/amd/display/dc/dml/dcn20/display_mode_vba_20v2.c
drivers/gpu/drm/amd/pm/amdgpu_pm.c
drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c
drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
drivers/gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.c
drivers/gpu/drm/i915/gt/intel_gt_requests.h
drivers/gpu/drm/mgag200/mgag200_pll.c
drivers/gpu/drm/panfrost/panfrost_mmu.c
drivers/gpu/drm/panfrost/panfrost_regs.h
drivers/gpu/drm/ttm/ttm_bo.c
drivers/gpu/drm/ttm/ttm_bo_util.c
drivers/gpu/drm/ttm/ttm_tt.c
drivers/gpu/drm/vc4/vc4_hdmi.c
drivers/input/joystick/analog.c
drivers/input/keyboard/Kconfig
drivers/input/keyboard/adc-keys.c
drivers/input/keyboard/adp5588-keys.c
drivers/input/keyboard/adp5589-keys.c
drivers/input/keyboard/ep93xx_keypad.c
drivers/input/misc/Kconfig
drivers/input/misc/Makefile
drivers/input/misc/ixp4xx-beeper.c [deleted file]
drivers/input/misc/pm8941-pwrkey.c
drivers/input/misc/sirfsoc-onkey.c [deleted file]
drivers/input/mouse/elan_i2c.h
drivers/input/mouse/elan_i2c_core.c
drivers/input/serio/parkbd.c
drivers/input/touchscreen/Kconfig
drivers/input/touchscreen/edt-ft5x06.c
drivers/input/touchscreen/mms114.c
drivers/iommu/Kconfig
drivers/iommu/amd/init.c
drivers/iommu/intel/svm.c
drivers/iommu/iova.c
drivers/misc/habanalabs/common/Makefile
drivers/misc/habanalabs/common/command_buffer.c
drivers/misc/habanalabs/common/command_submission.c
drivers/misc/habanalabs/common/context.c
drivers/misc/habanalabs/common/debugfs.c
drivers/misc/habanalabs/common/device.c
drivers/misc/habanalabs/common/firmware_if.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/common/habanalabs_drv.c
drivers/misc/habanalabs/common/habanalabs_ioctl.c
drivers/misc/habanalabs/common/hw_queue.c
drivers/misc/habanalabs/common/memory.c
drivers/misc/habanalabs/common/mmu/mmu_v1.c
drivers/misc/habanalabs/common/pci/pci.c
drivers/misc/habanalabs/common/state_dump.c [new file with mode: 0644]
drivers/misc/habanalabs/common/sysfs.c
drivers/misc/habanalabs/gaudi/gaudi.c
drivers/misc/habanalabs/gaudi/gaudiP.h
drivers/misc/habanalabs/gaudi/gaudi_coresight.c
drivers/misc/habanalabs/gaudi/gaudi_security.c
drivers/misc/habanalabs/goya/goya.c
drivers/misc/habanalabs/include/common/cpucp_if.h
drivers/misc/habanalabs/include/common/hl_boot_if.h
drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h
drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
drivers/misc/habanalabs/include/gaudi/gaudi_reg_map.h
drivers/nvme/host/core.c
drivers/nvme/host/multipath.c
drivers/nvme/host/nvme.h
drivers/nvme/host/tcp.c
drivers/nvme/target/admin-cmd.c
drivers/nvme/target/configfs.c
drivers/nvme/target/core.c
drivers/nvme/target/nvmet.h
drivers/nvme/target/passthru.c
drivers/of/property.c
drivers/parisc/dino.c
drivers/pwm/Kconfig
drivers/pwm/core.c
drivers/pwm/pwm-ab8500.c
drivers/pwm/pwm-atmel-hlcdc.c
drivers/pwm/pwm-atmel-tcb.c
drivers/pwm/pwm-atmel.c
drivers/pwm/pwm-bcm-kona.c
drivers/pwm/pwm-brcmstb.c
drivers/pwm/pwm-cros-ec.c
drivers/pwm/pwm-ep93xx.c
drivers/pwm/pwm-fsl-ftm.c
drivers/pwm/pwm-hibvt.c
drivers/pwm/pwm-img.c
drivers/pwm/pwm-imx-tpm.c
drivers/pwm/pwm-imx27.c
drivers/pwm/pwm-intel-lgm.c
drivers/pwm/pwm-iqs620a.c
drivers/pwm/pwm-jz4740.c
drivers/pwm/pwm-keembay.c
drivers/pwm/pwm-lp3943.c
drivers/pwm/pwm-lpc32xx.c
drivers/pwm/pwm-mediatek.c
drivers/pwm/pwm-mtk-disp.c
drivers/pwm/pwm-mxs.c
drivers/pwm/pwm-ntxec.c
drivers/pwm/pwm-omap-dmtimer.c
drivers/pwm/pwm-pca9685.c
drivers/pwm/pwm-pxa.c
drivers/pwm/pwm-raspberrypi-poe.c
drivers/pwm/pwm-rcar.c
drivers/pwm/pwm-renesas-tpu.c
drivers/pwm/pwm-rockchip.c
drivers/pwm/pwm-samsung.c
drivers/pwm/pwm-sifive.c
drivers/pwm/pwm-sl28cpld.c
drivers/pwm/pwm-stm32-lp.c
drivers/pwm/pwm-sun4i.c
drivers/pwm/pwm-tiecap.c
drivers/pwm/pwm-tiehrpwm.c
drivers/pwm/pwm-twl-led.c
drivers/pwm/pwm-twl.c
drivers/rtc/Kconfig
drivers/rtc/Makefile
drivers/rtc/lib.c
drivers/rtc/lib_test.c [new file with mode: 0644]
drivers/rtc/rtc-cmos.c
drivers/rtc/rtc-rx8025.c
drivers/rtc/rtc-s5m.c
drivers/rtc/rtc-tps65910.c
drivers/thermal/intel/int340x_thermal/int3400_thermal.c
drivers/thermal/intel/intel_powerclamp.c
drivers/thermal/intel/intel_tcc_cooling.c
drivers/thermal/qcom/Kconfig
drivers/thermal/qcom/Makefile
drivers/thermal/qcom/lmh.c [new file with mode: 0644]
drivers/thermal/qcom/qcom-spmi-adc-tm5.c
drivers/thermal/rcar_gen3_thermal.c
drivers/thermal/samsung/exynos_tmu.c
drivers/thermal/tegra/Kconfig
drivers/thermal/tegra/Makefile
drivers/thermal/tegra/soctherm.c
drivers/thermal/tegra/tegra30-tsensor.c [new file with mode: 0644]
drivers/vdpa/Kconfig
drivers/vdpa/Makefile
drivers/vdpa/ifcvf/ifcvf_base.c
drivers/vdpa/ifcvf/ifcvf_base.h
drivers/vdpa/ifcvf/ifcvf_main.c
drivers/vdpa/mlx5/core/mlx5_vdpa.h
drivers/vdpa/mlx5/core/mr.c
drivers/vdpa/mlx5/core/resources.c
drivers/vdpa/mlx5/net/mlx5_vnet.c
drivers/vdpa/vdpa.c
drivers/vdpa/vdpa_sim/vdpa_sim.c
drivers/vdpa/vdpa_user/Makefile [new file with mode: 0644]
drivers/vdpa/vdpa_user/iova_domain.c [new file with mode: 0644]
drivers/vdpa/vdpa_user/iova_domain.h [new file with mode: 0644]
drivers/vdpa/vdpa_user/vduse_dev.c [new file with mode: 0644]
drivers/vdpa/virtio_pci/vp_vdpa.c
drivers/vhost/iotlb.c
drivers/vhost/scsi.c
drivers/vhost/vdpa.c
drivers/vhost/vsock.c
drivers/video/fbdev/core/fbmem.c
drivers/virtio/virtio.c
drivers/virtio/virtio_balloon.c
fs/Kconfig
fs/Makefile
fs/block_dev.c [deleted file]
fs/cifs/cifsencrypt.c
fs/cifs/cifspdu.h
fs/cifs/smb2ops.c
fs/cifs/smbencrypt.c
fs/cifs/smbfsctl.h [deleted file]
fs/cifs_common/Makefile [deleted file]
fs/cifs_common/arc4.h [deleted file]
fs/cifs_common/cifs_arc4.c [deleted file]
fs/cifs_common/cifs_md4.c [deleted file]
fs/cifs_common/md4.h [deleted file]
fs/file.c
fs/fs_parser.c
fs/internal.h
fs/io-wq.c
fs/io_uring.c
fs/namei.c
fs/notify/mark.c
fs/smbfs_common/Makefile [new file with mode: 0644]
fs/smbfs_common/arc4.h [new file with mode: 0644]
fs/smbfs_common/cifs_arc4.c [new file with mode: 0644]
fs/smbfs_common/cifs_md4.c [new file with mode: 0644]
fs/smbfs_common/md4.h [new file with mode: 0644]
fs/smbfs_common/smbfsctl.h [new file with mode: 0644]
include/acpi/cppc_acpi.h
include/drm/ttm/ttm_tt.h
include/linux/bootconfig.h
include/linux/energy_model.h
include/linux/file.h
include/linux/pwm.h
include/linux/qcom_scm.h
include/linux/rwsem.h
include/linux/thermal.h
include/linux/time64.h
include/linux/vdpa.h
include/linux/vhost_iotlb.h
include/uapi/linux/vduse.h [new file with mode: 0644]
include/uapi/linux/virtio_ids.h
include/uapi/linux/virtio_vsock.h
include/uapi/misc/habanalabs.h
kernel/futex.c
kernel/locking/rtmutex.c
kernel/locking/rwsem.c
kernel/sched/core.c
kernel/sched/idle.c
kernel/trace/trace_boot.c
lib/bootconfig.c
net/vmw_vsock/af_vsock.c
net/vmw_vsock/virtio_transport_common.c
scripts/coccinelle/api/kvmalloc.cocci
scripts/coccinelle/iterators/use_after_iter.cocci
scripts/sorttable.c
tools/testing/vsock/vsock_test.c
tools/thermal/tmon/Makefile

index a5c28f6..284e2df 100644 (file)
@@ -215,6 +215,17 @@ Description:    Sets the skip reset on timeout option for the device. Value of
                 "0" means device will be reset in case some CS has timed out,
                 otherwise it will not be reset.
 
+What:           /sys/kernel/debug/habanalabs/hl<n>/state_dump
+Date:           Oct 2021
+KernelVersion:  5.15
+Contact:        ynudelman@habana.ai
+Description:    Gets the state dump occurring on a CS timeout or failure.
+                State dump is used for debug and is created each time in case of
+                a problem in a CS execution, before reset.
+                Reading from the node returns the newest state dump available.
+                Writing an integer X discards X state dumps, so that the
+                next read would return X+1-st newest state dump.
+
 What:           /sys/kernel/debug/habanalabs/hl<n>/stop_on_err
 Date:           Mar 2020
 KernelVersion:  5.6
@@ -230,6 +241,14 @@ Description:    Displays a list with information about the currently user
                 pointers (user virtual addresses) that are pinned and mapped
                 to DMA addresses
 
+What:           /sys/kernel/debug/habanalabs/hl<n>/userptr_lookup
+Date:           Aug 2021
+KernelVersion:  5.15
+Contact:        ogabbay@kernel.org
+Description:    Allows to search for specific user pointers (user virtual
+                addresses) that are pinned and mapped to DMA addresses, and see
+                their resolution to the specific dma address.
+
 What:           /sys/kernel/debug/habanalabs/hl<n>/vm
 Date:           Jan 2019
 KernelVersion:  5.1
index 2a7444e..2e71868 100644 (file)
@@ -315,6 +315,9 @@ Block Devices
 .. kernel-doc:: block/genhd.c
    :export:
 
+.. kernel-doc:: block/bdev.c
+   :export:
+
 Char devices
 ============
 
index 4265399..c851770 100644 (file)
@@ -14,10 +14,10 @@ allOf:
 
 properties:
   compatible:
-    oneOf:
-      - const: qcom,dsi-phy-7nm
-      - const: qcom,dsi-phy-7nm-8150
-      - const: qcom,sc7280-dsi-phy-7nm
+    enum:
+      - qcom,dsi-phy-7nm
+      - qcom,dsi-phy-7nm-8150
+      - qcom,sc7280-dsi-phy-7nm
 
   reg:
     items:
diff --git a/Documentation/devicetree/bindings/gpio/gpio-virtio.yaml b/Documentation/devicetree/bindings/gpio/gpio-virtio.yaml
new file mode 100644 (file)
index 0000000..601d857
--- /dev/null
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/gpio-virtio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Virtio GPIO controller
+
+maintainers:
+  - Viresh Kumar <viresh.kumar@linaro.org>
+
+allOf:
+  - $ref: /schemas/virtio/virtio-device.yaml#
+
+description:
+  Virtio GPIO controller, see /schemas/virtio/virtio-device.yaml for more
+  details.
+
+properties:
+  $nodename:
+    const: gpio
+
+  compatible:
+    const: virtio,device29
+
+  gpio-controller: true
+
+  "#gpio-cells":
+    const: 2
+
+  interrupt-controller: true
+
+  "#interrupt-cells":
+    const: 2
+
+required:
+  - compatible
+  - gpio-controller
+  - "#gpio-cells"
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    virtio@3000 {
+        compatible = "virtio,mmio";
+        reg = <0x3000 0x100>;
+        interrupts = <41>;
+
+        gpio {
+            compatible = "virtio,device29";
+            gpio-controller;
+            #gpio-cells = <2>;
+            interrupt-controller;
+            #interrupt-cells = <2>;
+        };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/i2c/i2c-virtio.yaml b/Documentation/devicetree/bindings/i2c/i2c-virtio.yaml
new file mode 100644 (file)
index 0000000..7d87ed8
--- /dev/null
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/i2c/i2c-virtio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Virtio I2C Adapter
+
+maintainers:
+  - Viresh Kumar <viresh.kumar@linaro.org>
+
+allOf:
+  - $ref: /schemas/i2c/i2c-controller.yaml#
+  - $ref: /schemas/virtio/virtio-device.yaml#
+
+description:
+  Virtio I2C device, see /schemas/virtio/virtio-device.yaml for more details.
+
+properties:
+  $nodename:
+    const: i2c
+
+  compatible:
+    const: virtio,device22
+
+required:
+  - compatible
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    virtio@3000 {
+        compatible = "virtio,mmio";
+        reg = <0x3000 0x100>;
+        interrupts = <41>;
+
+        i2c {
+            compatible = "virtio,device22";
+
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            light-sensor@20 {
+                compatible = "dynaimage,al3320a";
+                reg = <0x20>;
+            };
+        };
+    };
+
+...
index cffd020..d74f200 100644 (file)
@@ -29,6 +29,8 @@ properties:
     description:
       Regulator for the LRADC reference voltage
 
+  wakeup-source: true
+
 patternProperties:
   "^button-[0-9]+$":
     type: object
diff --git a/Documentation/devicetree/bindings/input/qcom,pm8941-pwrkey.txt b/Documentation/devicetree/bindings/input/qcom,pm8941-pwrkey.txt
deleted file mode 100644 (file)
index 6cd08bc..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-Qualcomm PM8941 PMIC Power Key
-
-PROPERTIES
-
-- compatible:
-       Usage: required
-       Value type: <string>
-       Definition: must be one of:
-                   "qcom,pm8941-pwrkey"
-                   "qcom,pm8941-resin"
-                   "qcom,pmk8350-pwrkey"
-                   "qcom,pmk8350-resin"
-
-- reg:
-       Usage: required
-       Value type: <prop-encoded-array>
-       Definition: base address of registers for block
-
-- interrupts:
-       Usage: required
-       Value type: <prop-encoded-array>
-       Definition: key change interrupt; The format of the specifier is
-                   defined by the binding document describing the node's
-                   interrupt parent.
-
-- debounce:
-       Usage: optional
-       Value type: <u32>
-       Definition: time in microseconds that key must be pressed or released
-                   for state change interrupt to trigger.
-
-- bias-pull-up:
-       Usage: optional
-       Value type: <empty>
-       Definition: presence of this property indicates that the KPDPWR_N pin
-                   should be configured for pull up.
-
-- linux,code:
-       Usage: optional
-       Value type: <u32>
-       Definition: The input key-code associated with the power key.
-                   Use the linux event codes defined in
-                   include/dt-bindings/input/linux-event-codes.h
-                   When property is omitted KEY_POWER is assumed.
-
-EXAMPLE
-
-       pwrkey@800 {
-               compatible = "qcom,pm8941-pwrkey";
-               reg = <0x800>;
-               interrupts = <0x0 0x8 0 IRQ_TYPE_EDGE_BOTH>;
-               debounce = <15625>;
-               bias-pull-up;
-               linux,code = <KEY_POWER>;
-       };
diff --git a/Documentation/devicetree/bindings/input/qcom,pm8941-pwrkey.yaml b/Documentation/devicetree/bindings/input/qcom,pm8941-pwrkey.yaml
new file mode 100644 (file)
index 0000000..62314a5
--- /dev/null
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/qcom,pm8941-pwrkey.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm PM8941 PMIC Power Key
+
+maintainers:
+  - Courtney Cavin <courtney.cavin@sonymobile.com>
+  - Vinod Koul <vkoul@kernel.org>
+
+allOf:
+  - $ref: input.yaml#
+
+properties:
+  compatible:
+    enum:
+      - qcom,pm8941-pwrkey
+      - qcom,pm8941-resin
+      - qcom,pmk8350-pwrkey
+      - qcom,pmk8350-resin
+
+  interrupts:
+    maxItems: 1
+
+  debounce:
+    description: |
+          Time in microseconds that key must be pressed or
+          released for state change interrupt to trigger.
+    $ref: /schemas/types.yaml#/definitions/uint32
+
+  bias-pull-up:
+    description: |
+           Presence of this property indicates that the KPDPWR_N
+           pin should be configured for pull up.
+    $ref: /schemas/types.yaml#/definitions/flag
+
+  linux,code:
+    description: |
+           The input key-code associated with the power key.
+           Use the linux event codes defined in
+           include/dt-bindings/input/linux-event-codes.h
+           When property is omitted KEY_POWER is assumed.
+
+required:
+  - compatible
+  - interrupts
+
+unevaluatedProperties: false
+...
diff --git a/Documentation/devicetree/bindings/input/regulator-haptic.txt b/Documentation/devicetree/bindings/input/regulator-haptic.txt
deleted file mode 100644 (file)
index 3ed1c7e..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-* Regulator Haptic Device Tree Bindings
-
-Required Properties:
- - compatible : Should be "regulator-haptic"
- - haptic-supply : Power supply to the haptic motor.
-       [*] refer Documentation/devicetree/bindings/regulator/regulator.txt
-
- - max-microvolt : The maximum voltage value supplied to the haptic motor.
-               [The unit of the voltage is a micro]
-
- - min-microvolt : The minimum voltage value supplied to the haptic motor.
-               [The unit of the voltage is a micro]
-
-Example:
-
-       haptics {
-               compatible = "regulator-haptic";
-               haptic-supply = <&motor_regulator>;
-               max-microvolt = <2700000>;
-               min-microvolt = <1100000>;
-       };
diff --git a/Documentation/devicetree/bindings/input/regulator-haptic.yaml b/Documentation/devicetree/bindings/input/regulator-haptic.yaml
new file mode 100644 (file)
index 0000000..b1ae72f
--- /dev/null
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/input/regulator-haptic.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: Regulator Haptic Device Tree Bindings
+
+maintainers:
+  - Jaewon Kim <jaewon02.kim@samsung.com>
+
+properties:
+  compatible:
+    const: regulator-haptic
+
+  haptic-supply:
+    description: >
+      Power supply to the haptic motor
+
+  max-microvolt:
+    description: >
+      The maximum voltage value supplied to the haptic motor
+
+  min-microvolt:
+    description: >
+      The minimum voltage value supplied to the haptic motor
+
+required:
+  - compatible
+  - haptic-supply
+  - max-microvolt
+  - min-microvolt
+
+additionalProperties: false
+
+examples:
+  - |
+    haptics {
+        compatible = "regulator-haptic";
+        haptic-supply = <&motor_regulator>;
+        max-microvolt = <2700000>;
+        min-microvolt = <1100000>;
+    };
diff --git a/Documentation/devicetree/bindings/input/touchscreen/chipone,icn8318.yaml b/Documentation/devicetree/bindings/input/touchscreen/chipone,icn8318.yaml
new file mode 100644 (file)
index 0000000..9df685b
--- /dev/null
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/chipone,icn8318.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ChipOne ICN8318 Touchscreen Controller Device Tree Bindings
+
+maintainers:
+  - Dmitry Torokhov <dmitry.torokhov@gmail.com>
+
+allOf:
+  - $ref: touchscreen.yaml#
+
+properties:
+  compatible:
+    const: chipone,icn8318
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  wake-gpios:
+    maxItems: 1
+
+unevaluatedProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - wake-gpios
+  - touchscreen-size-x
+  - touchscreen-size-y
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+    i2c {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        touchscreen@40 {
+            compatible = "chipone,icn8318";
+            reg = <0x40>;
+            interrupt-parent = <&pio>;
+            interrupts = <9 IRQ_TYPE_EDGE_FALLING>; /* EINT9 (PG9) */
+            pinctrl-names = "default";
+            pinctrl-0 = <&ts_wake_pin_p66>;
+            wake-gpios = <&pio 1 3 GPIO_ACTIVE_HIGH>; /* PB3 */
+            touchscreen-size-x = <800>;
+            touchscreen-size-y = <480>;
+            touchscreen-inverted-x;
+            touchscreen-swapped-x-y;
+        };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/input/touchscreen/chipone_icn8318.txt b/Documentation/devicetree/bindings/input/touchscreen/chipone_icn8318.txt
deleted file mode 100644 (file)
index 38b0603..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-* ChipOne icn8318 I2C touchscreen controller
-
-Required properties:
- - compatible            : "chipone,icn8318"
- - reg                   : I2C slave address of the chip (0x40)
- - interrupts            : interrupt specification for the icn8318 interrupt
- - wake-gpios            : GPIO specification for the WAKE input
- - touchscreen-size-x    : horizontal resolution of touchscreen (in pixels)
- - touchscreen-size-y    : vertical resolution of touchscreen (in pixels)
-
-Optional properties:
- - pinctrl-names         : should be "default"
- - pinctrl-0:            : a phandle pointing to the pin settings for the
-                           control gpios
- - touchscreen-fuzz-x    : horizontal noise value of the absolute input
-                           device (in pixels)
- - touchscreen-fuzz-y    : vertical noise value of the absolute input
-                           device (in pixels)
- - touchscreen-inverted-x : X axis is inverted (boolean)
- - touchscreen-inverted-y : Y axis is inverted (boolean)
- - touchscreen-swapped-x-y       : X and Y axis are swapped (boolean)
-                           Swapping is done after inverting the axis
-
-Example:
-
-i2c@00000000 {
-       /* ... */
-
-       chipone_icn8318@40 {
-               compatible = "chipone,icn8318";
-               reg = <0x40>;
-               interrupt-parent = <&pio>;
-               interrupts = <9 IRQ_TYPE_EDGE_FALLING>; /* EINT9 (PG9) */
-               pinctrl-names = "default";
-               pinctrl-0 = <&ts_wake_pin_p66>;
-               wake-gpios = <&pio 1 3 GPIO_ACTIVE_HIGH>; /* PB3 */
-               touchscreen-size-x = <800>;
-               touchscreen-size-y = <480>;
-               touchscreen-inverted-x;
-               touchscreen-swapped-x-y;
-       };
-
-       /* ... */
-};
diff --git a/Documentation/devicetree/bindings/input/touchscreen/pixcir,pixcir_ts.yaml b/Documentation/devicetree/bindings/input/touchscreen/pixcir,pixcir_ts.yaml
new file mode 100644 (file)
index 0000000..f9998ed
--- /dev/null
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/pixcir,pixcir_ts.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Pixcir Touchscreen Controller Device Tree Bindings
+
+maintainers:
+  - Dmitry Torokhov <dmitry.torokhov@gmail.com>
+
+allOf:
+  - $ref: touchscreen.yaml#
+
+properties:
+  compatible:
+    enum:
+      - pixcir,pixcir_ts
+      - pixcir,pixcir_tangoc
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  attb-gpio:
+    maxItems: 1
+
+  reset-gpios:
+    maxItems: 1
+
+  enable-gpios:
+    maxItems: 1
+
+  wake-gpios:
+    maxItems: 1
+
+unevaluatedProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - attb-gpio
+  - touchscreen-size-x
+  - touchscreen-size-y
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+    i2c {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        touchscreen@5c {
+            compatible = "pixcir,pixcir_ts";
+            reg = <0x5c>;
+            interrupts = <2 0>;
+            attb-gpio = <&gpf 2 0 2>;
+            touchscreen-size-x = <800>;
+            touchscreen-size-y = <600>;
+        };
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/input/touchscreen/pixcir_i2c_ts.txt b/Documentation/devicetree/bindings/input/touchscreen/pixcir_i2c_ts.txt
deleted file mode 100644 (file)
index 697a3e7..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-* Pixcir I2C touchscreen controllers
-
-Required properties:
-- compatible: must be "pixcir,pixcir_ts" or "pixcir,pixcir_tangoc"
-- reg: I2C address of the chip
-- interrupts: interrupt to which the chip is connected
-- attb-gpio: GPIO connected to the ATTB line of the chip
-- touchscreen-size-x: horizontal resolution of touchscreen (in pixels)
-- touchscreen-size-y: vertical resolution of touchscreen (in pixels)
-
-Optional properties:
-- reset-gpios: GPIO connected to the RESET line of the chip
-- enable-gpios: GPIO connected to the ENABLE line of the chip
-- wake-gpios: GPIO connected to the WAKE line of the chip
-
-Example:
-
-       i2c@00000000 {
-               /* ... */
-
-               pixcir_ts@5c {
-                       compatible = "pixcir,pixcir_ts";
-                       reg = <0x5c>;
-                       interrupts = <2 0>;
-                       attb-gpio = <&gpf 2 0 2>;
-                       touchscreen-size-x = <800>;
-                       touchscreen-size-y = <600>;
-               };
-
-               /* ... */
-       };
diff --git a/Documentation/devicetree/bindings/input/touchscreen/ti,tsc2005.yaml b/Documentation/devicetree/bindings/input/touchscreen/ti,tsc2005.yaml
new file mode 100644 (file)
index 0000000..938aab0
--- /dev/null
@@ -0,0 +1,128 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/ti,tsc2005.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Texas Instruments TSC2004 and TSC2005 touchscreen controller bindings
+
+maintainers:
+  - Marek Vasut <marex@denx.de>
+  - Michael Welling <mwelling@ieee.org>
+
+properties:
+  $nodename:
+    pattern: "^touchscreen(@.*)?$"
+
+  compatible:
+    enum:
+      - ti,tsc2004
+      - ti,tsc2005
+
+  reg:
+    maxItems: 1
+    description: |
+      I2C address when used on the I2C bus, or the SPI chip select index
+      when used on the SPI bus
+
+  interrupts:
+    maxItems: 1
+
+  reset-gpios:
+    maxItems: 1
+    description: GPIO specifier for the controller reset line
+
+  spi-max-frequency:
+    description: TSC2005 SPI bus clock frequency.
+    maximum: 25000000
+
+  ti,x-plate-ohms:
+    description: resistance of the touchscreen's X plates in ohm (defaults to 280)
+
+  ti,esd-recovery-timeout-ms:
+    description: |
+        if the touchscreen does not respond after the configured time
+        (in milli seconds), the driver will reset it. This is disabled
+        by default.
+
+  vio-supply:
+    description: Regulator specifier
+
+  touchscreen-fuzz-pressure: true
+  touchscreen-fuzz-x: true
+  touchscreen-fuzz-y: true
+  touchscreen-max-pressure: true
+  touchscreen-size-x: true
+  touchscreen-size-y: true
+
+allOf:
+  - $ref: touchscreen.yaml#
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: ti,tsc2004
+    then:
+      properties:
+        spi-max-frequency: false
+
+additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/irq.h>
+    #include <dt-bindings/gpio/gpio.h>
+    i2c {
+        #address-cells = <1>;
+        #size-cells = <0>;
+        touchscreen@48 {
+            compatible = "ti,tsc2004";
+            reg = <0x48>;
+            vio-supply = <&vio>;
+
+            reset-gpios = <&gpio4 8 GPIO_ACTIVE_HIGH>;
+            interrupts-extended = <&gpio1 27 IRQ_TYPE_EDGE_RISING>;
+
+            touchscreen-fuzz-x = <4>;
+            touchscreen-fuzz-y = <7>;
+            touchscreen-fuzz-pressure = <2>;
+            touchscreen-size-x = <4096>;
+            touchscreen-size-y = <4096>;
+            touchscreen-max-pressure = <2048>;
+
+            ti,x-plate-ohms = <280>;
+            ti,esd-recovery-timeout-ms = <8000>;
+        };
+    };
+  - |
+    #include <dt-bindings/interrupt-controller/irq.h>
+    #include <dt-bindings/gpio/gpio.h>
+    spi {
+        #address-cells = <1>;
+        #size-cells = <0>;
+        touchscreen@0 {
+            compatible = "ti,tsc2005";
+            spi-max-frequency = <6000000>;
+            reg = <0>;
+
+            vio-supply = <&vio>;
+
+            reset-gpios = <&gpio4 8 GPIO_ACTIVE_HIGH>; /* 104 */
+            interrupts-extended = <&gpio4 4 IRQ_TYPE_EDGE_RISING>; /* 100 */
+
+            touchscreen-fuzz-x = <4>;
+            touchscreen-fuzz-y = <7>;
+            touchscreen-fuzz-pressure = <2>;
+            touchscreen-size-x = <4096>;
+            touchscreen-size-y = <4096>;
+            touchscreen-max-pressure = <2048>;
+
+            ti,x-plate-ohms = <280>;
+            ti,esd-recovery-timeout-ms = <8000>;
+        };
+    };
diff --git a/Documentation/devicetree/bindings/input/touchscreen/tsc2005.txt b/Documentation/devicetree/bindings/input/touchscreen/tsc2005.txt
deleted file mode 100644 (file)
index b80c04b..0000000
+++ /dev/null
@@ -1,64 +0,0 @@
-* Texas Instruments tsc2004 and tsc2005 touchscreen controllers
-
-Required properties:
- - compatible                : "ti,tsc2004" or "ti,tsc2005"
- - reg                       : Device address
- - interrupts                : IRQ specifier
- - spi-max-frequency         : Maximum SPI clocking speed of the device
-                               (for tsc2005)
-
-Optional properties:
- - vio-supply                : Regulator specifier
- - reset-gpios               : GPIO specifier for the controller reset line
- - ti,x-plate-ohms           : integer, resistance of the touchscreen's X plates
-                               in ohm (defaults to 280)
- - ti,esd-recovery-timeout-ms : integer, if the touchscreen does not respond after
-                               the configured time (in milli seconds), the driver
-                               will reset it. This is disabled by default.
- - properties defined in touchscreen.txt
-
-Example:
-
-&i2c3 {
-       tsc2004@48 {
-               compatible = "ti,tsc2004";
-               reg = <0x48>;
-               vio-supply = <&vio>;
-
-               reset-gpios = <&gpio4 8 GPIO_ACTIVE_HIGH>;
-               interrupts-extended = <&gpio1 27 IRQ_TYPE_EDGE_RISING>;
-
-               touchscreen-fuzz-x = <4>;
-               touchscreen-fuzz-y = <7>;
-               touchscreen-fuzz-pressure = <2>;
-               touchscreen-size-x = <4096>;
-               touchscreen-size-y = <4096>;
-               touchscreen-max-pressure = <2048>;
-
-               ti,x-plate-ohms = <280>;
-               ti,esd-recovery-timeout-ms = <8000>;
-       };
-}
-
-&mcspi1 {
-       tsc2005@0 {
-               compatible = "ti,tsc2005";
-               spi-max-frequency = <6000000>;
-               reg = <0>;
-
-               vio-supply = <&vio>;
-
-               reset-gpios = <&gpio4 8 GPIO_ACTIVE_HIGH>; /* 104 */
-               interrupts-extended = <&gpio4 4 IRQ_TYPE_EDGE_RISING>; /* 100 */
-
-               touchscreen-fuzz-x = <4>;
-               touchscreen-fuzz-y = <7>;
-               touchscreen-fuzz-pressure = <2>;
-               touchscreen-size-x = <4096>;
-               touchscreen-size-y = <4096>;
-               touchscreen-max-pressure = <2048>;
-
-               ti,x-plate-ohms = <280>;
-               ti,esd-recovery-timeout-ms = <8000>;
-       };
-}
diff --git a/Documentation/devicetree/bindings/power/reset/qcom,pon.txt b/Documentation/devicetree/bindings/power/reset/qcom,pon.txt
deleted file mode 100644 (file)
index 0c0dc3a..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-Qualcomm PON Device
-
-The Power On device for Qualcomm PM8xxx is MFD supporting pwrkey
-and resin along with the Android reboot-mode.
-
-This DT node has pwrkey and resin as sub nodes.
-
-Required Properties:
--compatible: Must be one of:
-       "qcom,pm8916-pon"
-       "qcom,pms405-pon"
-       "qcom,pm8998-pon"
-
--reg: Specifies the physical address of the pon register
-
-Optional subnode:
--pwrkey: Specifies the subnode pwrkey and should follow the
- qcom,pm8941-pwrkey.txt description.
--resin: Specifies the subnode resin and should follow the
- qcom,pm8xxx-pwrkey.txt description.
-
-The rest of the properties should follow the generic reboot-mode description
-found in reboot-mode.txt
-
-Example:
-
-       pon@800 {
-               compatible = "qcom,pm8916-pon";
-
-               reg = <0x800>;
-               mode-bootloader = <0x2>;
-               mode-recovery = <0x1>;
-
-               pwrkey {
-                       compatible = "qcom,pm8941-pwrkey";
-                       interrupts = <0x0 0x8 0 IRQ_TYPE_EDGE_BOTH>;
-                       debounce = <15625>;
-                       bias-pull-up;
-                       linux,code = <KEY_POWER>;
-               };
-
-               resin {
-                       compatible = "qcom,pm8941-resin";
-                       interrupts = <0x0 0x8 1 IRQ_TYPE_EDGE_BOTH>;
-                       debounce = <15625>;
-                       bias-pull-up;
-                       linux,code = <KEY_VOLUMEDOWN>;
-               };
-       };
diff --git a/Documentation/devicetree/bindings/power/reset/qcom,pon.yaml b/Documentation/devicetree/bindings/power/reset/qcom,pon.yaml
new file mode 100644 (file)
index 0000000..353f155
--- /dev/null
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/power/reset/qcom,pon.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm PON Device
+
+maintainers:
+  - Vinod Koul <vkoul@kernel.org>
+
+description: |
+  The Power On device for Qualcomm PM8xxx is MFD supporting pwrkey
+  and resin along with the Android reboot-mode.
+
+  This DT node has pwrkey and resin as sub nodes.
+
+allOf:
+  - $ref: reboot-mode.yaml#
+
+properties:
+  compatible:
+    enum:
+      - qcom,pm8916-pon
+      - qcom,pms405-pon
+      - qcom,pm8998-pon
+
+  reg:
+    maxItems: 1
+
+  pwrkey:
+    type: object
+    $ref: "../../input/qcom,pm8941-pwrkey.yaml#"
+
+  resin:
+    type: object
+    $ref: "../../input/qcom,pm8941-pwrkey.yaml#"
+
+required:
+  - compatible
+  - reg
+
+unevaluatedProperties: false
+
+examples:
+  - |
+   #include <dt-bindings/interrupt-controller/irq.h>
+   #include <dt-bindings/input/linux-event-codes.h>
+   #include <dt-bindings/spmi/spmi.h>
+   spmi_bus: spmi@c440000 {
+     reg = <0x0c440000 0x1100>;
+     #address-cells = <2>;
+     #size-cells = <0>;
+     pmk8350: pmic@0 {
+       reg = <0x0 SPMI_USID>;
+       #address-cells = <1>;
+       #size-cells = <0>;
+       pmk8350_pon: pon_hlos@1300 {
+         reg = <0x1300>;
+         compatible = "qcom,pm8998-pon";
+
+         pwrkey {
+            compatible = "qcom,pm8941-pwrkey";
+            interrupts = < 0x0 0x8 0 IRQ_TYPE_EDGE_BOTH >;
+            debounce = <15625>;
+            bias-pull-up;
+            linux,code = <KEY_POWER>;
+         };
+
+         resin {
+            compatible = "qcom,pm8941-resin";
+            interrupts = <0x0 0x8 1 IRQ_TYPE_EDGE_BOTH>;
+            debounce = <15625>;
+            bias-pull-up;
+            linux,code = <KEY_VOLUMEDOWN>;
+         };
+       };
+     };
+   };
+...
index 9c6fda6..ad0a0b9 100644 (file)
@@ -36,7 +36,7 @@ patternProperties:
   "^mode-.*$":
     $ref: /schemas/types.yaml#/definitions/uint32
 
-additionalProperties: false
+additionalProperties: true
 
 examples:
   - |
index 5596bee..81a54a4 100644 (file)
@@ -29,6 +29,7 @@ properties:
           - enum:
               - rockchip,px30-pwm
               - rockchip,rk3308-pwm
+              - rockchip,rk3568-pwm
           - const: rockchip,rk3328-pwm
 
   reg:
index 7548d87..13925bb 100644 (file)
@@ -32,6 +32,9 @@ properties:
       - dallas,ds3232
       # I2C-BUS INTERFACE REAL TIME CLOCK MODULE
       - epson,rx8010
+      # I2C-BUS INTERFACE REAL TIME CLOCK MODULE
+      - epson,rx8025
+      - epson,rx8035
       # I2C-BUS INTERFACE REAL TIME CLOCK MODULE with Battery Backed RAM
       - epson,rx8571
       # I2C-BUS INTERFACE REAL TIME CLOCK MODULE
index e555381..9952199 100644 (file)
@@ -84,9 +84,9 @@ unevaluatedProperties: false
 if:
   properties:
     compatible:
-      oneOf:
-        - const: ti,omap2-mcspi
-        - const: ti,omap4-mcspi
+      enum:
+        - ti,omap2-mcspi
+        - ti,omap4-mcspi
 
 then:
   properties:
index 593f769..03e5dca 100644 (file)
@@ -27,13 +27,11 @@ properties:
 
   xlnx,num-ss-bits:
     description: Number of chip selects used.
-    $ref: /schemas/types.yaml#/definitions/uint32
     minimum: 1
     maximum: 32
 
   xlnx,num-transfer-bits:
     description: Number of bits per transfer. This will be 8 if not specified.
-    $ref: /schemas/types.yaml#/definitions/uint32
     enum: [8, 16, 32]
     default: 8
 
diff --git a/Documentation/devicetree/bindings/thermal/qcom-lmh.yaml b/Documentation/devicetree/bindings/thermal/qcom-lmh.yaml
new file mode 100644 (file)
index 0000000..289e9a8
--- /dev/null
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright 2021 Linaro Ltd.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/thermal/qcom-lmh.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Limits Management Hardware(LMh)
+
+maintainers:
+  - Thara Gopinath <thara.gopinath@linaro.org>
+
+description:
+  Limits Management Hardware(LMh) is a hardware infrastructure on some
+  Qualcomm SoCs that can enforce temperature and current limits as
+  programmed by software for certain IPs like CPU.
+
+properties:
+  compatible:
+    enum:
+      - qcom,sdm845-lmh
+
+  reg:
+    items:
+      - description: core registers
+
+  interrupts:
+    maxItems: 1
+
+  '#interrupt-cells':
+    const: 1
+
+  interrupt-controller: true
+
+  cpus:
+    description:
+      phandle of the first cpu in the LMh cluster
+    $ref: /schemas/types.yaml#/definitions/phandle
+
+  qcom,lmh-temp-arm-millicelsius:
+    description:
+      An integer expressing temperature threshold at which the LMh thermal
+      FSM is engaged.
+
+  qcom,lmh-temp-low-millicelsius:
+    description:
+      An integer expressing temperature threshold at which the state machine
+      will attempt to remove frequency throttling.
+
+  qcom,lmh-temp-high-millicelsius:
+    description:
+      An integer expressing temperature threshold at which the state machine
+      will attempt to throttle the frequency.
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - '#interrupt-cells'
+  - interrupt-controller
+  - cpus
+  - qcom,lmh-temp-arm-millicelsius
+  - qcom,lmh-temp-low-millicelsius
+  - qcom,lmh-temp-high-millicelsius
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+    lmh@17d70800 {
+      compatible = "qcom,sdm845-lmh";
+      reg = <0x17d70800 0x400>;
+      interrupts = <GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>;
+      cpus = <&CPU4>;
+      qcom,lmh-temp-arm-millicelsius = <65000>;
+      qcom,lmh-temp-low-millicelsius = <94500>;
+      qcom,lmh-temp-high-millicelsius = <95000>;
+      interrupt-controller;
+      #interrupt-cells = <1>;
+    };
index 164f715..a07de5e 100644 (file)
@@ -215,7 +215,7 @@ patternProperties:
       - polling-delay
       - polling-delay-passive
       - thermal-sensors
-      - trips
+
     additionalProperties: false
 
 additionalProperties: false
index d465970..4b7a027 100644 (file)
@@ -36,7 +36,8 @@ required:
   - reg
   - interrupts
 
-additionalProperties: false
+additionalProperties:
+  type: object
 
 examples:
   - |
diff --git a/Documentation/devicetree/bindings/virtio/virtio-device.yaml b/Documentation/devicetree/bindings/virtio/virtio-device.yaml
new file mode 100644 (file)
index 0000000..1778ea9
--- /dev/null
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/virtio/virtio-device.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Virtio device bindings
+
+maintainers:
+  - Viresh Kumar <viresh.kumar@linaro.org>
+
+description:
+  These bindings are applicable to virtio devices irrespective of the bus they
+  are bound to, like mmio or pci.
+
+# We need a select here so we don't match all nodes with 'virtio,mmio'
+properties:
+  compatible:
+    pattern: "^virtio,device[0-9a-f]{1,8}$"
+    description: Virtio device nodes.
+      "virtio,deviceID", where ID is the virtio device id. The textual
+      representation of ID shall be in lower case hexadecimal with leading
+      zeroes suppressed.
+
+required:
+  - compatible
+
+additionalProperties: true
+
+examples:
+  - |
+    virtio@3000 {
+        compatible = "virtio,mmio";
+        reg = <0x3000 0x100>;
+        interrupts = <43>;
+
+        i2c {
+            compatible = "virtio,device22";
+        };
+    };
+...
index f2105ee..ab9641e 100644 (file)
@@ -15,13 +15,13 @@ maintainers:
 
 properties:
   compatible:
-    oneOf:
-      - const: maxim,max6369
-      - const: maxim,max6370
-      - const: maxim,max6371
-      - const: maxim,max6372
-      - const: maxim,max6373
-      - const: maxim,max6374
+    enum:
+      - maxim,max6369
+      - maxim,max6370
+      - maxim,max6371
+      - maxim,max6372
+      - maxim,max6373
+      - maxim,max6374
 
   reg:
     description: This is a 1-byte memory-mapped address
index 99cb6d7..2949c99 100644 (file)
@@ -22,7 +22,7 @@
     |    openrisc: | TODO |
     |      parisc: |  ok  |
     |     powerpc: |  ok  |
-    |       riscv: | TODO |
+    |       riscv: |  ok  |
     |        s390: |  ok  |
     |          sh: | TODO |
     |       sparc: | TODO |
index 7e5c04c..98db2ea 100644 (file)
@@ -71,9 +71,6 @@ Other Functions
 .. kernel-doc:: fs/fs-writeback.c
    :export:
 
-.. kernel-doc:: fs/block_dev.c
-   :export:
-
 .. kernel-doc:: fs/anon_inodes.c
    :export:
 
index d5a73fa..8126bea 100644 (file)
@@ -37,7 +37,7 @@ TTM initialization
     This section is outdated.
 
 Drivers wishing to support TTM must pass a filled :c:type:`ttm_bo_driver
-<ttm_bo_driver>` structure to ttm_bo_device_init, together with an
+<ttm_bo_driver>` structure to ttm_device_init, together with an
 initialized global reference to the memory manager.  The ttm_bo_driver
 structure contains several fields with function pointers for
 initializing the TTM, allocating and freeing memory, waiting for command
index 60ac091..8a2788a 100644 (file)
@@ -101,8 +101,7 @@ subsystems which use EM might rely on this flag to check if all EM devices use
 the same scale. If there are different scales, these subsystems might decide
 to: return warning/error, stop working or panic.
 See Section 3. for an example of driver implementing this
-callback, and kernel/power/energy_model.c for further documentation on this
-API.
+callback, or Section 2.4 for further documentation on this API
 
 
 2.3 Accessing performance domains
@@ -123,7 +122,17 @@ em_cpu_energy() API. The estimation is performed assuming that the schedutil
 CPUfreq governor is in use in case of CPU device. Currently this calculation is
 not provided for other type of devices.
 
-More details about the above APIs can be found in include/linux/energy_model.h.
+More details about the above APIs can be found in ``<linux/energy_model.h>``
+or in Section 2.4
+
+
+2.4 Description details of this API
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. kernel-doc:: include/linux/energy_model.h
+   :internal:
+
+.. kernel-doc:: kernel/power/energy_model.c
+   :export:
 
 
 3. Example driver
index 0b5eefe..c432be0 100644 (file)
@@ -27,6 +27,7 @@ place where this information is gathered.
    iommu
    media/index
    sysfs-platform_profile
+   vduse
 
 .. only::  subproject and html
 
index b7070d7..2e81340 100644 (file)
@@ -299,6 +299,7 @@ Code  Seq#    Include File                                           Comments
 'z'   10-4F  drivers/s390/crypto/zcrypt_api.h                        conflict!
 '|'   00-7F  linux/media.h
 0x80  00-1F  linux/fb.h
+0x81  00-1F  linux/vduse.h
 0x89  00-06  arch/x86/include/asm/sockios.h
 0x89  0B-DF  linux/sockios.h
 0x89  E0-EF  linux/sockios.h                                         SIOCPROTOPRIVATE range
diff --git a/Documentation/userspace-api/vduse.rst b/Documentation/userspace-api/vduse.rst
new file mode 100644 (file)
index 0000000..42ef59e
--- /dev/null
@@ -0,0 +1,233 @@
+==================================
+VDUSE - "vDPA Device in Userspace"
+==================================
+
+vDPA (virtio data path acceleration) device is a device that uses a
+datapath which complies with the virtio specifications with vendor
+specific control path. vDPA devices can be both physically located on
+the hardware or emulated by software. VDUSE is a framework that makes it
+possible to implement software-emulated vDPA devices in userspace. And
+to make the device emulation more secure, the emulated vDPA device's
+control path is handled in the kernel and only the data path is
+implemented in the userspace.
+
+Note that only virtio block device is supported by VDUSE framework now,
+which can reduce security risks when the userspace process that implements
+the data path is run by an unprivileged user. The support for other device
+types can be added after the security issue of corresponding device driver
+is clarified or fixed in the future.
+
+Create/Destroy VDUSE devices
+------------------------
+
+VDUSE devices are created as follows:
+
+1. Create a new VDUSE instance with ioctl(VDUSE_CREATE_DEV) on
+   /dev/vduse/control.
+
+2. Setup each virtqueue with ioctl(VDUSE_VQ_SETUP) on /dev/vduse/$NAME.
+
+3. Begin processing VDUSE messages from /dev/vduse/$NAME. The first
+   messages will arrive while attaching the VDUSE instance to vDPA bus.
+
+4. Send the VDPA_CMD_DEV_NEW netlink message to attach the VDUSE
+   instance to vDPA bus.
+
+VDUSE devices are destroyed as follows:
+
+1. Send the VDPA_CMD_DEV_DEL netlink message to detach the VDUSE
+   instance from vDPA bus.
+
+2. Close the file descriptor referring to /dev/vduse/$NAME.
+
+3. Destroy the VDUSE instance with ioctl(VDUSE_DESTROY_DEV) on
+   /dev/vduse/control.
+
+The netlink messages can be sent via vdpa tool in iproute2 or use the
+below sample codes:
+
+.. code-block:: c
+
+       static int netlink_add_vduse(const char *name, enum vdpa_command cmd)
+       {
+               struct nl_sock *nlsock;
+               struct nl_msg *msg;
+               int famid;
+
+               nlsock = nl_socket_alloc();
+               if (!nlsock)
+                       return -ENOMEM;
+
+               if (genl_connect(nlsock))
+                       goto free_sock;
+
+               famid = genl_ctrl_resolve(nlsock, VDPA_GENL_NAME);
+               if (famid < 0)
+                       goto close_sock;
+
+               msg = nlmsg_alloc();
+               if (!msg)
+                       goto close_sock;
+
+               if (!genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, famid, 0, 0, cmd, 0))
+                       goto nla_put_failure;
+
+               NLA_PUT_STRING(msg, VDPA_ATTR_DEV_NAME, name);
+               if (cmd == VDPA_CMD_DEV_NEW)
+                       NLA_PUT_STRING(msg, VDPA_ATTR_MGMTDEV_DEV_NAME, "vduse");
+
+               if (nl_send_sync(nlsock, msg))
+                       goto close_sock;
+
+               nl_close(nlsock);
+               nl_socket_free(nlsock);
+
+               return 0;
+       nla_put_failure:
+               nlmsg_free(msg);
+       close_sock:
+               nl_close(nlsock);
+       free_sock:
+               nl_socket_free(nlsock);
+               return -1;
+       }
+
+How VDUSE works
+---------------
+
+As mentioned above, a VDUSE device is created by ioctl(VDUSE_CREATE_DEV) on
+/dev/vduse/control. With this ioctl, userspace can specify some basic configuration
+such as device name (uniquely identify a VDUSE device), virtio features, virtio
+configuration space, the number of virtqueues and so on for this emulated device.
+Then a char device interface (/dev/vduse/$NAME) is exported to userspace for device
+emulation. Userspace can use the VDUSE_VQ_SETUP ioctl on /dev/vduse/$NAME to
+add per-virtqueue configuration such as the max size of virtqueue to the device.
+
+After the initialization, the VDUSE device can be attached to vDPA bus via
+the VDPA_CMD_DEV_NEW netlink message. Userspace needs to read()/write() on
+/dev/vduse/$NAME to receive/reply some control messages from/to VDUSE kernel
+module as follows:
+
+.. code-block:: c
+
+       static int vduse_message_handler(int dev_fd)
+       {
+               int len;
+               struct vduse_dev_request req;
+               struct vduse_dev_response resp;
+
+               len = read(dev_fd, &req, sizeof(req));
+               if (len != sizeof(req))
+                       return -1;
+
+               resp.request_id = req.request_id;
+
+               switch (req.type) {
+
+               /* handle different types of messages */
+
+               }
+
+               len = write(dev_fd, &resp, sizeof(resp));
+               if (len != sizeof(resp))
+                       return -1;
+
+               return 0;
+       }
+
+There are now three types of messages introduced by VDUSE framework:
+
+- VDUSE_GET_VQ_STATE: Get the state for virtqueue, userspace should return
+  avail index for split virtqueue or the device/driver ring wrap counters and
+  the avail and used index for packed virtqueue.
+
+- VDUSE_SET_STATUS: Set the device status, userspace should follow
+  the virtio spec: https://docs.oasis-open.org/virtio/virtio/v1.1/virtio-v1.1.html
+  to process this message. For example, fail to set the FEATURES_OK device
+  status bit if the device can not accept the negotiated virtio features
+  get from the VDUSE_DEV_GET_FEATURES ioctl.
+
+- VDUSE_UPDATE_IOTLB: Notify userspace to update the memory mapping for specified
+  IOVA range, userspace should firstly remove the old mapping, then setup the new
+  mapping via the VDUSE_IOTLB_GET_FD ioctl.
+
+After DRIVER_OK status bit is set via the VDUSE_SET_STATUS message, userspace is
+able to start the dataplane processing as follows:
+
+1. Get the specified virtqueue's information with the VDUSE_VQ_GET_INFO ioctl,
+   including the size, the IOVAs of descriptor table, available ring and used ring,
+   the state and the ready status.
+
+2. Pass the above IOVAs to the VDUSE_IOTLB_GET_FD ioctl so that those IOVA regions
+   can be mapped into userspace. Some sample codes is shown below:
+
+.. code-block:: c
+
+       static int perm_to_prot(uint8_t perm)
+       {
+               int prot = 0;
+
+               switch (perm) {
+               case VDUSE_ACCESS_WO:
+                       prot |= PROT_WRITE;
+                       break;
+               case VDUSE_ACCESS_RO:
+                       prot |= PROT_READ;
+                       break;
+               case VDUSE_ACCESS_RW:
+                       prot |= PROT_READ | PROT_WRITE;
+                       break;
+               }
+
+               return prot;
+       }
+
+       static void *iova_to_va(int dev_fd, uint64_t iova, uint64_t *len)
+       {
+               int fd;
+               void *addr;
+               size_t size;
+               struct vduse_iotlb_entry entry;
+
+               entry.start = iova;
+               entry.last = iova;
+
+               /*
+                * Find the first IOVA region that overlaps with the specified
+                * range [start, last] and return the corresponding file descriptor.
+                */
+               fd = ioctl(dev_fd, VDUSE_IOTLB_GET_FD, &entry);
+               if (fd < 0)
+                       return NULL;
+
+               size = entry.last - entry.start + 1;
+               *len = entry.last - iova + 1;
+               addr = mmap(0, size, perm_to_prot(entry.perm), MAP_SHARED,
+                           fd, entry.offset);
+               close(fd);
+               if (addr == MAP_FAILED)
+                       return NULL;
+
+               /*
+                * Using some data structures such as linked list to store
+                * the iotlb mapping. The munmap(2) should be called for the
+                * cached mapping when the corresponding VDUSE_UPDATE_IOTLB
+                * message is received or the device is reset.
+                */
+
+               return addr + iova - entry.start;
+       }
+
+3. Setup the kick eventfd for the specified virtqueues with the VDUSE_VQ_SETUP_KICKFD
+   ioctl. The kick eventfd is used by VDUSE kernel module to notify userspace to
+   consume the available ring. This is optional since userspace can choose to poll the
+   available ring instead.
+
+4. Listen to the kick eventfd (optional) and consume the available ring. The buffer
+   described by the descriptors in the descriptor table should be also mapped into
+   userspace via the VDUSE_IOTLB_GET_FD ioctl before accessing.
+
+5. Inject an interrupt for specific virtqueue with the VDUSE_INJECT_VQ_IRQ ioctl
+   after the used ring is filled.
+
+For more details on the uAPI, please see include/uapi/linux/vduse.h.
index 5ec52be..7d46f03 100644 (file)
@@ -333,7 +333,7 @@ S:  Maintained
 F:     drivers/platform/x86/acer-wmi.c
 
 ACPI
-M:     "Rafael J. Wysocki" <rjw@rjwysocki.net>
+M:     "Rafael J. Wysocki" <rafael@kernel.org>
 M:     Len Brown <lenb@kernel.org>
 L:     linux-acpi@vger.kernel.org
 S:     Supported
@@ -354,7 +354,7 @@ F:  include/linux/fwnode.h
 F:     tools/power/acpi/
 
 ACPI APEI
-M:     "Rafael J. Wysocki" <rjw@rjwysocki.net>
+M:     "Rafael J. Wysocki" <rafael@kernel.org>
 M:     Len Brown <lenb@kernel.org>
 R:     James Morse <james.morse@arm.com>
 R:     Tony Luck <tony.luck@intel.com>
@@ -364,7 +364,6 @@ F:  drivers/acpi/apei/
 
 ACPI COMPONENT ARCHITECTURE (ACPICA)
 M:     Robert Moore <robert.moore@intel.com>
-M:     Erik Kaneda <erik.kaneda@intel.com>
 M:     "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
 L:     linux-acpi@vger.kernel.org
 L:     devel@acpica.org
@@ -403,7 +402,7 @@ S:  Maintained
 F:     drivers/platform/x86/i2c-multi-instantiate.c
 
 ACPI PMIC DRIVERS
-M:     "Rafael J. Wysocki" <rjw@rjwysocki.net>
+M:     "Rafael J. Wysocki" <rafael@kernel.org>
 M:     Len Brown <lenb@kernel.org>
 R:     Andy Shevchenko <andy@kernel.org>
 R:     Mika Westerberg <mika.westerberg@linux.intel.com>
@@ -3314,7 +3313,6 @@ S:        Maintained
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
 F:     block/
 F:     drivers/block/
-F:     fs/block_dev.c
 F:     include/linux/blk*
 F:     kernel/trace/blktrace.c
 F:     lib/sbitmap.c
@@ -4827,7 +4825,7 @@ W:        http://www.arm.com/products/processors/technologies/biglittleprocessing.php
 F:     drivers/cpufreq/vexpress-spc-cpufreq.c
 
 CPU FREQUENCY SCALING FRAMEWORK
-M:     "Rafael J. Wysocki" <rjw@rjwysocki.net>
+M:     "Rafael J. Wysocki" <rafael@kernel.org>
 M:     Viresh Kumar <viresh.kumar@linaro.org>
 L:     linux-pm@vger.kernel.org
 S:     Maintained
@@ -4845,7 +4843,7 @@ F:        kernel/sched/cpufreq*.c
 F:     tools/testing/selftests/cpufreq/
 
 CPU IDLE TIME MANAGEMENT FRAMEWORK
-M:     "Rafael J. Wysocki" <rjw@rjwysocki.net>
+M:     "Rafael J. Wysocki" <rafael@kernel.org>
 M:     Daniel Lezcano <daniel.lezcano@linaro.org>
 L:     linux-pm@vger.kernel.org
 S:     Maintained
@@ -7591,7 +7589,7 @@ W:        ftp://ftp.openlinux.org/pub/people/hch/vxfs
 F:     fs/freevxfs/
 
 FREEZER
-M:     "Rafael J. Wysocki" <rjw@rjwysocki.net>
+M:     "Rafael J. Wysocki" <rafael@kernel.org>
 M:     Pavel Machek <pavel@ucw.cz>
 L:     linux-pm@vger.kernel.org
 S:     Supported
@@ -7844,7 +7842,7 @@ S:        Supported
 F:     drivers/i2c/muxes/i2c-demux-pinctrl.c
 
 GENERIC PM DOMAINS
-M:     "Rafael J. Wysocki" <rjw@rjwysocki.net>
+M:     "Rafael J. Wysocki" <rafael@kernel.org>
 M:     Kevin Hilman <khilman@kernel.org>
 M:     Ulf Hansson <ulf.hansson@linaro.org>
 L:     linux-pm@vger.kernel.org
@@ -8310,7 +8308,7 @@ W:        http://drama.obuda.kando.hu/~fero/cgi-bin/hgafb.shtml
 F:     drivers/video/fbdev/hgafb.c
 
 HIBERNATION (aka Software Suspend, aka swsusp)
-M:     "Rafael J. Wysocki" <rjw@rjwysocki.net>
+M:     "Rafael J. Wysocki" <rafael@kernel.org>
 M:     Pavel Machek <pavel@ucw.cz>
 L:     linux-pm@vger.kernel.org
 S:     Supported
@@ -10623,10 +10621,10 @@ T:    git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
 F:     drivers/ata/sata_promise.*
 
 LIBATA SUBSYSTEM (Serial and Parallel ATA drivers)
-M:     Jens Axboe <axboe@kernel.dk>
+M:     Damien Le Moal <damien.lemoal@opensource.wdc.com>
 L:     linux-ide@vger.kernel.org
 S:     Maintained
-T:     git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/libata.git
 F:     Documentation/devicetree/bindings/ata/
 F:     drivers/ata/
 F:     include/linux/ata.h
@@ -13410,7 +13408,7 @@ F:      include/linux/nvme-fc.h
 NVM EXPRESS TARGET DRIVER
 M:     Christoph Hellwig <hch@lst.de>
 M:     Sagi Grimberg <sagi@grimberg.me>
-M:     Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
+M:     Chaitanya Kulkarni <kch@nvidia.com>
 L:     linux-nvme@lists.infradead.org
 S:     Supported
 W:     http://git.infradead.org/nvme.git
@@ -14969,7 +14967,7 @@ F:      kernel/time/*timer*
 F:     kernel/time/namespace.c
 
 POWER MANAGEMENT CORE
-M:     "Rafael J. Wysocki" <rjw@rjwysocki.net>
+M:     "Rafael J. Wysocki" <rafael@kernel.org>
 L:     linux-pm@vger.kernel.org
 S:     Supported
 B:     https://bugzilla.kernel.org
@@ -17947,7 +17945,7 @@ F:      arch/sh/
 F:     drivers/sh/
 
 SUSPEND TO RAM
-M:     "Rafael J. Wysocki" <rjw@rjwysocki.net>
+M:     "Rafael J. Wysocki" <rafael@kernel.org>
 M:     Len Brown <len.brown@intel.com>
 M:     Pavel Machek <pavel@ucw.cz>
 L:     linux-pm@vger.kernel.org
@@ -18567,6 +18565,7 @@ F:      drivers/thermal/
 F:     include/linux/cpu_cooling.h
 F:     include/linux/thermal.h
 F:     include/uapi/linux/thermal.h
+F:     tools/thermal/
 
 THERMAL DRIVER FOR AMLOGIC SOCS
 M:     Guillaume La Roque <glaroque@baylibre.com>
index b16be52..37a8175 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/crash_dump.h>
 #include <linux/hugetlb.h>
 #include <linux/acpi_iort.h>
+#include <linux/kmemleak.h>
 
 #include <asm/boot.h>
 #include <asm/fixmap.h>
@@ -101,6 +102,11 @@ static void __init reserve_crashkernel(void)
        pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
                crash_base, crash_base + crash_size, crash_size >> 20);
 
+       /*
+        * The crashkernel memory will be removed from the kernel linear
+        * map. Inform kmemleak so that it won't try to access it.
+        */
+       kmemleak_ignore_phys(crash_base);
        crashk_res.start = crash_base;
        crashk_res.end = crash_base + crash_size - 1;
 }
@@ -222,7 +228,21 @@ early_param("mem", early_mem);
 
 void __init arm64_memblock_init(void)
 {
-       const s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual);
+       s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual);
+
+       /*
+        * Corner case: 52-bit VA capable systems running KVM in nVHE mode may
+        * be limited in their ability to support a linear map that exceeds 51
+        * bits of VA space, depending on the placement of the ID map. Given
+        * that the placement of the ID map may be randomized, let's simply
+        * limit the kernel's linear map to 51 bits as well if we detect this
+        * configuration.
+        */
+       if (IS_ENABLED(CONFIG_KVM) && vabits_actual == 52 &&
+           is_hyp_mode_available() && !is_kernel_in_hyp_mode()) {
+               pr_info("Capping linear region to 51 bits for KVM in nVHE mode on LVA capable hardware.\n");
+               linear_region_size = min_t(u64, linear_region_size, BIT(51));
+       }
 
        /* Remove memory above our supported physical address size */
        memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);
index 3001a7d..4742b6f 100644 (file)
@@ -10,7 +10,6 @@ config PARISC
        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_STRICT_KERNEL_RWX
        select ARCH_HAS_UBSAN_SANITIZE_ALL
-       select ARCH_HAS_STRNLEN_USER
        select ARCH_NO_SG_CHAIN
        select ARCH_SUPPORTS_HUGETLBFS if PA20
        select ARCH_SUPPORTS_MEMORY_FAILURE
@@ -65,7 +64,6 @@ config PARISC
        select HAVE_KPROBES_ON_FTRACE
        select HAVE_DYNAMIC_FTRACE_WITH_REGS
        select HAVE_SOFTIRQ_ON_OWN_STACK if IRQSTACKS
-       select SET_FS
        select TRACE_IRQFLAGS_SUPPORT
 
        help
index dff4536..9fe5487 100644 (file)
@@ -26,7 +26,7 @@ endif
 OBJECTS += $(obj)/head.o $(obj)/real2.o $(obj)/firmware.o $(obj)/misc.o $(obj)/piggy.o
 
 LDFLAGS_vmlinux := -X -e startup --as-needed -T
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS) $(LIBGCC)
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS) $(LIBGCC) FORCE
        $(call if_changed,ld)
 
 sed-sizes := -e 's/^\([0-9a-fA-F]*\) . \(__bss_start\|_end\|parisc_kernel_start\)$$/\#define SZ\2 0x\1/p'
@@ -34,7 +34,7 @@ sed-sizes := -e 's/^\([0-9a-fA-F]*\) . \(__bss_start\|_end\|parisc_kernel_start\
 quiet_cmd_sizes = GEN $@
       cmd_sizes = $(NM) $< | sed -n $(sed-sizes) > $@
 
-$(obj)/sizes.h: vmlinux
+$(obj)/sizes.h: vmlinux FORCE
        $(call if_changed,sizes)
 
 AFLAGS_head.o += -I$(objtree)/$(obj) -DBOOTLOADER
@@ -70,19 +70,19 @@ suffix-$(CONFIG_KERNEL_LZMA)  := lzma
 suffix-$(CONFIG_KERNEL_LZO)  := lzo
 suffix-$(CONFIG_KERNEL_XZ)  := xz
 
-$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y)
+$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
        $(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y)
+$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
        $(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y)
+$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
        $(call if_changed,lz4)
-$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y)
+$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
        $(call if_changed,lzma)
-$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y)
+$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
        $(call if_changed,lzo)
-$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y)
+$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
        $(call if_changed,xzkern)
 
 LDFLAGS_piggy.o := -r --format binary --oformat $(LD_BFD) -T
-$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y)
+$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) FORCE
        $(call if_changed,ld)
index b5fbcd2..eeb7da0 100644 (file)
@@ -101,10 +101,6 @@ DECLARE_PER_CPU(struct cpuinfo_parisc, cpu_data);
 
 #define CPU_HVERSION ((boot_cpu_data.hversion >> 4) & 0x0FFF)
 
-typedef struct {
-       int seg;  
-} mm_segment_t;
-
 #define ARCH_MIN_TASKALIGN     8
 
 struct thread_struct {
index 2b3010a..4b9e3d7 100644 (file)
@@ -2,7 +2,7 @@
 #ifndef _ASM_PARISC_RT_SIGFRAME_H
 #define _ASM_PARISC_RT_SIGFRAME_H
 
-#define SIGRETURN_TRAMP 4
+#define SIGRETURN_TRAMP 3
 #define SIGRESTARTBLOCK_TRAMP 5 
 #define TRAMP_SIZE (SIGRETURN_TRAMP + SIGRESTARTBLOCK_TRAMP)
 
index 0bd38a9..00ad50f 100644 (file)
@@ -11,7 +11,6 @@
 struct thread_info {
        struct task_struct *task;       /* main task structure */
        unsigned long flags;            /* thread_info flags (see TIF_*) */
-       mm_segment_t addr_limit;        /* user-level address space limit */
        __u32 cpu;                      /* current CPU */
        int preempt_count;              /* 0=premptable, <0=BUG; will also serve as bh-counter */
 };
@@ -21,7 +20,6 @@ struct thread_info {
        .task           = &tsk,                 \
        .flags          = 0,                    \
        .cpu            = 0,                    \
-       .addr_limit     = KERNEL_DS,            \
        .preempt_count  = INIT_PREEMPT_COUNT,   \
 }
 
index 7c13314..192ad9e 100644 (file)
 #include <linux/bug.h>
 #include <linux/string.h>
 
-#define KERNEL_DS      ((mm_segment_t){0})
-#define USER_DS        ((mm_segment_t){1})
-
-#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg)
-
-#define get_fs()       (current_thread_info()->addr_limit)
-#define set_fs(x)      (current_thread_info()->addr_limit = (x))
-
 /*
  * Note that since kernel addresses are in a separate address space on
  * parisc, we don't need to do anything for access_ok().
 #define get_user __get_user
 
 #if !defined(CONFIG_64BIT)
-#define LDD_USER(val, ptr)     __get_user_asm64(val, ptr)
-#define STD_USER(x, ptr)       __put_user_asm64(x, ptr)
+#define LDD_USER(sr, val, ptr) __get_user_asm64(sr, val, ptr)
+#define STD_USER(sr, x, ptr)   __put_user_asm64(sr, x, ptr)
 #else
-#define LDD_USER(val, ptr)     __get_user_asm(val, "ldd", ptr)
-#define STD_USER(x, ptr)       __put_user_asm("std", x, ptr)
+#define LDD_USER(sr, val, ptr) __get_user_asm(sr, val, "ldd", ptr)
+#define STD_USER(sr, x, ptr)   __put_user_asm(sr, "std", x, ptr)
 #endif
 
 /*
@@ -67,28 +59,15 @@ struct exception_table_entry {
 #define ASM_EXCEPTIONTABLE_ENTRY_EFAULT( fault_addr, except_addr )\
        ASM_EXCEPTIONTABLE_ENTRY( fault_addr, except_addr + 1)
 
-/*
- * load_sr2() preloads the space register %%sr2 - based on the value of
- * get_fs() - with either a value of 0 to access kernel space (KERNEL_DS which
- * is 0), or with the current value of %%sr3 to access user space (USER_DS)
- * memory. The following __get_user_asm() and __put_user_asm() functions have
- * %%sr2 hard-coded to access the requested memory.
- */
-#define load_sr2() \
-       __asm__(" or,=  %0,%%r0,%%r0\n\t"       \
-               " mfsp %%sr3,%0\n\t"            \
-               " mtsp %0,%%sr2\n\t"            \
-               : : "r"(get_fs()) : )
-
-#define __get_user_internal(val, ptr)                  \
+#define __get_user_internal(sr, val, ptr)              \
 ({                                                     \
        register long __gu_err __asm__ ("r8") = 0;      \
                                                        \
        switch (sizeof(*(ptr))) {                       \
-       case 1: __get_user_asm(val, "ldb", ptr); break; \
-       case 2: __get_user_asm(val, "ldh", ptr); break; \
-       case 4: __get_user_asm(val, "ldw", ptr); break; \
-       case 8: LDD_USER(val, ptr); break;              \
+       case 1: __get_user_asm(sr, val, "ldb", ptr); break; \
+       case 2: __get_user_asm(sr, val, "ldh", ptr); break; \
+       case 4: __get_user_asm(sr, val, "ldw", ptr); break; \
+       case 8: LDD_USER(sr, val, ptr); break;          \
        default: BUILD_BUG();                           \
        }                                               \
                                                        \
@@ -97,15 +76,14 @@ struct exception_table_entry {
 
 #define __get_user(val, ptr)                           \
 ({                                                     \
-       load_sr2();                                     \
-       __get_user_internal(val, ptr);                  \
+       __get_user_internal("%%sr3,", val, ptr);        \
 })
 
-#define __get_user_asm(val, ldx, ptr)                  \
+#define __get_user_asm(sr, val, ldx, ptr)              \
 {                                                      \
        register long __gu_val;                         \
                                                        \
-       __asm__("1: " ldx " 0(%%sr2,%2),%0\n"           \
+       __asm__("1: " ldx " 0(" sr "%2),%0\n"           \
                "9:\n"                                  \
                ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \
                : "=r"(__gu_val), "=r"(__gu_err)        \
@@ -114,9 +92,22 @@ struct exception_table_entry {
        (val) = (__force __typeof__(*(ptr))) __gu_val;  \
 }
 
+#define HAVE_GET_KERNEL_NOFAULT
+#define __get_kernel_nofault(dst, src, type, err_label)        \
+{                                                      \
+       type __z;                                       \
+       long __err;                                     \
+       __err = __get_user_internal("%%sr0,", __z, (type *)(src)); \
+       if (unlikely(__err))                            \
+               goto err_label;                         \
+       else                                            \
+               *(type *)(dst) = __z;                   \
+}
+
+
 #if !defined(CONFIG_64BIT)
 
-#define __get_user_asm64(val, ptr)                     \
+#define __get_user_asm64(sr, val, ptr)                 \
 {                                                      \
        union {                                         \
                unsigned long long      l;              \
@@ -124,8 +115,8 @@ struct exception_table_entry {
        } __gu_tmp;                                     \
                                                        \
        __asm__("   copy %%r0,%R0\n"                    \
-               "1: ldw 0(%%sr2,%2),%0\n"               \
-               "2: ldw 4(%%sr2,%2),%R0\n"              \
+               "1: ldw 0(" sr "%2),%0\n"               \
+               "2: ldw 4(" sr "%2),%R0\n"              \
                "9:\n"                                  \
                ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \
                ASM_EXCEPTIONTABLE_ENTRY_EFAULT(2b, 9b) \
@@ -138,16 +129,16 @@ struct exception_table_entry {
 #endif /* !defined(CONFIG_64BIT) */
 
 
-#define __put_user_internal(x, ptr)                            \
+#define __put_user_internal(sr, x, ptr)                                \
 ({                                                             \
        register long __pu_err __asm__ ("r8") = 0;              \
         __typeof__(*(ptr)) __x = (__typeof__(*(ptr)))(x);      \
                                                                \
        switch (sizeof(*(ptr))) {                               \
-       case 1: __put_user_asm("stb", __x, ptr); break;         \
-       case 2: __put_user_asm("sth", __x, ptr); break;         \
-       case 4: __put_user_asm("stw", __x, ptr); break;         \
-       case 8: STD_USER(__x, ptr); break;                      \
+       case 1: __put_user_asm(sr, "stb", __x, ptr); break;     \
+       case 2: __put_user_asm(sr, "sth", __x, ptr); break;     \
+       case 4: __put_user_asm(sr, "stw", __x, ptr); break;     \
+       case 8: STD_USER(sr, __x, ptr); break;                  \
        default: BUILD_BUG();                                   \
        }                                                       \
                                                                \
@@ -156,10 +147,20 @@ struct exception_table_entry {
 
 #define __put_user(x, ptr)                                     \
 ({                                                             \
-       load_sr2();                                             \
-       __put_user_internal(x, ptr);                            \
+       __put_user_internal("%%sr3,", x, ptr);                  \
 })
 
+#define __put_kernel_nofault(dst, src, type, err_label)                \
+{                                                              \
+       type __z = *(type *)(src);                              \
+       long __err;                                             \
+       __err = __put_user_internal("%%sr0,", __z, (type *)(dst)); \
+       if (unlikely(__err))                                    \
+               goto err_label;                                 \
+}
+
+
+
 
 /*
  * The "__put_user/kernel_asm()" macros tell gcc they read from memory
@@ -170,26 +171,26 @@ struct exception_table_entry {
  * r8 is already listed as err.
  */
 
-#define __put_user_asm(stx, x, ptr)                         \
-       __asm__ __volatile__ (                              \
-               "1: " stx " %2,0(%%sr2,%1)\n"               \
-               "9:\n"                                      \
-               ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b)     \
-               : "=r"(__pu_err)                            \
+#define __put_user_asm(sr, stx, x, ptr)                                \
+       __asm__ __volatile__ (                                  \
+               "1: " stx " %2,0(" sr "%1)\n"                   \
+               "9:\n"                                          \
+               ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b)         \
+               : "=r"(__pu_err)                                \
                : "r"(ptr), "r"(x), "0"(__pu_err))
 
 
 #if !defined(CONFIG_64BIT)
 
-#define __put_user_asm64(__val, ptr) do {                  \
-       __asm__ __volatile__ (                              \
-               "1: stw %2,0(%%sr2,%1)\n"                   \
-               "2: stw %R2,4(%%sr2,%1)\n"                  \
-               "9:\n"                                      \
-               ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b)     \
-               ASM_EXCEPTIONTABLE_ENTRY_EFAULT(2b, 9b)     \
-               : "=r"(__pu_err)                            \
-               : "r"(ptr), "r"(__val), "0"(__pu_err));     \
+#define __put_user_asm64(sr, __val, ptr) do {                  \
+       __asm__ __volatile__ (                                  \
+               "1: stw %2,0(" sr "%1)\n"                       \
+               "2: stw %R2,4(" sr "%1)\n"                      \
+               "9:\n"                                          \
+               ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b)         \
+               ASM_EXCEPTIONTABLE_ENTRY_EFAULT(2b, 9b)         \
+               : "=r"(__pu_err)                                \
+               : "r"(ptr), "r"(__val), "0"(__pu_err));         \
 } while (0)
 
 #endif /* !defined(CONFIG_64BIT) */
@@ -200,14 +201,12 @@ struct exception_table_entry {
  */
 
 extern long strncpy_from_user(char *, const char __user *, long);
-extern unsigned lclear_user(void __user *, unsigned long);
-extern long lstrnlen_user(const char __user *, long);
+extern __must_check unsigned lclear_user(void __user *, unsigned long);
+extern __must_check long strnlen_user(const char __user *src, long n);
 /*
  * Complex access routines -- macros
  */
-#define user_addr_max() (~0UL)
 
-#define strnlen_user lstrnlen_user
 #define clear_user lclear_user
 #define __clear_user lclear_user
 
index 33113ba..22924a3 100644 (file)
@@ -230,7 +230,6 @@ int main(void)
        DEFINE(TI_TASK, offsetof(struct thread_info, task));
        DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
        DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
-       DEFINE(TI_SEGMENT, offsetof(struct thread_info, addr_limit));
        DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count));
        DEFINE(THREAD_SZ, sizeof(struct thread_info));
        /* THREAD_SZ_ALGN includes space for a stack frame. */
index e8a6a75..00297e8 100644 (file)
@@ -32,7 +32,6 @@ EXPORT_SYMBOL(__xchg64);
 
 #include <linux/uaccess.h>
 EXPORT_SYMBOL(lclear_user);
-EXPORT_SYMBOL(lstrnlen_user);
 
 #ifndef CONFIG_64BIT
 /* Needed so insmod can set dp value */
index 3fb86ee..cceb098 100644 (file)
@@ -150,8 +150,6 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_PA11
        dma_ops_init();
 #endif
-
-       clear_sched_clock_stable();
 }
 
 /*
index db1a47c..bbfe23c 100644 (file)
@@ -237,18 +237,22 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs,
 #endif
        
        usp = (regs->gr[30] & ~(0x01UL));
+       sigframe_size = PARISC_RT_SIGFRAME_SIZE;
 #ifdef CONFIG_64BIT
        if (is_compat_task()) {
                /* The gcc alloca implementation leaves garbage in the upper 32 bits of sp */
                usp = (compat_uint_t)usp;
+               sigframe_size = PARISC_RT_SIGFRAME_SIZE32;
        }
 #endif
-       /*FIXME: frame_size parameter is unused, remove it. */
-       frame = get_sigframe(&ksig->ka, usp, sizeof(*frame));
+       frame = get_sigframe(&ksig->ka, usp, sigframe_size);
 
        DBG(1,"SETUP_RT_FRAME: START\n");
        DBG(1,"setup_rt_frame: frame %p info %p\n", frame, ksig->info);
 
+       start = (unsigned long) frame;
+       if (start >= user_addr_max() - sigframe_size)
+               return -EFAULT;
        
 #ifdef CONFIG_64BIT
 
@@ -284,32 +288,21 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs,
           already in userspace. The first words of tramp are used to
           save the previous sigrestartblock trampoline that might be
           on the stack. We start the sigreturn trampoline at 
-          SIGRESTARTBLOCK_TRAMP+X. */
+          SIGRESTARTBLOCK_TRAMP. */
        err |= __put_user(in_syscall ? INSN_LDI_R25_1 : INSN_LDI_R25_0,
                        &frame->tramp[SIGRESTARTBLOCK_TRAMP+0]);
-       err |= __put_user(INSN_LDI_R20, 
-                       &frame->tramp[SIGRESTARTBLOCK_TRAMP+1]);
        err |= __put_user(INSN_BLE_SR2_R0, 
+                       &frame->tramp[SIGRESTARTBLOCK_TRAMP+1]);
+       err |= __put_user(INSN_LDI_R20,
                        &frame->tramp[SIGRESTARTBLOCK_TRAMP+2]);
-       err |= __put_user(INSN_NOP, &frame->tramp[SIGRESTARTBLOCK_TRAMP+3]);
-
-#if DEBUG_SIG
-       /* Assert that we're flushing in the correct space... */
-       {
-               unsigned long sid;
-               asm ("mfsp %%sr3,%0" : "=r" (sid));
-               DBG(1,"setup_rt_frame: Flushing 64 bytes at space %#x offset %p\n",
-                      sid, frame->tramp);
-       }
-#endif
 
-       start = (unsigned long) &frame->tramp[0];
-       end = (unsigned long) &frame->tramp[TRAMP_SIZE];
+       start = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP+0];
+       end = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP+3];
        flush_user_dcache_range_asm(start, end);
        flush_user_icache_range_asm(start, end);
 
        /* TRAMP Words 0-4, Length 5 = SIGRESTARTBLOCK_TRAMP
-        * TRAMP Words 5-9, Length 4 = SIGRETURN_TRAMP
+        * TRAMP Words 5-7, Length 3 = SIGRETURN_TRAMP
         * So the SIGRETURN_TRAMP is at the end of SIGRESTARTBLOCK_TRAMP
         */
        rp = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP];
@@ -353,11 +346,6 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs,
 
        /* The syscall return path will create IAOQ values from r31.
         */
-       sigframe_size = PARISC_RT_SIGFRAME_SIZE;
-#ifdef CONFIG_64BIT
-       if (is_compat_task())
-               sigframe_size = PARISC_RT_SIGFRAME_SIZE32;
-#endif
        if (in_syscall) {
                regs->gr[31] = haddr;
 #ifdef CONFIG_64BIT
@@ -501,7 +489,6 @@ syscall_restart(struct pt_regs *regs, struct k_sigaction *ka)
                DBG(1,"ERESTARTNOHAND: returning -EINTR\n");
                regs->gr[28] = -EINTR;
                break;
-
        case -ERESTARTSYS:
                if (!(ka->sa.sa_flags & SA_RESTART)) {
                        DBG(1,"ERESTARTSYS: putting -EINTR\n");
@@ -529,6 +516,10 @@ insert_restart_trampoline(struct pt_regs *regs)
                unsigned long end  = (unsigned long) &usp[5];
                long err = 0;
 
+               /* check that we don't exceed the stack */
+               if (A(&usp[0]) >= user_addr_max() - 5 * sizeof(int))
+                       return;
+
                /* Setup a trampoline to restart the syscall
                 * with __NR_restart_syscall
                 *
@@ -569,10 +560,6 @@ insert_restart_trampoline(struct pt_regs *regs)
 }
 
 /*
- * Note that 'init' is a special process: it doesn't get signals it doesn't
- * want to handle. Thus you cannot kill init even with a SIGKILL even by
- * mistake.
- *
  * We need to be able to restore the syscall arguments (r21-r26) to
  * restart syscalls.  Thus, the syscall path should save them in the
  * pt_regs structure (it's okay to do so since they are caller-save
index f166250..a5bdbb5 100644 (file)
@@ -36,7 +36,7 @@ struct compat_regfile {
         compat_int_t rf_sar;
 };
 
-#define COMPAT_SIGRETURN_TRAMP 4
+#define COMPAT_SIGRETURN_TRAMP 3
 #define COMPAT_SIGRESTARTBLOCK_TRAMP 5
 #define COMPAT_TRAMP_SIZE (COMPAT_SIGRETURN_TRAMP + \
                                COMPAT_SIGRESTARTBLOCK_TRAMP)
index 08e4d48..9fb1e79 100644 (file)
@@ -265,6 +265,9 @@ static int __init init_cr16_clocksource(void)
                            (cpu0_loc == per_cpu(cpu_data, cpu).cpu_loc))
                                continue;
 
+                       /* mark sched_clock unstable */
+                       clear_sched_clock_stable();
+
                        clocksource_cr16.name = "cr16_unstable";
                        clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE;
                        clocksource_cr16.rating = 0;
@@ -272,10 +275,6 @@ static int __init init_cr16_clocksource(void)
                }
        }
 
-       /* XXX: We may want to mark sched_clock stable here if cr16 clocks are
-        *      in sync:
-        *      (clocksource_cr16.flags == CLOCK_SOURCE_IS_CONTINUOUS) */
-
        /* register at clocksource framework */
        clocksource_register_hz(&clocksource_cr16,
                100 * PAGE0->mem_10msec);
index 36d6a86..b428d29 100644 (file)
 #include <asm/errno.h>
 #include <linux/linkage.h>
 
-       /*
-        * get_sr gets the appropriate space value into
-        * sr1 for kernel/user space access, depending
-        * on the flag stored in the task structure.
-        */
-
-       .macro  get_sr
-       mfctl       %cr30,%r1
-       ldw         TI_SEGMENT(%r1),%r22
-       mfsp        %sr3,%r1
-       or,<>       %r22,%r0,%r0
-       copy        %r0,%r1
-       mtsp        %r1,%sr1
-       .endm
-
        /*
         * unsigned long lclear_user(void *to, unsigned long n)
         *
 
 ENTRY_CFI(lclear_user)
        comib,=,n   0,%r25,$lclu_done
-       get_sr
 $lclu_loop:
        addib,<>    -1,%r25,$lclu_loop
-1:      stbs,ma     %r0,1(%sr1,%r26)
+1:     stbs,ma     %r0,1(%sr3,%r26)
 
 $lclu_done:
        bv          %r0(%r2)
@@ -67,40 +51,6 @@ $lclu_done:
 ENDPROC_CFI(lclear_user)
 
 
-       /*
-        * long lstrnlen_user(char *s, long n)
-        *
-        * Returns 0 if exception before zero byte or reaching N,
-        *         N+1 if N would be exceeded,
-        *         else strlen + 1 (i.e. includes zero byte).
-        */
-
-ENTRY_CFI(lstrnlen_user)
-       comib,=     0,%r25,$lslen_nzero
-       copy        %r26,%r24
-       get_sr
-1:      ldbs,ma     1(%sr1,%r26),%r1
-$lslen_loop:
-       comib,=,n   0,%r1,$lslen_done
-       addib,<>    -1,%r25,$lslen_loop
-2:      ldbs,ma     1(%sr1,%r26),%r1
-$lslen_done:
-       bv          %r0(%r2)
-       sub         %r26,%r24,%r28
-
-$lslen_nzero:
-       b           $lslen_done
-       ldo         1(%r26),%r26 /* special case for N == 0 */
-
-3:      b          $lslen_done
-       copy        %r24,%r26    /* reset r26 so 0 is returned on fault */
-
-       ASM_EXCEPTIONTABLE_ENTRY(1b,3b)
-       ASM_EXCEPTIONTABLE_ENTRY(2b,3b)
-
-ENDPROC_CFI(lstrnlen_user)
-
-
 /*
  * unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
  *
index c799556..c3f3fd5 100644 (file)
@@ -41,6 +41,7 @@ config RISCV
        select ARCH_WANT_FRAME_POINTERS
        select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
        select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
+       select BUILDTIME_TABLE_SORT if MMU
        select CLONE_BACKWARDS
        select CLINT_TIMER if !MMU
        select COMMON_CLK
index 01906a9..0eb4568 100644 (file)
@@ -132,8 +132,11 @@ $(BOOT_TARGETS): vmlinux
 Image.%: Image
        $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
 
-zinstall install:
-       $(Q)$(MAKE) $(build)=$(boot) $@
+install: install-image = Image
+zinstall: install-image = Image.gz
+install zinstall:
+       $(CONFIG_SHELL) $(srctree)/$(boot)/install.sh $(KERNELRELEASE) \
+       $(boot)/$(install-image) System.map "$(INSTALL_PATH)"
 
 archclean:
        $(Q)$(MAKE) $(clean)=$(boot)
index 6bf299f..becd062 100644 (file)
@@ -58,11 +58,3 @@ $(obj)/Image.lzo: $(obj)/Image FORCE
 
 $(obj)/loader.bin: $(obj)/loader FORCE
        $(call if_changed,objcopy)
-
-install:
-       $(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \
-       $(obj)/Image System.map "$(INSTALL_PATH)"
-
-zinstall:
-       $(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \
-       $(obj)/Image.gz System.map "$(INSTALL_PATH)"
index baea7d2..b254c60 100644 (file)
 
        aliases {
                ethernet0 = &emac1;
+               serial0 = &serial0;
+               serial1 = &serial1;
+               serial2 = &serial2;
+               serial3 = &serial3;
        };
 
        chosen {
-               stdout-path = &serial0;
+               stdout-path = "serial0:115200n8";
        };
 
        cpus {
index bc68231..4ebc803 100644 (file)
@@ -39,10 +39,12 @@ CONFIG_PCI=y
 CONFIG_PCIEPORTBUS=y
 CONFIG_PCI_HOST_GENERIC=y
 CONFIG_PCIE_XILINX=y
+CONFIG_PCIE_FU740=y
 CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_VIRTIO_BLK=y
+CONFIG_BLK_DEV_NVME=m
 CONFIG_BLK_DEV_SD=y
 CONFIG_BLK_DEV_SR=y
 CONFIG_SCSI_VIRTIO=y
@@ -108,6 +110,8 @@ CONFIG_NFS_V4_1=y
 CONFIG_NFS_V4_2=y
 CONFIG_ROOT_NFS=y
 CONFIG_9P_FS=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ISO8859_1=m
 CONFIG_CRYPTO_USER_API_HASH=y
 CONFIG_CRYPTO_DEV_VIRTIO=y
 CONFIG_PRINTK_TIME=y
index f4b490c..f53c400 100644 (file)
@@ -42,6 +42,9 @@
  */
 #define ELF_ET_DYN_BASE                ((TASK_SIZE / 3) * 2)
 
+#ifdef CONFIG_64BIT
+#define STACK_RND_MASK         (0x3ffff >> (PAGE_SHIFT - 12))
+#endif
 /*
  * This yields a mask that user programs can use to figure out what
  * instruction set this CPU supports.  This could be done in user space,
index af77655..9c9f350 100644 (file)
@@ -121,7 +121,6 @@ SECTIONS
        }
 
        BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0)
-       EXCEPTION_TABLE(0x10)
 
        .rel.dyn : AT(ADDR(.rel.dyn) - LOAD_OFFSET) {
                *(.rel.dyn*)
index 502d082..5104f3a 100644 (file)
@@ -4,6 +4,8 @@
  * Copyright (C) 2017 SiFive
  */
 
+#define RO_EXCEPTION_TABLE_ALIGN       16
+
 #ifdef CONFIG_XIP_KERNEL
 #include "vmlinux-xip.lds.S"
 #else
@@ -112,8 +114,6 @@ SECTIONS
                *(.srodata*)
        }
 
-       EXCEPTION_TABLE(0x10)
-
        . = ALIGN(SECTION_ALIGN);
        _data = .;
 
index 6cf4027..41aa1ba 100644 (file)
@@ -3,7 +3,7 @@
 # Makefile for the kernel block layer
 #
 
-obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \
+obj-$(CONFIG_BLOCK) := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
                        blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
                        blk-exec.o blk-merge.o blk-timeout.o \
                        blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
diff --git a/block/bdev.c b/block/bdev.c
new file mode 100644 (file)
index 0000000..cf2780c
--- /dev/null
@@ -0,0 +1,1058 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright (C) 2016 - 2020 Christoph Hellwig
+ */
+
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/major.h>
+#include <linux/device_cgroup.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/module.h>
+#include <linux/blkpg.h>
+#include <linux/magic.h>
+#include <linux/buffer_head.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/mount.h>
+#include <linux/pseudo_fs.h>
+#include <linux/uio.h>
+#include <linux/namei.h>
+#include <linux/cleancache.h>
+#include <linux/part_stat.h>
+#include <linux/uaccess.h>
+#include "../fs/internal.h"
+#include "blk.h"
+
+struct bdev_inode {
+       struct block_device bdev;
+       struct inode vfs_inode;
+};
+
+static inline struct bdev_inode *BDEV_I(struct inode *inode)
+{
+       return container_of(inode, struct bdev_inode, vfs_inode);
+}
+
+struct block_device *I_BDEV(struct inode *inode)
+{
+       return &BDEV_I(inode)->bdev;
+}
+EXPORT_SYMBOL(I_BDEV);
+
+static void bdev_write_inode(struct block_device *bdev)
+{
+       struct inode *inode = bdev->bd_inode;
+       int ret;
+
+       spin_lock(&inode->i_lock);
+       while (inode->i_state & I_DIRTY) {
+               spin_unlock(&inode->i_lock);
+               ret = write_inode_now(inode, true);
+               if (ret) {
+                       char name[BDEVNAME_SIZE];
+                       pr_warn_ratelimited("VFS: Dirty inode writeback failed "
+                                           "for block device %s (err=%d).\n",
+                                           bdevname(bdev, name), ret);
+               }
+               spin_lock(&inode->i_lock);
+       }
+       spin_unlock(&inode->i_lock);
+}
+
+/* Kill _all_ buffers and pagecache , dirty or not.. */
+static void kill_bdev(struct block_device *bdev)
+{
+       struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+       if (mapping_empty(mapping))
+               return;
+
+       invalidate_bh_lrus();
+       truncate_inode_pages(mapping, 0);
+}
+
+/* Invalidate clean unused buffers and pagecache. */
+void invalidate_bdev(struct block_device *bdev)
+{
+       struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+       if (mapping->nrpages) {
+               invalidate_bh_lrus();
+               lru_add_drain_all();    /* make sure all lru add caches are flushed */
+               invalidate_mapping_pages(mapping, 0, -1);
+       }
+       /* 99% of the time, we don't need to flush the cleancache on the bdev.
+        * But, for the strange corners, lets be cautious
+        */
+       cleancache_invalidate_inode(mapping);
+}
+EXPORT_SYMBOL(invalidate_bdev);
+
+/*
+ * Drop all buffers & page cache for given bdev range. This function bails
+ * with error if bdev has other exclusive owner (such as filesystem).
+ */
+int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
+                       loff_t lstart, loff_t lend)
+{
+       /*
+        * If we don't hold exclusive handle for the device, upgrade to it
+        * while we discard the buffer cache to avoid discarding buffers
+        * under live filesystem.
+        */
+       if (!(mode & FMODE_EXCL)) {
+               int err = bd_prepare_to_claim(bdev, truncate_bdev_range);
+               if (err)
+                       goto invalidate;
+       }
+
+       truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
+       if (!(mode & FMODE_EXCL))
+               bd_abort_claiming(bdev, truncate_bdev_range);
+       return 0;
+
+invalidate:
+       /*
+        * Someone else has handle exclusively open. Try invalidating instead.
+        * The 'end' argument is inclusive so the rounding is safe.
+        */
+       return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
+                                            lstart >> PAGE_SHIFT,
+                                            lend >> PAGE_SHIFT);
+}
+
+static void set_init_blocksize(struct block_device *bdev)
+{
+       unsigned int bsize = bdev_logical_block_size(bdev);
+       loff_t size = i_size_read(bdev->bd_inode);
+
+       while (bsize < PAGE_SIZE) {
+               if (size & bsize)
+                       break;
+               bsize <<= 1;
+       }
+       bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+}
+
+int set_blocksize(struct block_device *bdev, int size)
+{
+       /* Size must be a power of two, and between 512 and PAGE_SIZE */
+       if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
+               return -EINVAL;
+
+       /* Size cannot be smaller than the size supported by the device */
+       if (size < bdev_logical_block_size(bdev))
+               return -EINVAL;
+
+       /* Don't change the size if it is same as current */
+       if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
+               sync_blockdev(bdev);
+               bdev->bd_inode->i_blkbits = blksize_bits(size);
+               kill_bdev(bdev);
+       }
+       return 0;
+}
+
+EXPORT_SYMBOL(set_blocksize);
+
+int sb_set_blocksize(struct super_block *sb, int size)
+{
+       if (set_blocksize(sb->s_bdev, size))
+               return 0;
+       /* If we get here, we know size is power of two
+        * and it's value is between 512 and PAGE_SIZE */
+       sb->s_blocksize = size;
+       sb->s_blocksize_bits = blksize_bits(size);
+       return sb->s_blocksize;
+}
+
+EXPORT_SYMBOL(sb_set_blocksize);
+
+int sb_min_blocksize(struct super_block *sb, int size)
+{
+       int minsize = bdev_logical_block_size(sb->s_bdev);
+       if (size < minsize)
+               size = minsize;
+       return sb_set_blocksize(sb, size);
+}
+
+EXPORT_SYMBOL(sb_min_blocksize);
+
+int __sync_blockdev(struct block_device *bdev, int wait)
+{
+       if (!bdev)
+               return 0;
+       if (!wait)
+               return filemap_flush(bdev->bd_inode->i_mapping);
+       return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+}
+
+/*
+ * Write out and wait upon all the dirty data associated with a block
+ * device via its mapping.  Does not take the superblock lock.
+ */
+int sync_blockdev(struct block_device *bdev)
+{
+       return __sync_blockdev(bdev, 1);
+}
+EXPORT_SYMBOL(sync_blockdev);
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * device.   Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_bdev(struct block_device *bdev)
+{
+       struct super_block *sb = get_super(bdev);
+       if (sb) {
+               int res = sync_filesystem(sb);
+               drop_super(sb);
+               return res;
+       }
+       return sync_blockdev(bdev);
+}
+EXPORT_SYMBOL(fsync_bdev);
+
+/**
+ * freeze_bdev  --  lock a filesystem and force it into a consistent state
+ * @bdev:      blockdevice to lock
+ *
+ * If a superblock is found on this device, we take the s_umount semaphore
+ * on it to make sure nobody unmounts until the snapshot creation is done.
+ * The reference counter (bd_fsfreeze_count) guarantees that only the last
+ * unfreeze process can unfreeze the frozen filesystem actually when multiple
+ * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
+ * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * actually.
+ */
+int freeze_bdev(struct block_device *bdev)
+{
+       struct super_block *sb;
+       int error = 0;
+
+       mutex_lock(&bdev->bd_fsfreeze_mutex);
+       if (++bdev->bd_fsfreeze_count > 1)
+               goto done;
+
+       sb = get_active_super(bdev);
+       if (!sb)
+               goto sync;
+       if (sb->s_op->freeze_super)
+               error = sb->s_op->freeze_super(sb);
+       else
+               error = freeze_super(sb);
+       deactivate_super(sb);
+
+       if (error) {
+               bdev->bd_fsfreeze_count--;
+               goto done;
+       }
+       bdev->bd_fsfreeze_sb = sb;
+
+sync:
+       sync_blockdev(bdev);
+done:
+       mutex_unlock(&bdev->bd_fsfreeze_mutex);
+       return error;
+}
+EXPORT_SYMBOL(freeze_bdev);
+
+/**
+ * thaw_bdev  -- unlock filesystem
+ * @bdev:      blockdevice to unlock
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_bdev().
+ */
+int thaw_bdev(struct block_device *bdev)
+{
+       struct super_block *sb;
+       int error = -EINVAL;
+
+       mutex_lock(&bdev->bd_fsfreeze_mutex);
+       if (!bdev->bd_fsfreeze_count)
+               goto out;
+
+       error = 0;
+       if (--bdev->bd_fsfreeze_count > 0)
+               goto out;
+
+       sb = bdev->bd_fsfreeze_sb;
+       if (!sb)
+               goto out;
+
+       if (sb->s_op->thaw_super)
+               error = sb->s_op->thaw_super(sb);
+       else
+               error = thaw_super(sb);
+       if (error)
+               bdev->bd_fsfreeze_count++;
+       else
+               bdev->bd_fsfreeze_sb = NULL;
+out:
+       mutex_unlock(&bdev->bd_fsfreeze_mutex);
+       return error;
+}
+EXPORT_SYMBOL(thaw_bdev);
+
+/**
+ * bdev_read_page() - Start reading a page from a block device
+ * @bdev: The device to read the page from
+ * @sector: The offset on the device to read the page to (need not be aligned)
+ * @page: The page to read
+ *
+ * On entry, the page should be locked.  It will be unlocked when the page
+ * has been read.  If the block driver implements rw_page synchronously,
+ * that will be true on exit from this function, but it need not be.
+ *
+ * Errors returned by this function are usually "soft", eg out of memory, or
+ * queue full; callers should try a different route to read this page rather
+ * than propagate an error back up the stack.
+ *
+ * Return: negative errno if an error occurs, 0 if submission was successful.
+ */
+int bdev_read_page(struct block_device *bdev, sector_t sector,
+                       struct page *page)
+{
+       const struct block_device_operations *ops = bdev->bd_disk->fops;
+       int result = -EOPNOTSUPP;
+
+       if (!ops->rw_page || bdev_get_integrity(bdev))
+               return result;
+
+       result = blk_queue_enter(bdev->bd_disk->queue, 0);
+       if (result)
+               return result;
+       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+                             REQ_OP_READ);
+       blk_queue_exit(bdev->bd_disk->queue);
+       return result;
+}
+
+/**
+ * bdev_write_page() - Start writing a page to a block device
+ * @bdev: The device to write the page to
+ * @sector: The offset on the device to write the page to (need not be aligned)
+ * @page: The page to write
+ * @wbc: The writeback_control for the write
+ *
+ * On entry, the page should be locked and not currently under writeback.
+ * On exit, if the write started successfully, the page will be unlocked and
+ * under writeback.  If the write failed already (eg the driver failed to
+ * queue the page to the device), the page will still be locked.  If the
+ * caller is a ->writepage implementation, it will need to unlock the page.
+ *
+ * Errors returned by this function are usually "soft", eg out of memory, or
+ * queue full; callers should try a different route to write this page rather
+ * than propagate an error back up the stack.
+ *
+ * Return: negative errno if an error occurs, 0 if submission was successful.
+ */
+int bdev_write_page(struct block_device *bdev, sector_t sector,
+                       struct page *page, struct writeback_control *wbc)
+{
+       int result;
+       const struct block_device_operations *ops = bdev->bd_disk->fops;
+
+       if (!ops->rw_page || bdev_get_integrity(bdev))
+               return -EOPNOTSUPP;
+       result = blk_queue_enter(bdev->bd_disk->queue, 0);
+       if (result)
+               return result;
+
+       set_page_writeback(page);
+       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+                             REQ_OP_WRITE);
+       if (result) {
+               end_page_writeback(page);
+       } else {
+               clean_page_buffers(page);
+               unlock_page(page);
+       }
+       blk_queue_exit(bdev->bd_disk->queue);
+       return result;
+}
+
+/*
+ * pseudo-fs
+ */
+
+static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
+static struct kmem_cache * bdev_cachep __read_mostly;
+
+static struct inode *bdev_alloc_inode(struct super_block *sb)
+{
+       struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
+
+       if (!ei)
+               return NULL;
+       memset(&ei->bdev, 0, sizeof(ei->bdev));
+       return &ei->vfs_inode;
+}
+
+static void bdev_free_inode(struct inode *inode)
+{
+       struct block_device *bdev = I_BDEV(inode);
+
+       free_percpu(bdev->bd_stats);
+       kfree(bdev->bd_meta_info);
+
+       if (!bdev_is_partition(bdev)) {
+               if (bdev->bd_disk && bdev->bd_disk->bdi)
+                       bdi_put(bdev->bd_disk->bdi);
+               kfree(bdev->bd_disk);
+       }
+
+       if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
+               blk_free_ext_minor(MINOR(bdev->bd_dev));
+
+       kmem_cache_free(bdev_cachep, BDEV_I(inode));
+}
+
+static void init_once(void *data)
+{
+       struct bdev_inode *ei = data;
+
+       inode_init_once(&ei->vfs_inode);
+}
+
+static void bdev_evict_inode(struct inode *inode)
+{
+       truncate_inode_pages_final(&inode->i_data);
+       invalidate_inode_buffers(inode); /* is it needed here? */
+       clear_inode(inode);
+}
+
+static const struct super_operations bdev_sops = {
+       .statfs = simple_statfs,
+       .alloc_inode = bdev_alloc_inode,
+       .free_inode = bdev_free_inode,
+       .drop_inode = generic_delete_inode,
+       .evict_inode = bdev_evict_inode,
+};
+
+static int bd_init_fs_context(struct fs_context *fc)
+{
+       struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
+       if (!ctx)
+               return -ENOMEM;
+       fc->s_iflags |= SB_I_CGROUPWB;
+       ctx->ops = &bdev_sops;
+       return 0;
+}
+
+static struct file_system_type bd_type = {
+       .name           = "bdev",
+       .init_fs_context = bd_init_fs_context,
+       .kill_sb        = kill_anon_super,
+};
+
+struct super_block *blockdev_superblock __read_mostly;
+EXPORT_SYMBOL_GPL(blockdev_superblock);
+
+void __init bdev_cache_init(void)
+{
+       int err;
+       static struct vfsmount *bd_mnt;
+
+       bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
+                       0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+                               SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
+                       init_once);
+       err = register_filesystem(&bd_type);
+       if (err)
+               panic("Cannot register bdev pseudo-fs");
+       bd_mnt = kern_mount(&bd_type);
+       if (IS_ERR(bd_mnt))
+               panic("Cannot create bdev pseudo-fs");
+       blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
+}
+
+struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
+{
+       struct block_device *bdev;
+       struct inode *inode;
+
+       inode = new_inode(blockdev_superblock);
+       if (!inode)
+               return NULL;
+       inode->i_mode = S_IFBLK;
+       inode->i_rdev = 0;
+       inode->i_data.a_ops = &def_blk_aops;
+       mapping_set_gfp_mask(&inode->i_data, GFP_USER);
+
+       bdev = I_BDEV(inode);
+       mutex_init(&bdev->bd_fsfreeze_mutex);
+       spin_lock_init(&bdev->bd_size_lock);
+       bdev->bd_disk = disk;
+       bdev->bd_partno = partno;
+       bdev->bd_inode = inode;
+       bdev->bd_stats = alloc_percpu(struct disk_stats);
+       if (!bdev->bd_stats) {
+               iput(inode);
+               return NULL;
+       }
+       return bdev;
+}
+
+void bdev_add(struct block_device *bdev, dev_t dev)
+{
+       bdev->bd_dev = dev;
+       bdev->bd_inode->i_rdev = dev;
+       bdev->bd_inode->i_ino = dev;
+       insert_inode_hash(bdev->bd_inode);
+}
+
+long nr_blockdev_pages(void)
+{
+       struct inode *inode;
+       long ret = 0;
+
+       spin_lock(&blockdev_superblock->s_inode_list_lock);
+       list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
+               ret += inode->i_mapping->nrpages;
+       spin_unlock(&blockdev_superblock->s_inode_list_lock);
+
+       return ret;
+}
+
+/**
+ * bd_may_claim - test whether a block device can be claimed
+ * @bdev: block device of interest
+ * @whole: whole block device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Test whether @bdev can be claimed by @holder.
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).
+ *
+ * RETURNS:
+ * %true if @bdev can be claimed, %false otherwise.
+ */
+static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
+                        void *holder)
+{
+       if (bdev->bd_holder == holder)
+               return true;     /* already a holder */
+       else if (bdev->bd_holder != NULL)
+               return false;    /* held by someone else */
+       else if (whole == bdev)
+               return true;     /* is a whole device which isn't held */
+
+       else if (whole->bd_holder == bd_may_claim)
+               return true;     /* is a partition of a device that is being partitioned */
+       else if (whole->bd_holder != NULL)
+               return false;    /* is a partition of a held device */
+       else
+               return true;     /* is a partition of an un-held device */
+}
+
+/**
+ * bd_prepare_to_claim - claim a block device
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ *
+ * Claim @bdev.  This function fails if @bdev is already claimed by another
+ * holder and waits if another claiming is in progress. return, the caller
+ * has ownership of bd_claiming and bd_holder[s].
+ *
+ * RETURNS:
+ * 0 if @bdev can be claimed, -EBUSY otherwise.
+ */
+int bd_prepare_to_claim(struct block_device *bdev, void *holder)
+{
+       struct block_device *whole = bdev_whole(bdev);
+
+       if (WARN_ON_ONCE(!holder))
+               return -EINVAL;
+retry:
+       spin_lock(&bdev_lock);
+       /* if someone else claimed, fail */
+       if (!bd_may_claim(bdev, whole, holder)) {
+               spin_unlock(&bdev_lock);
+               return -EBUSY;
+       }
+
+       /* if claiming is already in progress, wait for it to finish */
+       if (whole->bd_claiming) {
+               wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+               DEFINE_WAIT(wait);
+
+               prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+               spin_unlock(&bdev_lock);
+               schedule();
+               finish_wait(wq, &wait);
+               goto retry;
+       }
+
+       /* yay, all mine */
+       whole->bd_claiming = holder;
+       spin_unlock(&bdev_lock);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
+
+static void bd_clear_claiming(struct block_device *whole, void *holder)
+{
+       lockdep_assert_held(&bdev_lock);
+       /* tell others that we're done */
+       BUG_ON(whole->bd_claiming != holder);
+       whole->bd_claiming = NULL;
+       wake_up_bit(&whole->bd_claiming, 0);
+}
+
+/**
+ * bd_finish_claiming - finish claiming of a block device
+ * @bdev: block device of interest
+ * @holder: holder that has claimed @bdev
+ *
+ * Finish exclusive open of a block device. Mark the device as exlusively
+ * open by the holder and wake up all waiters for exclusive open to finish.
+ */
+static void bd_finish_claiming(struct block_device *bdev, void *holder)
+{
+       struct block_device *whole = bdev_whole(bdev);
+
+       spin_lock(&bdev_lock);
+       BUG_ON(!bd_may_claim(bdev, whole, holder));
+       /*
+        * Note that for a whole device bd_holders will be incremented twice,
+        * and bd_holder will be set to bd_may_claim before being set to holder
+        */
+       whole->bd_holders++;
+       whole->bd_holder = bd_may_claim;
+       bdev->bd_holders++;
+       bdev->bd_holder = holder;
+       bd_clear_claiming(whole, holder);
+       spin_unlock(&bdev_lock);
+}
+
+/**
+ * bd_abort_claiming - abort claiming of a block device
+ * @bdev: block device of interest
+ * @holder: holder that has claimed @bdev
+ *
+ * Abort claiming of a block device when the exclusive open failed. This can be
+ * also used when exclusive open is not actually desired and we just needed
+ * to block other exclusive openers for a while.
+ */
+void bd_abort_claiming(struct block_device *bdev, void *holder)
+{
+       spin_lock(&bdev_lock);
+       bd_clear_claiming(bdev_whole(bdev), holder);
+       spin_unlock(&bdev_lock);
+}
+EXPORT_SYMBOL(bd_abort_claiming);
+
+static void blkdev_flush_mapping(struct block_device *bdev)
+{
+       WARN_ON_ONCE(bdev->bd_holders);
+       sync_blockdev(bdev);
+       kill_bdev(bdev);
+       bdev_write_inode(bdev);
+}
+
+static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
+{
+       struct gendisk *disk = bdev->bd_disk;
+       int ret = 0;
+
+       if (disk->fops->open) {
+               ret = disk->fops->open(bdev, mode);
+               if (ret) {
+                       /* avoid ghost partitions on a removed medium */
+                       if (ret == -ENOMEDIUM &&
+                            test_bit(GD_NEED_PART_SCAN, &disk->state))
+                               bdev_disk_changed(disk, true);
+                       return ret;
+               }
+       }
+
+       if (!bdev->bd_openers)
+               set_init_blocksize(bdev);
+       if (test_bit(GD_NEED_PART_SCAN, &disk->state))
+               bdev_disk_changed(disk, false);
+       bdev->bd_openers++;
+       return 0;;
+}
+
+static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
+{
+       if (!--bdev->bd_openers)
+               blkdev_flush_mapping(bdev);
+       if (bdev->bd_disk->fops->release)
+               bdev->bd_disk->fops->release(bdev->bd_disk, mode);
+}
+
+static int blkdev_get_part(struct block_device *part, fmode_t mode)
+{
+       struct gendisk *disk = part->bd_disk;
+       int ret;
+
+       if (part->bd_openers)
+               goto done;
+
+       ret = blkdev_get_whole(bdev_whole(part), mode);
+       if (ret)
+               return ret;
+
+       ret = -ENXIO;
+       if (!bdev_nr_sectors(part))
+               goto out_blkdev_put;
+
+       disk->open_partitions++;
+       set_init_blocksize(part);
+done:
+       part->bd_openers++;
+       return 0;
+
+out_blkdev_put:
+       blkdev_put_whole(bdev_whole(part), mode);
+       return ret;
+}
+
+static void blkdev_put_part(struct block_device *part, fmode_t mode)
+{
+       struct block_device *whole = bdev_whole(part);
+
+       if (--part->bd_openers)
+               return;
+       blkdev_flush_mapping(part);
+       whole->bd_disk->open_partitions--;
+       blkdev_put_whole(whole, mode);
+}
+
+struct block_device *blkdev_get_no_open(dev_t dev)
+{
+       struct block_device *bdev;
+       struct inode *inode;
+
+       inode = ilookup(blockdev_superblock, dev);
+       if (!inode) {
+               blk_request_module(dev);
+               inode = ilookup(blockdev_superblock, dev);
+               if (!inode)
+                       return NULL;
+       }
+
+       /* switch from the inode reference to a device mode one: */
+       bdev = &BDEV_I(inode)->bdev;
+       if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
+               bdev = NULL;
+       iput(inode);
+
+       if (!bdev)
+               return NULL;
+       if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||
+           !try_module_get(bdev->bd_disk->fops->owner)) {
+               put_device(&bdev->bd_device);
+               return NULL;
+       }
+
+       return bdev;
+}
+
+void blkdev_put_no_open(struct block_device *bdev)
+{
+       module_put(bdev->bd_disk->fops->owner);
+       put_device(&bdev->bd_device);
+}
+
+/**
+ * blkdev_get_by_dev - open a block device by device number
+ * @dev: device number of block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the block device described by device number @dev. If @mode includes
+ * %FMODE_EXCL, the block device is opened with exclusive access.  Specifying
+ * %FMODE_EXCL with a %NULL @holder is invalid.  Exclusive opens may nest for
+ * the same @holder.
+ *
+ * Use this interface ONLY if you really do not have anything better - i.e. when
+ * you are behind a truly sucky interface and all you are given is a device
+ * number.  Everything else should use blkdev_get_by_path().
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Reference to the block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
+{
+       bool unblock_events = true;
+       struct block_device *bdev;
+       struct gendisk *disk;
+       int ret;
+
+       ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
+                       MAJOR(dev), MINOR(dev),
+                       ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) |
+                       ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0));
+       if (ret)
+               return ERR_PTR(ret);
+
+       bdev = blkdev_get_no_open(dev);
+       if (!bdev)
+               return ERR_PTR(-ENXIO);
+       disk = bdev->bd_disk;
+
+       if (mode & FMODE_EXCL) {
+               ret = bd_prepare_to_claim(bdev, holder);
+               if (ret)
+                       goto put_blkdev;
+       }
+
+       disk_block_events(disk);
+
+       mutex_lock(&disk->open_mutex);
+       ret = -ENXIO;
+       if (!disk_live(disk))
+               goto abort_claiming;
+       if (bdev_is_partition(bdev))
+               ret = blkdev_get_part(bdev, mode);
+       else
+               ret = blkdev_get_whole(bdev, mode);
+       if (ret)
+               goto abort_claiming;
+       if (mode & FMODE_EXCL) {
+               bd_finish_claiming(bdev, holder);
+
+               /*
+                * Block event polling for write claims if requested.  Any write
+                * holder makes the write_holder state stick until all are
+                * released.  This is good enough and tracking individual
+                * writeable reference is too fragile given the way @mode is
+                * used in blkdev_get/put().
+                */
+               if ((mode & FMODE_WRITE) && !bdev->bd_write_holder &&
+                   (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
+                       bdev->bd_write_holder = true;
+                       unblock_events = false;
+               }
+       }
+       mutex_unlock(&disk->open_mutex);
+
+       if (unblock_events)
+               disk_unblock_events(disk);
+       return bdev;
+
+abort_claiming:
+       if (mode & FMODE_EXCL)
+               bd_abort_claiming(bdev, holder);
+       mutex_unlock(&disk->open_mutex);
+       disk_unblock_events(disk);
+put_blkdev:
+       blkdev_put_no_open(bdev);
+       return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(blkdev_get_by_dev);
+
+/**
+ * blkdev_get_by_path - open a block device by name
+ * @path: path to the block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the block device described by the device file at @path.  If @mode
+ * includes %FMODE_EXCL, the block device is opened with exclusive access.
+ * Specifying %FMODE_EXCL with a %NULL @holder is invalid.  Exclusive opens may
+ * nest for the same @holder.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Reference to the block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+                                       void *holder)
+{
+       struct block_device *bdev;
+       dev_t dev;
+       int error;
+
+       error = lookup_bdev(path, &dev);
+       if (error)
+               return ERR_PTR(error);
+
+       bdev = blkdev_get_by_dev(dev, mode, holder);
+       if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+               blkdev_put(bdev, mode);
+               return ERR_PTR(-EACCES);
+       }
+
+       return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_path);
+
+void blkdev_put(struct block_device *bdev, fmode_t mode)
+{
+       struct gendisk *disk = bdev->bd_disk;
+
+       /*
+        * Sync early if it looks like we're the last one.  If someone else
+        * opens the block device between now and the decrement of bd_openers
+        * then we did a sync that we didn't need to, but that's not the end
+        * of the world and we want to avoid long (could be several minute)
+        * syncs while holding the mutex.
+        */
+       if (bdev->bd_openers == 1)
+               sync_blockdev(bdev);
+
+       mutex_lock(&disk->open_mutex);
+       if (mode & FMODE_EXCL) {
+               struct block_device *whole = bdev_whole(bdev);
+               bool bdev_free;
+
+               /*
+                * Release a claim on the device.  The holder fields
+                * are protected with bdev_lock.  open_mutex is to
+                * synchronize disk_holder unlinking.
+                */
+               spin_lock(&bdev_lock);
+
+               WARN_ON_ONCE(--bdev->bd_holders < 0);
+               WARN_ON_ONCE(--whole->bd_holders < 0);
+
+               if ((bdev_free = !bdev->bd_holders))
+                       bdev->bd_holder = NULL;
+               if (!whole->bd_holders)
+                       whole->bd_holder = NULL;
+
+               spin_unlock(&bdev_lock);
+
+               /*
+                * If this was the last claim, remove holder link and
+                * unblock evpoll if it was a write holder.
+                */
+               if (bdev_free && bdev->bd_write_holder) {
+                       disk_unblock_events(disk);
+                       bdev->bd_write_holder = false;
+               }
+       }
+
+       /*
+        * Trigger event checking and tell drivers to flush MEDIA_CHANGE
+        * event.  This is to ensure detection of media removal commanded
+        * from userland - e.g. eject(1).
+        */
+       disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);
+
+       if (bdev_is_partition(bdev))
+               blkdev_put_part(bdev, mode);
+       else
+               blkdev_put_whole(bdev, mode);
+       mutex_unlock(&disk->open_mutex);
+
+       blkdev_put_no_open(bdev);
+}
+EXPORT_SYMBOL(blkdev_put);
+
+/**
+ * lookup_bdev  - lookup a struct block_device by name
+ * @pathname:  special file representing the block device
+ * @dev:       return value of the block device's dev_t
+ *
+ * Get a reference to the blockdevice at @pathname in the current
+ * namespace if possible and return it.  Return ERR_PTR(error)
+ * otherwise.
+ */
+int lookup_bdev(const char *pathname, dev_t *dev)
+{
+       struct inode *inode;
+       struct path path;
+       int error;
+
+       if (!pathname || !*pathname)
+               return -EINVAL;
+
+       error = kern_path(pathname, LOOKUP_FOLLOW, &path);
+       if (error)
+               return error;
+
+       inode = d_backing_inode(path.dentry);
+       error = -ENOTBLK;
+       if (!S_ISBLK(inode->i_mode))
+               goto out_path_put;
+       error = -EACCES;
+       if (!may_open_dev(&path))
+               goto out_path_put;
+
+       *dev = inode->i_rdev;
+       error = 0;
+out_path_put:
+       path_put(&path);
+       return error;
+}
+EXPORT_SYMBOL(lookup_bdev);
+
+int __invalidate_device(struct block_device *bdev, bool kill_dirty)
+{
+       struct super_block *sb = get_super(bdev);
+       int res = 0;
+
+       if (sb) {
+               /*
+                * no need to lock the super, get_super holds the
+                * read mutex so the filesystem cannot go away
+                * under us (->put_super runs with the write lock
+                * hold).
+                */
+               shrink_dcache_sb(sb);
+               res = invalidate_inodes(sb, kill_dirty);
+               drop_super(sb);
+       }
+       invalidate_bdev(bdev);
+       return res;
+}
+EXPORT_SYMBOL(__invalidate_device);
+
+void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
+{
+       struct inode *inode, *old_inode = NULL;
+
+       spin_lock(&blockdev_superblock->s_inode_list_lock);
+       list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
+               struct address_space *mapping = inode->i_mapping;
+               struct block_device *bdev;
+
+               spin_lock(&inode->i_lock);
+               if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
+                   mapping->nrpages == 0) {
+                       spin_unlock(&inode->i_lock);
+                       continue;
+               }
+               __iget(inode);
+               spin_unlock(&inode->i_lock);
+               spin_unlock(&blockdev_superblock->s_inode_list_lock);
+               /*
+                * We hold a reference to 'inode' so it couldn't have been
+                * removed from s_inodes list while we dropped the
+                * s_inode_list_lock  We cannot iput the inode now as we can
+                * be holding the last reference and we cannot iput it under
+                * s_inode_list_lock. So we keep the reference and iput it
+                * later.
+                */
+               iput(old_inode);
+               old_inode = inode;
+               bdev = I_BDEV(inode);
+
+               mutex_lock(&bdev->bd_disk->open_mutex);
+               if (bdev->bd_openers)
+                       func(bdev, arg);
+               mutex_unlock(&bdev->bd_disk->open_mutex);
+
+               spin_lock(&blockdev_superblock->s_inode_list_lock);
+       }
+       spin_unlock(&blockdev_superblock->s_inode_list_lock);
+       iput(old_inode);
+}
index 65d3a63..108a352 100644 (file)
@@ -2135,6 +2135,18 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
        }
 }
 
+/*
+ * Allow 4x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
+ * queues. This is important for md arrays to benefit from merging
+ * requests.
+ */
+static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
+{
+       if (plug->multiple_queues)
+               return BLK_MAX_REQUEST_COUNT * 4;
+       return BLK_MAX_REQUEST_COUNT;
+}
+
 /**
  * blk_mq_submit_bio - Create and send a request to block device.
  * @bio: Bio pointer.
@@ -2231,7 +2243,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
                else
                        last = list_entry_rq(plug->mq_list.prev);
 
-               if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
+               if (request_count >= blk_plug_max_rq_count(plug) || (last &&
                    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
                        blk_flush_plug_list(plug, false);
                        trace_block_plug(q);
index 55c4901..7c4e799 100644 (file)
@@ -2458,6 +2458,7 @@ int blk_throtl_init(struct request_queue *q)
 void blk_throtl_exit(struct request_queue *q)
 {
        BUG_ON(!q->td);
+       del_timer_sync(&q->td->service_queue.pending_timer);
        throtl_shutdown_wq(q);
        blkcg_deactivate_policy(q, &blkcg_policy_throtl);
        free_percpu(q->td->latency_buckets[READ]);
index 8c96b0c..7d2a0ba 100644 (file)
@@ -373,4 +373,6 @@ static inline void bio_clear_hipri(struct bio *bio)
        bio->bi_opf &= ~REQ_HIPRI;
 }
 
+extern const struct address_space_operations def_blk_aops;
+
 #endif /* BLK_INTERNAL_H */
diff --git a/block/fops.c b/block/fops.c
new file mode 100644 (file)
index 0000000..ffce6f6
--- /dev/null
@@ -0,0 +1,640 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 1991, 1992  Linus Torvalds
+ * Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright (C) 2016 - 2020 Christoph Hellwig
+ */
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/uio.h>
+#include <linux/namei.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/falloc.h>
+#include <linux/suspend.h>
+#include "blk.h"
+
+static struct inode *bdev_file_inode(struct file *file)
+{
+       return file->f_mapping->host;
+}
+
+static int blkdev_get_block(struct inode *inode, sector_t iblock,
+               struct buffer_head *bh, int create)
+{
+       bh->b_bdev = I_BDEV(inode);
+       bh->b_blocknr = iblock;
+       set_buffer_mapped(bh);
+       return 0;
+}
+
+static unsigned int dio_bio_write_op(struct kiocb *iocb)
+{
+       unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+
+       /* avoid the need for a I/O completion work item */
+       if (iocb->ki_flags & IOCB_DSYNC)
+               op |= REQ_FUA;
+       return op;
+}
+
+#define DIO_INLINE_BIO_VECS 4
+
+static void blkdev_bio_end_io_simple(struct bio *bio)
+{
+       struct task_struct *waiter = bio->bi_private;
+
+       WRITE_ONCE(bio->bi_private, NULL);
+       blk_wake_io_task(waiter);
+}
+
+static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
+               struct iov_iter *iter, unsigned int nr_pages)
+{
+       struct file *file = iocb->ki_filp;
+       struct block_device *bdev = I_BDEV(bdev_file_inode(file));
+       struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
+       loff_t pos = iocb->ki_pos;
+       bool should_dirty = false;
+       struct bio bio;
+       ssize_t ret;
+       blk_qc_t qc;
+
+       if ((pos | iov_iter_alignment(iter)) &
+           (bdev_logical_block_size(bdev) - 1))
+               return -EINVAL;
+
+       if (nr_pages <= DIO_INLINE_BIO_VECS)
+               vecs = inline_vecs;
+       else {
+               vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec),
+                                    GFP_KERNEL);
+               if (!vecs)
+                       return -ENOMEM;
+       }
+
+       bio_init(&bio, vecs, nr_pages);
+       bio_set_dev(&bio, bdev);
+       bio.bi_iter.bi_sector = pos >> 9;
+       bio.bi_write_hint = iocb->ki_hint;
+       bio.bi_private = current;
+       bio.bi_end_io = blkdev_bio_end_io_simple;
+       bio.bi_ioprio = iocb->ki_ioprio;
+
+       ret = bio_iov_iter_get_pages(&bio, iter);
+       if (unlikely(ret))
+               goto out;
+       ret = bio.bi_iter.bi_size;
+
+       if (iov_iter_rw(iter) == READ) {
+               bio.bi_opf = REQ_OP_READ;
+               if (iter_is_iovec(iter))
+                       should_dirty = true;
+       } else {
+               bio.bi_opf = dio_bio_write_op(iocb);
+               task_io_account_write(ret);
+       }
+       if (iocb->ki_flags & IOCB_NOWAIT)
+               bio.bi_opf |= REQ_NOWAIT;
+       if (iocb->ki_flags & IOCB_HIPRI)
+               bio_set_polled(&bio, iocb);
+
+       qc = submit_bio(&bio);
+       for (;;) {
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               if (!READ_ONCE(bio.bi_private))
+                       break;
+               if (!(iocb->ki_flags & IOCB_HIPRI) ||
+                   !blk_poll(bdev_get_queue(bdev), qc, true))
+                       blk_io_schedule();
+       }
+       __set_current_state(TASK_RUNNING);
+
+       bio_release_pages(&bio, should_dirty);
+       if (unlikely(bio.bi_status))
+               ret = blk_status_to_errno(bio.bi_status);
+
+out:
+       if (vecs != inline_vecs)
+               kfree(vecs);
+
+       bio_uninit(&bio);
+
+       return ret;
+}
+
+struct blkdev_dio {
+       union {
+               struct kiocb            *iocb;
+               struct task_struct      *waiter;
+       };
+       size_t                  size;
+       atomic_t                ref;
+       bool                    multi_bio : 1;
+       bool                    should_dirty : 1;
+       bool                    is_sync : 1;
+       struct bio              bio;
+};
+
+static struct bio_set blkdev_dio_pool;
+
+static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
+{
+       struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
+       struct request_queue *q = bdev_get_queue(bdev);
+
+       return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
+}
+
+static void blkdev_bio_end_io(struct bio *bio)
+{
+       struct blkdev_dio *dio = bio->bi_private;
+       bool should_dirty = dio->should_dirty;
+
+       if (bio->bi_status && !dio->bio.bi_status)
+               dio->bio.bi_status = bio->bi_status;
+
+       if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
+               if (!dio->is_sync) {
+                       struct kiocb *iocb = dio->iocb;
+                       ssize_t ret;
+
+                       if (likely(!dio->bio.bi_status)) {
+                               ret = dio->size;
+                               iocb->ki_pos += ret;
+                       } else {
+                               ret = blk_status_to_errno(dio->bio.bi_status);
+                       }
+
+                       dio->iocb->ki_complete(iocb, ret, 0);
+                       if (dio->multi_bio)
+                               bio_put(&dio->bio);
+               } else {
+                       struct task_struct *waiter = dio->waiter;
+
+                       WRITE_ONCE(dio->waiter, NULL);
+                       blk_wake_io_task(waiter);
+               }
+       }
+
+       if (should_dirty) {
+               bio_check_pages_dirty(bio);
+       } else {
+               bio_release_pages(bio, false);
+               bio_put(bio);
+       }
+}
+
+static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+               unsigned int nr_pages)
+{
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = bdev_file_inode(file);
+       struct block_device *bdev = I_BDEV(inode);
+       struct blk_plug plug;
+       struct blkdev_dio *dio;
+       struct bio *bio;
+       bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
+       bool is_read = (iov_iter_rw(iter) == READ), is_sync;
+       loff_t pos = iocb->ki_pos;
+       blk_qc_t qc = BLK_QC_T_NONE;
+       int ret = 0;
+
+       if ((pos | iov_iter_alignment(iter)) &
+           (bdev_logical_block_size(bdev) - 1))
+               return -EINVAL;
+
+       bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
+
+       dio = container_of(bio, struct blkdev_dio, bio);
+       dio->is_sync = is_sync = is_sync_kiocb(iocb);
+       if (dio->is_sync) {
+               dio->waiter = current;
+               bio_get(bio);
+       } else {
+               dio->iocb = iocb;
+       }
+
+       dio->size = 0;
+       dio->multi_bio = false;
+       dio->should_dirty = is_read && iter_is_iovec(iter);
+
+       /*
+        * Don't plug for HIPRI/polled IO, as those should go straight
+        * to issue
+        */
+       if (!is_poll)
+               blk_start_plug(&plug);
+
+       for (;;) {
+               bio_set_dev(bio, bdev);
+               bio->bi_iter.bi_sector = pos >> 9;
+               bio->bi_write_hint = iocb->ki_hint;
+               bio->bi_private = dio;
+               bio->bi_end_io = blkdev_bio_end_io;
+               bio->bi_ioprio = iocb->ki_ioprio;
+
+               ret = bio_iov_iter_get_pages(bio, iter);
+               if (unlikely(ret)) {
+                       bio->bi_status = BLK_STS_IOERR;
+                       bio_endio(bio);
+                       break;
+               }
+
+               if (is_read) {
+                       bio->bi_opf = REQ_OP_READ;
+                       if (dio->should_dirty)
+                               bio_set_pages_dirty(bio);
+               } else {
+                       bio->bi_opf = dio_bio_write_op(iocb);
+                       task_io_account_write(bio->bi_iter.bi_size);
+               }
+               if (iocb->ki_flags & IOCB_NOWAIT)
+                       bio->bi_opf |= REQ_NOWAIT;
+
+               dio->size += bio->bi_iter.bi_size;
+               pos += bio->bi_iter.bi_size;
+
+               nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
+               if (!nr_pages) {
+                       bool polled = false;
+
+                       if (iocb->ki_flags & IOCB_HIPRI) {
+                               bio_set_polled(bio, iocb);
+                               polled = true;
+                       }
+
+                       qc = submit_bio(bio);
+
+                       if (polled)
+                               WRITE_ONCE(iocb->ki_cookie, qc);
+                       break;
+               }
+
+               if (!dio->multi_bio) {
+                       /*
+                        * AIO needs an extra reference to ensure the dio
+                        * structure which is embedded into the first bio
+                        * stays around.
+                        */
+                       if (!is_sync)
+                               bio_get(bio);
+                       dio->multi_bio = true;
+                       atomic_set(&dio->ref, 2);
+               } else {
+                       atomic_inc(&dio->ref);
+               }
+
+               submit_bio(bio);
+               bio = bio_alloc(GFP_KERNEL, nr_pages);
+       }
+
+       if (!is_poll)
+               blk_finish_plug(&plug);
+
+       if (!is_sync)
+               return -EIOCBQUEUED;
+
+       for (;;) {
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               if (!READ_ONCE(dio->waiter))
+                       break;
+
+               if (!(iocb->ki_flags & IOCB_HIPRI) ||
+                   !blk_poll(bdev_get_queue(bdev), qc, true))
+                       blk_io_schedule();
+       }
+       __set_current_state(TASK_RUNNING);
+
+       if (!ret)
+               ret = blk_status_to_errno(dio->bio.bi_status);
+       if (likely(!ret))
+               ret = dio->size;
+
+       bio_put(&dio->bio);
+       return ret;
+}
+
+static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+       unsigned int nr_pages;
+
+       if (!iov_iter_count(iter))
+               return 0;
+
+       nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
+       if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS)
+               return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
+
+       return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
+}
+
+static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
+{
+       return block_write_full_page(page, blkdev_get_block, wbc);
+}
+
+static int blkdev_readpage(struct file * file, struct page * page)
+{
+       return block_read_full_page(page, blkdev_get_block);
+}
+
+static void blkdev_readahead(struct readahead_control *rac)
+{
+       mpage_readahead(rac, blkdev_get_block);
+}
+
+static int blkdev_write_begin(struct file *file, struct address_space *mapping,
+               loff_t pos, unsigned len, unsigned flags, struct page **pagep,
+               void **fsdata)
+{
+       return block_write_begin(mapping, pos, len, flags, pagep,
+                                blkdev_get_block);
+}
+
+static int blkdev_write_end(struct file *file, struct address_space *mapping,
+               loff_t pos, unsigned len, unsigned copied, struct page *page,
+               void *fsdata)
+{
+       int ret;
+       ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+       unlock_page(page);
+       put_page(page);
+
+       return ret;
+}
+
+static int blkdev_writepages(struct address_space *mapping,
+                            struct writeback_control *wbc)
+{
+       return generic_writepages(mapping, wbc);
+}
+
+const struct address_space_operations def_blk_aops = {
+       .set_page_dirty = __set_page_dirty_buffers,
+       .readpage       = blkdev_readpage,
+       .readahead      = blkdev_readahead,
+       .writepage      = blkdev_writepage,
+       .write_begin    = blkdev_write_begin,
+       .write_end      = blkdev_write_end,
+       .writepages     = blkdev_writepages,
+       .direct_IO      = blkdev_direct_IO,
+       .migratepage    = buffer_migrate_page_norefs,
+       .is_dirty_writeback = buffer_check_dirty_writeback,
+};
+
+/*
+ * for a block special file file_inode(file)->i_size is zero
+ * so we compute the size by hand (just as in block_read/write above)
+ */
+static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
+{
+       struct inode *bd_inode = bdev_file_inode(file);
+       loff_t retval;
+
+       inode_lock(bd_inode);
+       retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
+       inode_unlock(bd_inode);
+       return retval;
+}
+
+static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
+               int datasync)
+{
+       struct inode *bd_inode = bdev_file_inode(filp);
+       struct block_device *bdev = I_BDEV(bd_inode);
+       int error;
+
+       error = file_write_and_wait_range(filp, start, end);
+       if (error)
+               return error;
+
+       /*
+        * There is no need to serialise calls to blkdev_issue_flush with
+        * i_mutex and doing so causes performance issues with concurrent
+        * O_SYNC writers to a block device.
+        */
+       error = blkdev_issue_flush(bdev);
+       if (error == -EOPNOTSUPP)
+               error = 0;
+
+       return error;
+}
+
+static int blkdev_open(struct inode *inode, struct file *filp)
+{
+       struct block_device *bdev;
+
+       /*
+        * Preserve backwards compatibility and allow large file access
+        * even if userspace doesn't ask for it explicitly. Some mkfs
+        * binary needs it. We might want to drop this workaround
+        * during an unstable branch.
+        */
+       filp->f_flags |= O_LARGEFILE;
+       filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+
+       if (filp->f_flags & O_NDELAY)
+               filp->f_mode |= FMODE_NDELAY;
+       if (filp->f_flags & O_EXCL)
+               filp->f_mode |= FMODE_EXCL;
+       if ((filp->f_flags & O_ACCMODE) == 3)
+               filp->f_mode |= FMODE_WRITE_IOCTL;
+
+       bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
+       if (IS_ERR(bdev))
+               return PTR_ERR(bdev);
+       filp->f_mapping = bdev->bd_inode->i_mapping;
+       filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
+       return 0;
+}
+
+static int blkdev_close(struct inode *inode, struct file *filp)
+{
+       struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
+
+       blkdev_put(bdev, filp->f_mode);
+       return 0;
+}
+
+static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+       struct block_device *bdev = I_BDEV(bdev_file_inode(file));
+       fmode_t mode = file->f_mode;
+
+       /*
+        * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
+        * to updated it before every ioctl.
+        */
+       if (file->f_flags & O_NDELAY)
+               mode |= FMODE_NDELAY;
+       else
+               mode &= ~FMODE_NDELAY;
+
+       return blkdev_ioctl(bdev, mode, cmd, arg);
+}
+
+/*
+ * Write data to the block device.  Only intended for the block device itself
+ * and the raw driver which basically is a fake block device.
+ *
+ * Does not take i_mutex for the write and thus is not for general purpose
+ * use.
+ */
+static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct file *file = iocb->ki_filp;
+       struct inode *bd_inode = bdev_file_inode(file);
+       loff_t size = i_size_read(bd_inode);
+       struct blk_plug plug;
+       size_t shorted = 0;
+       ssize_t ret;
+
+       if (bdev_read_only(I_BDEV(bd_inode)))
+               return -EPERM;
+
+       if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
+               return -ETXTBSY;
+
+       if (!iov_iter_count(from))
+               return 0;
+
+       if (iocb->ki_pos >= size)
+               return -ENOSPC;
+
+       if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
+               return -EOPNOTSUPP;
+
+       size -= iocb->ki_pos;
+       if (iov_iter_count(from) > size) {
+               shorted = iov_iter_count(from) - size;
+               iov_iter_truncate(from, size);
+       }
+
+       blk_start_plug(&plug);
+       ret = __generic_file_write_iter(iocb, from);
+       if (ret > 0)
+               ret = generic_write_sync(iocb, ret);
+       iov_iter_reexpand(from, iov_iter_count(from) + shorted);
+       blk_finish_plug(&plug);
+       return ret;
+}
+
+static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+       struct file *file = iocb->ki_filp;
+       struct inode *bd_inode = bdev_file_inode(file);
+       loff_t size = i_size_read(bd_inode);
+       loff_t pos = iocb->ki_pos;
+       size_t shorted = 0;
+       ssize_t ret;
+
+       if (pos >= size)
+               return 0;
+
+       size -= pos;
+       if (iov_iter_count(to) > size) {
+               shorted = iov_iter_count(to) - size;
+               iov_iter_truncate(to, size);
+       }
+
+       ret = generic_file_read_iter(iocb, to);
+       iov_iter_reexpand(to, iov_iter_count(to) + shorted);
+       return ret;
+}
+
+#define        BLKDEV_FALLOC_FL_SUPPORTED                                      \
+               (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
+                FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
+
+static long blkdev_fallocate(struct file *file, int mode, loff_t start,
+                            loff_t len)
+{
+       struct block_device *bdev = I_BDEV(bdev_file_inode(file));
+       loff_t end = start + len - 1;
+       loff_t isize;
+       int error;
+
+       /* Fail if we don't recognize the flags. */
+       if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
+               return -EOPNOTSUPP;
+
+       /* Don't go off the end of the device. */
+       isize = i_size_read(bdev->bd_inode);
+       if (start >= isize)
+               return -EINVAL;
+       if (end >= isize) {
+               if (mode & FALLOC_FL_KEEP_SIZE) {
+                       len = isize - start;
+                       end = start + len - 1;
+               } else
+                       return -EINVAL;
+       }
+
+       /*
+        * Don't allow IO that isn't aligned to logical block size.
+        */
+       if ((start | len) & (bdev_logical_block_size(bdev) - 1))
+               return -EINVAL;
+
+       /* Invalidate the page cache, including dirty pages. */
+       error = truncate_bdev_range(bdev, file->f_mode, start, end);
+       if (error)
+               return error;
+
+       switch (mode) {
+       case FALLOC_FL_ZERO_RANGE:
+       case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
+               error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
+                                           GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
+               break;
+       case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
+               error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
+                                            GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
+               break;
+       case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
+               error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
+                                            GFP_KERNEL, 0);
+               break;
+       default:
+               return -EOPNOTSUPP;
+       }
+       if (error)
+               return error;
+
+       /*
+        * Invalidate the page cache again; if someone wandered in and dirtied
+        * a page, we just discard it - userspace has no way of knowing whether
+        * the write happened before or after discard completing...
+        */
+       return truncate_bdev_range(bdev, file->f_mode, start, end);
+}
+
+const struct file_operations def_blk_fops = {
+       .open           = blkdev_open,
+       .release        = blkdev_close,
+       .llseek         = blkdev_llseek,
+       .read_iter      = blkdev_read_iter,
+       .write_iter     = blkdev_write_iter,
+       .iopoll         = blkdev_iopoll,
+       .mmap           = generic_file_mmap,
+       .fsync          = blkdev_fsync,
+       .unlocked_ioctl = block_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = compat_blkdev_ioctl,
+#endif
+       .splice_read    = generic_file_splice_read,
+       .splice_write   = iter_file_splice_write,
+       .fallocate      = blkdev_fallocate,
+};
+
+static __init int blkdev_init(void)
+{
+       return bioset_init(&blkdev_dio_pool, 4,
+                               offsetof(struct blkdev_dio, bio),
+                               BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE);
+}
+module_init(blkdev_init);
index 567549a..7b6e5e1 100644 (file)
@@ -183,6 +183,7 @@ static struct blk_major_name {
        void (*probe)(dev_t devt);
 } *major_names[BLKDEV_MAJOR_HASH_SIZE];
 static DEFINE_MUTEX(major_names_lock);
+static DEFINE_SPINLOCK(major_names_spinlock);
 
 /* index in the above - for now: assume no multimajor ranges */
 static inline int major_to_index(unsigned major)
@@ -195,11 +196,11 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
 {
        struct blk_major_name *dp;
 
-       mutex_lock(&major_names_lock);
+       spin_lock(&major_names_spinlock);
        for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
                if (dp->major == offset)
                        seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
-       mutex_unlock(&major_names_lock);
+       spin_unlock(&major_names_spinlock);
 }
 #endif /* CONFIG_PROC_FS */
 
@@ -271,6 +272,7 @@ int __register_blkdev(unsigned int major, const char *name,
        p->next = NULL;
        index = major_to_index(major);
 
+       spin_lock(&major_names_spinlock);
        for (n = &major_names[index]; *n; n = &(*n)->next) {
                if ((*n)->major == major)
                        break;
@@ -279,6 +281,7 @@ int __register_blkdev(unsigned int major, const char *name,
                *n = p;
        else
                ret = -EBUSY;
+       spin_unlock(&major_names_spinlock);
 
        if (ret < 0) {
                printk("register_blkdev: cannot get major %u for %s\n",
@@ -298,6 +301,7 @@ void unregister_blkdev(unsigned int major, const char *name)
        int index = major_to_index(major);
 
        mutex_lock(&major_names_lock);
+       spin_lock(&major_names_spinlock);
        for (n = &major_names[index]; *n; n = &(*n)->next)
                if ((*n)->major == major)
                        break;
@@ -307,6 +311,7 @@ void unregister_blkdev(unsigned int major, const char *name)
                p = *n;
                *n = p->next;
        }
+       spin_unlock(&major_names_spinlock);
        mutex_unlock(&major_names_lock);
        kfree(p);
 }
index a4d4eeb..bd48210 100644 (file)
@@ -1008,23 +1008,14 @@ static int cpc_write(int cpu, struct cpc_register_resource *reg_res, u64 val)
        return ret_val;
 }
 
-/**
- * cppc_get_desired_perf - Get the value of desired performance register.
- * @cpunum: CPU from which to get desired performance.
- * @desired_perf: address of a variable to store the returned desired performance
- *
- * Return: 0 for success, -EIO otherwise.
- */
-int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
+static int cppc_get_perf(int cpunum, enum cppc_regs reg_idx, u64 *perf)
 {
        struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum);
-       int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum);
-       struct cpc_register_resource *desired_reg;
-       struct cppc_pcc_data *pcc_ss_data = NULL;
-
-       desired_reg = &cpc_desc->cpc_regs[DESIRED_PERF];
+       struct cpc_register_resource *reg = &cpc_desc->cpc_regs[reg_idx];
 
-       if (CPC_IN_PCC(desired_reg)) {
+       if (CPC_IN_PCC(reg)) {
+               int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum);
+               struct cppc_pcc_data *pcc_ss_data = NULL;
                int ret = 0;
 
                if (pcc_ss_id < 0)
@@ -1035,7 +1026,7 @@ int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
                down_write(&pcc_ss_data->pcc_lock);
 
                if (send_pcc_cmd(pcc_ss_id, CMD_READ) >= 0)
-                       cpc_read(cpunum, desired_reg, desired_perf);
+                       cpc_read(cpunum, reg, perf);
                else
                        ret = -EIO;
 
@@ -1044,12 +1035,36 @@ int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
                return ret;
        }
 
-       cpc_read(cpunum, desired_reg, desired_perf);
+       cpc_read(cpunum, reg, perf);
 
        return 0;
 }
+
+/**
+ * cppc_get_desired_perf - Get the desired performance register value.
+ * @cpunum: CPU from which to get desired performance.
+ * @desired_perf: Return address.
+ *
+ * Return: 0 for success, -EIO otherwise.
+ */
+int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
+{
+       return cppc_get_perf(cpunum, DESIRED_PERF, desired_perf);
+}
 EXPORT_SYMBOL_GPL(cppc_get_desired_perf);
 
+/**
+ * cppc_get_nominal_perf - Get the nominal performance register value.
+ * @cpunum: CPU from which to get nominal performance.
+ * @nominal_perf: Return address.
+ *
+ * Return: 0 for success, -EIO otherwise.
+ */
+int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
+{
+       return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf);
+}
+
 /**
  * cppc_get_perf_caps - Get a CPU's performance capabilities.
  * @cpunum: CPU from which to get capabilities info.
index 1f6007a..89c22bc 100644 (file)
@@ -288,10 +288,18 @@ invalid_guid:
 
 void __init init_prmt(void)
 {
+       struct acpi_table_header *tbl;
        acpi_status status;
-       int mc = acpi_table_parse_entries(ACPI_SIG_PRMT, sizeof(struct acpi_table_prmt) +
+       int mc;
+
+       status = acpi_get_table(ACPI_SIG_PRMT, 0, &tbl);
+       if (ACPI_FAILURE(status))
+               return;
+
+       mc = acpi_table_parse_entries(ACPI_SIG_PRMT, sizeof(struct acpi_table_prmt) +
                                          sizeof (struct acpi_table_prmt_header),
                                          0, acpi_parse_prmt, 0);
+       acpi_put_table(tbl);
        /*
         * Return immediately if PRMT table is not present or no PRM module found.
         */
index b24513e..5b54c80 100644 (file)
@@ -16,7 +16,6 @@
 #include <linux/signal.h>
 #include <linux/kthread.h>
 #include <linux/dmi.h>
-#include <linux/nls.h>
 #include <linux/dma-map-ops.h>
 #include <linux/platform_data/x86/apple.h>
 #include <linux/pgtable.h>
index d568772..cbea78e 100644 (file)
@@ -1642,7 +1642,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
        }
 
        dev->power.may_skip_resume = true;
-       dev->power.must_resume = false;
+       dev->power.must_resume = !dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME);
 
        dpm_watchdog_set(&wd, dev);
        device_lock(dev);
index 3bad326..b91a3a9 100644 (file)
 /**
  * dev_pm_attach_wake_irq - Attach device interrupt as a wake IRQ
  * @dev: Device entry
- * @irq: Device wake-up capable interrupt
  * @wirq: Wake irq specific data
  *
- * Internal function to attach either a device IO interrupt or a
- * dedicated wake-up interrupt as a wake IRQ.
+ * Internal function to attach a dedicated wake-up interrupt as a wake IRQ.
  */
-static int dev_pm_attach_wake_irq(struct device *dev, int irq,
-                                 struct wake_irq *wirq)
+static int dev_pm_attach_wake_irq(struct device *dev, struct wake_irq *wirq)
 {
        unsigned long flags;
 
@@ -65,7 +62,7 @@ int dev_pm_set_wake_irq(struct device *dev, int irq)
        wirq->dev = dev;
        wirq->irq = irq;
 
-       err = dev_pm_attach_wake_irq(dev, irq, wirq);
+       err = dev_pm_attach_wake_irq(dev, wirq);
        if (err)
                kfree(wirq);
 
@@ -196,7 +193,7 @@ int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq)
        if (err)
                goto err_free_name;
 
-       err = dev_pm_attach_wake_irq(dev, irq, wirq);
+       err = dev_pm_attach_wake_irq(dev, wirq);
        if (err)
                goto err_free_irq;
 
index c84be00..26798da 100644 (file)
@@ -129,8 +129,8 @@ static int __init n64cart_probe(struct platform_device *pdev)
        }
 
        reg_base = devm_platform_ioremap_resource(pdev, 0);
-       if (!reg_base)
-               return -EINVAL;
+       if (IS_ERR(reg_base))
+               return PTR_ERR(reg_base);
 
        disk = blk_alloc_disk(NUMA_NO_NODE);
        if (!disk)
index 57c6ae7..9b3bd08 100644 (file)
@@ -762,7 +762,7 @@ static int virtblk_probe(struct virtio_device *vdev)
                goto out_free_vblk;
 
        /* Default queue sizing is to fill the ring. */
-       if (likely(!virtblk_queue_depth)) {
+       if (!virtblk_queue_depth) {
                queue_depth = vblk->vqs[0].vq->num_free;
                /* ... but without indirect descs, we use 2 descs per req */
                if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
@@ -836,7 +836,7 @@ static int virtblk_probe(struct virtio_device *vdev)
        else
                blk_size = queue_logical_block_size(q);
 
-       if (unlikely(blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE)) {
+       if (blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE) {
                dev_err(&vdev->dev,
                        "block size is changed unexpectedly, now is %u\n",
                        blk_size);
index bb46698..6f3272b 100644 (file)
@@ -591,7 +591,7 @@ static void handle_transaction_done(struct smi_info *smi_info)
                smi_info->handlers->get_result(smi_info->si_sm, msg, 3);
                if (msg[2] != 0) {
                        /* Error clearing flags */
-                       dev_warn(smi_info->io.dev,
+                       dev_warn_ratelimited(smi_info->io.dev,
                                 "Error clearing flags: %2.2x\n", msg[2]);
                }
                smi_info->si_state = SI_NORMAL;
@@ -683,10 +683,10 @@ static void handle_transaction_done(struct smi_info *smi_info)
                /* We got the flags from the SMI, now handle them. */
                smi_info->handlers->get_result(smi_info->si_sm, msg, 4);
                if (msg[2] != 0) {
-                       dev_warn(smi_info->io.dev,
-                                "Couldn't get irq info: %x.\n", msg[2]);
-                       dev_warn(smi_info->io.dev,
-                                "Maybe ok, but ipmi might run very slowly.\n");
+                       dev_warn_ratelimited(smi_info->io.dev,
+                               "Couldn't get irq info: %x,\n"
+                               "Maybe ok, but ipmi might run very slowly.\n",
+                               msg[2]);
                        smi_info->si_state = SI_NORMAL;
                        break;
                }
@@ -721,7 +721,7 @@ static void handle_transaction_done(struct smi_info *smi_info)
 
                smi_info->handlers->get_result(smi_info->si_sm, msg, 4);
                if (msg[2] != 0)
-                       dev_warn(smi_info->io.dev,
+                       dev_warn_ratelimited(smi_info->io.dev,
                                 "Could not set the global enables: 0x%x.\n",
                                 msg[2]);
 
@@ -1343,7 +1343,7 @@ retry:
 
                if (cc != IPMI_CC_NO_ERROR &&
                    ++retry_count <= GET_DEVICE_ID_MAX_RETRY) {
-                       dev_warn(smi_info->io.dev,
+                       dev_warn_ratelimited(smi_info->io.dev,
                            "BMC returned 0x%2.2x, retry get bmc device id\n",
                            cc);
                        goto retry;
@@ -1605,7 +1605,7 @@ static ssize_t name##_show(struct device *dev,                    \
                                                                        \
        return snprintf(buf, 10, "%u\n", smi_get_stat(smi_info, name)); \
 }                                                                      \
-static DEVICE_ATTR(name, 0444, name##_show, NULL)
+static DEVICE_ATTR_RO(name)
 
 static ssize_t type_show(struct device *dev,
                         struct device_attribute *attr,
@@ -1615,7 +1615,7 @@ static ssize_t type_show(struct device *dev,
 
        return snprintf(buf, 10, "%s\n", si_to_str[smi_info->io.si_type]);
 }
-static DEVICE_ATTR(type, 0444, type_show, NULL);
+static DEVICE_ATTR_RO(type);
 
 static ssize_t interrupts_enabled_show(struct device *dev,
                                       struct device_attribute *attr,
@@ -1626,8 +1626,7 @@ static ssize_t interrupts_enabled_show(struct device *dev,
 
        return snprintf(buf, 10, "%d\n", enabled);
 }
-static DEVICE_ATTR(interrupts_enabled, 0444,
-                  interrupts_enabled_show, NULL);
+static DEVICE_ATTR_RO(interrupts_enabled);
 
 IPMI_SI_ATTR(short_timeouts);
 IPMI_SI_ATTR(long_timeouts);
@@ -1658,7 +1657,7 @@ static ssize_t params_show(struct device *dev,
                        smi_info->io.irq,
                        smi_info->io.slave_addr);
 }
-static DEVICE_ATTR(params, 0444, params_show, NULL);
+static DEVICE_ATTR_RO(params);
 
 static struct attribute *ipmi_si_dev_attrs[] = {
        &dev_attr_type.attr,
index 053089f..3236706 100644 (file)
@@ -176,10 +176,6 @@ static const struct parent_map gcc_parent_map_2[] = {
        { P_GPLL0_OUT_ODD, 2 },
 };
 
-static const struct clk_parent_data gcc_parent_data_2[] = {
-       { .fw_name = "bi_tcxo" },
-       { .hw = &gpll0_out_odd.clkr.hw },
-};
 static const struct clk_parent_data gcc_parent_data_2_ao[] = {
        { .fw_name = "bi_tcxo_ao" },
        { .hw = &gpll0_out_odd.clkr.hw },
index 2d83a9f..1097f82 100644 (file)
@@ -268,6 +268,7 @@ static struct cpudata **all_cpu_data;
  * @get_min:           Callback to get minimum P state
  * @get_turbo:         Callback to get turbo P state
  * @get_scaling:       Callback to get frequency scaling factor
+ * @get_cpu_scaling:   Get frequency scaling factor for a given cpu
  * @get_aperf_mperf_shift: Callback to get the APERF vs MPERF frequency difference
  * @get_val:           Callback to convert P state to actual MSR write value
  * @get_vid:           Callback to get VID data for Atom platforms
@@ -281,6 +282,7 @@ struct pstate_funcs {
        int (*get_min)(void);
        int (*get_turbo)(void);
        int (*get_scaling)(void);
+       int (*get_cpu_scaling)(int cpu);
        int (*get_aperf_mperf_shift)(void);
        u64 (*get_val)(struct cpudata*, int pstate);
        void (*get_vid)(struct cpudata *);
@@ -384,6 +386,15 @@ static int intel_pstate_get_cppc_guaranteed(int cpu)
        return cppc_perf.nominal_perf;
 }
 
+static u32 intel_pstate_cppc_nominal(int cpu)
+{
+       u64 nominal_perf;
+
+       if (cppc_get_nominal_perf(cpu, &nominal_perf))
+               return 0;
+
+       return nominal_perf;
+}
 #else /* CONFIG_ACPI_CPPC_LIB */
 static inline void intel_pstate_set_itmt_prio(int cpu)
 {
@@ -470,20 +481,6 @@ static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
 
        acpi_processor_unregister_performance(policy->cpu);
 }
-
-static bool intel_pstate_cppc_perf_valid(u32 perf, struct cppc_perf_caps *caps)
-{
-       return perf && perf <= caps->highest_perf && perf >= caps->lowest_perf;
-}
-
-static bool intel_pstate_cppc_perf_caps(struct cpudata *cpu,
-                                       struct cppc_perf_caps *caps)
-{
-       if (cppc_get_perf_caps(cpu->cpu, caps))
-               return false;
-
-       return caps->highest_perf && caps->lowest_perf <= caps->highest_perf;
-}
 #else /* CONFIG_ACPI */
 static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
 {
@@ -506,15 +503,8 @@ static inline int intel_pstate_get_cppc_guaranteed(int cpu)
 }
 #endif /* CONFIG_ACPI_CPPC_LIB */
 
-static void intel_pstate_hybrid_hwp_perf_ctl_parity(struct cpudata *cpu)
-{
-       pr_debug("CPU%d: Using PERF_CTL scaling for HWP\n", cpu->cpu);
-
-       cpu->pstate.scaling = cpu->pstate.perf_ctl_scaling;
-}
-
 /**
- * intel_pstate_hybrid_hwp_calibrate - Calibrate HWP performance levels.
+ * intel_pstate_hybrid_hwp_adjust - Calibrate HWP performance levels.
  * @cpu: Target CPU.
  *
  * On hybrid processors, HWP may expose more performance levels than there are
@@ -522,115 +512,46 @@ static void intel_pstate_hybrid_hwp_perf_ctl_parity(struct cpudata *cpu)
  * scaling factor between HWP performance levels and CPU frequency will be less
  * than the scaling factor between P-state values and CPU frequency.
  *
- * In that case, the scaling factor between HWP performance levels and CPU
- * frequency needs to be determined which can be done with the help of the
- * observation that certain HWP performance levels should correspond to certain
- * P-states, like for example the HWP highest performance should correspond
- * to the maximum turbo P-state of the CPU.
+ * In that case, adjust the CPU parameters used in computations accordingly.
  */
-static void intel_pstate_hybrid_hwp_calibrate(struct cpudata *cpu)
+static void intel_pstate_hybrid_hwp_adjust(struct cpudata *cpu)
 {
        int perf_ctl_max_phys = cpu->pstate.max_pstate_physical;
        int perf_ctl_scaling = cpu->pstate.perf_ctl_scaling;
        int perf_ctl_turbo = pstate_funcs.get_turbo();
        int turbo_freq = perf_ctl_turbo * perf_ctl_scaling;
-       int perf_ctl_max = pstate_funcs.get_max();
-       int max_freq = perf_ctl_max * perf_ctl_scaling;
-       int scaling = INT_MAX;
-       int freq;
+       int scaling = cpu->pstate.scaling;
 
        pr_debug("CPU%d: perf_ctl_max_phys = %d\n", cpu->cpu, perf_ctl_max_phys);
-       pr_debug("CPU%d: perf_ctl_max = %d\n", cpu->cpu, perf_ctl_max);
+       pr_debug("CPU%d: perf_ctl_max = %d\n", cpu->cpu, pstate_funcs.get_max());
        pr_debug("CPU%d: perf_ctl_turbo = %d\n", cpu->cpu, perf_ctl_turbo);
        pr_debug("CPU%d: perf_ctl_scaling = %d\n", cpu->cpu, perf_ctl_scaling);
-
        pr_debug("CPU%d: HWP_CAP guaranteed = %d\n", cpu->cpu, cpu->pstate.max_pstate);
        pr_debug("CPU%d: HWP_CAP highest = %d\n", cpu->cpu, cpu->pstate.turbo_pstate);
-
-#ifdef CONFIG_ACPI
-       if (IS_ENABLED(CONFIG_ACPI_CPPC_LIB)) {
-               struct cppc_perf_caps caps;
-
-               if (intel_pstate_cppc_perf_caps(cpu, &caps)) {
-                       if (intel_pstate_cppc_perf_valid(caps.nominal_perf, &caps)) {
-                               pr_debug("CPU%d: Using CPPC nominal\n", cpu->cpu);
-
-                               /*
-                                * If the CPPC nominal performance is valid, it
-                                * can be assumed to correspond to cpu_khz.
-                                */
-                               if (caps.nominal_perf == perf_ctl_max_phys) {
-                                       intel_pstate_hybrid_hwp_perf_ctl_parity(cpu);
-                                       return;
-                               }
-                               scaling = DIV_ROUND_UP(cpu_khz, caps.nominal_perf);
-                       } else if (intel_pstate_cppc_perf_valid(caps.guaranteed_perf, &caps)) {
-                               pr_debug("CPU%d: Using CPPC guaranteed\n", cpu->cpu);
-
-                               /*
-                                * If the CPPC guaranteed performance is valid,
-                                * it can be assumed to correspond to max_freq.
-                                */
-                               if (caps.guaranteed_perf == perf_ctl_max) {
-                                       intel_pstate_hybrid_hwp_perf_ctl_parity(cpu);
-                                       return;
-                               }
-                               scaling = DIV_ROUND_UP(max_freq, caps.guaranteed_perf);
-                       }
-               }
-       }
-#endif
-       /*
-        * If using the CPPC data to compute the HWP-to-frequency scaling factor
-        * doesn't work, use the HWP_CAP gauranteed perf for this purpose with
-        * the assumption that it corresponds to max_freq.
-        */
-       if (scaling > perf_ctl_scaling) {
-               pr_debug("CPU%d: Using HWP_CAP guaranteed\n", cpu->cpu);
-
-               if (cpu->pstate.max_pstate == perf_ctl_max) {
-                       intel_pstate_hybrid_hwp_perf_ctl_parity(cpu);
-                       return;
-               }
-               scaling = DIV_ROUND_UP(max_freq, cpu->pstate.max_pstate);
-               if (scaling > perf_ctl_scaling) {
-                       /*
-                        * This should not happen, because it would mean that
-                        * the number of HWP perf levels was less than the
-                        * number of P-states, so use the PERF_CTL scaling in
-                        * that case.
-                        */
-                       pr_debug("CPU%d: scaling (%d) out of range\n", cpu->cpu,
-                               scaling);
-
-                       intel_pstate_hybrid_hwp_perf_ctl_parity(cpu);
-                       return;
-               }
-       }
+       pr_debug("CPU%d: HWP-to-frequency scaling factor: %d\n", cpu->cpu, scaling);
 
        /*
-        * If the product of the HWP performance scaling factor obtained above
-        * and the HWP_CAP highest performance is greater than the maximum turbo
-        * frequency corresponding to the pstate_funcs.get_turbo() return value,
-        * the scaling factor is too high, so recompute it so that the HWP_CAP
-        * highest performance corresponds to the maximum turbo frequency.
+        * If the product of the HWP performance scaling factor and the HWP_CAP
+        * highest performance is greater than the maximum turbo frequency
+        * corresponding to the pstate_funcs.get_turbo() return value, the
+        * scaling factor is too high, so recompute it to make the HWP_CAP
+        * highest performance correspond to the maximum turbo frequency.
         */
        if (turbo_freq < cpu->pstate.turbo_pstate * scaling) {
-               pr_debug("CPU%d: scaling too high (%d)\n", cpu->cpu, scaling);
-
                cpu->pstate.turbo_freq = turbo_freq;
                scaling = DIV_ROUND_UP(turbo_freq, cpu->pstate.turbo_pstate);
-       }
+               cpu->pstate.scaling = scaling;
 
-       cpu->pstate.scaling = scaling;
-
-       pr_debug("CPU%d: HWP-to-frequency scaling factor: %d\n", cpu->cpu, scaling);
+               pr_debug("CPU%d: refined HWP-to-frequency scaling factor: %d\n",
+                        cpu->cpu, scaling);
+       }
 
        cpu->pstate.max_freq = rounddown(cpu->pstate.max_pstate * scaling,
                                         perf_ctl_scaling);
 
-       freq = perf_ctl_max_phys * perf_ctl_scaling;
-       cpu->pstate.max_pstate_physical = DIV_ROUND_UP(freq, scaling);
+       cpu->pstate.max_pstate_physical =
+                       DIV_ROUND_UP(perf_ctl_max_phys * perf_ctl_scaling,
+                                    scaling);
 
        cpu->pstate.min_freq = cpu->pstate.min_pstate * perf_ctl_scaling;
        /*
@@ -1861,6 +1782,38 @@ static int knl_get_turbo_pstate(void)
        return ret;
 }
 
+#ifdef CONFIG_ACPI_CPPC_LIB
+static u32 hybrid_ref_perf;
+
+static int hybrid_get_cpu_scaling(int cpu)
+{
+       return DIV_ROUND_UP(core_get_scaling() * hybrid_ref_perf,
+                           intel_pstate_cppc_nominal(cpu));
+}
+
+static void intel_pstate_cppc_set_cpu_scaling(void)
+{
+       u32 min_nominal_perf = U32_MAX;
+       int cpu;
+
+       for_each_present_cpu(cpu) {
+               u32 nominal_perf = intel_pstate_cppc_nominal(cpu);
+
+               if (nominal_perf && nominal_perf < min_nominal_perf)
+                       min_nominal_perf = nominal_perf;
+       }
+
+       if (min_nominal_perf < U32_MAX) {
+               hybrid_ref_perf = min_nominal_perf;
+               pstate_funcs.get_cpu_scaling = hybrid_get_cpu_scaling;
+       }
+}
+#else
+static inline void intel_pstate_cppc_set_cpu_scaling(void)
+{
+}
+#endif /* CONFIG_ACPI_CPPC_LIB */
+
 static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate)
 {
        trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
@@ -1889,10 +1842,8 @@ static void intel_pstate_max_within_limits(struct cpudata *cpu)
 
 static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
 {
-       bool hybrid_cpu = boot_cpu_has(X86_FEATURE_HYBRID_CPU);
        int perf_ctl_max_phys = pstate_funcs.get_max_physical();
-       int perf_ctl_scaling = hybrid_cpu ? cpu_khz / perf_ctl_max_phys :
-                                           pstate_funcs.get_scaling();
+       int perf_ctl_scaling = pstate_funcs.get_scaling();
 
        cpu->pstate.min_pstate = pstate_funcs.get_min();
        cpu->pstate.max_pstate_physical = perf_ctl_max_phys;
@@ -1901,10 +1852,13 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
        if (hwp_active && !hwp_mode_bdw) {
                __intel_pstate_get_hwp_cap(cpu);
 
-               if (hybrid_cpu)
-                       intel_pstate_hybrid_hwp_calibrate(cpu);
-               else
+               if (pstate_funcs.get_cpu_scaling) {
+                       cpu->pstate.scaling = pstate_funcs.get_cpu_scaling(cpu->cpu);
+                       if (cpu->pstate.scaling != perf_ctl_scaling)
+                               intel_pstate_hybrid_hwp_adjust(cpu);
+               } else {
                        cpu->pstate.scaling = perf_ctl_scaling;
+               }
        } else {
                cpu->pstate.scaling = perf_ctl_scaling;
                cpu->pstate.max_pstate = pstate_funcs.get_max();
@@ -3276,6 +3230,9 @@ static int __init intel_pstate_init(void)
                        if (!default_driver)
                                default_driver = &intel_pstate;
 
+                       if (boot_cpu_has(X86_FEATURE_HYBRID_CPU))
+                               intel_pstate_cppc_set_cpu_scaling();
+
                        goto hwp_cpu_matched;
                }
        } else {
index 9561e3d..541efe0 100644 (file)
@@ -42,6 +42,7 @@ config UDMABUF
 config DMABUF_MOVE_NOTIFY
        bool "Move notify between drivers (EXPERIMENTAL)"
        default n
+       depends on DMA_SHARED_BUFFER
        help
          Don't pin buffers if the dynamic DMA-buf interface is available on
          both the exporter as well as the importer. This fixes a security
@@ -52,6 +53,7 @@ config DMABUF_MOVE_NOTIFY
 
 config DMABUF_DEBUG
        bool "DMA-BUF debug checks"
+       depends on DMA_SHARED_BUFFER
        default y if DMA_API_DEBUG
        help
          This option enables additional checks for DMA-BUF importers and
@@ -74,7 +76,7 @@ menuconfig DMABUF_HEAPS
 
 menuconfig DMABUF_SYSFS_STATS
        bool "DMA-BUF sysfs statistics"
-       select DMA_SHARED_BUFFER
+       depends on DMA_SHARED_BUFFER
        help
           Choose this option to enable DMA-BUF sysfs statistics
           in location /sys/kernel/dmabuf/buffers.
index 715e491..4c3fd2e 100644 (file)
@@ -488,9 +488,7 @@ static int fwnet_finish_incoming_packet(struct net_device *net,
                                        struct sk_buff *skb, u16 source_node_id,
                                        bool is_broadcast, u16 ether_type)
 {
-       struct fwnet_device *dev;
        int status;
-       __be64 guid;
 
        switch (ether_type) {
        case ETH_P_ARP:
@@ -503,7 +501,6 @@ static int fwnet_finish_incoming_packet(struct net_device *net,
                goto err;
        }
 
-       dev = netdev_priv(net);
        /* Write metadata, and then pass to the receive level */
        skb->dev = net;
        skb->ip_summed = CHECKSUM_NONE;
@@ -512,7 +509,6 @@ static int fwnet_finish_incoming_packet(struct net_device *net,
         * Parse the encapsulation header. This actually does the job of
         * converting to an ethernet-like pseudo frame header.
         */
-       guid = cpu_to_be64(dev->card->guid);
        if (dev_hard_header(skb, net, ether_type,
                           is_broadcast ? net->broadcast : net->dev_addr,
                           NULL, skb->len) >= 0) {
index ced1964..2ee97ba 100644 (file)
@@ -1147,6 +1147,64 @@ int qcom_scm_qsmmu500_wait_safe_toggle(bool en)
 }
 EXPORT_SYMBOL(qcom_scm_qsmmu500_wait_safe_toggle);
 
+bool qcom_scm_lmh_dcvsh_available(void)
+{
+       return __qcom_scm_is_call_available(__scm->dev, QCOM_SCM_SVC_LMH, QCOM_SCM_LMH_LIMIT_DCVSH);
+}
+EXPORT_SYMBOL(qcom_scm_lmh_dcvsh_available);
+
+int qcom_scm_lmh_profile_change(u32 profile_id)
+{
+       struct qcom_scm_desc desc = {
+               .svc = QCOM_SCM_SVC_LMH,
+               .cmd = QCOM_SCM_LMH_LIMIT_PROFILE_CHANGE,
+               .arginfo = QCOM_SCM_ARGS(1, QCOM_SCM_VAL),
+               .args[0] = profile_id,
+               .owner = ARM_SMCCC_OWNER_SIP,
+       };
+
+       return qcom_scm_call(__scm->dev, &desc, NULL);
+}
+EXPORT_SYMBOL(qcom_scm_lmh_profile_change);
+
+int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val,
+                      u64 limit_node, u32 node_id, u64 version)
+{
+       dma_addr_t payload_phys;
+       u32 *payload_buf;
+       int ret, payload_size = 5 * sizeof(u32);
+
+       struct qcom_scm_desc desc = {
+               .svc = QCOM_SCM_SVC_LMH,
+               .cmd = QCOM_SCM_LMH_LIMIT_DCVSH,
+               .arginfo = QCOM_SCM_ARGS(5, QCOM_SCM_RO, QCOM_SCM_VAL, QCOM_SCM_VAL,
+                                       QCOM_SCM_VAL, QCOM_SCM_VAL),
+               .args[1] = payload_size,
+               .args[2] = limit_node,
+               .args[3] = node_id,
+               .args[4] = version,
+               .owner = ARM_SMCCC_OWNER_SIP,
+       };
+
+       payload_buf = dma_alloc_coherent(__scm->dev, payload_size, &payload_phys, GFP_KERNEL);
+       if (!payload_buf)
+               return -ENOMEM;
+
+       payload_buf[0] = payload_fn;
+       payload_buf[1] = 0;
+       payload_buf[2] = payload_reg;
+       payload_buf[3] = 1;
+       payload_buf[4] = payload_val;
+
+       desc.args[0] = payload_phys;
+
+       ret = qcom_scm_call(__scm->dev, &desc, NULL);
+
+       dma_free_coherent(__scm->dev, payload_size, payload_buf, payload_phys);
+       return ret;
+}
+EXPORT_SYMBOL(qcom_scm_lmh_dcvsh);
+
 static int qcom_scm_find_dload_address(struct device *dev, u64 *addr)
 {
        struct device_node *tcsr;
index 632fe31..d92156c 100644 (file)
@@ -114,6 +114,10 @@ extern int scm_legacy_call(struct device *dev, const struct qcom_scm_desc *desc,
 #define QCOM_SCM_SVC_HDCP              0x11
 #define QCOM_SCM_HDCP_INVOKE           0x01
 
+#define QCOM_SCM_SVC_LMH                       0x13
+#define QCOM_SCM_LMH_LIMIT_PROFILE_CHANGE      0x01
+#define QCOM_SCM_LMH_LIMIT_DCVSH               0x10
+
 #define QCOM_SCM_SVC_SMMU_PROGRAM              0x15
 #define QCOM_SCM_SMMU_CONFIG_ERRATA1           0x03
 #define QCOM_SCM_SMMU_CONFIG_ERRATA1_CLIENT_ALL        0x02
index 8f53837..97178b3 100644 (file)
@@ -468,14 +468,18 @@ bool amdgpu_atomfirmware_dynamic_boot_config_supported(struct amdgpu_device *ade
        return (fw_cap & ATOM_FIRMWARE_CAP_DYNAMIC_BOOT_CFG_ENABLE) ? true : false;
 }
 
-/*
- * Helper function to query RAS EEPROM address
- *
- * @adev: amdgpu_device pointer
+/**
+ * amdgpu_atomfirmware_ras_rom_addr -- Get the RAS EEPROM addr from VBIOS
+ * adev: amdgpu_device pointer
+ * i2c_address: pointer to u8; if not NULL, will contain
+ *    the RAS EEPROM address if the function returns true
  *
- * Return true if vbios supports ras rom address reporting
+ * Return true if VBIOS supports RAS EEPROM address reporting,
+ * else return false. If true and @i2c_address is not NULL,
+ * will contain the RAS ROM address.
  */
-bool amdgpu_atomfirmware_ras_rom_addr(struct amdgpu_device *adev, uint8_t* i2c_address)
+bool amdgpu_atomfirmware_ras_rom_addr(struct amdgpu_device *adev,
+                                     u8 *i2c_address)
 {
        struct amdgpu_mode_info *mode_info = &adev->mode_info;
        int index;
@@ -483,27 +487,39 @@ bool amdgpu_atomfirmware_ras_rom_addr(struct amdgpu_device *adev, uint8_t* i2c_a
        union firmware_info *firmware_info;
        u8 frev, crev;
 
-       if (i2c_address == NULL)
-               return false;
-
-       *i2c_address = 0;
-
        index = get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
-                       firmwareinfo);
+                                           firmwareinfo);
 
        if (amdgpu_atom_parse_data_header(adev->mode_info.atom_context,
-                               index, &size, &frev, &crev, &data_offset)) {
+                                         index, &size, &frev, &crev,
+                                         &data_offset)) {
                /* support firmware_info 3.4 + */
                if ((frev == 3 && crev >=4) || (frev > 3)) {
                        firmware_info = (union firmware_info *)
                                (mode_info->atom_context->bios + data_offset);
-                       *i2c_address = firmware_info->v34.ras_rom_i2c_slave_addr;
+                       /* The ras_rom_i2c_slave_addr should ideally
+                        * be a 19-bit EEPROM address, which would be
+                        * used as is by the driver; see top of
+                        * amdgpu_eeprom.c.
+                        *
+                        * When this is the case, 0 is of course a
+                        * valid RAS EEPROM address, in which case,
+                        * we'll drop the first "if (firm...)" and only
+                        * leave the check for the pointer.
+                        *
+                        * The reason this works right now is because
+                        * ras_rom_i2c_slave_addr contains the EEPROM
+                        * device type qualifier 1010b in the top 4
+                        * bits.
+                        */
+                       if (firmware_info->v34.ras_rom_i2c_slave_addr) {
+                               if (i2c_address)
+                                       *i2c_address = firmware_info->v34.ras_rom_i2c_slave_addr;
+                               return true;
+                       }
                }
        }
 
-       if (*i2c_address != 0)
-               return true;
-
        return false;
 }
 
index 8e5a7ac..7a73167 100644 (file)
@@ -522,6 +522,7 @@ uint32_t amdgpu_display_supported_domains(struct amdgpu_device *adev,
                        break;
                case CHIP_RENOIR:
                case CHIP_VANGOGH:
+               case CHIP_YELLOW_CARP:
                        domain |= AMDGPU_GEM_DOMAIN_GTT;
                        break;
 
index b664029..f18240f 100644 (file)
@@ -1181,7 +1181,12 @@ static const struct pci_device_id pciidlist[] = {
        {0x1002, 0x73A1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
        {0x1002, 0x73A2, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
        {0x1002, 0x73A3, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
+       {0x1002, 0x73A5, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
+       {0x1002, 0x73A8, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
+       {0x1002, 0x73A9, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
        {0x1002, 0x73AB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
+       {0x1002, 0x73AC, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
+       {0x1002, 0x73AD, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
        {0x1002, 0x73AE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
        {0x1002, 0x73AF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
        {0x1002, 0x73BF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
@@ -1197,6 +1202,11 @@ static const struct pci_device_id pciidlist[] = {
        {0x1002, 0x73C0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVY_FLOUNDER},
        {0x1002, 0x73C1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVY_FLOUNDER},
        {0x1002, 0x73C3, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVY_FLOUNDER},
+       {0x1002, 0x73DA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVY_FLOUNDER},
+       {0x1002, 0x73DB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVY_FLOUNDER},
+       {0x1002, 0x73DC, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVY_FLOUNDER},
+       {0x1002, 0x73DD, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVY_FLOUNDER},
+       {0x1002, 0x73DE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVY_FLOUNDER},
        {0x1002, 0x73DF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVY_FLOUNDER},
 
        /* DIMGREY_CAVEFISH */
@@ -1204,6 +1214,13 @@ static const struct pci_device_id pciidlist[] = {
        {0x1002, 0x73E1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
        {0x1002, 0x73E2, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
        {0x1002, 0x73E3, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
+       {0x1002, 0x73E8, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
+       {0x1002, 0x73E9, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
+       {0x1002, 0x73EA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
+       {0x1002, 0x73EB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
+       {0x1002, 0x73EC, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
+       {0x1002, 0x73ED, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
+       {0x1002, 0x73EF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
        {0x1002, 0x73FF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
 
        /* Aldebaran */
index d94c541..5a6857c 100644 (file)
@@ -59,6 +59,7 @@ void amdgpu_show_fdinfo(struct seq_file *m, struct file *f)
        uint64_t vram_mem = 0, gtt_mem = 0, cpu_mem = 0;
        struct drm_file *file = f->private_data;
        struct amdgpu_device *adev = drm_to_adev(file->minor->dev);
+       struct amdgpu_bo *root;
        int ret;
 
        ret = amdgpu_file_to_fpriv(f, &fpriv);
@@ -69,13 +70,19 @@ void amdgpu_show_fdinfo(struct seq_file *m, struct file *f)
        dev = PCI_SLOT(adev->pdev->devfn);
        fn = PCI_FUNC(adev->pdev->devfn);
 
-       ret = amdgpu_bo_reserve(fpriv->vm.root.bo, false);
+       root = amdgpu_bo_ref(fpriv->vm.root.bo);
+       if (!root)
+               return;
+
+       ret = amdgpu_bo_reserve(root, false);
        if (ret) {
                DRM_ERROR("Fail to reserve bo\n");
                return;
        }
        amdgpu_vm_get_memory(&fpriv->vm, &vram_mem, &gtt_mem, &cpu_mem);
-       amdgpu_bo_unreserve(fpriv->vm.root.bo);
+       amdgpu_bo_unreserve(root);
+       amdgpu_bo_unref(&root);
+
        seq_printf(m, "pdev:\t%04x:%02x:%02x.%d\npasid:\t%u\n", domain, bus,
                        dev, fn, fpriv->vm.pasid);
        seq_printf(m, "vram mem:\t%llu kB\n", vram_mem/1024UL);
index 14499f0..8d682be 100644 (file)
@@ -552,6 +552,9 @@ void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev)
                if (!ring || !ring->fence_drv.initialized)
                        continue;
 
+               if (!ring->no_scheduler)
+                       drm_sched_stop(&ring->sched, NULL);
+
                /* You can't wait for HW to signal if it's gone */
                if (!drm_dev_is_unplugged(&adev->ddev))
                        r = amdgpu_fence_wait_empty(ring);
@@ -611,6 +614,11 @@ void amdgpu_fence_driver_hw_init(struct amdgpu_device *adev)
                if (!ring || !ring->fence_drv.initialized)
                        continue;
 
+               if (!ring->no_scheduler) {
+                       drm_sched_resubmit_jobs(&ring->sched);
+                       drm_sched_start(&ring->sched, true);
+               }
+
                /* enable the interrupt */
                if (ring->fence_drv.irq_src)
                        amdgpu_irq_get(adev, ring->fence_drv.irq_src,
index cb07cc3..d6aa032 100644 (file)
@@ -341,21 +341,18 @@ retry:
        r = amdgpu_gem_object_create(adev, size, args->in.alignment,
                                     initial_domain,
                                     flags, ttm_bo_type_device, resv, &gobj);
-       if (r) {
-               if (r != -ERESTARTSYS) {
-                       if (flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) {
-                               flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
-                               goto retry;
-                       }
+       if (r && r != -ERESTARTSYS) {
+               if (flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) {
+                       flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
+                       goto retry;
+               }
 
-                       if (initial_domain == AMDGPU_GEM_DOMAIN_VRAM) {
-                               initial_domain |= AMDGPU_GEM_DOMAIN_GTT;
-                               goto retry;
-                       }
-                       DRM_DEBUG("Failed to allocate GEM object (%llu, %d, %llu, %d)\n",
-                                 size, initial_domain, args->in.alignment, r);
+               if (initial_domain == AMDGPU_GEM_DOMAIN_VRAM) {
+                       initial_domain |= AMDGPU_GEM_DOMAIN_GTT;
+                       goto retry;
                }
-               return r;
+               DRM_DEBUG("Failed to allocate GEM object (%llu, %d, %llu, %d)\n",
+                               size, initial_domain, args->in.alignment, r);
        }
 
        if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) {
index 5430003..675a72e 100644 (file)
@@ -118,7 +118,7 @@ bool amdgpu_gtt_mgr_has_gart_addr(struct ttm_resource *res)
  * @man: TTM memory type manager
  * @tbo: TTM BO we need this range for
  * @place: placement flags and restrictions
- * @mem: the resulting mem object
+ * @res: the resulting mem object
  *
  * Dummy, allocate the node but no space for it yet.
  */
@@ -182,7 +182,7 @@ err_out:
  * amdgpu_gtt_mgr_del - free ranges
  *
  * @man: TTM memory type manager
- * @mem: TTM memory object
+ * @res: TTM memory object
  *
  * Free the allocated GTT again.
  */
index 23efdc6..9b41cb8 100644 (file)
@@ -469,10 +469,10 @@ psp_cmd_submit_buf(struct psp_context *psp,
         */
        if (!skip_unsupport && (psp->cmd_buf_mem->resp.status || !timeout) && !ras_intr) {
                if (ucode)
-                       DRM_WARN("failed to load ucode (%s) ",
-                                 amdgpu_ucode_name(ucode->ucode_id));
-               DRM_WARN("psp gfx command (%s) failed and response status is (0x%X)\n",
-                        psp_gfx_cmd_name(psp->cmd_buf_mem->cmd_id),
+                       DRM_WARN("failed to load ucode %s(0x%X) ",
+                                 amdgpu_ucode_name(ucode->ucode_id), ucode->ucode_id);
+               DRM_WARN("psp gfx command %s(0x%X) failed and response status is (0x%X)\n",
+                        psp_gfx_cmd_name(psp->cmd_buf_mem->cmd_id), psp->cmd_buf_mem->cmd_id,
                         psp->cmd_buf_mem->resp.status);
                if (!timeout) {
                        ret = -EINVAL;
index 9dc3b2d..dc44c94 100644 (file)
@@ -114,27 +114,24 @@ static bool __get_eeprom_i2c_addr_arct(struct amdgpu_device *adev,
 static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev,
                                  struct amdgpu_ras_eeprom_control *control)
 {
-       uint8_t ras_rom_i2c_slave_addr;
+       u8 i2c_addr;
 
        if (!control)
                return false;
 
-       control->i2c_address = 0;
-
-       if (amdgpu_atomfirmware_ras_rom_addr(adev, &ras_rom_i2c_slave_addr))
-       {
-               switch (ras_rom_i2c_slave_addr) {
-               case 0xA0:
-                       control->i2c_address = 0;
-                       return true;
-               case 0xA8:
-                       control->i2c_address = 0x40000;
-                       return true;
-               default:
-                       dev_warn(adev->dev, "RAS EEPROM I2C slave address %02x not supported",
-                                ras_rom_i2c_slave_addr);
-                       return false;
-               }
+       if (amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) {
+               /* The address given by VBIOS is an 8-bit, wire-format
+                * address, i.e. the most significant byte.
+                *
+                * Normalize it to a 19-bit EEPROM address. Remove the
+                * device type identifier and make it a 7-bit address;
+                * then make it a 19-bit EEPROM address. See top of
+                * amdgpu_eeprom.c.
+                */
+               i2c_addr = (i2c_addr & 0x0F) >> 1;
+               control->i2c_address = ((u32) i2c_addr) << 16;
+
+               return true;
        }
 
        switch (adev->asic_type) {
index 2fd77c3..7b2b098 100644 (file)
@@ -361,7 +361,7 @@ static void amdgpu_vram_mgr_virt_start(struct ttm_resource *mem,
  * @man: TTM memory type manager
  * @tbo: TTM BO we need this range for
  * @place: placement flags and restrictions
- * @mem: the resulting mem object
+ * @res: the resulting mem object
  *
  * Allocate VRAM for the given BO.
  */
@@ -487,7 +487,7 @@ error_sub:
  * amdgpu_vram_mgr_del - free ranges
  *
  * @man: TTM memory type manager
- * @mem: TTM memory object
+ * @res: TTM memory object
  *
  * Free the allocated VRAM again.
  */
@@ -522,7 +522,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man,
  * amdgpu_vram_mgr_alloc_sgt - allocate and fill a sg table
  *
  * @adev: amdgpu device pointer
- * @mem: TTM memory object
+ * @res: TTM memory object
  * @offset: byte offset from the base of VRAM BO
  * @length: number of bytes to export in sg_table
  * @dev: the other device
index ff2307d..23b066b 100644 (file)
@@ -258,6 +258,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
        amdgpu_virt_fini_data_exchange(adev);
        atomic_set(&adev->in_gpu_reset, 1);
 
+       xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
+
        do {
                if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
                        goto flr_done;
index 5057263..bd3b231 100644 (file)
@@ -37,6 +37,7 @@ enum idh_request {
        IDH_REQ_GPU_RESET_ACCESS,
 
        IDH_LOG_VF_ERROR       = 200,
+       IDH_READY_TO_RESET      = 201,
 };
 
 enum idh_event {
index ba1d3ab..f50045c 100644 (file)
 #define mmRCC_DEV0_EPF0_STRAP0_ALDE                    0x0015
 #define mmRCC_DEV0_EPF0_STRAP0_ALDE_BASE_IDX           2
 
-#define mmBIF_DOORBELL_INT_CNTL_ALDE                   0x3878
+#define mmBIF_DOORBELL_INT_CNTL_ALDE                   0x00fe
 #define mmBIF_DOORBELL_INT_CNTL_ALDE_BASE_IDX          2
 #define BIF_DOORBELL_INT_CNTL_ALDE__DOORBELL_INTERRUPT_DISABLE__SHIFT  0x18
 #define BIF_DOORBELL_INT_CNTL_ALDE__DOORBELL_INTERRUPT_DISABLE_MASK    0x01000000L
 
+#define mmBIF_INTR_CNTL_ALDE                           0x0101
+#define mmBIF_INTR_CNTL_ALDE_BASE_IDX                  2
+
 static void nbio_v7_4_query_ras_error_count(struct amdgpu_device *adev,
                                        void *ras_error_status);
 
@@ -440,14 +443,23 @@ static int nbio_v7_4_set_ras_controller_irq_state(struct amdgpu_device *adev,
         */
        uint32_t bif_intr_cntl;
 
-       bif_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_INTR_CNTL);
+       if (adev->asic_type == CHIP_ALDEBARAN)
+               bif_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_INTR_CNTL_ALDE);
+       else
+               bif_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_INTR_CNTL);
+
        if (state == AMDGPU_IRQ_STATE_ENABLE) {
                /* set interrupt vector select bit to 0 to select
                 * vetcor 1 for bare metal case */
                bif_intr_cntl = REG_SET_FIELD(bif_intr_cntl,
                                              BIF_INTR_CNTL,
                                              RAS_INTR_VEC_SEL, 0);
-               WREG32_SOC15(NBIO, 0, mmBIF_INTR_CNTL, bif_intr_cntl);
+
+               if (adev->asic_type == CHIP_ALDEBARAN)
+                       WREG32_SOC15(NBIO, 0, mmBIF_INTR_CNTL_ALDE, bif_intr_cntl);
+               else
+                       WREG32_SOC15(NBIO, 0, mmBIF_INTR_CNTL, bif_intr_cntl);
+
        }
 
        return 0;
@@ -476,14 +488,22 @@ static int nbio_v7_4_set_ras_err_event_athub_irq_state(struct amdgpu_device *ade
         */
        uint32_t bif_intr_cntl;
 
-       bif_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_INTR_CNTL);
+       if (adev->asic_type == CHIP_ALDEBARAN)
+               bif_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_INTR_CNTL_ALDE);
+       else
+               bif_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_INTR_CNTL);
+
        if (state == AMDGPU_IRQ_STATE_ENABLE) {
                /* set interrupt vector select bit to 0 to select
                 * vetcor 1 for bare metal case */
                bif_intr_cntl = REG_SET_FIELD(bif_intr_cntl,
                                              BIF_INTR_CNTL,
                                              RAS_INTR_VEC_SEL, 0);
-               WREG32_SOC15(NBIO, 0, mmBIF_INTR_CNTL, bif_intr_cntl);
+
+               if (adev->asic_type == CHIP_ALDEBARAN)
+                       WREG32_SOC15(NBIO, 0, mmBIF_INTR_CNTL_ALDE, bif_intr_cntl);
+               else
+                       WREG32_SOC15(NBIO, 0, mmBIF_INTR_CNTL, bif_intr_cntl);
        }
 
        return 0;
index 42a35d9..fe9a7cc 100644 (file)
@@ -904,14 +904,7 @@ static bool vi_asic_supports_baco(struct amdgpu_device *adev)
        case CHIP_POLARIS11:
        case CHIP_POLARIS12:
        case CHIP_TOPAZ:
-               /* Disable BACO support for the specific polaris12 SKU temporarily */
-               if ((adev->pdev->device == 0x699F) &&
-                    (adev->pdev->revision == 0xC7) &&
-                    (adev->pdev->subsystem_vendor == 0x1028) &&
-                    (adev->pdev->subsystem_device == 0x0039))
-                       return false;
-               else
-                       return amdgpu_dpm_is_baco_supported(adev);
+               return amdgpu_dpm_is_baco_supported(adev);
        default:
                return false;
        }
index 491373f..9fc8021 100644 (file)
@@ -2484,7 +2484,8 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
        }
        if (!p->xnack_enabled) {
                pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
-               return -EFAULT;
+               r = -EFAULT;
+               goto out;
        }
        svms = &p->svms;
 
index 8167236..9b1fc54 100644 (file)
@@ -1200,7 +1200,7 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
        dc_hardware_init(adev->dm.dc);
 
 #if defined(CONFIG_DRM_AMD_DC_DCN)
-       if (adev->apu_flags) {
+       if ((adev->flags & AMD_IS_APU) && (adev->asic_type >= CHIP_CARRIZO)) {
                struct dc_phy_addr_space_config pa_config;
 
                mmhub_read_system_context(adev, &pa_config);
index cd025c1..330edd6 100644 (file)
@@ -1561,7 +1561,7 @@ bool dc_link_dp_perform_link_training_skip_aux(
        struct dc_link *link,
        const struct dc_link_settings *link_setting)
 {
-       struct link_training_settings lt_settings;
+       struct link_training_settings lt_settings = {0};
 
        dp_decide_training_settings(
                        link,
@@ -1707,7 +1707,7 @@ enum link_training_result dc_link_dp_perform_link_training(
        bool skip_video_pattern)
 {
        enum link_training_result status = LINK_TRAINING_SUCCESS;
-       struct link_training_settings lt_settings;
+       struct link_training_settings lt_settings = {0};
        enum dp_link_encoding encoding =
                        dp_get_link_encoding_format(link_settings);
 
@@ -1923,7 +1923,7 @@ enum link_training_result dc_link_dp_sync_lt_attempt(
     struct dc_link_settings *link_settings,
     struct dc_link_training_overrides *lt_overrides)
 {
-       struct link_training_settings lt_settings;
+       struct link_training_settings lt_settings = {0};
        enum link_training_result lt_status = LINK_TRAINING_SUCCESS;
        enum dp_panel_mode panel_mode = DP_PANEL_MODE_DEFAULT;
        enum clock_source_id dp_cs_id = CLOCK_SOURCE_ID_EXTERNAL;
index dc7823d..dd38796 100644 (file)
@@ -510,8 +510,12 @@ static struct stream_encoder *dcn303_stream_encoder_create(enum engine_id eng_id
        vpg = dcn303_vpg_create(ctx, vpg_inst);
        afmt = dcn303_afmt_create(ctx, afmt_inst);
 
-       if (!enc1 || !vpg || !afmt)
+       if (!enc1 || !vpg || !afmt) {
+               kfree(enc1);
+               kfree(vpg);
+               kfree(afmt);
                return NULL;
+       }
 
        dcn30_dio_stream_encoder_construct(enc1, ctx, ctx->dc_bios, eng_id, vpg, afmt, &stream_enc_regs[eng_id],
                        &se_shift, &se_mask);
index 7db268d..3b37213 100644 (file)
@@ -109,7 +109,7 @@ bool dcn31_is_panel_backlight_on(struct panel_cntl *panel_cntl)
        union dmub_rb_cmd cmd;
 
        if (!dcn31_query_backlight_info(panel_cntl, &cmd))
-               return 0;
+               return false;
 
        return cmd.panel_cntl.data.is_backlight_on;
 }
@@ -119,7 +119,7 @@ bool dcn31_is_panel_powered_on(struct panel_cntl *panel_cntl)
        union dmub_rb_cmd cmd;
 
        if (!dcn31_query_backlight_info(panel_cntl, &cmd))
-               return 0;
+               return false;
 
        return cmd.panel_cntl.data.is_powered_on;
 }
index fbed530..63bbdf8 100644 (file)
@@ -2641,7 +2641,7 @@ static void dml20v2_DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndP
                for (k = 0; k < mode_lib->vba.NumberOfActivePlanes; ++k) {
                        if (mode_lib->vba.PrefetchMode[mode_lib->vba.VoltageLevel][mode_lib->vba.maxMpcComb] == 0) {
                                if (mode_lib->vba.DRAMClockChangeWatermark >
-                               dml_max(mode_lib->vba.StutterEnterPlusExitWatermark, mode_lib->vba.UrgentWatermark))
+                                       dml_max(mode_lib->vba.StutterEnterPlusExitWatermark, mode_lib->vba.UrgentWatermark))
                                        mode_lib->vba.MinTTUVBlank[k] += 25;
                        }
                }
index 2d55627..249cb0a 100644 (file)
@@ -2005,10 +2005,10 @@ static int ss_bias_attr_update(struct amdgpu_device *adev, struct amdgpu_device_
 static struct amdgpu_device_attr amdgpu_device_attrs[] = {
        AMDGPU_DEVICE_ATTR_RW(power_dpm_state,                          ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
        AMDGPU_DEVICE_ATTR_RW(power_dpm_force_performance_level,        ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
-       AMDGPU_DEVICE_ATTR_RO(pp_num_states,                            ATTR_FLAG_BASIC),
-       AMDGPU_DEVICE_ATTR_RO(pp_cur_state,                             ATTR_FLAG_BASIC),
-       AMDGPU_DEVICE_ATTR_RW(pp_force_state,                           ATTR_FLAG_BASIC),
-       AMDGPU_DEVICE_ATTR_RW(pp_table,                                 ATTR_FLAG_BASIC),
+       AMDGPU_DEVICE_ATTR_RO(pp_num_states,                            ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
+       AMDGPU_DEVICE_ATTR_RO(pp_cur_state,                             ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
+       AMDGPU_DEVICE_ATTR_RW(pp_force_state,                           ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
+       AMDGPU_DEVICE_ATTR_RW(pp_table,                                 ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
        AMDGPU_DEVICE_ATTR_RW(pp_dpm_sclk,                              ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
        AMDGPU_DEVICE_ATTR_RW(pp_dpm_mclk,                              ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
        AMDGPU_DEVICE_ATTR_RW(pp_dpm_socclk,                            ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
index 715b422..8156729 100644 (file)
@@ -1335,6 +1335,30 @@ enum smu_cmn2asic_mapping_type {
 #define WORKLOAD_MAP(profile, workload) \
        [profile] = {1, (workload)}
 
+/**
+ * smu_memcpy_trailing - Copy the end of one structure into the middle of another
+ *
+ * @dst: Pointer to destination struct
+ * @first_dst_member: The member name in @dst where the overwrite begins
+ * @last_dst_member: The member name in @dst where the overwrite ends after
+ * @src: Pointer to the source struct
+ * @first_src_member: The member name in @src where the copy begins
+ *
+ */
+#define smu_memcpy_trailing(dst, first_dst_member, last_dst_member,       \
+                           src, first_src_member)                         \
+({                                                                        \
+       size_t __src_offset = offsetof(typeof(*(src)), first_src_member);  \
+       size_t __src_size = sizeof(*(src)) - __src_offset;                 \
+       size_t __dst_offset = offsetof(typeof(*(dst)), first_dst_member);  \
+       size_t __dst_size = offsetofend(typeof(*(dst)), last_dst_member) - \
+                           __dst_offset;                                  \
+       BUILD_BUG_ON(__src_size != __dst_size);                            \
+       __builtin_memcpy((u8 *)(dst) + __dst_offset,                       \
+                        (u8 *)(src) + __src_offset,                       \
+                        __dst_size);                                      \
+})
+
 #if !defined(SWSMU_CODE_LAYER_L2) && !defined(SWSMU_CODE_LAYER_L3) && !defined(SWSMU_CODE_LAYER_L4)
 int smu_get_power_limit(void *handle,
                        uint32_t *limit,
index 465ff8d..e7803ce 100644 (file)
@@ -27,6 +27,9 @@
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <asm/div64.h>
+#if IS_ENABLED(CONFIG_X86_64)
+#include <asm/intel-family.h>
+#endif
 #include <drm/amdgpu_drm.h>
 #include "ppatomctrl.h"
 #include "atombios.h"
@@ -1733,6 +1736,17 @@ static int smu7_disable_dpm_tasks(struct pp_hwmgr *hwmgr)
        return result;
 }
 
+static bool intel_core_rkl_chk(void)
+{
+#if IS_ENABLED(CONFIG_X86_64)
+       struct cpuinfo_x86 *c = &cpu_data(0);
+
+       return (c->x86 == 6 && c->x86_model == INTEL_FAM6_ROCKETLAKE);
+#else
+       return false;
+#endif
+}
+
 static void smu7_init_dpm_defaults(struct pp_hwmgr *hwmgr)
 {
        struct smu7_hwmgr *data = (struct smu7_hwmgr *)(hwmgr->backend);
@@ -1758,7 +1772,8 @@ static void smu7_init_dpm_defaults(struct pp_hwmgr *hwmgr)
 
        data->mclk_dpm_key_disabled = hwmgr->feature_mask & PP_MCLK_DPM_MASK ? false : true;
        data->sclk_dpm_key_disabled = hwmgr->feature_mask & PP_SCLK_DPM_MASK ? false : true;
-       data->pcie_dpm_key_disabled = hwmgr->feature_mask & PP_PCIE_DPM_MASK ? false : true;
+       data->pcie_dpm_key_disabled =
+               intel_core_rkl_chk() || !(hwmgr->feature_mask & PP_PCIE_DPM_MASK);
        /* need to set voltage control types before EVV patching */
        data->voltage_control = SMU7_VOLTAGE_CONTROL_NONE;
        data->vddci_control = SMU7_VOLTAGE_CONTROL_NONE;
index 273df66..e343cc2 100644 (file)
@@ -483,10 +483,8 @@ static int arcturus_append_powerplay_table(struct smu_context *smu)
 
        if ((smc_dpm_table->table_header.format_revision == 4) &&
            (smc_dpm_table->table_header.content_revision == 6))
-               memcpy(&smc_pptable->MaxVoltageStepGfx,
-                      &smc_dpm_table->maxvoltagestepgfx,
-                      sizeof(*smc_dpm_table) - offsetof(struct atom_smc_dpm_info_v4_6, maxvoltagestepgfx));
-
+               smu_memcpy_trailing(smc_pptable, MaxVoltageStepGfx, BoardReserved,
+                                   smc_dpm_table, maxvoltagestepgfx);
        return 0;
 }
 
index f966817..a5fc5d7 100644 (file)
@@ -431,16 +431,16 @@ static int navi10_append_powerplay_table(struct smu_context *smu)
 
        switch (smc_dpm_table->table_header.content_revision) {
        case 5: /* nv10 and nv14 */
-               memcpy(smc_pptable->I2cControllers, smc_dpm_table->I2cControllers,
-                       sizeof(*smc_dpm_table) - sizeof(smc_dpm_table->table_header));
+               smu_memcpy_trailing(smc_pptable, I2cControllers, BoardReserved,
+                                   smc_dpm_table, I2cControllers);
                break;
        case 7: /* nv12 */
                ret = amdgpu_atombios_get_data_table(adev, index, NULL, NULL, NULL,
                                              (uint8_t **)&smc_dpm_table_v4_7);
                if (ret)
                        return ret;
-               memcpy(smc_pptable->I2cControllers, smc_dpm_table_v4_7->I2cControllers,
-                       sizeof(*smc_dpm_table_v4_7) - sizeof(smc_dpm_table_v4_7->table_header));
+               smu_memcpy_trailing(smc_pptable, I2cControllers, BoardReserved,
+                                   smc_dpm_table_v4_7, I2cControllers);
                break;
        default:
                dev_err(smu->adev->dev, "smc_dpm_info with unsupported content revision %d!\n",
index 6eb50b0..3a34214 100644 (file)
@@ -1869,7 +1869,7 @@ static int vangogh_od_edit_dpm_table(struct smu_context *smu, enum PP_OD_DPM_TAB
                } else {
                        if (smu->gfx_actual_hard_min_freq > smu->gfx_actual_soft_max_freq) {
                                dev_err(smu->adev->dev,
-                                       "The setting minimun sclk (%d) MHz is greater than the setting maximum sclk (%d) MHz\n",
+                                       "The setting minimum sclk (%d) MHz is greater than the setting maximum sclk (%d) MHz\n",
                                        smu->gfx_actual_hard_min_freq,
                                        smu->gfx_actual_soft_max_freq);
                                return -EINVAL;
index b391380..5aa175e 100644 (file)
@@ -426,7 +426,7 @@ static int renoir_od_edit_dpm_table(struct smu_context *smu,
                } else {
                        if (smu->gfx_actual_hard_min_freq > smu->gfx_actual_soft_max_freq) {
                                dev_err(smu->adev->dev,
-                                       "The setting minimun sclk (%d) MHz is greater than the setting maximum sclk (%d) MHz\n",
+                                       "The setting minimum sclk (%d) MHz is greater than the setting maximum sclk (%d) MHz\n",
                                        smu->gfx_actual_hard_min_freq,
                                        smu->gfx_actual_soft_max_freq);
                                return -EINVAL;
index ec8c30d..ab65202 100644 (file)
@@ -409,9 +409,8 @@ static int aldebaran_append_powerplay_table(struct smu_context *smu)
 
        if ((smc_dpm_table->table_header.format_revision == 4) &&
            (smc_dpm_table->table_header.content_revision == 10))
-               memcpy(&smc_pptable->GfxMaxCurrent,
-                      &smc_dpm_table->GfxMaxCurrent,
-                      sizeof(*smc_dpm_table) - offsetof(struct atom_smc_dpm_info_v4_10, GfxMaxCurrent));
+               smu_memcpy_trailing(smc_pptable, GfxMaxCurrent, reserved,
+                                   smc_dpm_table, GfxMaxCurrent);
        return 0;
 }
 
index 0f17c25..627ba2e 100644 (file)
@@ -731,7 +731,7 @@ static int yellow_carp_od_edit_dpm_table(struct smu_context *smu, enum PP_OD_DPM
                } else {
                        if (smu->gfx_actual_hard_min_freq > smu->gfx_actual_soft_max_freq) {
                                dev_err(smu->adev->dev,
-                                       "The setting minimun sclk (%d) MHz is greater than the setting maximum sclk (%d) MHz\n",
+                                       "The setting minimum sclk (%d) MHz is greater than the setting maximum sclk (%d) MHz\n",
                                        smu->gfx_actual_hard_min_freq,
                                        smu->gfx_actual_soft_max_freq);
                                return -EINVAL;
index 51dbe0e..d2969f6 100644 (file)
@@ -6,7 +6,7 @@
 #ifndef INTEL_GT_REQUESTS_H
 #define INTEL_GT_REQUESTS_H
 
-#include <stddef.h>
+#include <linux/stddef.h>
 
 struct intel_engine_cs;
 struct intel_gt;
index 7c903cf..e9ae22b 100644 (file)
@@ -124,6 +124,7 @@ static int mgag200_pixpll_compute_g200se_00(struct mgag200_pll *pixpll, long clo
        unsigned int computed;
 
        m = n = p = s = 0;
+       delta = 0xffffffff;
        permitteddelta = clock * 5 / 1000;
 
        for (testp = 8; testp > 0; testp /= 2) {
index 0da5b31..dfe5f1d 100644 (file)
@@ -58,25 +58,16 @@ static int write_cmd(struct panfrost_device *pfdev, u32 as_nr, u32 cmd)
 }
 
 static void lock_region(struct panfrost_device *pfdev, u32 as_nr,
-                       u64 iova, size_t size)
+                       u64 iova, u64 size)
 {
        u8 region_width;
        u64 region = iova & PAGE_MASK;
-       /*
-        * fls returns:
-        * 1 .. 32
-        *
-        * 10 + fls(num_pages)
-        * results in the range (11 .. 42)
-        */
-
-       size = round_up(size, PAGE_SIZE);
 
-       region_width = 10 + fls(size >> PAGE_SHIFT);
-       if ((size >> PAGE_SHIFT) != (1ul << (region_width - 11))) {
-               /* not pow2, so must go up to the next pow2 */
-               region_width += 1;
-       }
+       /* The size is encoded as ceil(log2) minus(1), which may be calculated
+        * with fls. The size must be clamped to hardware bounds.
+        */
+       size = max_t(u64, size, AS_LOCK_REGION_MIN_SIZE);
+       region_width = fls64(size - 1) - 1;
        region |= region_width;
 
        /* Lock the region that needs to be updated */
@@ -87,7 +78,7 @@ static void lock_region(struct panfrost_device *pfdev, u32 as_nr,
 
 
 static int mmu_hw_do_operation_locked(struct panfrost_device *pfdev, int as_nr,
-                                     u64 iova, size_t size, u32 op)
+                                     u64 iova, u64 size, u32 op)
 {
        if (as_nr < 0)
                return 0;
@@ -104,7 +95,7 @@ static int mmu_hw_do_operation_locked(struct panfrost_device *pfdev, int as_nr,
 
 static int mmu_hw_do_operation(struct panfrost_device *pfdev,
                               struct panfrost_mmu *mmu,
-                              u64 iova, size_t size, u32 op)
+                              u64 iova, u64 size, u32 op)
 {
        int ret;
 
@@ -121,7 +112,7 @@ static void panfrost_mmu_enable(struct panfrost_device *pfdev, struct panfrost_m
        u64 transtab = cfg->arm_mali_lpae_cfg.transtab;
        u64 memattr = cfg->arm_mali_lpae_cfg.memattr;
 
-       mmu_hw_do_operation_locked(pfdev, as_nr, 0, ~0UL, AS_COMMAND_FLUSH_MEM);
+       mmu_hw_do_operation_locked(pfdev, as_nr, 0, ~0ULL, AS_COMMAND_FLUSH_MEM);
 
        mmu_write(pfdev, AS_TRANSTAB_LO(as_nr), transtab & 0xffffffffUL);
        mmu_write(pfdev, AS_TRANSTAB_HI(as_nr), transtab >> 32);
@@ -137,7 +128,7 @@ static void panfrost_mmu_enable(struct panfrost_device *pfdev, struct panfrost_m
 
 static void panfrost_mmu_disable(struct panfrost_device *pfdev, u32 as_nr)
 {
-       mmu_hw_do_operation_locked(pfdev, as_nr, 0, ~0UL, AS_COMMAND_FLUSH_MEM);
+       mmu_hw_do_operation_locked(pfdev, as_nr, 0, ~0ULL, AS_COMMAND_FLUSH_MEM);
 
        mmu_write(pfdev, AS_TRANSTAB_LO(as_nr), 0);
        mmu_write(pfdev, AS_TRANSTAB_HI(as_nr), 0);
@@ -251,7 +242,7 @@ static size_t get_pgsize(u64 addr, size_t size)
 
 static void panfrost_mmu_flush_range(struct panfrost_device *pfdev,
                                     struct panfrost_mmu *mmu,
-                                    u64 iova, size_t size)
+                                    u64 iova, u64 size)
 {
        if (mmu->as < 0)
                return;
index 1940ff8..6c5a11e 100644 (file)
 #define AS_FAULTSTATUS_ACCESS_TYPE_READ                (0x2 << 8)
 #define AS_FAULTSTATUS_ACCESS_TYPE_WRITE       (0x3 << 8)
 
+#define AS_LOCK_REGION_MIN_SIZE                 (1ULL << 15)
+
 #define gpu_write(dev, reg, data) writel(data, dev->iomem + reg)
 #define gpu_read(dev, reg) readl(dev->iomem + reg)
 
index ea4add2..bb9e02c 100644 (file)
@@ -1160,9 +1160,9 @@ int ttm_bo_swapout(struct ttm_buffer_object *bo, struct ttm_operation_ctx *ctx,
        }
 
        if (bo->deleted) {
-               ttm_bo_cleanup_refs(bo, false, false, locked);
+               ret = ttm_bo_cleanup_refs(bo, false, false, locked);
                ttm_bo_put(bo);
-               return 0;
+               return ret == -EBUSY ? -ENOSPC : ret;
        }
 
        ttm_bo_del_from_lru(bo);
@@ -1216,7 +1216,7 @@ out:
        if (locked)
                dma_resv_unlock(bo->base.resv);
        ttm_bo_put(bo);
-       return ret;
+       return ret == -EBUSY ? -ENOSPC : ret;
 }
 
 void ttm_bo_tt_destroy(struct ttm_buffer_object *bo)
index 763fa6f..1c5ffe2 100644 (file)
@@ -143,7 +143,6 @@ int ttm_bo_move_memcpy(struct ttm_buffer_object *bo,
        struct ttm_resource *src_mem = bo->resource;
        struct ttm_resource_manager *src_man =
                ttm_manager_type(bdev, src_mem->mem_type);
-       struct ttm_resource src_copy = *src_mem;
        union {
                struct ttm_kmap_iter_tt tt;
                struct ttm_kmap_iter_linear_io io;
@@ -173,11 +172,11 @@ int ttm_bo_move_memcpy(struct ttm_buffer_object *bo,
        }
 
        ttm_move_memcpy(bo, dst_mem->num_pages, dst_iter, src_iter);
-       src_copy = *src_mem;
-       ttm_bo_move_sync_cleanup(bo, dst_mem);
 
        if (!src_iter->ops->maps_tt)
-               ttm_kmap_iter_linear_io_fini(&_src_iter.io, bdev, &src_copy);
+               ttm_kmap_iter_linear_io_fini(&_src_iter.io, bdev, src_mem);
+       ttm_bo_move_sync_cleanup(bo, dst_mem);
+
 out_src_iter:
        if (!dst_iter->ops->maps_tt)
                ttm_kmap_iter_linear_io_fini(&_dst_iter.io, bdev, dst_mem);
index 24031a8..d5cd8b5 100644 (file)
@@ -32,7 +32,6 @@
 #define pr_fmt(fmt) "[TTM] " fmt
 
 #include <linux/sched.h>
-#include <linux/pagemap.h>
 #include <linux/shmem_fs.h>
 #include <linux/file.h>
 #include <drm/drm_cache.h>
index b7dc32a..4a11150 100644 (file)
@@ -1462,7 +1462,7 @@ static const struct hdmi_codec_ops vc4_hdmi_codec_ops = {
        .audio_startup = vc4_hdmi_audio_startup,
 };
 
-struct hdmi_codec_pdata vc4_hdmi_codec_pdata = {
+static struct hdmi_codec_pdata vc4_hdmi_codec_pdata = {
        .ops = &vc4_hdmi_codec_ops,
        .max_i2s_channels = 8,
        .i2s = 1,
index f798922..882c3c8 100644 (file)
@@ -28,10 +28,6 @@ MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");
 
-static bool use_ktime = true;
-module_param(use_ktime, bool, 0400);
-MODULE_PARM_DESC(use_ktime, "Use ktime for measuring I/O speed");
-
 /*
  * Option parsing.
  */
@@ -110,7 +106,6 @@ struct analog_port {
        char cooked;
        int bads;
        int reads;
-       int speed;
        int loop;
        int fuzz;
        int axes[4];
@@ -119,66 +114,6 @@ struct analog_port {
        int axtime;
 };
 
-/*
- * Time macros.
- */
-
-#ifdef __i386__
-
-#include <linux/i8253.h>
-
-#define GET_TIME(x)    do { if (boot_cpu_has(X86_FEATURE_TSC)) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0)
-#define DELTA(x,y)     (boot_cpu_has(X86_FEATURE_TSC) ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0)))
-#define TIME_NAME      (boot_cpu_has(X86_FEATURE_TSC)?"TSC":"PIT")
-static unsigned int get_time_pit(void)
-{
-        unsigned long flags;
-        unsigned int count;
-
-        raw_spin_lock_irqsave(&i8253_lock, flags);
-        outb_p(0x00, 0x43);
-        count = inb_p(0x40);
-        count |= inb_p(0x40) << 8;
-        raw_spin_unlock_irqrestore(&i8253_lock, flags);
-
-        return count;
-}
-#elif defined(__x86_64__)
-#define GET_TIME(x)    do { x = (unsigned int)rdtsc(); } while (0)
-#define DELTA(x,y)     ((y)-(x))
-#define TIME_NAME      "TSC"
-#elif defined(__alpha__) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_PPC) || defined(CONFIG_RISCV)
-#define GET_TIME(x)    do { x = get_cycles(); } while (0)
-#define DELTA(x,y)     ((y)-(x))
-#define TIME_NAME      "get_cycles"
-#else
-#define FAKE_TIME
-static unsigned long analog_faketime = 0;
-#define GET_TIME(x)     do { x = analog_faketime++; } while(0)
-#define DELTA(x,y)     ((y)-(x))
-#define TIME_NAME      "Unreliable"
-#warning Precise timer not defined for this architecture.
-#endif
-
-static inline u64 get_time(void)
-{
-       if (use_ktime) {
-               return ktime_get_ns();
-       } else {
-               unsigned int x;
-               GET_TIME(x);
-               return x;
-       }
-}
-
-static inline unsigned int delta(u64 x, u64 y)
-{
-       if (use_ktime)
-               return y - x;
-       else
-               return DELTA((unsigned int)x, (unsigned int)y);
-}
-
 /*
  * analog_decode() decodes analog joystick data and reports input events.
  */
@@ -234,18 +169,18 @@ static void analog_decode(struct analog *analog, int *axes, int *initial, int bu
 static int analog_cooked_read(struct analog_port *port)
 {
        struct gameport *gameport = port->gameport;
-       u64 time[4], start, loop, now;
+       ktime_t time[4], start, loop, now;
        unsigned int loopout, timeout;
        unsigned char data[4], this, last;
        unsigned long flags;
        int i, j;
 
        loopout = (ANALOG_LOOP_TIME * port->loop) / 1000;
-       timeout = ANALOG_MAX_TIME * port->speed;
+       timeout = ANALOG_MAX_TIME * NSEC_PER_MSEC;
 
        local_irq_save(flags);
        gameport_trigger(gameport);
-       now = get_time();
+       now = ktime_get();
        local_irq_restore(flags);
 
        start = now;
@@ -258,16 +193,16 @@ static int analog_cooked_read(struct analog_port *port)
 
                local_irq_disable();
                this = gameport_read(gameport) & port->mask;
-               now = get_time();
+               now = ktime_get();
                local_irq_restore(flags);
 
-               if ((last ^ this) && (delta(loop, now) < loopout)) {
+               if ((last ^ this) && (ktime_sub(now, loop) < loopout)) {
                        data[i] = last ^ this;
                        time[i] = now;
                        i++;
                }
 
-       } while (this && (i < 4) && (delta(start, now) < timeout));
+       } while (this && (i < 4) && (ktime_sub(now, start) < timeout));
 
        this <<= 4;
 
@@ -275,7 +210,7 @@ static int analog_cooked_read(struct analog_port *port)
                this |= data[i];
                for (j = 0; j < 4; j++)
                        if (data[i] & (1 << j))
-                               port->axes[j] = (delta(start, time[i]) << ANALOG_FUZZ_BITS) / port->loop;
+                               port->axes[j] = ((u32)ktime_sub(time[i], start) << ANALOG_FUZZ_BITS) / port->loop;
        }
 
        return -(this != port->mask);
@@ -375,38 +310,22 @@ static void analog_calibrate_timer(struct analog_port *port)
 {
        struct gameport *gameport = port->gameport;
        unsigned int i, t, tx;
-       u64 t1, t2, t3;
+       ktime_t t1, t2, t3;
        unsigned long flags;
 
-       if (use_ktime) {
-               port->speed = 1000000;
-       } else {
-               local_irq_save(flags);
-               t1 = get_time();
-#ifdef FAKE_TIME
-               analog_faketime += 830;
-#endif
-               mdelay(1);
-               t2 = get_time();
-               t3 = get_time();
-               local_irq_restore(flags);
-
-               port->speed = delta(t1, t2) - delta(t2, t3);
-       }
-
        tx = ~0;
 
        for (i = 0; i < 50; i++) {
                local_irq_save(flags);
-               t1 = get_time();
+               t1 = ktime_get();
                for (t = 0; t < 50; t++) {
                        gameport_read(gameport);
-                       t2 = get_time();
+                       t2 = ktime_get();
                }
-               t3 = get_time();
+               t3 = ktime_get();
                local_irq_restore(flags);
                udelay(i);
-               t = delta(t1, t2) - delta(t2, t3);
+               t = ktime_sub(t2, t1) - ktime_sub(t3, t2);
                if (t < tx) tx = t;
        }
 
@@ -611,7 +530,7 @@ static int analog_init_port(struct gameport *gameport, struct gameport_driver *d
                t = gameport_read(gameport);
                msleep(ANALOG_MAX_TIME);
                port->mask = (gameport_read(gameport) ^ t) & t & 0xf;
-               port->fuzz = (port->speed * ANALOG_FUZZ_MAGIC) / port->loop / 1000 + ANALOG_FUZZ_BITS;
+               port->fuzz = (NSEC_PER_MSEC * ANALOG_FUZZ_MAGIC) / port->loop / 1000 + ANALOG_FUZZ_BITS;
 
                for (i = 0; i < ANALOG_INIT_RETRIES; i++) {
                        if (!analog_cooked_read(port))
index 40a070a..e75650e 100644 (file)
@@ -210,7 +210,7 @@ config KEYBOARD_LKKBD
        select SERIO
        help
          Say Y here if you want to use a LK201 or LK401 style serial
-         keyboard. This keyboard is also useable on PCs if you attach
+         keyboard. This keyboard is also usable on PCs if you attach
          it with the inputattach program. The connector pinout is
          described within lkkbd.c.
 
index 6d5be48..bf72ab8 100644 (file)
@@ -193,7 +193,7 @@ static const struct of_device_id adc_keys_of_match[] = {
 MODULE_DEVICE_TABLE(of, adc_keys_of_match);
 #endif
 
-static struct platform_driver __refdata adc_keys_driver = {
+static struct platform_driver adc_keys_driver = {
        .driver = {
                .name = "adc_keys",
                .of_match_table = of_match_ptr(adc_keys_of_match),
index 90a59b9..1592da4 100644 (file)
@@ -17,7 +17,7 @@
 #include <linux/platform_device.h>
 #include <linux/input.h>
 #include <linux/i2c.h>
-#include <linux/gpio.h>
+#include <linux/gpio/driver.h>
 #include <linux/slab.h>
 
 #include <linux/platform_data/adp5588.h>
index 654e047..bdd2644 100644 (file)
@@ -18,7 +18,7 @@
 #include <linux/platform_device.h>
 #include <linux/input.h>
 #include <linux/i2c.h>
-#include <linux/gpio.h>
+#include <linux/gpio/driver.h>
 #include <linux/slab.h>
 
 #include <linux/input/adp5589.h>
index c819433..e0e931e 100644 (file)
@@ -157,7 +157,7 @@ static int ep93xx_keypad_open(struct input_dev *pdev)
 
        if (!keypad->enabled) {
                ep93xx_keypad_config(keypad);
-               clk_enable(keypad->clk);
+               clk_prepare_enable(keypad->clk);
                keypad->enabled = true;
        }
 
@@ -169,7 +169,7 @@ static void ep93xx_keypad_close(struct input_dev *pdev)
        struct ep93xx_keypad *keypad = input_get_drvdata(pdev);
 
        if (keypad->enabled) {
-               clk_disable(keypad->clk);
+               clk_disable_unprepare(keypad->clk);
                keypad->enabled = false;
        }
 }
index 498cde3..dd5227c 100644 (file)
@@ -309,18 +309,6 @@ config INPUT_GPIO_VIBRA
          To compile this driver as a module, choose M here: the module will be
          called gpio-vibra.
 
-config INPUT_IXP4XX_BEEPER
-       tristate "IXP4XX Beeper support"
-       depends on ARCH_IXP4XX
-       help
-         If you say yes here, you can connect a beeper to the
-         ixp4xx gpio pins. This is used by the LinkSys NSLU2.
-
-         If unsure, say Y.
-
-         To compile this driver as a module, choose M here: the
-         module will be called ixp4xx-beeper.
-
 config INPUT_COBALT_BTNS
        tristate "Cobalt button interface"
        depends on MIPS_COBALT
@@ -811,16 +799,6 @@ config INPUT_XEN_KBDDEV_FRONTEND
          To compile this driver as a module, choose M here: the
          module will be called xen-kbdfront.
 
-config INPUT_SIRFSOC_ONKEY
-       tristate "CSR SiRFSoC power on/off/suspend key support"
-       depends on ARCH_SIRF && OF
-       default y
-       help
-         Say Y here if you want to support for the SiRFSoC power on/off/suspend key
-         in Linux, after you press the onkey, system will suspend.
-
-         If unsure, say N.
-
 config INPUT_IDEAPAD_SLIDEBAR
        tristate "IdeaPad Laptop Slidebar"
        depends on INPUT
index f593bee..b92c53a 100644 (file)
@@ -44,7 +44,6 @@ obj-$(CONFIG_HP_SDC_RTC)              += hp_sdc_rtc.o
 obj-$(CONFIG_INPUT_IMS_PCU)            += ims-pcu.o
 obj-$(CONFIG_INPUT_IQS269A)            += iqs269a.o
 obj-$(CONFIG_INPUT_IQS626A)            += iqs626a.o
-obj-$(CONFIG_INPUT_IXP4XX_BEEPER)      += ixp4xx-beeper.o
 obj-$(CONFIG_INPUT_KEYSPAN_REMOTE)     += keyspan_remote.o
 obj-$(CONFIG_INPUT_KXTJ9)              += kxtj9.o
 obj-$(CONFIG_INPUT_M68K_BEEP)          += m68kspkr.o
@@ -74,7 +73,6 @@ obj-$(CONFIG_INPUT_GPIO_ROTARY_ENCODER)       += rotary_encoder.o
 obj-$(CONFIG_INPUT_RK805_PWRKEY)       += rk805-pwrkey.o
 obj-$(CONFIG_INPUT_SC27XX_VIBRA)       += sc27xx-vibra.o
 obj-$(CONFIG_INPUT_SGI_BTNS)           += sgi_btns.o
-obj-$(CONFIG_INPUT_SIRFSOC_ONKEY)      += sirfsoc-onkey.o
 obj-$(CONFIG_INPUT_SOC_BUTTON_ARRAY)   += soc_button_array.o
 obj-$(CONFIG_INPUT_SPARCSPKR)          += sparcspkr.o
 obj-$(CONFIG_INPUT_STPMIC1_ONKEY)      += stpmic1_onkey.o
diff --git a/drivers/input/misc/ixp4xx-beeper.c b/drivers/input/misc/ixp4xx-beeper.c
deleted file mode 100644 (file)
index 05018d0..0000000
+++ /dev/null
@@ -1,183 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Generic IXP4xx beeper driver
- *
- * Copyright (C) 2005 Tower Technologies
- *
- * based on nslu2-io.c
- *  Copyright (C) 2004 Karen Spearel
- *
- * Author: Alessandro Zummo <a.zummo@towertech.it>
- * Maintainers: http://www.nslu2-linux.org/
- */
-
-#include <linux/module.h>
-#include <linux/input.h>
-#include <linux/delay.h>
-#include <linux/platform_device.h>
-#include <linux/interrupt.h>
-#include <linux/gpio.h>
-#include <mach/hardware.h>
-
-MODULE_AUTHOR("Alessandro Zummo <a.zummo@towertech.it>");
-MODULE_DESCRIPTION("ixp4xx beeper driver");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:ixp4xx-beeper");
-
-static DEFINE_SPINLOCK(beep_lock);
-
-static int ixp4xx_timer2_irq;
-
-static void ixp4xx_spkr_control(unsigned int pin, unsigned int count)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&beep_lock, flags);
-
-       if (count) {
-               gpio_direction_output(pin, 0);
-               *IXP4XX_OSRT2 = (count & ~IXP4XX_OST_RELOAD_MASK) | IXP4XX_OST_ENABLE;
-       } else {
-               gpio_direction_output(pin, 1);
-               gpio_direction_input(pin);
-               *IXP4XX_OSRT2 = 0;
-       }
-
-       spin_unlock_irqrestore(&beep_lock, flags);
-}
-
-static int ixp4xx_spkr_event(struct input_dev *dev, unsigned int type, unsigned int code, int value)
-{
-       unsigned int pin = (unsigned int) input_get_drvdata(dev);
-       unsigned int count = 0;
-
-       if (type != EV_SND)
-               return -1;
-
-       switch (code) {
-               case SND_BELL:
-                       if (value)
-                               value = 1000;
-               case SND_TONE:
-                       break;
-               default:
-                       return -1;
-       }
-
-       if (value > 20 && value < 32767)
-               count = (ixp4xx_timer_freq / (value * 4)) - 1;
-
-       ixp4xx_spkr_control(pin, count);
-
-       return 0;
-}
-
-static irqreturn_t ixp4xx_spkr_interrupt(int irq, void *dev_id)
-{
-       unsigned int pin = (unsigned int) dev_id;
-
-       /* clear interrupt */
-       *IXP4XX_OSST = IXP4XX_OSST_TIMER_2_PEND;
-
-       /* flip the beeper output */
-       gpio_set_value(pin, !gpio_get_value(pin));
-
-       return IRQ_HANDLED;
-}
-
-static int ixp4xx_spkr_probe(struct platform_device *dev)
-{
-       struct input_dev *input_dev;
-       int irq;
-       int err;
-
-       input_dev = input_allocate_device();
-       if (!input_dev)
-               return -ENOMEM;
-
-       input_set_drvdata(input_dev, (void *) dev->id);
-
-       input_dev->name = "ixp4xx beeper";
-       input_dev->phys = "ixp4xx/gpio";
-       input_dev->id.bustype = BUS_HOST;
-       input_dev->id.vendor  = 0x001f;
-       input_dev->id.product = 0x0001;
-       input_dev->id.version = 0x0100;
-       input_dev->dev.parent = &dev->dev;
-
-       input_dev->evbit[0] = BIT_MASK(EV_SND);
-       input_dev->sndbit[0] = BIT_MASK(SND_BELL) | BIT_MASK(SND_TONE);
-       input_dev->event = ixp4xx_spkr_event;
-
-       irq = platform_get_irq(dev, 0);
-       if (irq < 0) {
-               err = irq;
-               goto err_free_device;
-       }
-
-       err = gpio_request(dev->id, "ixp4-beeper");
-       if (err)
-               goto err_free_device;
-
-       err = request_irq(irq, &ixp4xx_spkr_interrupt,
-                         IRQF_NO_SUSPEND, "ixp4xx-beeper",
-                         (void *) dev->id);
-       if (err)
-               goto err_free_gpio;
-       ixp4xx_timer2_irq = irq;
-
-       err = input_register_device(input_dev);
-       if (err)
-               goto err_free_irq;
-
-       platform_set_drvdata(dev, input_dev);
-
-       return 0;
-
- err_free_irq:
-       free_irq(irq, (void *)dev->id);
- err_free_gpio:
-       gpio_free(dev->id);
- err_free_device:
-       input_free_device(input_dev);
-
-       return err;
-}
-
-static int ixp4xx_spkr_remove(struct platform_device *dev)
-{
-       struct input_dev *input_dev = platform_get_drvdata(dev);
-       unsigned int pin = (unsigned int) input_get_drvdata(input_dev);
-
-       input_unregister_device(input_dev);
-
-       /* turn the speaker off */
-       disable_irq(ixp4xx_timer2_irq);
-       ixp4xx_spkr_control(pin, 0);
-
-       free_irq(ixp4xx_timer2_irq, (void *)dev->id);
-       gpio_free(dev->id);
-
-       return 0;
-}
-
-static void ixp4xx_spkr_shutdown(struct platform_device *dev)
-{
-       struct input_dev *input_dev = platform_get_drvdata(dev);
-       unsigned int pin = (unsigned int) input_get_drvdata(input_dev);
-
-       /* turn off the speaker */
-       disable_irq(ixp4xx_timer2_irq);
-       ixp4xx_spkr_control(pin, 0);
-}
-
-static struct platform_driver ixp4xx_spkr_platform_driver = {
-       .driver         = {
-               .name   = "ixp4xx-beeper",
-       },
-       .probe          = ixp4xx_spkr_probe,
-       .remove         = ixp4xx_spkr_remove,
-       .shutdown       = ixp4xx_spkr_shutdown,
-};
-module_platform_driver(ixp4xx_spkr_platform_driver);
-
index 10e3fc0..3360960 100644 (file)
@@ -284,7 +284,7 @@ static int pm8941_pwrkey_probe(struct platform_device *pdev)
        }
 
        if (pwrkey->data->supports_ps_hold_poff_config) {
-               pwrkey->reboot_notifier.notifier_call = pm8941_reboot_notify,
+               pwrkey->reboot_notifier.notifier_call = pm8941_reboot_notify;
                error = register_reboot_notifier(&pwrkey->reboot_notifier);
                if (error) {
                        dev_err(&pdev->dev, "failed to register reboot notifier: %d\n",
diff --git a/drivers/input/misc/sirfsoc-onkey.c b/drivers/input/misc/sirfsoc-onkey.c
deleted file mode 100644 (file)
index 7982bf8..0000000
+++ /dev/null
@@ -1,207 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Power key driver for SiRF PrimaII
- *
- * Copyright (c) 2013 - 2014 Cambridge Silicon Radio Limited, a CSR plc group
- * company.
- */
-
-#include <linux/module.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/platform_device.h>
-#include <linux/input.h>
-#include <linux/rtc/sirfsoc_rtciobrg.h>
-#include <linux/of.h>
-#include <linux/workqueue.h>
-
-struct sirfsoc_pwrc_drvdata {
-       u32                     pwrc_base;
-       struct input_dev        *input;
-       struct delayed_work     work;
-};
-
-#define PWRC_ON_KEY_BIT                        (1 << 0)
-
-#define PWRC_INT_STATUS                        0xc
-#define PWRC_INT_MASK                  0x10
-#define PWRC_PIN_STATUS                        0x14
-#define PWRC_KEY_DETECT_UP_TIME                20      /* ms*/
-
-static int sirfsoc_pwrc_is_on_key_down(struct sirfsoc_pwrc_drvdata *pwrcdrv)
-{
-       u32 state = sirfsoc_rtc_iobrg_readl(pwrcdrv->pwrc_base +
-                                                       PWRC_PIN_STATUS);
-       return !(state & PWRC_ON_KEY_BIT); /* ON_KEY is active low */
-}
-
-static void sirfsoc_pwrc_report_event(struct work_struct *work)
-{
-       struct sirfsoc_pwrc_drvdata *pwrcdrv =
-               container_of(work, struct sirfsoc_pwrc_drvdata, work.work);
-
-       if (sirfsoc_pwrc_is_on_key_down(pwrcdrv)) {
-               schedule_delayed_work(&pwrcdrv->work,
-                       msecs_to_jiffies(PWRC_KEY_DETECT_UP_TIME));
-       } else {
-               input_event(pwrcdrv->input, EV_KEY, KEY_POWER, 0);
-               input_sync(pwrcdrv->input);
-       }
-}
-
-static irqreturn_t sirfsoc_pwrc_isr(int irq, void *dev_id)
-{
-       struct sirfsoc_pwrc_drvdata *pwrcdrv = dev_id;
-       u32 int_status;
-
-       int_status = sirfsoc_rtc_iobrg_readl(pwrcdrv->pwrc_base +
-                                                       PWRC_INT_STATUS);
-       sirfsoc_rtc_iobrg_writel(int_status & ~PWRC_ON_KEY_BIT,
-                                pwrcdrv->pwrc_base + PWRC_INT_STATUS);
-
-       input_event(pwrcdrv->input, EV_KEY, KEY_POWER, 1);
-       input_sync(pwrcdrv->input);
-       schedule_delayed_work(&pwrcdrv->work,
-                             msecs_to_jiffies(PWRC_KEY_DETECT_UP_TIME));
-
-       return IRQ_HANDLED;
-}
-
-static void sirfsoc_pwrc_toggle_interrupts(struct sirfsoc_pwrc_drvdata *pwrcdrv,
-                                          bool enable)
-{
-       u32 int_mask;
-
-       int_mask = sirfsoc_rtc_iobrg_readl(pwrcdrv->pwrc_base + PWRC_INT_MASK);
-       if (enable)
-               int_mask |= PWRC_ON_KEY_BIT;
-       else
-               int_mask &= ~PWRC_ON_KEY_BIT;
-       sirfsoc_rtc_iobrg_writel(int_mask, pwrcdrv->pwrc_base + PWRC_INT_MASK);
-}
-
-static int sirfsoc_pwrc_open(struct input_dev *input)
-{
-       struct sirfsoc_pwrc_drvdata *pwrcdrv = input_get_drvdata(input);
-
-       sirfsoc_pwrc_toggle_interrupts(pwrcdrv, true);
-
-       return 0;
-}
-
-static void sirfsoc_pwrc_close(struct input_dev *input)
-{
-       struct sirfsoc_pwrc_drvdata *pwrcdrv = input_get_drvdata(input);
-
-       sirfsoc_pwrc_toggle_interrupts(pwrcdrv, false);
-       cancel_delayed_work_sync(&pwrcdrv->work);
-}
-
-static const struct of_device_id sirfsoc_pwrc_of_match[] = {
-       { .compatible = "sirf,prima2-pwrc" },
-       {},
-};
-MODULE_DEVICE_TABLE(of, sirfsoc_pwrc_of_match);
-
-static int sirfsoc_pwrc_probe(struct platform_device *pdev)
-{
-       struct device_node *np = pdev->dev.of_node;
-       struct sirfsoc_pwrc_drvdata *pwrcdrv;
-       int irq;
-       int error;
-
-       pwrcdrv = devm_kzalloc(&pdev->dev, sizeof(struct sirfsoc_pwrc_drvdata),
-                              GFP_KERNEL);
-       if (!pwrcdrv) {
-               dev_info(&pdev->dev, "Not enough memory for the device data\n");
-               return -ENOMEM;
-       }
-
-       /*
-        * We can't use of_iomap because pwrc is not mapped in memory,
-        * the so-called base address is only offset in rtciobrg
-        */
-       error = of_property_read_u32(np, "reg", &pwrcdrv->pwrc_base);
-       if (error) {
-               dev_err(&pdev->dev,
-                       "unable to find base address of pwrc node in dtb\n");
-               return error;
-       }
-
-       pwrcdrv->input = devm_input_allocate_device(&pdev->dev);
-       if (!pwrcdrv->input)
-               return -ENOMEM;
-
-       pwrcdrv->input->name = "sirfsoc pwrckey";
-       pwrcdrv->input->phys = "pwrc/input0";
-       pwrcdrv->input->evbit[0] = BIT_MASK(EV_KEY);
-       input_set_capability(pwrcdrv->input, EV_KEY, KEY_POWER);
-
-       INIT_DELAYED_WORK(&pwrcdrv->work, sirfsoc_pwrc_report_event);
-
-       pwrcdrv->input->open = sirfsoc_pwrc_open;
-       pwrcdrv->input->close = sirfsoc_pwrc_close;
-
-       input_set_drvdata(pwrcdrv->input, pwrcdrv);
-
-       /* Make sure the device is quiesced */
-       sirfsoc_pwrc_toggle_interrupts(pwrcdrv, false);
-
-       irq = platform_get_irq(pdev, 0);
-       error = devm_request_irq(&pdev->dev, irq,
-                                sirfsoc_pwrc_isr, 0,
-                                "sirfsoc_pwrc_int", pwrcdrv);
-       if (error) {
-               dev_err(&pdev->dev, "unable to claim irq %d, error: %d\n",
-                       irq, error);
-               return error;
-       }
-
-       error = input_register_device(pwrcdrv->input);
-       if (error) {
-               dev_err(&pdev->dev,
-                       "unable to register input device, error: %d\n",
-                       error);
-               return error;
-       }
-
-       dev_set_drvdata(&pdev->dev, pwrcdrv);
-       device_init_wakeup(&pdev->dev, 1);
-
-       return 0;
-}
-
-static int __maybe_unused sirfsoc_pwrc_resume(struct device *dev)
-{
-       struct sirfsoc_pwrc_drvdata *pwrcdrv = dev_get_drvdata(dev);
-       struct input_dev *input = pwrcdrv->input;
-
-       /*
-        * Do not mask pwrc interrupt as we want pwrc work as a wakeup source
-        * if users touch X_ONKEY_B, see arch/arm/mach-prima2/pm.c
-        */
-       mutex_lock(&input->mutex);
-       if (input_device_enabled(input))
-               sirfsoc_pwrc_toggle_interrupts(pwrcdrv, true);
-       mutex_unlock(&input->mutex);
-
-       return 0;
-}
-
-static SIMPLE_DEV_PM_OPS(sirfsoc_pwrc_pm_ops, NULL, sirfsoc_pwrc_resume);
-
-static struct platform_driver sirfsoc_pwrc_driver = {
-       .probe          = sirfsoc_pwrc_probe,
-       .driver         = {
-               .name   = "sirfsoc-pwrc",
-               .pm     = &sirfsoc_pwrc_pm_ops,
-               .of_match_table = sirfsoc_pwrc_of_match,
-       }
-};
-
-module_platform_driver(sirfsoc_pwrc_driver);
-
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Binghua Duan <Binghua.Duan@csr.com>, Xianglong Du <Xianglong.Du@csr.com>");
-MODULE_DESCRIPTION("CSR Prima2 PWRC Driver");
-MODULE_ALIAS("platform:sirfsoc-pwrc");
index dc4a240..3c84dee 100644 (file)
@@ -55,8 +55,9 @@
 #define ETP_FW_PAGE_SIZE_512   512
 #define ETP_FW_SIGNATURE_SIZE  6
 
-#define ETP_PRODUCT_ID_DELBIN  0x00C2
+#define ETP_PRODUCT_ID_WHITEBOX        0x00B8
 #define ETP_PRODUCT_ID_VOXEL   0x00BF
+#define ETP_PRODUCT_ID_DELBIN  0x00C2
 #define ETP_PRODUCT_ID_MAGPIE  0x0120
 #define ETP_PRODUCT_ID_BOBBA   0x0121
 
index dad22c1..47af62c 100644 (file)
@@ -105,6 +105,7 @@ static u32 elan_i2c_lookup_quirks(u16 ic_type, u16 product_id)
                u32 quirks;
        } elan_i2c_quirks[] = {
                { 0x0D, ETP_PRODUCT_ID_DELBIN, ETP_QUIRK_QUICK_WAKEUP },
+               { 0x0D, ETP_PRODUCT_ID_WHITEBOX, ETP_QUIRK_QUICK_WAKEUP },
                { 0x10, ETP_PRODUCT_ID_VOXEL, ETP_QUIRK_QUICK_WAKEUP },
                { 0x14, ETP_PRODUCT_ID_MAGPIE, ETP_QUIRK_QUICK_WAKEUP },
                { 0x14, ETP_PRODUCT_ID_BOBBA, ETP_QUIRK_QUICK_WAKEUP },
index 3ac57a9..51b6850 100644 (file)
@@ -220,16 +220,4 @@ static struct parport_driver parkbd_parport_driver = {
        .detach = parkbd_detach,
        .devmodel = true,
 };
-
-static int __init parkbd_init(void)
-{
-       return parport_register_driver(&parkbd_parport_driver);
-}
-
-static void __exit parkbd_exit(void)
-{
-       parport_unregister_driver(&parkbd_parport_driver);
-}
-
-module_init(parkbd_init);
-module_exit(parkbd_exit);
+module_parport_driver(parkbd_parport_driver);
index ad454cd..d4e7473 100644 (file)
@@ -932,7 +932,7 @@ config TOUCHSCREEN_USB_COMPOSITE
          - JASTEC USB Touch Controller/DigiTech DTR-02U
          - Zytronic controllers
          - Elo TouchSystems 2700 IntelliTouch
-         - EasyTouch USB Touch Controller from Data Modul
+         - EasyTouch USB Touch Controller from Data Module
          - e2i (Mimo monitors)
 
          Have a look at <http://linux.chapter7.ch/touchkit/> for
index 263de3b..bb2e1cb 100644 (file)
@@ -899,6 +899,7 @@ static int edt_ft5x06_ts_identify(struct i2c_client *client,
                 * the identification registers.
                 */
                switch (rdbuf[0]) {
+               case 0x11:   /* EDT EP0110M09 */
                case 0x35:   /* EDT EP0350M09 */
                case 0x43:   /* EDT EP0430M09 */
                case 0x50:   /* EDT EP0500M09 */
index 0efd1a1..9fa3b0e 100644 (file)
@@ -54,6 +54,7 @@
 
 enum mms_type {
        TYPE_MMS114     = 114,
+       TYPE_MMS134S    = 134,
        TYPE_MMS136     = 136,
        TYPE_MMS152     = 152,
        TYPE_MMS345L    = 345,
@@ -212,7 +213,7 @@ static irqreturn_t mms114_interrupt(int irq, void *dev_id)
                goto out;
 
        /* MMS136 has slightly different event size */
-       if (data->type == TYPE_MMS136)
+       if (data->type == TYPE_MMS134S || data->type == TYPE_MMS136)
                touch_size = packet_size / MMS136_EVENT_SIZE;
        else
                touch_size = packet_size / MMS114_EVENT_SIZE;
@@ -281,6 +282,7 @@ static int mms114_get_version(struct mms114_data *data)
                break;
 
        case TYPE_MMS114:
+       case TYPE_MMS134S:
        case TYPE_MMS136:
                error = __mms114_read_reg(data, MMS114_TSP_REV, 6, buf);
                if (error)
@@ -304,8 +306,9 @@ static int mms114_setup_regs(struct mms114_data *data)
        if (error < 0)
                return error;
 
-       /* Only MMS114 and MMS136 have configuration and power on registers */
-       if (data->type != TYPE_MMS114 && data->type != TYPE_MMS136)
+       /* MMS114, MMS134S and MMS136 have configuration and power on registers */
+       if (data->type != TYPE_MMS114 && data->type != TYPE_MMS134S &&
+           data->type != TYPE_MMS136)
                return 0;
 
        error = mms114_set_active(data, true);
@@ -487,7 +490,8 @@ static int mms114_probe(struct i2c_client *client,
                                     0, data->props.max_y, 0, 0);
        }
 
-       if (data->type == TYPE_MMS114 || data->type == TYPE_MMS136) {
+       if (data->type == TYPE_MMS114 || data->type == TYPE_MMS134S ||
+           data->type == TYPE_MMS136) {
                /*
                 * The firmware handles movement and pressure fuzz, so
                 * don't duplicate that in software.
@@ -611,6 +615,9 @@ static const struct of_device_id mms114_dt_match[] = {
        {
                .compatible = "melfas,mms114",
                .data = (void *)TYPE_MMS114,
+       }, {
+               .compatible = "melfas,mms134s",
+               .data = (void *)TYPE_MMS134S,
        }, {
                .compatible = "melfas,mms136",
                .data = (void *)TYPE_MMS136,
index 8ad8618..124c41a 100644 (file)
@@ -82,7 +82,7 @@ config IOMMU_DEBUGFS
 choice
        prompt "IOMMU default domain type"
        depends on IOMMU_API
-       default IOMMU_DEFAULT_DMA_LAZY if AMD_IOMMU || INTEL_IOMMU
+       default IOMMU_DEFAULT_DMA_LAZY if X86 || IA64
        default IOMMU_DEFAULT_DMA_STRICT
        help
          Choose the type of IOMMU domain used to manage DMA API usage by
index bdcf167..2a822b2 100644 (file)
@@ -297,6 +297,22 @@ int amd_iommu_get_num_iommus(void)
        return amd_iommus_present;
 }
 
+#ifdef CONFIG_IRQ_REMAP
+static bool check_feature_on_all_iommus(u64 mask)
+{
+       bool ret = false;
+       struct amd_iommu *iommu;
+
+       for_each_iommu(iommu) {
+               ret = iommu_feature(iommu, mask);
+               if (!ret)
+                       return false;
+       }
+
+       return true;
+}
+#endif
+
 /*
  * For IVHD type 0x11/0x40, EFR is also available via IVHD.
  * Default to IVHD EFR since it is available sooner
@@ -813,9 +829,9 @@ static int iommu_ga_log_enable(struct amd_iommu *iommu)
        return 0;
 }
 
-#ifdef CONFIG_IRQ_REMAP
 static int iommu_init_ga_log(struct amd_iommu *iommu)
 {
+#ifdef CONFIG_IRQ_REMAP
        u64 entry;
 
        if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
@@ -845,25 +861,9 @@ static int iommu_init_ga_log(struct amd_iommu *iommu)
 err_out:
        free_ga_log(iommu);
        return -EINVAL;
-}
-#endif /* CONFIG_IRQ_REMAP */
-
-static int iommu_init_ga(struct amd_iommu *iommu)
-{
-       int ret = 0;
-
-#ifdef CONFIG_IRQ_REMAP
-       /* Note: We have already checked GASup from IVRS table.
-        *       Now, we need to make sure that GAMSup is set.
-        */
-       if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
-           !iommu_feature(iommu, FEATURE_GAM_VAPIC))
-               amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA;
-
-       ret = iommu_init_ga_log(iommu);
+#else
+       return 0;
 #endif /* CONFIG_IRQ_REMAP */
-
-       return ret;
 }
 
 static int __init alloc_cwwb_sem(struct amd_iommu *iommu)
@@ -1845,7 +1845,7 @@ static int __init iommu_init_pci(struct amd_iommu *iommu)
        if (iommu_feature(iommu, FEATURE_PPR) && alloc_ppr_log(iommu))
                return -ENOMEM;
 
-       ret = iommu_init_ga(iommu);
+       ret = iommu_init_ga_log(iommu);
        if (ret)
                return ret;
 
@@ -2479,6 +2479,14 @@ static void early_enable_iommus(void)
        }
 
 #ifdef CONFIG_IRQ_REMAP
+       /*
+        * Note: We have already checked GASup from IVRS table.
+        *       Now, we need to make sure that GAMSup is set.
+        */
+       if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
+           !check_feature_on_all_iommus(FEATURE_GAM_VAPIC))
+               amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA;
+
        if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
                amd_iommu_irq_ops.capability |= (1 << IRQ_POSTING_CAP);
 #endif
index 2014fe8..0c22878 100644 (file)
@@ -514,9 +514,6 @@ static void load_pasid(struct mm_struct *mm, u32 pasid)
 {
        mutex_lock(&mm->context.lock);
 
-       /* Synchronize with READ_ONCE in update_pasid(). */
-       smp_store_release(&mm->pasid, pasid);
-
        /* Update PASID MSR on all CPUs running the mm's tasks. */
        on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true);
 
@@ -792,7 +789,19 @@ prq_retry:
                goto prq_retry;
        }
 
+       /*
+        * A work in IO page fault workqueue may try to lock pasid_mutex now.
+        * Holding pasid_mutex while waiting in iopf_queue_flush_dev() for
+        * all works in the workqueue to finish may cause deadlock.
+        *
+        * It's unnecessary to hold pasid_mutex in iopf_queue_flush_dev().
+        * Unlock it to allow the works to be handled while waiting for
+        * them to finish.
+        */
+       lockdep_assert_held(&pasid_mutex);
+       mutex_unlock(&pasid_mutex);
        iopf_queue_flush_dev(dev);
+       mutex_lock(&pasid_mutex);
 
        /*
         * Perform steps described in VT-d spec CH7.10 to drain page
index 0af42fb..9e8bc80 100644 (file)
@@ -519,6 +519,7 @@ retry:
 
        return new_iova->pfn_lo;
 }
+EXPORT_SYMBOL_GPL(alloc_iova_fast);
 
 /**
  * free_iova_fast - free iova pfn range into rcache
@@ -536,6 +537,7 @@ free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
 
        free_iova(iovad, pfn);
 }
+EXPORT_SYMBOL_GPL(free_iova_fast);
 
 #define fq_ring_for_each(i, fq) \
        for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % IOVA_FQ_SIZE)
index 5d8b482..6ebe3c7 100644 (file)
@@ -10,4 +10,5 @@ HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \
                common/asid.o common/habanalabs_ioctl.o \
                common/command_buffer.o common/hw_queue.o common/irq.o \
                common/sysfs.o common/hwmon.o common/memory.o \
-               common/command_submission.o common/firmware_if.o
+               common/command_submission.o common/firmware_if.o \
+               common/state_dump.o
index 719168c..8132a84 100644 (file)
@@ -314,8 +314,6 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
 
        spin_lock(&mgr->cb_lock);
        rc = idr_alloc(&mgr->cb_handles, cb, 1, 0, GFP_ATOMIC);
-       if (rc < 0)
-               rc = idr_alloc(&mgr->cb_handles, cb, 1, 0, GFP_KERNEL);
        spin_unlock(&mgr->cb_lock);
 
        if (rc < 0) {
@@ -552,7 +550,7 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
 
        vma->vm_private_data = cb;
 
-       rc = hdev->asic_funcs->cb_mmap(hdev, vma, cb->kernel_address,
+       rc = hdev->asic_funcs->mmap(hdev, vma, cb->kernel_address,
                                        cb->bus_address, cb->size);
        if (rc) {
                spin_lock(&cb->lock);
index 80c60fb..7b0516c 100644 (file)
@@ -38,7 +38,11 @@ static void hl_sob_reset(struct kref *ref)
                                                        kref);
        struct hl_device *hdev = hw_sob->hdev;
 
+       dev_dbg(hdev->dev, "reset sob id %u\n", hw_sob->sob_id);
+
        hdev->asic_funcs->reset_sob(hdev, hw_sob);
+
+       hw_sob->need_reset = false;
 }
 
 void hl_sob_reset_error(struct kref *ref)
@@ -52,6 +56,24 @@ void hl_sob_reset_error(struct kref *ref)
                hw_sob->q_idx, hw_sob->sob_id);
 }
 
+void hw_sob_put(struct hl_hw_sob *hw_sob)
+{
+       if (hw_sob)
+               kref_put(&hw_sob->kref, hl_sob_reset);
+}
+
+static void hw_sob_put_err(struct hl_hw_sob *hw_sob)
+{
+       if (hw_sob)
+               kref_put(&hw_sob->kref, hl_sob_reset_error);
+}
+
+void hw_sob_get(struct hl_hw_sob *hw_sob)
+{
+       if (hw_sob)
+               kref_get(&hw_sob->kref);
+}
+
 /**
  * hl_gen_sob_mask() - Generates a sob mask to be used in a monitor arm packet
  * @sob_base: sob base id
@@ -84,76 +106,29 @@ int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask)
        return 0;
 }
 
-static void sob_reset_work(struct work_struct *work)
-{
-       struct hl_cs_compl *hl_cs_cmpl =
-               container_of(work, struct hl_cs_compl, sob_reset_work);
-       struct hl_device *hdev = hl_cs_cmpl->hdev;
-
-       /*
-        * A signal CS can get completion while the corresponding wait
-        * for signal CS is on its way to the PQ. The wait for signal CS
-        * will get stuck if the signal CS incremented the SOB to its
-        * max value and there are no pending (submitted) waits on this
-        * SOB.
-        * We do the following to void this situation:
-        * 1. The wait for signal CS must get a ref for the signal CS as
-        *    soon as possible in cs_ioctl_signal_wait() and put it
-        *    before being submitted to the PQ but after it incremented
-        *    the SOB refcnt in init_signal_wait_cs().
-        * 2. Signal/Wait for signal CS will decrement the SOB refcnt
-        *    here.
-        * These two measures guarantee that the wait for signal CS will
-        * reset the SOB upon completion rather than the signal CS and
-        * hence the above scenario is avoided.
-        */
-       kref_put(&hl_cs_cmpl->hw_sob->kref, hl_sob_reset);
-
-       if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
-               hdev->asic_funcs->reset_sob_group(hdev,
-                               hl_cs_cmpl->sob_group);
-
-       kfree(hl_cs_cmpl);
-}
-
 static void hl_fence_release(struct kref *kref)
 {
        struct hl_fence *fence =
                container_of(kref, struct hl_fence, refcount);
        struct hl_cs_compl *hl_cs_cmpl =
                container_of(fence, struct hl_cs_compl, base_fence);
-       struct hl_device *hdev = hl_cs_cmpl->hdev;
-
-       /* EBUSY means the CS was never submitted and hence we don't have
-        * an attached hw_sob object that we should handle here
-        */
-       if (fence->error == -EBUSY)
-               goto free;
-
-       if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
-               (hl_cs_cmpl->type == CS_TYPE_WAIT) ||
-               (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)) {
-
-               dev_dbg(hdev->dev,
-                       "CS 0x%llx type %d finished, sob_id: %d, sob_val: 0x%x\n",
-                       hl_cs_cmpl->cs_seq,
-                       hl_cs_cmpl->type,
-                       hl_cs_cmpl->hw_sob->sob_id,
-                       hl_cs_cmpl->sob_val);
-
-               queue_work(hdev->sob_reset_wq, &hl_cs_cmpl->sob_reset_work);
 
-               return;
-       }
-
-free:
        kfree(hl_cs_cmpl);
 }
 
 void hl_fence_put(struct hl_fence *fence)
 {
-       if (fence)
-               kref_put(&fence->refcount, hl_fence_release);
+       if (IS_ERR_OR_NULL(fence))
+               return;
+       kref_put(&fence->refcount, hl_fence_release);
+}
+
+void hl_fences_put(struct hl_fence **fence, int len)
+{
+       int i;
+
+       for (i = 0; i < len; i++, fence++)
+               hl_fence_put(*fence);
 }
 
 void hl_fence_get(struct hl_fence *fence)
@@ -473,11 +448,139 @@ static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs)
        spin_unlock(&hdev->cs_mirror_lock);
 }
 
+/*
+ * force_complete_multi_cs - complete all contexts that wait on multi-CS
+ *
+ * @hdev: pointer to habanalabs device structure
+ */
+static void force_complete_multi_cs(struct hl_device *hdev)
+{
+       int i;
+
+       for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
+               struct multi_cs_completion *mcs_compl;
+
+               mcs_compl = &hdev->multi_cs_completion[i];
+
+               spin_lock(&mcs_compl->lock);
+
+               if (!mcs_compl->used) {
+                       spin_unlock(&mcs_compl->lock);
+                       continue;
+               }
+
+               /* when calling force complete no context should be waiting on
+                * multi-cS.
+                * We are calling the function as a protection for such case
+                * to free any pending context and print error message
+                */
+               dev_err(hdev->dev,
+                               "multi-CS completion context %d still waiting when calling force completion\n",
+                               i);
+               complete_all(&mcs_compl->completion);
+               spin_unlock(&mcs_compl->lock);
+       }
+}
+
+/*
+ * complete_multi_cs - complete all waiting entities on multi-CS
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @cs: CS structure
+ * The function signals a waiting entity that has an overlapping stream masters
+ * with the completed CS.
+ * For example:
+ * - a completed CS worked on stream master QID 4, multi CS completion
+ *   is actively waiting on stream master QIDs 3, 5. don't send signal as no
+ *   common stream master QID
+ * - a completed CS worked on stream master QID 4, multi CS completion
+ *   is actively waiting on stream master QIDs 3, 4. send signal as stream
+ *   master QID 4 is common
+ */
+static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
+{
+       struct hl_fence *fence = cs->fence;
+       int i;
+
+       /* in case of multi CS check for completion only for the first CS */
+       if (cs->staged_cs && !cs->staged_first)
+               return;
+
+       for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
+               struct multi_cs_completion *mcs_compl;
+
+               mcs_compl = &hdev->multi_cs_completion[i];
+               if (!mcs_compl->used)
+                       continue;
+
+               spin_lock(&mcs_compl->lock);
+
+               /*
+                * complete if:
+                * 1. still waiting for completion
+                * 2. the completed CS has at least one overlapping stream
+                *    master with the stream masters in the completion
+                */
+               if (mcs_compl->used &&
+                               (fence->stream_master_qid_map &
+                                       mcs_compl->stream_master_qid_map)) {
+                       /* extract the timestamp only of first completed CS */
+                       if (!mcs_compl->timestamp)
+                               mcs_compl->timestamp =
+                                               ktime_to_ns(fence->timestamp);
+                       complete_all(&mcs_compl->completion);
+               }
+
+               spin_unlock(&mcs_compl->lock);
+       }
+}
+
+static inline void cs_release_sob_reset_handler(struct hl_device *hdev,
+                                       struct hl_cs *cs,
+                                       struct hl_cs_compl *hl_cs_cmpl)
+{
+       /* Skip this handler if the cs wasn't submitted, to avoid putting
+        * the hw_sob twice, since this case already handled at this point,
+        * also skip if the hw_sob pointer wasn't set.
+        */
+       if (!hl_cs_cmpl->hw_sob || !cs->submitted)
+               return;
+
+       spin_lock(&hl_cs_cmpl->lock);
+
+       /*
+        * we get refcount upon reservation of signals or signal/wait cs for the
+        * hw_sob object, and need to put it when the first staged cs
+        * (which cotains the encaps signals) or cs signal/wait is completed.
+        */
+       if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
+                       (hl_cs_cmpl->type == CS_TYPE_WAIT) ||
+                       (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) ||
+                       (!!hl_cs_cmpl->encaps_signals)) {
+               dev_dbg(hdev->dev,
+                               "CS 0x%llx type %d finished, sob_id: %d, sob_val: %u\n",
+                               hl_cs_cmpl->cs_seq,
+                               hl_cs_cmpl->type,
+                               hl_cs_cmpl->hw_sob->sob_id,
+                               hl_cs_cmpl->sob_val);
+
+               hw_sob_put(hl_cs_cmpl->hw_sob);
+
+               if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
+                       hdev->asic_funcs->reset_sob_group(hdev,
+                                       hl_cs_cmpl->sob_group);
+       }
+
+       spin_unlock(&hl_cs_cmpl->lock);
+}
+
 static void cs_do_release(struct kref *ref)
 {
        struct hl_cs *cs = container_of(ref, struct hl_cs, refcount);
        struct hl_device *hdev = cs->ctx->hdev;
        struct hl_cs_job *job, *tmp;
+       struct hl_cs_compl *hl_cs_cmpl =
+                       container_of(cs->fence, struct hl_cs_compl, base_fence);
 
        cs->completed = true;
 
@@ -493,8 +596,9 @@ static void cs_do_release(struct kref *ref)
                complete_job(hdev, job);
 
        if (!cs->submitted) {
-               /* In case the wait for signal CS was submitted, the put occurs
-                * in init_signal_wait_cs() or collective_wait_init_cs()
+               /*
+                * In case the wait for signal CS was submitted, the fence put
+                * occurs in init_signal_wait_cs() or collective_wait_init_cs()
                 * right before hanging on the PQ.
                 */
                if (cs->type == CS_TYPE_WAIT ||
@@ -535,8 +639,20 @@ static void cs_do_release(struct kref *ref)
                        list_del(&cs->staged_cs_node);
                        spin_unlock(&hdev->cs_mirror_lock);
                }
+
+               /* decrement refcount to handle when first staged cs
+                * with encaps signals is completed.
+                */
+               if (hl_cs_cmpl->encaps_signals)
+                       kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount,
+                                               hl_encaps_handle_do_release);
        }
 
+       if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
+                       && cs->encaps_signals)
+               kref_put(&cs->encaps_sig_hdl->refcount,
+                                       hl_encaps_handle_do_release);
+
 out:
        /* Must be called before hl_ctx_put because inside we use ctx to get
         * the device
@@ -566,6 +682,10 @@ out:
        if (cs->timestamp)
                cs->fence->timestamp = ktime_get();
        complete_all(&cs->fence->completion);
+       complete_multi_cs(hdev, cs);
+
+       cs_release_sob_reset_handler(hdev, cs, hl_cs_cmpl);
+
        hl_fence_put(cs->fence);
 
        kfree(cs->jobs_in_queue_cnt);
@@ -621,6 +741,10 @@ static void cs_timedout(struct work_struct *work)
                break;
        }
 
+       rc = hl_state_dump(hdev);
+       if (rc)
+               dev_err(hdev->dev, "Error during system state dump %d\n", rc);
+
        cs_put(cs);
 
        if (likely(!skip_reset_on_timeout)) {
@@ -661,6 +785,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
        cs->completed = false;
        cs->type = cs_type;
        cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
+       cs->encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
        cs->timeout_jiffies = timeout;
        cs->skip_reset_on_timeout =
                hdev->skip_reset_on_timeout ||
@@ -671,9 +796,9 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
        kref_init(&cs->refcount);
        spin_lock_init(&cs->job_lock);
 
-       cs_cmpl = kmalloc(sizeof(*cs_cmpl), GFP_ATOMIC);
+       cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_ATOMIC);
        if (!cs_cmpl)
-               cs_cmpl = kmalloc(sizeof(*cs_cmpl), GFP_KERNEL);
+               cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_KERNEL);
 
        if (!cs_cmpl) {
                atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
@@ -698,7 +823,6 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
        cs_cmpl->hdev = hdev;
        cs_cmpl->type = cs->type;
        spin_lock_init(&cs_cmpl->lock);
-       INIT_WORK(&cs_cmpl->sob_reset_work, sob_reset_work);
        cs->fence = &cs_cmpl->base_fence;
 
        spin_lock(&ctx->cs_lock);
@@ -791,31 +915,22 @@ void hl_cs_rollback_all(struct hl_device *hdev)
                cs_rollback(hdev, cs);
                cs_put(cs);
        }
-}
-
-void hl_pending_cb_list_flush(struct hl_ctx *ctx)
-{
-       struct hl_pending_cb *pending_cb, *tmp;
 
-       list_for_each_entry_safe(pending_cb, tmp,
-                       &ctx->pending_cb_list, cb_node) {
-               list_del(&pending_cb->cb_node);
-               hl_cb_put(pending_cb->cb);
-               kfree(pending_cb);
-       }
+       force_complete_multi_cs(hdev);
 }
 
 static void
 wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt)
 {
        struct hl_user_pending_interrupt *pend;
+       unsigned long flags;
 
-       spin_lock(&interrupt->wait_list_lock);
+       spin_lock_irqsave(&interrupt->wait_list_lock, flags);
        list_for_each_entry(pend, &interrupt->wait_list_head, wait_list_node) {
                pend->fence.error = -EIO;
                complete_all(&pend->fence.completion);
        }
-       spin_unlock(&interrupt->wait_list_lock);
+       spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
 }
 
 void hl_release_pending_user_interrupts(struct hl_device *hdev)
@@ -981,6 +1096,10 @@ static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
                return CS_TYPE_WAIT;
        else if (cs_type_flags & HL_CS_FLAGS_COLLECTIVE_WAIT)
                return CS_TYPE_COLLECTIVE_WAIT;
+       else if (cs_type_flags & HL_CS_FLAGS_RESERVE_SIGNALS_ONLY)
+               return CS_RESERVE_SIGNALS;
+       else if (cs_type_flags & HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY)
+               return CS_UNRESERVE_SIGNALS;
        else
                return CS_TYPE_DEFAULT;
 }
@@ -1081,7 +1200,8 @@ static int hl_cs_copy_chunk_array(struct hl_device *hdev,
 }
 
 static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
-                               u64 sequence, u32 flags)
+                               u64 sequence, u32 flags,
+                               u32 encaps_signal_handle)
 {
        if (!(flags & HL_CS_FLAGS_STAGED_SUBMISSION))
                return 0;
@@ -1093,6 +1213,9 @@ static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
                /* Staged CS sequence is the first CS sequence */
                INIT_LIST_HEAD(&cs->staged_cs_node);
                cs->staged_sequence = cs->sequence;
+
+               if (cs->encaps_signals)
+                       cs->encaps_sig_hdl_id = encaps_signal_handle;
        } else {
                /* User sequence will be validated in 'hl_hw_queue_schedule_cs'
                 * under the cs_mirror_lock
@@ -1108,9 +1231,20 @@ static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
        return 0;
 }
 
+static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid)
+{
+       int i;
+
+       for (i = 0; i < hdev->stream_master_qid_arr_size; i++)
+               if (qid == hdev->stream_master_qid_arr[i])
+                       return BIT(i);
+
+       return 0;
+}
+
 static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
                                u32 num_chunks, u64 *cs_seq, u32 flags,
-                               u32 timeout)
+                               u32 encaps_signals_handle, u32 timeout)
 {
        bool staged_mid, int_queues_only = true;
        struct hl_device *hdev = hpriv->hdev;
@@ -1121,6 +1255,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
        struct hl_cs *cs;
        struct hl_cb *cb;
        u64 user_sequence;
+       u8 stream_master_qid_map = 0;
        int rc, i;
 
        cntr = &hdev->aggregated_cs_counters;
@@ -1148,7 +1283,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 
        hl_debugfs_add_cs(cs);
 
-       rc = cs_staged_submission(hdev, cs, user_sequence, flags);
+       rc = cs_staged_submission(hdev, cs, user_sequence, flags,
+                                               encaps_signals_handle);
        if (rc)
                goto free_cs_object;
 
@@ -1179,9 +1315,20 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
                        cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
                }
 
-               if (queue_type == QUEUE_TYPE_EXT || queue_type == QUEUE_TYPE_HW)
+               if (queue_type == QUEUE_TYPE_EXT ||
+                                               queue_type == QUEUE_TYPE_HW) {
                        int_queues_only = false;
 
+                       /*
+                        * store which stream are being used for external/HW
+                        * queues of this CS
+                        */
+                       if (hdev->supports_wait_for_multi_cs)
+                               stream_master_qid_map |=
+                                       get_stream_master_qid_mask(hdev,
+                                                       chunk->queue_index);
+               }
+
                job = hl_cs_allocate_job(hdev, queue_type,
                                                is_kernel_allocated_cb);
                if (!job) {
@@ -1242,6 +1389,13 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
                goto free_cs_object;
        }
 
+       /*
+        * store the (external/HW queues) streams used by the CS in the
+        * fence object for multi-CS completion
+        */
+       if (hdev->supports_wait_for_multi_cs)
+               cs->fence->stream_master_qid_map = stream_master_qid_map;
+
        rc = hl_hw_queue_schedule_cs(cs);
        if (rc) {
                if (rc != -EAGAIN)
@@ -1270,130 +1424,6 @@ out:
        return rc;
 }
 
-static int pending_cb_create_job(struct hl_device *hdev, struct hl_ctx *ctx,
-               struct hl_cs *cs, struct hl_cb *cb, u32 size, u32 hw_queue_id)
-{
-       struct hw_queue_properties *hw_queue_prop;
-       struct hl_cs_counters_atomic *cntr;
-       struct hl_cs_job *job;
-
-       hw_queue_prop = &hdev->asic_prop.hw_queues_props[hw_queue_id];
-       cntr = &hdev->aggregated_cs_counters;
-
-       job = hl_cs_allocate_job(hdev, hw_queue_prop->type, true);
-       if (!job) {
-               atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
-               atomic64_inc(&cntr->out_of_mem_drop_cnt);
-               dev_err(hdev->dev, "Failed to allocate a new job\n");
-               return -ENOMEM;
-       }
-
-       job->id = 0;
-       job->cs = cs;
-       job->user_cb = cb;
-       atomic_inc(&job->user_cb->cs_cnt);
-       job->user_cb_size = size;
-       job->hw_queue_id = hw_queue_id;
-       job->patched_cb = job->user_cb;
-       job->job_cb_size = job->user_cb_size;
-
-       /* increment refcount as for external queues we get completion */
-       cs_get(cs);
-
-       cs->jobs_in_queue_cnt[job->hw_queue_id]++;
-
-       list_add_tail(&job->cs_node, &cs->job_list);
-
-       hl_debugfs_add_job(hdev, job);
-
-       return 0;
-}
-
-static int hl_submit_pending_cb(struct hl_fpriv *hpriv)
-{
-       struct hl_device *hdev = hpriv->hdev;
-       struct hl_ctx *ctx = hpriv->ctx;
-       struct hl_pending_cb *pending_cb, *tmp;
-       struct list_head local_cb_list;
-       struct hl_cs *cs;
-       struct hl_cb *cb;
-       u32 hw_queue_id;
-       u32 cb_size;
-       int process_list, rc = 0;
-
-       if (list_empty(&ctx->pending_cb_list))
-               return 0;
-
-       process_list = atomic_cmpxchg(&ctx->thread_pending_cb_token, 1, 0);
-
-       /* Only a single thread is allowed to process the list */
-       if (!process_list)
-               return 0;
-
-       if (list_empty(&ctx->pending_cb_list))
-               goto free_pending_cb_token;
-
-       /* move all list elements to a local list */
-       INIT_LIST_HEAD(&local_cb_list);
-       spin_lock(&ctx->pending_cb_lock);
-       list_for_each_entry_safe(pending_cb, tmp, &ctx->pending_cb_list,
-                                                               cb_node)
-               list_move_tail(&pending_cb->cb_node, &local_cb_list);
-       spin_unlock(&ctx->pending_cb_lock);
-
-       rc = allocate_cs(hdev, ctx, CS_TYPE_DEFAULT, ULLONG_MAX, &cs, 0,
-                               hdev->timeout_jiffies);
-       if (rc)
-               goto add_list_elements;
-
-       hl_debugfs_add_cs(cs);
-
-       /* Iterate through pending cb list, create jobs and add to CS */
-       list_for_each_entry(pending_cb, &local_cb_list, cb_node) {
-               cb = pending_cb->cb;
-               cb_size = pending_cb->cb_size;
-               hw_queue_id = pending_cb->hw_queue_id;
-
-               rc = pending_cb_create_job(hdev, ctx, cs, cb, cb_size,
-                                                               hw_queue_id);
-               if (rc)
-                       goto free_cs_object;
-       }
-
-       rc = hl_hw_queue_schedule_cs(cs);
-       if (rc) {
-               if (rc != -EAGAIN)
-                       dev_err(hdev->dev,
-                               "Failed to submit CS %d.%llu (%d)\n",
-                               ctx->asid, cs->sequence, rc);
-               goto free_cs_object;
-       }
-
-       /* pending cb was scheduled successfully */
-       list_for_each_entry_safe(pending_cb, tmp, &local_cb_list, cb_node) {
-               list_del(&pending_cb->cb_node);
-               kfree(pending_cb);
-       }
-
-       cs_put(cs);
-
-       goto free_pending_cb_token;
-
-free_cs_object:
-       cs_rollback(hdev, cs);
-       cs_put(cs);
-add_list_elements:
-       spin_lock(&ctx->pending_cb_lock);
-       list_for_each_entry_safe_reverse(pending_cb, tmp, &local_cb_list,
-                                                               cb_node)
-               list_move(&pending_cb->cb_node, &ctx->pending_cb_list);
-       spin_unlock(&ctx->pending_cb_lock);
-free_pending_cb_token:
-       atomic_set(&ctx->thread_pending_cb_token, 1);
-
-       return rc;
-}
-
 static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
                                u64 *cs_seq)
 {
@@ -1443,7 +1473,7 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
                        rc = 0;
                } else {
                        rc = cs_ioctl_default(hpriv, chunks, num_chunks,
-                                       cs_seq, 0, hdev->timeout_jiffies);
+                                       cs_seq, 0, 0, hdev->timeout_jiffies);
                }
 
                mutex_unlock(&hpriv->restore_phase_mutex);
@@ -1501,10 +1531,17 @@ out:
  * hl_cs_signal_sob_wraparound_handler: handle SOB value wrapaound case.
  * if the SOB value reaches the max value move to the other SOB reserved
  * to the queue.
+ * @hdev: pointer to device structure
+ * @q_idx: stream queue index
+ * @hw_sob: the H/W SOB used in this signal CS.
+ * @count: signals count
+ * @encaps_sig: tells whether it's reservation for encaps signals or not.
+ *
  * Note that this function must be called while hw_queues_lock is taken.
  */
 int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
-                       struct hl_hw_sob **hw_sob, u32 count)
+                       struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig)
+
 {
        struct hl_sync_stream_properties *prop;
        struct hl_hw_sob *sob = *hw_sob, *other_sob;
@@ -1512,7 +1549,7 @@ int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
 
        prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
 
-       kref_get(&sob->kref);
+       hw_sob_get(sob);
 
        /* check for wraparound */
        if (prop->next_sob_val + count >= HL_MAX_SOB_VAL) {
@@ -1522,7 +1559,7 @@ int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
                 * just incremented the refcount right before calling this
                 * function.
                 */
-               kref_put(&sob->kref, hl_sob_reset_error);
+               hw_sob_put_err(sob);
 
                /*
                 * check the other sob value, if it still in use then fail
@@ -1537,12 +1574,42 @@ int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
                        return -EINVAL;
                }
 
-               prop->next_sob_val = 1;
+               /*
+                * next_sob_val always points to the next available signal
+                * in the sob, so in encaps signals it will be the next one
+                * after reserving the required amount.
+                */
+               if (encaps_sig)
+                       prop->next_sob_val = count + 1;
+               else
+                       prop->next_sob_val = count;
 
                /* only two SOBs are currently in use */
                prop->curr_sob_offset = other_sob_offset;
                *hw_sob = other_sob;
 
+               /*
+                * check if other_sob needs reset, then do it before using it
+                * for the reservation or the next signal cs.
+                * we do it here, and for both encaps and regular signal cs
+                * cases in order to avoid possible races of two kref_put
+                * of the sob which can occur at the same time if we move the
+                * sob reset(kref_put) to cs_do_release function.
+                * in addition, if we have combination of cs signal and
+                * encaps, and at the point we need to reset the sob there was
+                * no more reservations and only signal cs keep coming,
+                * in such case we need signal_cs to put the refcount and
+                * reset the sob.
+                */
+               if (other_sob->need_reset)
+                       hw_sob_put(other_sob);
+
+               if (encaps_sig) {
+                       /* set reset indication for the sob */
+                       sob->need_reset = true;
+                       hw_sob_get(other_sob);
+               }
+
                dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n",
                                prop->curr_sob_offset, q_idx);
        } else {
@@ -1553,12 +1620,18 @@ int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
 }
 
 static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
-               struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx)
+               struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx,
+               bool encaps_signals)
 {
        u64 *signal_seq_arr = NULL;
        u32 size_to_copy, signal_seq_arr_len;
        int rc = 0;
 
+       if (encaps_signals) {
+               *signal_seq = chunk->encaps_signal_seq;
+               return 0;
+       }
+
        signal_seq_arr_len = chunk->num_signal_seq_arr;
 
        /* currently only one signal seq is supported */
@@ -1583,7 +1656,7 @@ static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
                return -ENOMEM;
        }
 
-       size_to_copy = chunk->num_signal_seq_arr * sizeof(*signal_seq_arr);
+       size_to_copy = signal_seq_arr_len * sizeof(*signal_seq_arr);
        if (copy_from_user(signal_seq_arr,
                                u64_to_user_ptr(chunk->signal_seq_arr),
                                size_to_copy)) {
@@ -1605,8 +1678,8 @@ out:
 }
 
 static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
-               struct hl_ctx *ctx, struct hl_cs *cs, enum hl_queue_type q_type,
-               u32 q_idx)
+               struct hl_ctx *ctx, struct hl_cs *cs,
+               enum hl_queue_type q_type, u32 q_idx, u32 encaps_signal_offset)
 {
        struct hl_cs_counters_atomic *cntr;
        struct hl_cs_job *job;
@@ -1644,6 +1717,9 @@ static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
        job->user_cb_size = cb_size;
        job->hw_queue_id = q_idx;
 
+       if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
+                       && cs->encaps_signals)
+               job->encaps_sig_wait_offset = encaps_signal_offset;
        /*
         * No need in parsing, user CB is the patched CB.
         * We call hl_cb_destroy() out of two reasons - we don't need the CB in
@@ -1666,75 +1742,307 @@ static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
        return 0;
 }
 
-static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
-                               void __user *chunks, u32 num_chunks,
-                               u64 *cs_seq, u32 flags, u32 timeout)
+static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
+                               u32 q_idx, u32 count,
+                               u32 *handle_id, u32 *sob_addr,
+                               u32 *signals_count)
 {
-       struct hl_cs_chunk *cs_chunk_array, *chunk;
        struct hw_queue_properties *hw_queue_prop;
+       struct hl_sync_stream_properties *prop;
        struct hl_device *hdev = hpriv->hdev;
-       struct hl_cs_compl *sig_waitcs_cmpl;
-       u32 q_idx, collective_engine_id = 0;
-       struct hl_cs_counters_atomic *cntr;
-       struct hl_fence *sig_fence = NULL;
-       struct hl_ctx *ctx = hpriv->ctx;
-       enum hl_queue_type q_type;
-       struct hl_cs *cs;
-       u64 signal_seq;
-       int rc;
-
-       cntr = &hdev->aggregated_cs_counters;
-       *cs_seq = ULLONG_MAX;
+       struct hl_cs_encaps_sig_handle *handle;
+       struct hl_encaps_signals_mgr *mgr;
+       struct hl_hw_sob *hw_sob;
+       int hdl_id;
+       int rc = 0;
 
-       rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks,
-                       ctx);
-       if (rc)
+       if (count >= HL_MAX_SOB_VAL) {
+               dev_err(hdev->dev, "signals count(%u) exceeds the max SOB value\n",
+                                               count);
+               rc = -EINVAL;
                goto out;
+       }
 
-       /* currently it is guaranteed to have only one chunk */
-       chunk = &cs_chunk_array[0];
-
-       if (chunk->queue_index >= hdev->asic_prop.max_queues) {
-               atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
-               atomic64_inc(&cntr->validation_drop_cnt);
+       if (q_idx >= hdev->asic_prop.max_queues) {
                dev_err(hdev->dev, "Queue index %d is invalid\n",
-                       chunk->queue_index);
+                       q_idx);
                rc = -EINVAL;
-               goto free_cs_chunk_array;
+               goto out;
        }
 
-       q_idx = chunk->queue_index;
        hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
-       q_type = hw_queue_prop->type;
 
        if (!hw_queue_prop->supports_sync_stream) {
-               atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
-               atomic64_inc(&cntr->validation_drop_cnt);
                dev_err(hdev->dev,
                        "Queue index %d does not support sync stream operations\n",
-                       q_idx);
+                                                                       q_idx);
                rc = -EINVAL;
-               goto free_cs_chunk_array;
+               goto out;
        }
 
-       if (cs_type == CS_TYPE_COLLECTIVE_WAIT) {
-               if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) {
-                       atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
-                       atomic64_inc(&cntr->validation_drop_cnt);
-                       dev_err(hdev->dev,
-                               "Queue index %d is invalid\n", q_idx);
-                       rc = -EINVAL;
-                       goto free_cs_chunk_array;
-               }
+       prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
 
-               collective_engine_id = chunk->collective_engine_id;
-       }
+       handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+       if (!handle) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       handle->count = count;
+       mgr = &hpriv->ctx->sig_mgr;
+
+       spin_lock(&mgr->lock);
+       hdl_id = idr_alloc(&mgr->handles, handle, 1, 0, GFP_ATOMIC);
+       spin_unlock(&mgr->lock);
+
+       if (hdl_id < 0) {
+               dev_err(hdev->dev, "Failed to allocate IDR for a new signal reservation\n");
+               rc = -EINVAL;
+               goto out;
+       }
+
+       handle->id = hdl_id;
+       handle->q_idx = q_idx;
+       handle->hdev = hdev;
+       kref_init(&handle->refcount);
+
+       hdev->asic_funcs->hw_queues_lock(hdev);
+
+       hw_sob = &prop->hw_sob[prop->curr_sob_offset];
+
+       /*
+        * Increment the SOB value by count by user request
+        * to reserve those signals
+        * check if the signals amount to reserve is not exceeding the max sob
+        * value, if yes then switch sob.
+        */
+       rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, count,
+                                                               true);
+       if (rc) {
+               dev_err(hdev->dev, "Failed to switch SOB\n");
+               hdev->asic_funcs->hw_queues_unlock(hdev);
+               rc = -EINVAL;
+               goto remove_idr;
+       }
+       /* set the hw_sob to the handle after calling the sob wraparound handler
+        * since sob could have changed.
+        */
+       handle->hw_sob = hw_sob;
+
+       /* store the current sob value for unreserve validity check, and
+        * signal offset support
+        */
+       handle->pre_sob_val = prop->next_sob_val - handle->count;
+
+       *signals_count = prop->next_sob_val;
+       hdev->asic_funcs->hw_queues_unlock(hdev);
+
+       *sob_addr = handle->hw_sob->sob_addr;
+       *handle_id = hdl_id;
+
+       dev_dbg(hdev->dev,
+               "Signals reserved, sob_id: %d, sob addr: 0x%x, last sob_val: %u, q_idx: %d, hdl_id: %d\n",
+                       hw_sob->sob_id, handle->hw_sob->sob_addr,
+                       prop->next_sob_val - 1, q_idx, hdl_id);
+       goto out;
+
+remove_idr:
+       spin_lock(&mgr->lock);
+       idr_remove(&mgr->handles, hdl_id);
+       spin_unlock(&mgr->lock);
+
+       kfree(handle);
+out:
+       return rc;
+}
+
+static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
+{
+       struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
+       struct hl_sync_stream_properties *prop;
+       struct hl_device *hdev = hpriv->hdev;
+       struct hl_encaps_signals_mgr *mgr;
+       struct hl_hw_sob *hw_sob;
+       u32 q_idx, sob_addr;
+       int rc = 0;
+
+       mgr = &hpriv->ctx->sig_mgr;
+
+       spin_lock(&mgr->lock);
+       encaps_sig_hdl = idr_find(&mgr->handles, handle_id);
+       if (encaps_sig_hdl) {
+               dev_dbg(hdev->dev, "unreserve signals, handle: %u, SOB:0x%x, count: %u\n",
+                               handle_id, encaps_sig_hdl->hw_sob->sob_addr,
+                                       encaps_sig_hdl->count);
+
+               hdev->asic_funcs->hw_queues_lock(hdev);
+
+               q_idx = encaps_sig_hdl->q_idx;
+               prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
+               hw_sob = &prop->hw_sob[prop->curr_sob_offset];
+               sob_addr = hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id);
+
+               /* Check if sob_val got out of sync due to other
+                * signal submission requests which were handled
+                * between the reserve-unreserve calls or SOB switch
+                * upon reaching SOB max value.
+                */
+               if (encaps_sig_hdl->pre_sob_val + encaps_sig_hdl->count
+                               != prop->next_sob_val ||
+                               sob_addr != encaps_sig_hdl->hw_sob->sob_addr) {
+                       dev_err(hdev->dev, "Cannot unreserve signals, SOB val ran out of sync, expected: %u, actual val: %u\n",
+                               encaps_sig_hdl->pre_sob_val,
+                               (prop->next_sob_val - encaps_sig_hdl->count));
+
+                       hdev->asic_funcs->hw_queues_unlock(hdev);
+                       rc = -EINVAL;
+                       goto out;
+               }
+
+               /*
+                * Decrement the SOB value by count by user request
+                * to unreserve those signals
+                */
+               prop->next_sob_val -= encaps_sig_hdl->count;
 
-       if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_COLLECTIVE_WAIT) {
-               rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq, ctx);
+               hdev->asic_funcs->hw_queues_unlock(hdev);
+
+               hw_sob_put(hw_sob);
+
+               /* Release the id and free allocated memory of the handle */
+               idr_remove(&mgr->handles, handle_id);
+               kfree(encaps_sig_hdl);
+       } else {
+               rc = -EINVAL;
+               dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n");
+       }
+out:
+       spin_unlock(&mgr->lock);
+
+       return rc;
+}
+
+static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
+                               void __user *chunks, u32 num_chunks,
+                               u64 *cs_seq, u32 flags, u32 timeout)
+{
+       struct hl_cs_encaps_sig_handle *encaps_sig_hdl = NULL;
+       bool handle_found = false, is_wait_cs = false,
+                       wait_cs_submitted = false,
+                       cs_encaps_signals = false;
+       struct hl_cs_chunk *cs_chunk_array, *chunk;
+       bool staged_cs_with_encaps_signals = false;
+       struct hw_queue_properties *hw_queue_prop;
+       struct hl_device *hdev = hpriv->hdev;
+       struct hl_cs_compl *sig_waitcs_cmpl;
+       u32 q_idx, collective_engine_id = 0;
+       struct hl_cs_counters_atomic *cntr;
+       struct hl_fence *sig_fence = NULL;
+       struct hl_ctx *ctx = hpriv->ctx;
+       enum hl_queue_type q_type;
+       struct hl_cs *cs;
+       u64 signal_seq;
+       int rc;
+
+       cntr = &hdev->aggregated_cs_counters;
+       *cs_seq = ULLONG_MAX;
+
+       rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks,
+                       ctx);
+       if (rc)
+               goto out;
+
+       /* currently it is guaranteed to have only one chunk */
+       chunk = &cs_chunk_array[0];
+
+       if (chunk->queue_index >= hdev->asic_prop.max_queues) {
+               atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
+               atomic64_inc(&cntr->validation_drop_cnt);
+               dev_err(hdev->dev, "Queue index %d is invalid\n",
+                       chunk->queue_index);
+               rc = -EINVAL;
+               goto free_cs_chunk_array;
+       }
+
+       q_idx = chunk->queue_index;
+       hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
+       q_type = hw_queue_prop->type;
+
+       if (!hw_queue_prop->supports_sync_stream) {
+               atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
+               atomic64_inc(&cntr->validation_drop_cnt);
+               dev_err(hdev->dev,
+                       "Queue index %d does not support sync stream operations\n",
+                       q_idx);
+               rc = -EINVAL;
+               goto free_cs_chunk_array;
+       }
+
+       if (cs_type == CS_TYPE_COLLECTIVE_WAIT) {
+               if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) {
+                       atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
+                       atomic64_inc(&cntr->validation_drop_cnt);
+                       dev_err(hdev->dev,
+                               "Queue index %d is invalid\n", q_idx);
+                       rc = -EINVAL;
+                       goto free_cs_chunk_array;
+               }
+
+               collective_engine_id = chunk->collective_engine_id;
+       }
+
+       is_wait_cs = !!(cs_type == CS_TYPE_WAIT ||
+                       cs_type == CS_TYPE_COLLECTIVE_WAIT);
+
+       cs_encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
+
+       if (is_wait_cs) {
+               rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq,
+                               ctx, cs_encaps_signals);
                if (rc)
                        goto free_cs_chunk_array;
 
+               if (cs_encaps_signals) {
+                       /* check if cs sequence has encapsulated
+                        * signals handle
+                        */
+                       struct idr *idp;
+                       u32 id;
+
+                       spin_lock(&ctx->sig_mgr.lock);
+                       idp = &ctx->sig_mgr.handles;
+                       idr_for_each_entry(idp, encaps_sig_hdl, id) {
+                               if (encaps_sig_hdl->cs_seq == signal_seq) {
+                                       handle_found = true;
+                                       /* get refcount to protect removing
+                                        * this handle from idr, needed when
+                                        * multiple wait cs are used with offset
+                                        * to wait on reserved encaps signals.
+                                        */
+                                       kref_get(&encaps_sig_hdl->refcount);
+                                       break;
+                               }
+                       }
+                       spin_unlock(&ctx->sig_mgr.lock);
+
+                       if (!handle_found) {
+                               dev_err(hdev->dev, "Cannot find encapsulated signals handle for seq 0x%llx\n",
+                                               signal_seq);
+                               rc = -EINVAL;
+                               goto free_cs_chunk_array;
+                       }
+
+                       /* validate also the signal offset value */
+                       if (chunk->encaps_signal_offset >
+                                       encaps_sig_hdl->count) {
+                               dev_err(hdev->dev, "offset(%u) value exceed max reserved signals count(%u)!\n",
+                                               chunk->encaps_signal_offset,
+                                               encaps_sig_hdl->count);
+                               rc = -EINVAL;
+                               goto free_cs_chunk_array;
+                       }
+               }
+
                sig_fence = hl_ctx_get_fence(ctx, signal_seq);
                if (IS_ERR(sig_fence)) {
                        atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
@@ -1755,11 +2063,16 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
                sig_waitcs_cmpl =
                        container_of(sig_fence, struct hl_cs_compl, base_fence);
 
-               if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL) {
+               staged_cs_with_encaps_signals = !!
+                               (sig_waitcs_cmpl->type == CS_TYPE_DEFAULT &&
+                               (flags & HL_CS_FLAGS_ENCAP_SIGNALS));
+
+               if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL &&
+                               !staged_cs_with_encaps_signals) {
                        atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
                        atomic64_inc(&cntr->validation_drop_cnt);
                        dev_err(hdev->dev,
-                               "CS seq 0x%llx is not of a signal CS\n",
+                               "CS seq 0x%llx is not of a signal/encaps-signal CS\n",
                                signal_seq);
                        hl_fence_put(sig_fence);
                        rc = -EINVAL;
@@ -1776,18 +2089,27 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 
        rc = allocate_cs(hdev, ctx, cs_type, ULLONG_MAX, &cs, flags, timeout);
        if (rc) {
-               if (cs_type == CS_TYPE_WAIT ||
-                       cs_type == CS_TYPE_COLLECTIVE_WAIT)
+               if (is_wait_cs)
                        hl_fence_put(sig_fence);
+
                goto free_cs_chunk_array;
        }
 
        /*
         * Save the signal CS fence for later initialization right before
         * hanging the wait CS on the queue.
+        * for encaps signals case, we save the cs sequence and handle pointer
+        * for later initialization.
         */
-       if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_COLLECTIVE_WAIT)
+       if (is_wait_cs) {
                cs->signal_fence = sig_fence;
+               /* store the handle pointer, so we don't have to
+                * look for it again, later on the flow
+                * when we need to set SOB info in hw_queue.
+                */
+               if (cs->encaps_signals)
+                       cs->encaps_sig_hdl = encaps_sig_hdl;
+       }
 
        hl_debugfs_add_cs(cs);
 
@@ -1795,10 +2117,11 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 
        if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_SIGNAL)
                rc = cs_ioctl_signal_wait_create_jobs(hdev, ctx, cs, q_type,
-                               q_idx);
+                               q_idx, chunk->encaps_signal_offset);
        else if (cs_type == CS_TYPE_COLLECTIVE_WAIT)
                rc = hdev->asic_funcs->collective_wait_create_jobs(hdev, ctx,
-                               cs, q_idx, collective_engine_id);
+                               cs, q_idx, collective_engine_id,
+                               chunk->encaps_signal_offset);
        else {
                atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
                atomic64_inc(&cntr->validation_drop_cnt);
@@ -1810,7 +2133,13 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 
        rc = hl_hw_queue_schedule_cs(cs);
        if (rc) {
-               if (rc != -EAGAIN)
+               /* In case wait cs failed here, it means the signal cs
+                * already completed. we want to free all it's related objects
+                * but we don't want to fail the ioctl.
+                */
+               if (is_wait_cs)
+                       rc = 0;
+               else if (rc != -EAGAIN)
                        dev_err(hdev->dev,
                                "Failed to submit CS %d.%llu to H/W queues, error %d\n",
                                ctx->asid, cs->sequence, rc);
@@ -1818,6 +2147,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
        }
 
        rc = HL_CS_STATUS_SUCCESS;
+       if (is_wait_cs)
+               wait_cs_submitted = true;
        goto put_cs;
 
 free_cs_object:
@@ -1828,6 +2159,10 @@ put_cs:
        /* We finished with the CS in this function, so put the ref */
        cs_put(cs);
 free_cs_chunk_array:
+       if (!wait_cs_submitted && cs_encaps_signals && handle_found &&
+                                                       is_wait_cs)
+               kref_put(&encaps_sig_hdl->refcount,
+                               hl_encaps_handle_do_release);
        kfree(cs_chunk_array);
 out:
        return rc;
@@ -1836,10 +2171,11 @@ out:
 int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 {
        union hl_cs_args *args = data;
-       enum hl_cs_type cs_type;
+       enum hl_cs_type cs_type = 0;
        u64 cs_seq = ULONG_MAX;
        void __user *chunks;
-       u32 num_chunks, flags, timeout;
+       u32 num_chunks, flags, timeout,
+               signals_count = 0, sob_addr = 0, handle_id = 0;
        int rc;
 
        rc = hl_cs_sanity_checks(hpriv, args);
@@ -1850,10 +2186,6 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
        if (rc)
                goto out;
 
-       rc = hl_submit_pending_cb(hpriv);
-       if (rc)
-               goto out;
-
        cs_type = hl_cs_get_cs_type(args->in.cs_flags &
                                        ~HL_CS_FLAGS_FORCE_RESTORE);
        chunks = (void __user *) (uintptr_t) args->in.chunks_execute;
@@ -1876,80 +2208,448 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
                rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks,
                                        &cs_seq, args->in.cs_flags, timeout);
                break;
+       case CS_RESERVE_SIGNALS:
+               rc = cs_ioctl_reserve_signals(hpriv,
+                                       args->in.encaps_signals_q_idx,
+                                       args->in.encaps_signals_count,
+                                       &handle_id, &sob_addr, &signals_count);
+               break;
+       case CS_UNRESERVE_SIGNALS:
+               rc = cs_ioctl_unreserve_signals(hpriv,
+                                       args->in.encaps_sig_handle_id);
+               break;
        default:
                rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
-                                               args->in.cs_flags, timeout);
+                                               args->in.cs_flags,
+                                               args->in.encaps_sig_handle_id,
+                                               timeout);
                break;
        }
-
 out:
        if (rc != -EAGAIN) {
                memset(args, 0, sizeof(*args));
+
+               if (cs_type == CS_RESERVE_SIGNALS) {
+                       args->out.handle_id = handle_id;
+                       args->out.sob_base_addr_offset = sob_addr;
+                       args->out.count = signals_count;
+               } else {
+                       args->out.seq = cs_seq;
+               }
                args->out.status = rc;
-               args->out.seq = cs_seq;
        }
 
        return rc;
 }
 
+static int hl_wait_for_fence(struct hl_ctx *ctx, u64 seq, struct hl_fence *fence,
+                               enum hl_cs_wait_status *status, u64 timeout_us,
+                               s64 *timestamp)
+{
+       struct hl_device *hdev = ctx->hdev;
+       long completion_rc;
+       int rc = 0;
+
+       if (IS_ERR(fence)) {
+               rc = PTR_ERR(fence);
+               if (rc == -EINVAL)
+                       dev_notice_ratelimited(hdev->dev,
+                               "Can't wait on CS %llu because current CS is at seq %llu\n",
+                               seq, ctx->cs_sequence);
+               return rc;
+       }
+
+       if (!fence) {
+               dev_dbg(hdev->dev,
+                       "Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
+                               seq, ctx->cs_sequence);
+
+               *status = CS_WAIT_STATUS_GONE;
+               return 0;
+       }
+
+       if (!timeout_us) {
+               completion_rc = completion_done(&fence->completion);
+       } else {
+               unsigned long timeout;
+
+               timeout = (timeout_us == MAX_SCHEDULE_TIMEOUT) ?
+                               timeout_us : usecs_to_jiffies(timeout_us);
+               completion_rc =
+                       wait_for_completion_interruptible_timeout(
+                               &fence->completion, timeout);
+       }
+
+       if (completion_rc > 0) {
+               *status = CS_WAIT_STATUS_COMPLETED;
+               if (timestamp)
+                       *timestamp = ktime_to_ns(fence->timestamp);
+       } else {
+               *status = CS_WAIT_STATUS_BUSY;
+       }
+
+       if (fence->error == -ETIMEDOUT)
+               rc = -ETIMEDOUT;
+       else if (fence->error == -EIO)
+               rc = -EIO;
+
+       return rc;
+}
+
+/*
+ * hl_cs_poll_fences - iterate CS fences to check for CS completion
+ *
+ * @mcs_data: multi-CS internal data
+ *
+ * @return 0 on success, otherwise non 0 error code
+ *
+ * The function iterates on all CS sequence in the list and set bit in
+ * completion_bitmap for each completed CS.
+ * while iterating, the function can extracts the stream map to be later
+ * used by the waiting function.
+ * this function shall be called after taking context ref
+ */
+static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
+{
+       struct hl_fence **fence_ptr = mcs_data->fence_arr;
+       struct hl_device *hdev = mcs_data->ctx->hdev;
+       int i, rc, arr_len = mcs_data->arr_len;
+       u64 *seq_arr = mcs_data->seq_arr;
+       ktime_t max_ktime, first_cs_time;
+       enum hl_cs_wait_status status;
+
+       memset(fence_ptr, 0, arr_len * sizeof(*fence_ptr));
+
+       /* get all fences under the same lock */
+       rc = hl_ctx_get_fences(mcs_data->ctx, seq_arr, fence_ptr, arr_len);
+       if (rc)
+               return rc;
+
+       /*
+        * set to maximum time to verify timestamp is valid: if at the end
+        * this value is maintained- no timestamp was updated
+        */
+       max_ktime = ktime_set(KTIME_SEC_MAX, 0);
+       first_cs_time = max_ktime;
+
+       for (i = 0; i < arr_len; i++, fence_ptr++) {
+               struct hl_fence *fence = *fence_ptr;
+
+               /*
+                * function won't sleep as it is called with timeout 0 (i.e.
+                * poll the fence)
+                */
+               rc = hl_wait_for_fence(mcs_data->ctx, seq_arr[i], fence,
+                                               &status, 0, NULL);
+               if (rc) {
+                       dev_err(hdev->dev,
+                               "wait_for_fence error :%d for CS seq %llu\n",
+                                                               rc, seq_arr[i]);
+                       break;
+               }
+
+               mcs_data->stream_master_qid_map |= fence->stream_master_qid_map;
+
+               if (status == CS_WAIT_STATUS_BUSY)
+                       continue;
+
+               mcs_data->completion_bitmap |= BIT(i);
+
+               /*
+                * best effort to extract timestamp. few notes:
+                * - if even single fence is gone we cannot extract timestamp
+                *   (as fence not exist anymore)
+                * - for all completed CSs we take the earliest timestamp.
+                *   for this we have to validate that:
+                *       1. given timestamp was indeed set
+                *       2. the timestamp is earliest of all timestamps so far
+                */
+
+               if (status == CS_WAIT_STATUS_GONE) {
+                       mcs_data->update_ts = false;
+                       mcs_data->gone_cs = true;
+               } else if (mcs_data->update_ts &&
+                       (ktime_compare(fence->timestamp,
+                                               ktime_set(0, 0)) > 0) &&
+                       (ktime_compare(fence->timestamp, first_cs_time) < 0)) {
+                       first_cs_time = fence->timestamp;
+               }
+       }
+
+       hl_fences_put(mcs_data->fence_arr, arr_len);
+
+       if (mcs_data->update_ts &&
+                       (ktime_compare(first_cs_time, max_ktime) != 0))
+               mcs_data->timestamp = ktime_to_ns(first_cs_time);
+
+       return rc;
+}
+
 static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
                                u64 timeout_us, u64 seq,
                                enum hl_cs_wait_status *status, s64 *timestamp)
 {
        struct hl_fence *fence;
-       unsigned long timeout;
        int rc = 0;
-       long completion_rc;
 
        if (timestamp)
                *timestamp = 0;
 
-       if (timeout_us == MAX_SCHEDULE_TIMEOUT)
-               timeout = timeout_us;
-       else
-               timeout = usecs_to_jiffies(timeout_us);
-
        hl_ctx_get(hdev, ctx);
 
        fence = hl_ctx_get_fence(ctx, seq);
-       if (IS_ERR(fence)) {
-               rc = PTR_ERR(fence);
-               if (rc == -EINVAL)
-                       dev_notice_ratelimited(hdev->dev,
-                               "Can't wait on CS %llu because current CS is at seq %llu\n",
-                               seq, ctx->cs_sequence);
-       } else if (fence) {
-               if (!timeout_us)
-                       completion_rc = completion_done(&fence->completion);
-               else
-                       completion_rc =
-                               wait_for_completion_interruptible_timeout(
-                                       &fence->completion, timeout);
 
-               if (completion_rc > 0) {
-                       *status = CS_WAIT_STATUS_COMPLETED;
-                       if (timestamp)
-                               *timestamp = ktime_to_ns(fence->timestamp);
-               } else {
-                       *status = CS_WAIT_STATUS_BUSY;
+       rc = hl_wait_for_fence(ctx, seq, fence, status, timeout_us, timestamp);
+       hl_fence_put(fence);
+       hl_ctx_put(ctx);
+
+       return rc;
+}
+
+/*
+ * hl_wait_multi_cs_completion_init - init completion structure
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @stream_master_bitmap: stream master QIDs map, set bit indicates stream
+ *                        master QID to wait on
+ *
+ * @return valid completion struct pointer on success, otherwise error pointer
+ *
+ * up to MULTI_CS_MAX_USER_CTX calls can be done concurrently to the driver.
+ * the function gets the first available completion (by marking it "used")
+ * and initialize its values.
+ */
+static struct multi_cs_completion *hl_wait_multi_cs_completion_init(
+                                                       struct hl_device *hdev,
+                                                       u8 stream_master_bitmap)
+{
+       struct multi_cs_completion *mcs_compl;
+       int i;
+
+       /* find free multi_cs completion structure */
+       for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
+               mcs_compl = &hdev->multi_cs_completion[i];
+               spin_lock(&mcs_compl->lock);
+               if (!mcs_compl->used) {
+                       mcs_compl->used = 1;
+                       mcs_compl->timestamp = 0;
+                       mcs_compl->stream_master_qid_map = stream_master_bitmap;
+                       reinit_completion(&mcs_compl->completion);
+                       spin_unlock(&mcs_compl->lock);
+                       break;
                }
+               spin_unlock(&mcs_compl->lock);
+       }
 
-               if (fence->error == -ETIMEDOUT)
-                       rc = -ETIMEDOUT;
-               else if (fence->error == -EIO)
-                       rc = -EIO;
+       if (i == MULTI_CS_MAX_USER_CTX) {
+               dev_err(hdev->dev,
+                               "no available multi-CS completion structure\n");
+               return ERR_PTR(-ENOMEM);
+       }
+       return mcs_compl;
+}
 
-               hl_fence_put(fence);
-       } else {
-               dev_dbg(hdev->dev,
-                       "Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
-                       seq, ctx->cs_sequence);
-               *status = CS_WAIT_STATUS_GONE;
+/*
+ * hl_wait_multi_cs_completion_fini - return completion structure and set as
+ *                                    unused
+ *
+ * @mcs_compl: pointer to the completion structure
+ */
+static void hl_wait_multi_cs_completion_fini(
+                                       struct multi_cs_completion *mcs_compl)
+{
+       /*
+        * free completion structure, do it under lock to be in-sync with the
+        * thread that signals completion
+        */
+       spin_lock(&mcs_compl->lock);
+       mcs_compl->used = 0;
+       spin_unlock(&mcs_compl->lock);
+}
+
+/*
+ * hl_wait_multi_cs_completion - wait for first CS to complete
+ *
+ * @mcs_data: multi-CS internal data
+ *
+ * @return 0 on success, otherwise non 0 error code
+ */
+static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data)
+{
+       struct hl_device *hdev = mcs_data->ctx->hdev;
+       struct multi_cs_completion *mcs_compl;
+       long completion_rc;
+
+       mcs_compl = hl_wait_multi_cs_completion_init(hdev,
+                                       mcs_data->stream_master_qid_map);
+       if (IS_ERR(mcs_compl))
+               return PTR_ERR(mcs_compl);
+
+       completion_rc = wait_for_completion_interruptible_timeout(
+                                       &mcs_compl->completion,
+                                       usecs_to_jiffies(mcs_data->timeout_us));
+
+       /* update timestamp */
+       if (completion_rc > 0)
+               mcs_data->timestamp = mcs_compl->timestamp;
+
+       hl_wait_multi_cs_completion_fini(mcs_compl);
+
+       mcs_data->wait_status = completion_rc;
+
+       return 0;
+}
+
+/*
+ * hl_multi_cs_completion_init - init array of multi-CS completion structures
+ *
+ * @hdev: pointer to habanalabs device structure
+ */
+void hl_multi_cs_completion_init(struct hl_device *hdev)
+{
+       struct multi_cs_completion *mcs_cmpl;
+       int i;
+
+       for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
+               mcs_cmpl = &hdev->multi_cs_completion[i];
+               mcs_cmpl->used = 0;
+               spin_lock_init(&mcs_cmpl->lock);
+               init_completion(&mcs_cmpl->completion);
+       }
+}
+
+/*
+ * hl_multi_cs_wait_ioctl - implementation of the multi-CS wait ioctl
+ *
+ * @hpriv: pointer to the private data of the fd
+ * @data: pointer to multi-CS wait ioctl in/out args
+ *
+ */
+static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
+{
+       struct hl_device *hdev = hpriv->hdev;
+       struct multi_cs_data mcs_data = {0};
+       union hl_wait_cs_args *args = data;
+       struct hl_ctx *ctx = hpriv->ctx;
+       struct hl_fence **fence_arr;
+       void __user *seq_arr;
+       u32 size_to_copy;
+       u64 *cs_seq_arr;
+       u8 seq_arr_len;
+       int rc;
+
+       if (!hdev->supports_wait_for_multi_cs) {
+               dev_err(hdev->dev, "Wait for multi CS is not supported\n");
+               return -EPERM;
+       }
+
+       seq_arr_len = args->in.seq_arr_len;
+
+       if (seq_arr_len > HL_WAIT_MULTI_CS_LIST_MAX_LEN) {
+               dev_err(hdev->dev, "Can wait only up to %d CSs, input sequence is of length %u\n",
+                               HL_WAIT_MULTI_CS_LIST_MAX_LEN, seq_arr_len);
+               return -EINVAL;
+       }
+
+       /* allocate memory for sequence array */
+       cs_seq_arr =
+               kmalloc_array(seq_arr_len, sizeof(*cs_seq_arr), GFP_KERNEL);
+       if (!cs_seq_arr)
+               return -ENOMEM;
+
+       /* copy CS sequence array from user */
+       seq_arr = (void __user *) (uintptr_t) args->in.seq;
+       size_to_copy = seq_arr_len * sizeof(*cs_seq_arr);
+       if (copy_from_user(cs_seq_arr, seq_arr, size_to_copy)) {
+               dev_err(hdev->dev, "Failed to copy multi-cs sequence array from user\n");
+               rc = -EFAULT;
+               goto free_seq_arr;
+       }
+
+       /* allocate array for the fences */
+       fence_arr = kmalloc_array(seq_arr_len, sizeof(*fence_arr), GFP_KERNEL);
+       if (!fence_arr) {
+               rc = -ENOMEM;
+               goto free_seq_arr;
+       }
+
+       /* initialize the multi-CS internal data */
+       mcs_data.ctx = ctx;
+       mcs_data.seq_arr = cs_seq_arr;
+       mcs_data.fence_arr = fence_arr;
+       mcs_data.arr_len = seq_arr_len;
+
+       hl_ctx_get(hdev, ctx);
+
+       /* poll all CS fences, extract timestamp */
+       mcs_data.update_ts = true;
+       rc = hl_cs_poll_fences(&mcs_data);
+       /*
+        * skip wait for CS completion when one of the below is true:
+        * - an error on the poll function
+        * - one or more CS in the list completed
+        * - the user called ioctl with timeout 0
+        */
+       if (rc || mcs_data.completion_bitmap || !args->in.timeout_us)
+               goto put_ctx;
+
+       /* wait (with timeout) for the first CS to be completed */
+       mcs_data.timeout_us = args->in.timeout_us;
+       rc = hl_wait_multi_cs_completion(&mcs_data);
+       if (rc)
+               goto put_ctx;
+
+       if (mcs_data.wait_status > 0) {
+               /*
+                * poll fences once again to update the CS map.
+                * no timestamp should be updated this time.
+                */
+               mcs_data.update_ts = false;
+               rc = hl_cs_poll_fences(&mcs_data);
+
+               /*
+                * if hl_wait_multi_cs_completion returned before timeout (i.e.
+                * it got a completion) we expect to see at least one CS
+                * completed after the poll function.
+                */
+               if (!mcs_data.completion_bitmap) {
+                       dev_err(hdev->dev, "Multi-CS got completion on wait but no CS completed\n");
+                       rc = -EFAULT;
+               }
        }
 
+put_ctx:
        hl_ctx_put(ctx);
+       kfree(fence_arr);
 
-       return rc;
+free_seq_arr:
+       kfree(cs_seq_arr);
+
+       /* update output args */
+       memset(args, 0, sizeof(*args));
+       if (rc)
+               return rc;
+
+       if (mcs_data.completion_bitmap) {
+               args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
+               args->out.cs_completion_map = mcs_data.completion_bitmap;
+
+               /* if timestamp not 0- it's valid */
+               if (mcs_data.timestamp) {
+                       args->out.timestamp_nsec = mcs_data.timestamp;
+                       args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
+               }
+
+               /* update if some CS was gone */
+               if (mcs_data.timestamp)
+                       args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
+       } else if (mcs_data.wait_status == -ERESTARTSYS) {
+               args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED;
+       } else {
+               args->out.status = HL_WAIT_CS_STATUS_BUSY;
+       }
+
+       return 0;
 }
 
 static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
@@ -2015,9 +2715,9 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 {
        struct hl_user_pending_interrupt *pend;
        struct hl_user_interrupt *interrupt;
-       unsigned long timeout;
-       long completion_rc;
+       unsigned long timeout, flags;
        u32 completion_value;
+       long completion_rc;
        int rc = 0;
 
        if (timeout_us == U32_MAX)
@@ -2040,17 +2740,10 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
        else
                interrupt = &hdev->user_interrupt[interrupt_offset];
 
-       spin_lock(&interrupt->wait_list_lock);
-       if (!hl_device_operational(hdev, NULL)) {
-               rc = -EPERM;
-               goto unlock_and_free_fence;
-       }
-
        if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 4)) {
-               dev_err(hdev->dev,
-                       "Failed to copy completion value from user\n");
+               dev_err(hdev->dev, "Failed to copy completion value from user\n");
                rc = -EFAULT;
-               goto unlock_and_free_fence;
+               goto free_fence;
        }
 
        if (completion_value >= target_value)
@@ -2059,48 +2752,57 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
                *status = CS_WAIT_STATUS_BUSY;
 
        if (!timeout_us || (*status == CS_WAIT_STATUS_COMPLETED))
-               goto unlock_and_free_fence;
+               goto free_fence;
 
        /* Add pending user interrupt to relevant list for the interrupt
         * handler to monitor
         */
+       spin_lock_irqsave(&interrupt->wait_list_lock, flags);
        list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
-       spin_unlock(&interrupt->wait_list_lock);
+       spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
 
 wait_again:
        /* Wait for interrupt handler to signal completion */
-       completion_rc =
-               wait_for_completion_interruptible_timeout(
-                               &pend->fence.completion, timeout);
+       completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion,
+                                                                               timeout);
 
        /* If timeout did not expire we need to perform the comparison.
         * If comparison fails, keep waiting until timeout expires
         */
        if (completion_rc > 0) {
-               if (copy_from_user(&completion_value,
-                               u64_to_user_ptr(user_address), 4)) {
-                       dev_err(hdev->dev,
-                               "Failed to copy completion value from user\n");
+               if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 4)) {
+                       dev_err(hdev->dev, "Failed to copy completion value from user\n");
                        rc = -EFAULT;
+
                        goto remove_pending_user_interrupt;
                }
 
                if (completion_value >= target_value) {
                        *status = CS_WAIT_STATUS_COMPLETED;
                } else {
+                       spin_lock_irqsave(&interrupt->wait_list_lock, flags);
+                       reinit_completion(&pend->fence.completion);
                        timeout = completion_rc;
+
+                       spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
                        goto wait_again;
                }
+       } else if (completion_rc == -ERESTARTSYS) {
+               dev_err_ratelimited(hdev->dev,
+                       "user process got signal while waiting for interrupt ID %d\n",
+                       interrupt->interrupt_id);
+               *status = HL_WAIT_CS_STATUS_INTERRUPTED;
+               rc = -EINTR;
        } else {
                *status = CS_WAIT_STATUS_BUSY;
        }
 
 remove_pending_user_interrupt:
-       spin_lock(&interrupt->wait_list_lock);
+       spin_lock_irqsave(&interrupt->wait_list_lock, flags);
        list_del(&pend->wait_list_node);
+       spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
 
-unlock_and_free_fence:
-       spin_unlock(&interrupt->wait_list_lock);
+free_fence:
        kfree(pend);
        hl_ctx_put(ctx);
 
@@ -2148,8 +2850,9 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
        memset(args, 0, sizeof(*args));
 
        if (rc) {
-               dev_err_ratelimited(hdev->dev,
-                       "interrupt_wait_ioctl failed (%d)\n", rc);
+               if (rc != -EINTR)
+                       dev_err_ratelimited(hdev->dev,
+                               "interrupt_wait_ioctl failed (%d)\n", rc);
 
                return rc;
        }
@@ -2173,8 +2876,16 @@ int hl_wait_ioctl(struct hl_fpriv *hpriv, void *data)
        u32 flags = args->in.flags;
        int rc;
 
+       /* If the device is not operational, no point in waiting for any command submission or
+        * user interrupt
+        */
+       if (!hl_device_operational(hpriv->hdev, NULL))
+               return -EPERM;
+
        if (flags & HL_WAIT_CS_FLAGS_INTERRUPT)
                rc = hl_interrupt_wait_ioctl(hpriv, data);
+       else if (flags & HL_WAIT_CS_FLAGS_MULTI_CS)
+               rc = hl_multi_cs_wait_ioctl(hpriv, data);
        else
                rc = hl_cs_wait_ioctl(hpriv, data);
 
index 19b6b04..2297830 100644 (file)
@@ -9,16 +9,70 @@
 
 #include <linux/slab.h>
 
+void hl_encaps_handle_do_release(struct kref *ref)
+{
+       struct hl_cs_encaps_sig_handle *handle =
+               container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
+       struct hl_ctx *ctx = handle->hdev->compute_ctx;
+       struct hl_encaps_signals_mgr *mgr = &ctx->sig_mgr;
+
+       spin_lock(&mgr->lock);
+       idr_remove(&mgr->handles, handle->id);
+       spin_unlock(&mgr->lock);
+
+       kfree(handle);
+}
+
+static void hl_encaps_handle_do_release_sob(struct kref *ref)
+{
+       struct hl_cs_encaps_sig_handle *handle =
+               container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
+       struct hl_ctx *ctx = handle->hdev->compute_ctx;
+       struct hl_encaps_signals_mgr *mgr = &ctx->sig_mgr;
+
+       /* if we're here, then there was a signals reservation but cs with
+        * encaps signals wasn't submitted, so need to put refcount
+        * to hw_sob taken at the reservation.
+        */
+       hw_sob_put(handle->hw_sob);
+
+       spin_lock(&mgr->lock);
+       idr_remove(&mgr->handles, handle->id);
+       spin_unlock(&mgr->lock);
+
+       kfree(handle);
+}
+
+static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
+{
+       spin_lock_init(&mgr->lock);
+       idr_init(&mgr->handles);
+}
+
+static void hl_encaps_sig_mgr_fini(struct hl_device *hdev,
+                       struct hl_encaps_signals_mgr *mgr)
+{
+       struct hl_cs_encaps_sig_handle *handle;
+       struct idr *idp;
+       u32 id;
+
+       idp = &mgr->handles;
+
+       if (!idr_is_empty(idp)) {
+               dev_warn(hdev->dev, "device released while some encaps signals handles are still allocated\n");
+               idr_for_each_entry(idp, handle, id)
+                       kref_put(&handle->refcount,
+                                       hl_encaps_handle_do_release_sob);
+       }
+
+       idr_destroy(&mgr->handles);
+}
+
 static void hl_ctx_fini(struct hl_ctx *ctx)
 {
        struct hl_device *hdev = ctx->hdev;
        int i;
 
-       /* Release all allocated pending cb's, those cb's were never
-        * scheduled so it is safe to release them here
-        */
-       hl_pending_cb_list_flush(ctx);
-
        /* Release all allocated HW block mapped list entries and destroy
         * the mutex.
         */
@@ -53,6 +107,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
                hl_cb_va_pool_fini(ctx);
                hl_vm_ctx_fini(ctx);
                hl_asid_free(hdev, ctx->asid);
+               hl_encaps_sig_mgr_fini(hdev, &ctx->sig_mgr);
 
                /* Scrub both SRAM and DRAM */
                hdev->asic_funcs->scrub_device_mem(hdev, 0, 0);
@@ -130,9 +185,6 @@ void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx)
 {
        if (kref_put(&ctx->refcount, hl_ctx_do_release) == 1)
                return;
-
-       dev_warn(hdev->dev,
-               "user process released device but its command submissions are still executing\n");
 }
 
 int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
@@ -144,11 +196,8 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
        kref_init(&ctx->refcount);
 
        ctx->cs_sequence = 1;
-       INIT_LIST_HEAD(&ctx->pending_cb_list);
-       spin_lock_init(&ctx->pending_cb_lock);
        spin_lock_init(&ctx->cs_lock);
        atomic_set(&ctx->thread_ctx_switch_token, 1);
-       atomic_set(&ctx->thread_pending_cb_token, 1);
        ctx->thread_ctx_switch_wait_token = 0;
        ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
                                sizeof(struct hl_fence *),
@@ -200,6 +249,8 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
                        goto err_cb_va_pool_fini;
                }
 
+               hl_encaps_sig_mgr_init(&ctx->sig_mgr);
+
                dev_dbg(hdev->dev, "create user context %d\n", ctx->asid);
        }
 
@@ -229,31 +280,86 @@ int hl_ctx_put(struct hl_ctx *ctx)
        return kref_put(&ctx->refcount, hl_ctx_do_release);
 }
 
-struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
+/*
+ * hl_ctx_get_fence_locked - get CS fence under CS lock
+ *
+ * @ctx: pointer to the context structure.
+ * @seq: CS sequences number
+ *
+ * @return valid fence pointer on success, NULL if fence is gone, otherwise
+ *         error pointer.
+ *
+ * NOTE: this function shall be called with cs_lock locked
+ */
+static struct hl_fence *hl_ctx_get_fence_locked(struct hl_ctx *ctx, u64 seq)
 {
        struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
        struct hl_fence *fence;
 
-       spin_lock(&ctx->cs_lock);
-
-       if (seq >= ctx->cs_sequence) {
-               spin_unlock(&ctx->cs_lock);
+       if (seq >= ctx->cs_sequence)
                return ERR_PTR(-EINVAL);
-       }
 
-       if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) {
-               spin_unlock(&ctx->cs_lock);
+       if (seq + asic_prop->max_pending_cs < ctx->cs_sequence)
                return NULL;
-       }
 
        fence = ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)];
        hl_fence_get(fence);
+       return fence;
+}
+
+struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
+{
+       struct hl_fence *fence;
+
+       spin_lock(&ctx->cs_lock);
+
+       fence = hl_ctx_get_fence_locked(ctx, seq);
 
        spin_unlock(&ctx->cs_lock);
 
        return fence;
 }
 
+/*
+ * hl_ctx_get_fences - get multiple CS fences under the same CS lock
+ *
+ * @ctx: pointer to the context structure.
+ * @seq_arr: array of CS sequences to wait for
+ * @fence: fence array to store the CS fences
+ * @arr_len: length of seq_arr and fence_arr
+ *
+ * @return 0 on success, otherwise non 0 error code
+ */
+int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr,
+                               struct hl_fence **fence, u32 arr_len)
+{
+       struct hl_fence **fence_arr_base = fence;
+       int i, rc = 0;
+
+       spin_lock(&ctx->cs_lock);
+
+       for (i = 0; i < arr_len; i++, fence++) {
+               u64 seq = seq_arr[i];
+
+               *fence = hl_ctx_get_fence_locked(ctx, seq);
+
+               if (IS_ERR(*fence)) {
+                       dev_err(ctx->hdev->dev,
+                               "Failed to get fence for CS with seq 0x%llx\n",
+                                       seq);
+                       rc = PTR_ERR(*fence);
+                       break;
+               }
+       }
+
+       spin_unlock(&ctx->cs_lock);
+
+       if (rc)
+               hl_fences_put(fence_arr_base, i);
+
+       return rc;
+}
+
 /*
  * hl_ctx_mgr_init - initialize the context manager
  *
index 703d79f..985f1f3 100644 (file)
@@ -209,12 +209,12 @@ static int userptr_show(struct seq_file *s, void *data)
                if (first) {
                        first = false;
                        seq_puts(s, "\n");
-                       seq_puts(s, " user virtual address     size             dma dir\n");
+                       seq_puts(s, " pid      user virtual address     size             dma dir\n");
                        seq_puts(s, "----------------------------------------------------------\n");
                }
-               seq_printf(s,
-                       "    0x%-14llx      %-10u    %-30s\n",
-                       userptr->addr, userptr->size, dma_dir[userptr->dir]);
+               seq_printf(s, " %-7d  0x%-14llx      %-10llu    %-30s\n",
+                               userptr->pid, userptr->addr, userptr->size,
+                               dma_dir[userptr->dir]);
        }
 
        spin_unlock(&dev_entry->userptr_spinlock);
@@ -235,7 +235,7 @@ static int vm_show(struct seq_file *s, void *data)
        struct hl_vm_hash_node *hnode;
        struct hl_userptr *userptr;
        struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
-       enum vm_type_t *vm_type;
+       enum vm_type *vm_type;
        bool once = true;
        u64 j;
        int i;
@@ -261,7 +261,7 @@ static int vm_show(struct seq_file *s, void *data)
                        if (*vm_type == VM_TYPE_USERPTR) {
                                userptr = hnode->ptr;
                                seq_printf(s,
-                                       "    0x%-14llx      %-10u\n",
+                                       "    0x%-14llx      %-10llu\n",
                                        hnode->vaddr, userptr->size);
                        } else {
                                phys_pg_pack = hnode->ptr;
@@ -320,6 +320,77 @@ static int vm_show(struct seq_file *s, void *data)
        return 0;
 }
 
+static int userptr_lookup_show(struct seq_file *s, void *data)
+{
+       struct hl_debugfs_entry *entry = s->private;
+       struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
+       struct scatterlist *sg;
+       struct hl_userptr *userptr;
+       bool first = true;
+       u64 total_npages, npages, sg_start, sg_end;
+       dma_addr_t dma_addr;
+       int i;
+
+       spin_lock(&dev_entry->userptr_spinlock);
+
+       list_for_each_entry(userptr, &dev_entry->userptr_list, debugfs_list) {
+               if (dev_entry->userptr_lookup >= userptr->addr &&
+               dev_entry->userptr_lookup < userptr->addr + userptr->size) {
+                       total_npages = 0;
+                       for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents,
+                                       i) {
+                               npages = hl_get_sg_info(sg, &dma_addr);
+                               sg_start = userptr->addr +
+                                       total_npages * PAGE_SIZE;
+                               sg_end = userptr->addr +
+                                       (total_npages + npages) * PAGE_SIZE;
+
+                               if (dev_entry->userptr_lookup >= sg_start &&
+                                   dev_entry->userptr_lookup < sg_end) {
+                                       dma_addr += (dev_entry->userptr_lookup -
+                                                       sg_start);
+                                       if (first) {
+                                               first = false;
+                                               seq_puts(s, "\n");
+                                               seq_puts(s, " user virtual address         dma address       pid        region start     region size\n");
+                                               seq_puts(s, "---------------------------------------------------------------------------------------\n");
+                                       }
+                                       seq_printf(s, " 0x%-18llx  0x%-16llx  %-8u  0x%-16llx %-12llu\n",
+                                               dev_entry->userptr_lookup,
+                                               (u64)dma_addr, userptr->pid,
+                                               userptr->addr, userptr->size);
+                               }
+                               total_npages += npages;
+                       }
+               }
+       }
+
+       spin_unlock(&dev_entry->userptr_spinlock);
+
+       if (!first)
+               seq_puts(s, "\n");
+
+       return 0;
+}
+
+static ssize_t userptr_lookup_write(struct file *file, const char __user *buf,
+               size_t count, loff_t *f_pos)
+{
+       struct seq_file *s = file->private_data;
+       struct hl_debugfs_entry *entry = s->private;
+       struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
+       ssize_t rc;
+       u64 value;
+
+       rc = kstrtoull_from_user(buf, count, 16, &value);
+       if (rc)
+               return rc;
+
+       dev_entry->userptr_lookup = value;
+
+       return count;
+}
+
 static int mmu_show(struct seq_file *s, void *data)
 {
        struct hl_debugfs_entry *entry = s->private;
@@ -349,7 +420,7 @@ static int mmu_show(struct seq_file *s, void *data)
                return 0;
        }
 
-       phys_addr = hops_info.hop_info[hops_info.used_hops - 1].hop_pte_val;
+       hl_mmu_va_to_pa(ctx, virt_addr, &phys_addr);
 
        if (hops_info.scrambled_vaddr &&
                (dev_entry->mmu_addr != hops_info.scrambled_vaddr))
@@ -491,11 +562,10 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr, u32 size,
        struct hl_vm_phys_pg_pack *phys_pg_pack;
        struct hl_ctx *ctx = hdev->compute_ctx;
        struct hl_vm_hash_node *hnode;
+       u64 end_address, range_size;
        struct hl_userptr *userptr;
-       enum vm_type_t *vm_type;
+       enum vm_type *vm_type;
        bool valid = false;
-       u64 end_address;
-       u32 range_size;
        int i, rc = 0;
 
        if (!ctx) {
@@ -1043,6 +1113,60 @@ static ssize_t hl_security_violations_read(struct file *f, char __user *buf,
        return 0;
 }
 
+static ssize_t hl_state_dump_read(struct file *f, char __user *buf,
+                                       size_t count, loff_t *ppos)
+{
+       struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+       ssize_t rc;
+
+       down_read(&entry->state_dump_sem);
+       if (!entry->state_dump[entry->state_dump_head])
+               rc = 0;
+       else
+               rc = simple_read_from_buffer(
+                       buf, count, ppos,
+                       entry->state_dump[entry->state_dump_head],
+                       strlen(entry->state_dump[entry->state_dump_head]));
+       up_read(&entry->state_dump_sem);
+
+       return rc;
+}
+
+static ssize_t hl_state_dump_write(struct file *f, const char __user *buf,
+                                       size_t count, loff_t *ppos)
+{
+       struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+       struct hl_device *hdev = entry->hdev;
+       ssize_t rc;
+       u32 size;
+       int i;
+
+       rc = kstrtouint_from_user(buf, count, 10, &size);
+       if (rc)
+               return rc;
+
+       if (size <= 0 || size >= ARRAY_SIZE(entry->state_dump)) {
+               dev_err(hdev->dev, "Invalid number of dumps to skip\n");
+               return -EINVAL;
+       }
+
+       if (entry->state_dump[entry->state_dump_head]) {
+               down_write(&entry->state_dump_sem);
+               for (i = 0; i < size; ++i) {
+                       vfree(entry->state_dump[entry->state_dump_head]);
+                       entry->state_dump[entry->state_dump_head] = NULL;
+                       if (entry->state_dump_head > 0)
+                               entry->state_dump_head--;
+                       else
+                               entry->state_dump_head =
+                                       ARRAY_SIZE(entry->state_dump) - 1;
+               }
+               up_write(&entry->state_dump_sem);
+       }
+
+       return count;
+}
+
 static const struct file_operations hl_data32b_fops = {
        .owner = THIS_MODULE,
        .read = hl_data_read32,
@@ -1110,12 +1234,19 @@ static const struct file_operations hl_security_violations_fops = {
        .read = hl_security_violations_read
 };
 
+static const struct file_operations hl_state_dump_fops = {
+       .owner = THIS_MODULE,
+       .read = hl_state_dump_read,
+       .write = hl_state_dump_write
+};
+
 static const struct hl_info_list hl_debugfs_list[] = {
        {"command_buffers", command_buffers_show, NULL},
        {"command_submission", command_submission_show, NULL},
        {"command_submission_jobs", command_submission_jobs_show, NULL},
        {"userptr", userptr_show, NULL},
        {"vm", vm_show, NULL},
+       {"userptr_lookup", userptr_lookup_show, userptr_lookup_write},
        {"mmu", mmu_show, mmu_asid_va_write},
        {"engines", engines_show, NULL}
 };
@@ -1172,6 +1303,7 @@ void hl_debugfs_add_device(struct hl_device *hdev)
        INIT_LIST_HEAD(&dev_entry->userptr_list);
        INIT_LIST_HEAD(&dev_entry->ctx_mem_hash_list);
        mutex_init(&dev_entry->file_mutex);
+       init_rwsem(&dev_entry->state_dump_sem);
        spin_lock_init(&dev_entry->cb_spinlock);
        spin_lock_init(&dev_entry->cs_spinlock);
        spin_lock_init(&dev_entry->cs_job_spinlock);
@@ -1283,6 +1415,12 @@ void hl_debugfs_add_device(struct hl_device *hdev)
                                dev_entry->root,
                                &hdev->skip_reset_on_timeout);
 
+       debugfs_create_file("state_dump",
+                               0600,
+                               dev_entry->root,
+                               dev_entry,
+                               &hl_state_dump_fops);
+
        for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
                debugfs_create_file(hl_debugfs_list[i].name,
                                        0444,
@@ -1297,6 +1435,7 @@ void hl_debugfs_add_device(struct hl_device *hdev)
 void hl_debugfs_remove_device(struct hl_device *hdev)
 {
        struct hl_dbg_device_entry *entry = &hdev->hl_debugfs;
+       int i;
 
        debugfs_remove_recursive(entry->root);
 
@@ -1304,6 +1443,9 @@ void hl_debugfs_remove_device(struct hl_device *hdev)
 
        vfree(entry->blob_desc.data);
 
+       for (i = 0; i < ARRAY_SIZE(entry->state_dump); ++i)
+               vfree(entry->state_dump[i]);
+
        kfree(entry->entry_arr);
 }
 
@@ -1416,6 +1558,28 @@ void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx)
        spin_unlock(&dev_entry->ctx_mem_hash_spinlock);
 }
 
+/**
+ * hl_debugfs_set_state_dump - register state dump making it accessible via
+ *                             debugfs
+ * @hdev: pointer to the device structure
+ * @data: the actual dump data
+ * @length: the length of the data
+ */
+void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data,
+                                       unsigned long length)
+{
+       struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs;
+
+       down_write(&dev_entry->state_dump_sem);
+
+       dev_entry->state_dump_head = (dev_entry->state_dump_head + 1) %
+                                       ARRAY_SIZE(dev_entry->state_dump);
+       vfree(dev_entry->state_dump[dev_entry->state_dump_head]);
+       dev_entry->state_dump[dev_entry->state_dump_head] = data;
+
+       up_write(&dev_entry->state_dump_sem);
+}
+
 void __init hl_debugfs_init(void)
 {
        hl_debug_root = debugfs_create_dir("habanalabs", NULL);
index ff4cbde..97c7c86 100644 (file)
@@ -7,11 +7,11 @@
 
 #define pr_fmt(fmt)                    "habanalabs: " fmt
 
+#include <uapi/misc/habanalabs.h>
 #include "habanalabs.h"
 
 #include <linux/pci.h>
 #include <linux/hwmon.h>
-#include <uapi/misc/habanalabs.h>
 
 enum hl_device_status hl_device_status(struct hl_device *hdev)
 {
@@ -23,6 +23,8 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
                status = HL_DEVICE_STATUS_NEEDS_RESET;
        else if (hdev->disabled)
                status = HL_DEVICE_STATUS_MALFUNCTION;
+       else if (!hdev->init_done)
+               status = HL_DEVICE_STATUS_IN_DEVICE_CREATION;
        else
                status = HL_DEVICE_STATUS_OPERATIONAL;
 
@@ -44,6 +46,7 @@ bool hl_device_operational(struct hl_device *hdev,
        case HL_DEVICE_STATUS_NEEDS_RESET:
                return false;
        case HL_DEVICE_STATUS_OPERATIONAL:
+       case HL_DEVICE_STATUS_IN_DEVICE_CREATION:
        default:
                return true;
        }
@@ -129,8 +132,8 @@ static int hl_device_release(struct inode *inode, struct file *filp)
        hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
 
        if (!hl_hpriv_put(hpriv))
-               dev_warn(hdev->dev,
-                       "Device is still in use because there are live CS and/or memory mappings\n");
+               dev_notice(hdev->dev,
+                       "User process closed FD but device still in use\n");
 
        hdev->last_open_session_duration_jif =
                jiffies - hdev->last_successful_open_jif;
@@ -308,9 +311,15 @@ static void device_hard_reset_pending(struct work_struct *work)
                container_of(work, struct hl_device_reset_work,
                                reset_work.work);
        struct hl_device *hdev = device_reset_work->hdev;
+       u32 flags;
        int rc;
 
-       rc = hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD);
+       flags = HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD;
+
+       if (device_reset_work->fw_reset)
+               flags |= HL_RESET_FW;
+
+       rc = hl_device_reset(hdev, flags);
        if ((rc == -EBUSY) && !hdev->device_fini_pending) {
                dev_info(hdev->dev,
                        "Could not reset device. will try again in %u seconds",
@@ -682,6 +691,44 @@ out:
        return rc;
 }
 
+static void take_release_locks(struct hl_device *hdev)
+{
+       /* Flush anyone that is inside the critical section of enqueue
+        * jobs to the H/W
+        */
+       hdev->asic_funcs->hw_queues_lock(hdev);
+       hdev->asic_funcs->hw_queues_unlock(hdev);
+
+       /* Flush processes that are sending message to CPU */
+       mutex_lock(&hdev->send_cpu_message_lock);
+       mutex_unlock(&hdev->send_cpu_message_lock);
+
+       /* Flush anyone that is inside device open */
+       mutex_lock(&hdev->fpriv_list_lock);
+       mutex_unlock(&hdev->fpriv_list_lock);
+}
+
+static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset)
+{
+       if (hard_reset)
+               device_late_fini(hdev);
+
+       /*
+        * Halt the engines and disable interrupts so we won't get any more
+        * completions from H/W and we won't have any accesses from the
+        * H/W to the host machine
+        */
+       hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset);
+
+       /* Go over all the queues, release all CS and their jobs */
+       hl_cs_rollback_all(hdev);
+
+       /* Release all pending user interrupts, each pending user interrupt
+        * holds a reference to user context
+        */
+       hl_release_pending_user_interrupts(hdev);
+}
+
 /*
  * hl_device_suspend - initiate device suspend
  *
@@ -707,16 +754,7 @@ int hl_device_suspend(struct hl_device *hdev)
        /* This blocks all other stuff that is not blocked by in_reset */
        hdev->disabled = true;
 
-       /*
-        * Flush anyone that is inside the critical section of enqueue
-        * jobs to the H/W
-        */
-       hdev->asic_funcs->hw_queues_lock(hdev);
-       hdev->asic_funcs->hw_queues_unlock(hdev);
-
-       /* Flush processes that are sending message to CPU */
-       mutex_lock(&hdev->send_cpu_message_lock);
-       mutex_unlock(&hdev->send_cpu_message_lock);
+       take_release_locks(hdev);
 
        rc = hdev->asic_funcs->suspend(hdev);
        if (rc)
@@ -819,6 +857,11 @@ static int device_kill_open_processes(struct hl_device *hdev, u32 timeout)
                        usleep_range(1000, 10000);
 
                        put_task_struct(task);
+               } else {
+                       dev_warn(hdev->dev,
+                               "Can't get task struct for PID so giving up on killing process\n");
+                       mutex_unlock(&hdev->fpriv_list_lock);
+                       return -ETIME;
                }
        }
 
@@ -885,7 +928,7 @@ static void device_disable_open_processes(struct hl_device *hdev)
 int hl_device_reset(struct hl_device *hdev, u32 flags)
 {
        u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
-       bool hard_reset, from_hard_reset_thread, hard_instead_soft = false;
+       bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false;
        int i, rc;
 
        if (!hdev->init_done) {
@@ -894,8 +937,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
                return 0;
        }
 
-       hard_reset = (flags & HL_RESET_HARD) != 0;
-       from_hard_reset_thread = (flags & HL_RESET_FROM_RESET_THREAD) != 0;
+       hard_reset = !!(flags & HL_RESET_HARD);
+       from_hard_reset_thread = !!(flags & HL_RESET_FROM_RESET_THREAD);
+       fw_reset = !!(flags & HL_RESET_FW);
 
        if (!hard_reset && !hdev->supports_soft_reset) {
                hard_instead_soft = true;
@@ -947,11 +991,13 @@ do_reset:
                else
                        hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
 
-               /*
-                * if reset is due to heartbeat, device CPU is no responsive in
-                * which case no point sending PCI disable message to it
+               /* If reset is due to heartbeat, device CPU is no responsive in
+                * which case no point sending PCI disable message to it.
+                *
+                * If F/W is performing the reset, no need to send it a message to disable
+                * PCI access
                 */
-               if (hard_reset && !(flags & HL_RESET_HEARTBEAT)) {
+               if (hard_reset && !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) {
                        /* Disable PCI access from device F/W so he won't send
                         * us additional interrupts. We disable MSI/MSI-X at
                         * the halt_engines function and we can't have the F/W
@@ -970,15 +1016,7 @@ do_reset:
                /* This also blocks future CS/VM/JOB completion operations */
                hdev->disabled = true;
 
-               /* Flush anyone that is inside the critical section of enqueue
-                * jobs to the H/W
-                */
-               hdev->asic_funcs->hw_queues_lock(hdev);
-               hdev->asic_funcs->hw_queues_unlock(hdev);
-
-               /* Flush anyone that is inside device open */
-               mutex_lock(&hdev->fpriv_list_lock);
-               mutex_unlock(&hdev->fpriv_list_lock);
+               take_release_locks(hdev);
 
                dev_err(hdev->dev, "Going to RESET device!\n");
        }
@@ -989,6 +1027,8 @@ again:
 
                hdev->process_kill_trial_cnt = 0;
 
+               hdev->device_reset_work.fw_reset = fw_reset;
+
                /*
                 * Because the reset function can't run from heartbeat work,
                 * we need to call the reset function from a dedicated work.
@@ -999,31 +1039,7 @@ again:
                return 0;
        }
 
-       if (hard_reset) {
-               device_late_fini(hdev);
-
-               /*
-                * Now that the heartbeat thread is closed, flush processes
-                * which are sending messages to CPU
-                */
-               mutex_lock(&hdev->send_cpu_message_lock);
-               mutex_unlock(&hdev->send_cpu_message_lock);
-       }
-
-       /*
-        * Halt the engines and disable interrupts so we won't get any more
-        * completions from H/W and we won't have any accesses from the
-        * H/W to the host machine
-        */
-       hdev->asic_funcs->halt_engines(hdev, hard_reset);
-
-       /* Go over all the queues, release all CS and their jobs */
-       hl_cs_rollback_all(hdev);
-
-       /* Release all pending user interrupts, each pending user interrupt
-        * holds a reference to user context
-        */
-       hl_release_pending_user_interrupts(hdev);
+       cleanup_resources(hdev, hard_reset, fw_reset);
 
 kill_processes:
        if (hard_reset) {
@@ -1057,12 +1073,15 @@ kill_processes:
        }
 
        /* Reset the H/W. It will be in idle state after this returns */
-       hdev->asic_funcs->hw_fini(hdev, hard_reset);
+       hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
 
        if (hard_reset) {
+               hdev->fw_loader.linux_loaded = false;
+
                /* Release kernel context */
                if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1)
                        hdev->kernel_ctx = NULL;
+
                hl_vm_fini(hdev);
                hl_mmu_fini(hdev);
                hl_eq_reset(hdev, &hdev->event_queue);
@@ -1292,6 +1311,10 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
        if (rc)
                goto user_interrupts_fini;
 
+
+       /* initialize completion structure for multi CS wait */
+       hl_multi_cs_completion_init(hdev);
+
        /*
         * Initialize the H/W queues. Must be done before hw_init, because
         * there the addresses of the kernel queue are being written to the
@@ -1361,6 +1384,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 
        hdev->compute_ctx = NULL;
 
+       hdev->asic_funcs->state_dump_init(hdev);
+
        hl_debugfs_add_device(hdev);
 
        /* debugfs nodes are created in hl_ctx_init so it must be called after
@@ -1567,31 +1592,13 @@ void hl_device_fini(struct hl_device *hdev)
        /* Mark device as disabled */
        hdev->disabled = true;
 
-       /* Flush anyone that is inside the critical section of enqueue
-        * jobs to the H/W
-        */
-       hdev->asic_funcs->hw_queues_lock(hdev);
-       hdev->asic_funcs->hw_queues_unlock(hdev);
-
-       /* Flush anyone that is inside device open */
-       mutex_lock(&hdev->fpriv_list_lock);
-       mutex_unlock(&hdev->fpriv_list_lock);
+       take_release_locks(hdev);
 
        hdev->hard_reset_pending = true;
 
        hl_hwmon_fini(hdev);
 
-       device_late_fini(hdev);
-
-       /*
-        * Halt the engines and disable interrupts so we won't get any more
-        * completions from H/W and we won't have any accesses from the
-        * H/W to the host machine
-        */
-       hdev->asic_funcs->halt_engines(hdev, true);
-
-       /* Go over all the queues, release all CS and their jobs */
-       hl_cs_rollback_all(hdev);
+       cleanup_resources(hdev, true, false);
 
        /* Kill processes here after CS rollback. This is because the process
         * can't really exit until all its CSs are done, which is what we
@@ -1610,7 +1617,9 @@ void hl_device_fini(struct hl_device *hdev)
        hl_cb_pool_fini(hdev);
 
        /* Reset the H/W. It will be in idle state after this returns */
-       hdev->asic_funcs->hw_fini(hdev, true);
+       hdev->asic_funcs->hw_fini(hdev, true, false);
+
+       hdev->fw_loader.linux_loaded = false;
 
        /* Release kernel context */
        if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
index 2e4d04e..8d2568c 100644 (file)
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  */
 
@@ -240,11 +240,15 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
        /* set fence to a non valid value */
        pkt->fence = cpu_to_le32(UINT_MAX);
 
-       rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, len, pkt_dma_addr);
-       if (rc) {
-               dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc);
-               goto out;
-       }
+       /*
+        * The CPU queue is a synchronous queue with an effective depth of
+        * a single entry (although it is allocated with room for multiple
+        * entries). We lock on it using 'send_cpu_message_lock' which
+        * serializes accesses to the CPU queue.
+        * Which means that we don't need to lock the access to the entire H/W
+        * queues module when submitting a JOB to the CPU queue.
+        */
+       hl_hw_queue_submit_bd(hdev, queue, 0, len, pkt_dma_addr);
 
        if (prop->fw_app_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN)
                expected_ack_val = queue->pi;
@@ -663,17 +667,15 @@ int hl_fw_cpucp_info_get(struct hl_device *hdev,
        hdev->event_queue.check_eqe_index = false;
 
        /* Read FW application security bits again */
-       if (hdev->asic_prop.fw_cpu_boot_dev_sts0_valid) {
-               hdev->asic_prop.fw_app_cpu_boot_dev_sts0 =
-                                               RREG32(sts_boot_dev_sts0_reg);
-               if (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 &
+       if (prop->fw_cpu_boot_dev_sts0_valid) {
+               prop->fw_app_cpu_boot_dev_sts0 = RREG32(sts_boot_dev_sts0_reg);
+               if (prop->fw_app_cpu_boot_dev_sts0 &
                                CPU_BOOT_DEV_STS0_EQ_INDEX_EN)
                        hdev->event_queue.check_eqe_index = true;
        }
 
-       if (hdev->asic_prop.fw_cpu_boot_dev_sts1_valid)
-               hdev->asic_prop.fw_app_cpu_boot_dev_sts1 =
-                                               RREG32(sts_boot_dev_sts1_reg);
+       if (prop->fw_cpu_boot_dev_sts1_valid)
+               prop->fw_app_cpu_boot_dev_sts1 = RREG32(sts_boot_dev_sts1_reg);
 
 out:
        hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
@@ -1008,6 +1010,11 @@ void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev)
        } else {
                WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE);
                msleep(static_loader->cpu_reset_wait_msec);
+
+               /* Must clear this register in order to prevent preboot
+                * from reading WFE after reboot
+                */
+               WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_NA);
        }
 
        hdev->device_cpu_is_halted = true;
@@ -1055,6 +1062,10 @@ static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
                dev_err(hdev->dev,
                        "Device boot progress - Thermal Sensor initialization failed\n");
                break;
+       case CPU_BOOT_STATUS_SECURITY_READY:
+               dev_err(hdev->dev,
+                       "Device boot progress - Stuck in preboot after security initialization\n");
+               break;
        default:
                dev_err(hdev->dev,
                        "Device boot progress - Invalid status code %d\n",
@@ -1238,11 +1249,6 @@ static void hl_fw_preboot_update_state(struct hl_device *hdev)
         *               b. Check whether hard reset is done by boot cpu
         * 3. FW application - a. Fetch fw application security status
         *                     b. Check whether hard reset is done by fw app
-        *
-        * Preboot:
-        * Check security status bit (CPU_BOOT_DEV_STS0_ENABLED). If set, then-
-        * check security enabled bit (CPU_BOOT_DEV_STS0_SECURITY_EN)
-        * If set, then mark GIC controller to be disabled.
         */
        prop->hard_reset_done_by_fw =
                !!(cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN);
@@ -1953,8 +1959,8 @@ static void hl_fw_dynamic_update_linux_interrupt_if(struct hl_device *hdev)
        if (!hdev->asic_prop.gic_interrupts_enable &&
                        !(hdev->asic_prop.fw_app_cpu_boot_dev_sts0 &
                                CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN)) {
-               dyn_regs->gic_host_halt_irq = dyn_regs->gic_host_irq_ctrl;
-               dyn_regs->gic_host_ints_irq = dyn_regs->gic_host_irq_ctrl;
+               dyn_regs->gic_host_halt_irq = dyn_regs->gic_host_pi_upd_irq;
+               dyn_regs->gic_host_ints_irq = dyn_regs->gic_host_pi_upd_irq;
 
                dev_warn(hdev->dev,
                        "Using a single interrupt interface towards cpucp");
@@ -2122,8 +2128,7 @@ static void hl_fw_linux_update_state(struct hl_device *hdev,
 
        /* Read FW application security bits */
        if (prop->fw_cpu_boot_dev_sts0_valid) {
-               prop->fw_app_cpu_boot_dev_sts0 =
-                               RREG32(cpu_boot_dev_sts0_reg);
+               prop->fw_app_cpu_boot_dev_sts0 = RREG32(cpu_boot_dev_sts0_reg);
 
                if (prop->fw_app_cpu_boot_dev_sts0 &
                                CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
@@ -2143,8 +2148,7 @@ static void hl_fw_linux_update_state(struct hl_device *hdev,
        }
 
        if (prop->fw_cpu_boot_dev_sts1_valid) {
-               prop->fw_app_cpu_boot_dev_sts1 =
-                               RREG32(cpu_boot_dev_sts1_reg);
+               prop->fw_app_cpu_boot_dev_sts1 = RREG32(cpu_boot_dev_sts1_reg);
 
                dev_dbg(hdev->dev,
                        "Firmware application CPU status1 %#x\n",
@@ -2235,6 +2239,10 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
        dev_info(hdev->dev,
                "Loading firmware to device, may take some time...\n");
 
+       /*
+        * In this stage, "cpu_dyn_regs" contains only LKD's hard coded values!
+        * It will be updated from FW after hl_fw_dynamic_request_descriptor().
+        */
        dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs;
 
        rc = hl_fw_dynamic_send_protocol_cmd(hdev, fw_loader, COMMS_RST_STATE,
index 6b3cdd7..bebebcb 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/scatterlist.h>
 #include <linux/hashtable.h>
 #include <linux/debugfs.h>
+#include <linux/rwsem.h>
 #include <linux/bitfield.h>
 #include <linux/genalloc.h>
 #include <linux/sched/signal.h>
 
 #define HL_COMMON_USER_INTERRUPT_ID    0xFFF
 
+#define HL_STATE_DUMP_HIST_LEN         5
+
+#define OBJ_NAMES_HASH_TABLE_BITS      7 /* 1 << 7 buckets */
+#define SYNC_TO_ENGINE_HASH_TABLE_BITS 7 /* 1 << 7 buckets */
+
 /* Memory */
 #define MEM_HASH_TABLE_BITS            7 /* 1 << 7 buckets */
 
@@ -122,12 +128,17 @@ enum hl_mmu_page_table_location {
  *
  * - HL_RESET_DEVICE_RELEASE
  *       Set if reset is due to device release
+ *
+ * - HL_RESET_FW
+ *       F/W will perform the reset. No need to ask it to reset the device. This is relevant
+ *       only when running with secured f/w
  */
 #define HL_RESET_HARD                  (1 << 0)
 #define HL_RESET_FROM_RESET_THREAD     (1 << 1)
 #define HL_RESET_HEARTBEAT             (1 << 2)
 #define HL_RESET_TDR                   (1 << 3)
 #define HL_RESET_DEVICE_RELEASE                (1 << 4)
+#define HL_RESET_FW                    (1 << 5)
 
 #define HL_MAX_SOBS_PER_MONITOR        8
 
@@ -236,7 +247,9 @@ enum hl_cs_type {
        CS_TYPE_DEFAULT,
        CS_TYPE_SIGNAL,
        CS_TYPE_WAIT,
-       CS_TYPE_COLLECTIVE_WAIT
+       CS_TYPE_COLLECTIVE_WAIT,
+       CS_RESERVE_SIGNALS,
+       CS_UNRESERVE_SIGNALS
 };
 
 /*
@@ -281,13 +294,17 @@ enum queue_cb_alloc_flags {
  * @hdev: habanalabs device structure.
  * @kref: refcount of this SOB. The SOB will reset once the refcount is zero.
  * @sob_id: id of this SOB.
+ * @sob_addr: the sob offset from the base address.
  * @q_idx: the H/W queue that uses this SOB.
+ * @need_reset: reset indication set when switching to the other sob.
  */
 struct hl_hw_sob {
        struct hl_device        *hdev;
        struct kref             kref;
        u32                     sob_id;
+       u32                     sob_addr;
        u32                     q_idx;
+       bool                    need_reset;
 };
 
 enum hl_collective_mode {
@@ -317,11 +334,11 @@ struct hw_queue_properties {
 };
 
 /**
- * enum vm_type_t - virtual memory mapping request information.
+ * enum vm_type - virtual memory mapping request information.
  * @VM_TYPE_USERPTR: mapping of user memory to device virtual address.
  * @VM_TYPE_PHYS_PACK: mapping of DRAM memory to device virtual address.
  */
-enum vm_type_t {
+enum vm_type {
        VM_TYPE_USERPTR = 0x1,
        VM_TYPE_PHYS_PACK = 0x2
 };
@@ -381,6 +398,16 @@ struct hl_mmu_properties {
        u8      host_resident;
 };
 
+/**
+ * struct hl_hints_range - hint addresses reserved va range.
+ * @start_addr: start address of the va range.
+ * @end_addr: end address of the va range.
+ */
+struct hl_hints_range {
+       u64 start_addr;
+       u64 end_addr;
+};
+
 /**
  * struct asic_fixed_properties - ASIC specific immutable properties.
  * @hw_queues_props: H/W queues properties.
@@ -392,6 +419,10 @@ struct hl_mmu_properties {
  * @pmmu: PCI (host) MMU address translation properties.
  * @pmmu_huge: PCI (host) MMU address translation properties for memory
  *              allocated with huge pages.
+ * @hints_dram_reserved_va_range: dram hint addresses reserved range.
+ * @hints_host_reserved_va_range: host hint addresses reserved range.
+ * @hints_host_hpage_reserved_va_range: host huge page hint addresses reserved
+ *                                      range.
  * @sram_base_address: SRAM physical start address.
  * @sram_end_address: SRAM physical end address.
  * @sram_user_base_address - SRAM physical start address for user access.
@@ -412,6 +443,10 @@ struct hl_mmu_properties {
  *                    to the device's MMU.
  * @cb_va_end_addr: virtual end address of command buffers which are mapped to
  *                  the device's MMU.
+ * @dram_hints_align_mask: dram va hint addresses alignment mask which is used
+ *                  for hints validity check.
+ * device_dma_offset_for_host_access: the offset to add to host DMA addresses
+ *                                    to enable the device to access them.
  * @mmu_pgt_size: MMU page tables total size.
  * @mmu_pte_size: PTE size in MMU page tables.
  * @mmu_hop_table_size: MMU hop table size.
@@ -459,6 +494,8 @@ struct hl_mmu_properties {
  *                                       reserved for the user
  * @first_available_cq: first available CQ for the user.
  * @user_interrupt_count: number of user interrupts.
+ * @server_type: Server type that the ASIC is currently installed in.
+ *               The value is according to enum hl_server_type in uapi file.
  * @tpc_enabled_mask: which TPCs are enabled.
  * @completion_queues_count: number of completion queues.
  * @fw_security_enabled: true if security measures are enabled in firmware,
@@ -470,6 +507,7 @@ struct hl_mmu_properties {
  * @dram_supports_virtual_memory: is there an MMU towards the DRAM
  * @hard_reset_done_by_fw: true if firmware is handling hard reset flow
  * @num_functional_hbms: number of functional HBMs in each DCORE.
+ * @hints_range_reservation: device support hint addresses range reservation.
  * @iatu_done_by_fw: true if iATU configuration is being done by FW.
  * @dynamic_fw_load: is dynamic FW load is supported.
  * @gic_interrupts_enable: true if FW is not blocking GIC controller,
@@ -483,6 +521,9 @@ struct asic_fixed_properties {
        struct hl_mmu_properties        dmmu;
        struct hl_mmu_properties        pmmu;
        struct hl_mmu_properties        pmmu_huge;
+       struct hl_hints_range           hints_dram_reserved_va_range;
+       struct hl_hints_range           hints_host_reserved_va_range;
+       struct hl_hints_range           hints_host_hpage_reserved_va_range;
        u64                             sram_base_address;
        u64                             sram_end_address;
        u64                             sram_user_base_address;
@@ -500,6 +541,8 @@ struct asic_fixed_properties {
        u64                             mmu_dram_default_page_addr;
        u64                             cb_va_start_addr;
        u64                             cb_va_end_addr;
+       u64                             dram_hints_align_mask;
+       u64                             device_dma_offset_for_host_access;
        u32                             mmu_pgt_size;
        u32                             mmu_pte_size;
        u32                             mmu_hop_table_size;
@@ -534,6 +577,7 @@ struct asic_fixed_properties {
        u16                             first_available_user_msix_interrupt;
        u16                             first_available_cq[HL_MAX_DCORES];
        u16                             user_interrupt_count;
+       u16                             server_type;
        u8                              tpc_enabled_mask;
        u8                              completion_queues_count;
        u8                              fw_security_enabled;
@@ -542,6 +586,7 @@ struct asic_fixed_properties {
        u8                              dram_supports_virtual_memory;
        u8                              hard_reset_done_by_fw;
        u8                              num_functional_hbms;
+       u8                              hints_range_reservation;
        u8                              iatu_done_by_fw;
        u8                              dynamic_fw_load;
        u8                              gic_interrupts_enable;
@@ -552,40 +597,45 @@ struct asic_fixed_properties {
  * @completion: fence is implemented using completion
  * @refcount: refcount for this fence
  * @cs_sequence: sequence of the corresponding command submission
+ * @stream_master_qid_map: streams masters QID bitmap to represent all streams
+ *                         masters QIDs that multi cs is waiting on
  * @error: mark this fence with error
  * @timestamp: timestamp upon completion
- *
  */
 struct hl_fence {
        struct completion       completion;
        struct kref             refcount;
        u64                     cs_sequence;
+       u32                     stream_master_qid_map;
        int                     error;
        ktime_t                 timestamp;
 };
 
 /**
  * struct hl_cs_compl - command submission completion object.
- * @sob_reset_work: workqueue object to run SOB reset flow.
  * @base_fence: hl fence object.
  * @lock: spinlock to protect fence.
  * @hdev: habanalabs device structure.
  * @hw_sob: the H/W SOB used in this signal/wait CS.
+ * @encaps_sig_hdl: encaps signals hanlder.
  * @cs_seq: command submission sequence number.
  * @type: type of the CS - signal/wait.
  * @sob_val: the SOB value that is used in this signal/wait CS.
  * @sob_group: the SOB group that is used in this collective wait CS.
+ * @encaps_signals: indication whether it's a completion object of cs with
+ * encaps signals or not.
  */
 struct hl_cs_compl {
-       struct work_struct      sob_reset_work;
        struct hl_fence         base_fence;
        spinlock_t              lock;
        struct hl_device        *hdev;
        struct hl_hw_sob        *hw_sob;
+       struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
        u64                     cs_seq;
        enum hl_cs_type         type;
        u16                     sob_val;
        u16                     sob_group;
+       bool                    encaps_signals;
 };
 
 /*
@@ -697,6 +747,17 @@ struct hl_sync_stream_properties {
        u8              curr_sob_offset;
 };
 
+/**
+ * struct hl_encaps_signals_mgr - describes sync stream encapsulated signals
+ * handlers manager
+ * @lock: protects handles.
+ * @handles: an idr to hold all encapsulated signals handles.
+ */
+struct hl_encaps_signals_mgr {
+       spinlock_t              lock;
+       struct idr              handles;
+};
+
 /**
  * struct hl_hw_queue - describes a H/W transport queue.
  * @shadow_queue: pointer to a shadow queue that holds pointers to jobs.
@@ -875,7 +936,7 @@ struct pci_mem_region {
        u64 region_base;
        u64 region_size;
        u64 bar_size;
-       u32 offset_in_bar;
+       u64 offset_in_bar;
        u8 bar_id;
        u8 used;
 };
@@ -996,7 +1057,7 @@ struct fw_load_mgr {
  *                hw_fini and before CS rollback.
  * @suspend: handles IP specific H/W or SW changes for suspend.
  * @resume: handles IP specific H/W or SW changes for resume.
- * @cb_mmap: maps a CB.
+ * @mmap: maps a memory.
  * @ring_doorbell: increment PI on a given QMAN.
  * @pqe_write: Write the PQ entry to the PQ. This is ASIC-specific
  *             function because the PQs are located in different memory areas
@@ -1101,6 +1162,10 @@ struct fw_load_mgr {
  *                         generic f/w compatible PLL Indexes
  * @init_firmware_loader: initialize data for FW loader.
  * @init_cpu_scrambler_dram: Enable CPU specific DRAM scrambling
+ * @state_dump_init: initialize constants required for state dump
+ * @get_sob_addr: get SOB base address offset.
+ * @set_pci_memory_regions: setting properties of PCI memory regions
+ * @get_stream_master_qid_arr: get pointer to stream masters QID array
  */
 struct hl_asic_funcs {
        int (*early_init)(struct hl_device *hdev);
@@ -1110,11 +1175,11 @@ struct hl_asic_funcs {
        int (*sw_init)(struct hl_device *hdev);
        int (*sw_fini)(struct hl_device *hdev);
        int (*hw_init)(struct hl_device *hdev);
-       void (*hw_fini)(struct hl_device *hdev, bool hard_reset);
-       void (*halt_engines)(struct hl_device *hdev, bool hard_reset);
+       void (*hw_fini)(struct hl_device *hdev, bool hard_reset, bool fw_reset);
+       void (*halt_engines)(struct hl_device *hdev, bool hard_reset, bool fw_reset);
        int (*suspend)(struct hl_device *hdev);
        int (*resume)(struct hl_device *hdev);
-       int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
+       int (*mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
                        void *cpu_addr, dma_addr_t dma_addr, size_t size);
        void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi);
        void (*pqe_write)(struct hl_device *hdev, __le64 *pqe,
@@ -1210,10 +1275,11 @@ struct hl_asic_funcs {
        void (*reset_sob_group)(struct hl_device *hdev, u16 sob_group);
        void (*set_dma_mask_from_fw)(struct hl_device *hdev);
        u64 (*get_device_time)(struct hl_device *hdev);
-       void (*collective_wait_init_cs)(struct hl_cs *cs);
+       int (*collective_wait_init_cs)(struct hl_cs *cs);
        int (*collective_wait_create_jobs)(struct hl_device *hdev,
-                       struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
-                       u32 collective_engine_id);
+                       struct hl_ctx *ctx, struct hl_cs *cs,
+                       u32 wait_queue_id, u32 collective_engine_id,
+                       u32 encaps_signal_offset);
        u64 (*scramble_addr)(struct hl_device *hdev, u64 addr);
        u64 (*descramble_addr)(struct hl_device *hdev, u64 addr);
        void (*ack_protection_bits_errors)(struct hl_device *hdev);
@@ -1226,6 +1292,10 @@ struct hl_asic_funcs {
        int (*map_pll_idx_to_fw_idx)(u32 pll_idx);
        void (*init_firmware_loader)(struct hl_device *hdev);
        void (*init_cpu_scrambler_dram)(struct hl_device *hdev);
+       void (*state_dump_init)(struct hl_device *hdev);
+       u32 (*get_sob_addr)(struct hl_device *hdev, u32 sob_id);
+       void (*set_pci_memory_regions)(struct hl_device *hdev);
+       u32* (*get_stream_master_qid_arr)(void);
 };
 
 
@@ -1282,20 +1352,6 @@ struct hl_cs_counters_atomic {
        atomic64_t validation_drop_cnt;
 };
 
-/**
- * struct hl_pending_cb - pending command buffer structure
- * @cb_node: cb node in pending cb list
- * @cb: command buffer to send in next submission
- * @cb_size: command buffer size
- * @hw_queue_id: destination queue id
- */
-struct hl_pending_cb {
-       struct list_head        cb_node;
-       struct hl_cb            *cb;
-       u32                     cb_size;
-       u32                     hw_queue_id;
-};
-
 /**
  * struct hl_ctx - user/kernel context.
  * @mem_hash: holds mapping from virtual address to virtual memory area
@@ -1312,28 +1368,21 @@ struct hl_pending_cb {
  *            MMU hash or walking the PGT requires talking this lock.
  * @hw_block_list_lock: protects the HW block memory list.
  * @debugfs_list: node in debugfs list of contexts.
- * pending_cb_list: list of pending command buffers waiting to be sent upon
- *                  next user command submission context.
  * @hw_block_mem_list: list of HW block virtual mapped addresses.
  * @cs_counters: context command submission counters.
  * @cb_va_pool: device VA pool for command buffers which are mapped to the
  *              device's MMU.
+ * @sig_mgr: encaps signals handle manager.
  * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
  *                     to user so user could inquire about CS. It is used as
  *                     index to cs_pending array.
  * @dram_default_hops: array that holds all hops addresses needed for default
  *                     DRAM mapping.
- * @pending_cb_lock: spinlock to protect pending cb list
  * @cs_lock: spinlock to protect cs_sequence.
  * @dram_phys_mem: amount of used physical DRAM memory by this context.
  * @thread_ctx_switch_token: token to prevent multiple threads of the same
  *                             context from running the context switch phase.
  *                             Only a single thread should run it.
- * @thread_pending_cb_token: token to prevent multiple threads from processing
- *                             the pending CB list. Only a single thread should
- *                             process the list since it is protected by a
- *                             spinlock and we don't want to halt the entire
- *                             command submission sequence.
  * @thread_ctx_switch_wait_token: token to prevent the threads that didn't run
  *                             the context switch phase from moving to their
  *                             execution phase before the context switch phase
@@ -1353,17 +1402,15 @@ struct hl_ctx {
        struct mutex                    mmu_lock;
        struct mutex                    hw_block_list_lock;
        struct list_head                debugfs_list;
-       struct list_head                pending_cb_list;
        struct list_head                hw_block_mem_list;
        struct hl_cs_counters_atomic    cs_counters;
        struct gen_pool                 *cb_va_pool;
+       struct hl_encaps_signals_mgr    sig_mgr;
        u64                             cs_sequence;
        u64                             *dram_default_hops;
-       spinlock_t                      pending_cb_lock;
        spinlock_t                      cs_lock;
        atomic64_t                      dram_phys_mem;
        atomic_t                        thread_ctx_switch_token;
-       atomic_t                        thread_pending_cb_token;
        u32                             thread_ctx_switch_wait_token;
        u32                             asid;
        u32                             handle;
@@ -1394,20 +1441,22 @@ struct hl_ctx_mgr {
  * @sgt: pointer to the scatter-gather table that holds the pages.
  * @dir: for DMA unmapping, the direction must be supplied, so save it.
  * @debugfs_list: node in debugfs list of command submissions.
+ * @pid: the pid of the user process owning the memory
  * @addr: user-space virtual address of the start of the memory area.
  * @size: size of the memory area to pin & map.
  * @dma_mapped: true if the SG was mapped to DMA addresses, false otherwise.
  */
 struct hl_userptr {
-       enum vm_type_t          vm_type; /* must be first */
+       enum vm_type            vm_type; /* must be first */
        struct list_head        job_node;
        struct page             **pages;
        unsigned int            npages;
        struct sg_table         *sgt;
        enum dma_data_direction dir;
        struct list_head        debugfs_list;
+       pid_t                   pid;
        u64                     addr;
-       u32                     size;
+       u64                     size;
        u8                      dma_mapped;
 };
 
@@ -1426,12 +1475,14 @@ struct hl_userptr {
  * @mirror_node : node in device mirror list of command submissions.
  * @staged_cs_node: node in the staged cs list.
  * @debugfs_list: node in debugfs list of command submissions.
+ * @encaps_sig_hdl: holds the encaps signals handle.
  * @sequence: the sequence number of this CS.
  * @staged_sequence: the sequence of the staged submission this CS is part of,
  *                   relevant only if staged_cs is set.
  * @timeout_jiffies: cs timeout in jiffies.
  * @submission_time_jiffies: submission time of the cs
  * @type: CS_TYPE_*.
+ * @encaps_sig_hdl_id: encaps signals handle id, set for the first staged cs.
  * @submitted: true if CS was submitted to H/W.
  * @completed: true if CS was completed by device.
  * @timedout : true if CS was timedout.
@@ -1445,6 +1496,7 @@ struct hl_userptr {
  * @staged_cs: true if this CS is part of a staged submission.
  * @skip_reset_on_timeout: true if we shall not reset the device in case
  *                         timeout occurs (debug scenario).
+ * @encaps_signals: true if this CS has encaps reserved signals.
  */
 struct hl_cs {
        u16                     *jobs_in_queue_cnt;
@@ -1459,11 +1511,13 @@ struct hl_cs {
        struct list_head        mirror_node;
        struct list_head        staged_cs_node;
        struct list_head        debugfs_list;
+       struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
        u64                     sequence;
        u64                     staged_sequence;
        u64                     timeout_jiffies;
        u64                     submission_time_jiffies;
        enum hl_cs_type         type;
+       u32                     encaps_sig_hdl_id;
        u8                      submitted;
        u8                      completed;
        u8                      timedout;
@@ -1474,6 +1528,7 @@ struct hl_cs {
        u8                      staged_first;
        u8                      staged_cs;
        u8                      skip_reset_on_timeout;
+       u8                      encaps_signals;
 };
 
 /**
@@ -1493,6 +1548,8 @@ struct hl_cs {
  * @hw_queue_id: the id of the H/W queue this job is submitted to.
  * @user_cb_size: the actual size of the CB we got from the user.
  * @job_cb_size: the actual size of the CB that we put on the queue.
+ * @encaps_sig_wait_offset: encapsulated signals offset, which allow user
+ *                          to wait on part of the reserved signals.
  * @is_kernel_allocated_cb: true if the CB handle we got from the user holds a
  *                          handle to a kernel-allocated CB object, false
  *                          otherwise (SRAM/DRAM/host address).
@@ -1517,6 +1574,7 @@ struct hl_cs_job {
        u32                     hw_queue_id;
        u32                     user_cb_size;
        u32                     job_cb_size;
+       u32                     encaps_sig_wait_offset;
        u8                      is_kernel_allocated_cb;
        u8                      contains_dma_pkt;
 };
@@ -1613,7 +1671,7 @@ struct hl_vm_hw_block_list_node {
  * @created_from_userptr: is product of host virtual address.
  */
 struct hl_vm_phys_pg_pack {
-       enum vm_type_t          vm_type; /* must be first */
+       enum vm_type            vm_type; /* must be first */
        u64                     *pages;
        u64                     npages;
        u64                     total_size;
@@ -1759,9 +1817,13 @@ struct hl_debugfs_entry {
  * @ctx_mem_hash_list: list of available contexts with MMU mappings.
  * @ctx_mem_hash_spinlock: protects cb_list.
  * @blob_desc: descriptor of blob
+ * @state_dump: data of the system states in case of a bad cs.
+ * @state_dump_sem: protects state_dump.
  * @addr: next address to read/write from/to in read/write32.
  * @mmu_addr: next virtual address to translate to physical address in mmu_show.
+ * @userptr_lookup: the target user ptr to look up for on demand.
  * @mmu_asid: ASID to use while translating in mmu_show.
+ * @state_dump_head: index of the latest state dump
  * @i2c_bus: generic u8 debugfs file for bus value to use in i2c_data_read.
  * @i2c_addr: generic u8 debugfs file for address value to use in i2c_data_read.
  * @i2c_reg: generic u8 debugfs file for register value to use in i2c_data_read.
@@ -1783,14 +1845,149 @@ struct hl_dbg_device_entry {
        struct list_head                ctx_mem_hash_list;
        spinlock_t                      ctx_mem_hash_spinlock;
        struct debugfs_blob_wrapper     blob_desc;
+       char                            *state_dump[HL_STATE_DUMP_HIST_LEN];
+       struct rw_semaphore             state_dump_sem;
        u64                             addr;
        u64                             mmu_addr;
+       u64                             userptr_lookup;
        u32                             mmu_asid;
+       u32                             state_dump_head;
        u8                              i2c_bus;
        u8                              i2c_addr;
        u8                              i2c_reg;
 };
 
+/**
+ * struct hl_hw_obj_name_entry - single hw object name, member of
+ * hl_state_dump_specs
+ * @node: link to the containing hash table
+ * @name: hw object name
+ * @id: object identifier
+ */
+struct hl_hw_obj_name_entry {
+       struct hlist_node       node;
+       const char              *name;
+       u32                     id;
+};
+
+enum hl_state_dump_specs_props {
+       SP_SYNC_OBJ_BASE_ADDR,
+       SP_NEXT_SYNC_OBJ_ADDR,
+       SP_SYNC_OBJ_AMOUNT,
+       SP_MON_OBJ_WR_ADDR_LOW,
+       SP_MON_OBJ_WR_ADDR_HIGH,
+       SP_MON_OBJ_WR_DATA,
+       SP_MON_OBJ_ARM_DATA,
+       SP_MON_OBJ_STATUS,
+       SP_MONITORS_AMOUNT,
+       SP_TPC0_CMDQ,
+       SP_TPC0_CFG_SO,
+       SP_NEXT_TPC,
+       SP_MME_CMDQ,
+       SP_MME_CFG_SO,
+       SP_NEXT_MME,
+       SP_DMA_CMDQ,
+       SP_DMA_CFG_SO,
+       SP_DMA_QUEUES_OFFSET,
+       SP_NUM_OF_MME_ENGINES,
+       SP_SUB_MME_ENG_NUM,
+       SP_NUM_OF_DMA_ENGINES,
+       SP_NUM_OF_TPC_ENGINES,
+       SP_ENGINE_NUM_OF_QUEUES,
+       SP_ENGINE_NUM_OF_STREAMS,
+       SP_ENGINE_NUM_OF_FENCES,
+       SP_FENCE0_CNT_OFFSET,
+       SP_FENCE0_RDATA_OFFSET,
+       SP_CP_STS_OFFSET,
+       SP_NUM_CORES,
+
+       SP_MAX
+};
+
+enum hl_sync_engine_type {
+       ENGINE_TPC,
+       ENGINE_DMA,
+       ENGINE_MME,
+};
+
+/**
+ * struct hl_mon_state_dump - represents a state dump of a single monitor
+ * @id: monitor id
+ * @wr_addr_low: address monitor will write to, low bits
+ * @wr_addr_high: address monitor will write to, high bits
+ * @wr_data: data monitor will write
+ * @arm_data: register value containing monitor configuration
+ * @status: monitor status
+ */
+struct hl_mon_state_dump {
+       u32             id;
+       u32             wr_addr_low;
+       u32             wr_addr_high;
+       u32             wr_data;
+       u32             arm_data;
+       u32             status;
+};
+
+/**
+ * struct hl_sync_to_engine_map_entry - sync object id to engine mapping entry
+ * @engine_type: type of the engine
+ * @engine_id: id of the engine
+ * @sync_id: id of the sync object
+ */
+struct hl_sync_to_engine_map_entry {
+       struct hlist_node               node;
+       enum hl_sync_engine_type        engine_type;
+       u32                             engine_id;
+       u32                             sync_id;
+};
+
+/**
+ * struct hl_sync_to_engine_map - maps sync object id to associated engine id
+ * @tb: hash table containing the mapping, each element is of type
+ *      struct hl_sync_to_engine_map_entry
+ */
+struct hl_sync_to_engine_map {
+       DECLARE_HASHTABLE(tb, SYNC_TO_ENGINE_HASH_TABLE_BITS);
+};
+
+/**
+ * struct hl_state_dump_specs_funcs - virtual functions used by the state dump
+ * @gen_sync_to_engine_map: generate a hash map from sync obj id to its engine
+ * @print_single_monitor: format monitor data as string
+ * @monitor_valid: return true if given monitor dump is valid
+ * @print_fences_single_engine: format fences data as string
+ */
+struct hl_state_dump_specs_funcs {
+       int (*gen_sync_to_engine_map)(struct hl_device *hdev,
+                               struct hl_sync_to_engine_map *map);
+       int (*print_single_monitor)(char **buf, size_t *size, size_t *offset,
+                                   struct hl_device *hdev,
+                                   struct hl_mon_state_dump *mon);
+       int (*monitor_valid)(struct hl_mon_state_dump *mon);
+       int (*print_fences_single_engine)(struct hl_device *hdev,
+                                       u64 base_offset,
+                                       u64 status_base_offset,
+                                       enum hl_sync_engine_type engine_type,
+                                       u32 engine_id, char **buf,
+                                       size_t *size, size_t *offset);
+};
+
+/**
+ * struct hl_state_dump_specs - defines ASIC known hw objects names
+ * @so_id_to_str_tb: sync objects names index table
+ * @monitor_id_to_str_tb: monitors names index table
+ * @funcs: virtual functions used for state dump
+ * @sync_namager_names: readable names for sync manager if available (ex: N_E)
+ * @props: pointer to a per asic const props array required for state dump
+ */
+struct hl_state_dump_specs {
+       DECLARE_HASHTABLE(so_id_to_str_tb, OBJ_NAMES_HASH_TABLE_BITS);
+       DECLARE_HASHTABLE(monitor_id_to_str_tb, OBJ_NAMES_HASH_TABLE_BITS);
+       struct hl_state_dump_specs_funcs        funcs;
+       const char * const                      *sync_namager_names;
+       s64                                     *props;
+};
+
 
 /*
  * DEVICES
@@ -1798,7 +1995,7 @@ struct hl_dbg_device_entry {
 
 #define HL_STR_MAX     32
 
-#define HL_DEV_STS_MAX (HL_DEVICE_STATUS_NEEDS_RESET + 1)
+#define HL_DEV_STS_MAX (HL_DEVICE_STATUS_LAST + 1)
 
 /* Theoretical limit only. A single host can only contain up to 4 or 8 PCIe
  * x16 cards. In extreme cases, there are hosts that can accommodate 16 cards.
@@ -1946,11 +2143,13 @@ struct hwmon_chip_info;
  * @wq: work queue for device reset procedure.
  * @reset_work: reset work to be done.
  * @hdev: habanalabs device structure.
+ * @fw_reset: whether f/w will do the reset without us sending them a message to do it.
  */
 struct hl_device_reset_work {
        struct workqueue_struct         *wq;
        struct delayed_work             reset_work;
        struct hl_device                *hdev;
+       bool                            fw_reset;
 };
 
 /**
@@ -2064,6 +2263,58 @@ struct hl_mmu_funcs {
                        u64 virt_addr, struct hl_mmu_hop_info *hops);
 };
 
+/**
+ * number of user contexts allowed to call wait_for_multi_cs ioctl in
+ * parallel
+ */
+#define MULTI_CS_MAX_USER_CTX  2
+
+/**
+ * struct multi_cs_completion - multi CS wait completion.
+ * @completion: completion of any of the CS in the list
+ * @lock: spinlock for the completion structure
+ * @timestamp: timestamp for the multi-CS completion
+ * @stream_master_qid_map: bitmap of all stream masters on which the multi-CS
+ *                        is waiting
+ * @used: 1 if in use, otherwise 0
+ */
+struct multi_cs_completion {
+       struct completion       completion;
+       spinlock_t              lock;
+       s64                     timestamp;
+       u32                     stream_master_qid_map;
+       u8                      used;
+};
+
+/**
+ * struct multi_cs_data - internal data for multi CS call
+ * @ctx: pointer to the context structure
+ * @fence_arr: array of fences of all CSs
+ * @seq_arr: array of CS sequence numbers
+ * @timeout_us: timeout in usec for waiting for CS to complete
+ * @timestamp: timestamp of first completed CS
+ * @wait_status: wait for CS status
+ * @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0)
+ * @stream_master_qid_map: bitmap of all stream master QIDs on which the
+ *                         multi-CS is waiting
+ * @arr_len: fence_arr and seq_arr array length
+ * @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0)
+ * @update_ts: update timestamp. 1- update the timestamp, otherwise 0.
+ */
+struct multi_cs_data {
+       struct hl_ctx   *ctx;
+       struct hl_fence **fence_arr;
+       u64             *seq_arr;
+       s64             timeout_us;
+       s64             timestamp;
+       long            wait_status;
+       u32             completion_bitmap;
+       u32             stream_master_qid_map;
+       u8              arr_len;
+       u8              gone_cs;
+       u8              update_ts;
+};
+
 /**
  * struct hl_device - habanalabs device structure.
  * @pdev: pointer to PCI device, can be NULL in case of simulator device.
@@ -2129,6 +2380,8 @@ struct hl_mmu_funcs {
  * @mmu_func: device-related MMU functions.
  * @fw_loader: FW loader manager.
  * @pci_mem_region: array of memory regions in the PCI
+ * @state_dump_specs: constants and dictionaries needed to dump system state.
+ * @multi_cs_completion: array of multi-CS completion.
  * @dram_used_mem: current DRAM memory consumption.
  * @timeout_jiffies: device CS timeout value.
  * @max_power: the max power of the device, as configured by the sysadmin. This
@@ -2205,6 +2458,7 @@ struct hl_mmu_funcs {
  *                        halted. We can't halt it again because the COMMS
  *                        protocol will throw an error. Relevant only for
  *                        cases where Linux was not loaded to device CPU
+ * @supports_wait_for_multi_cs: true if wait for multi CS is supported
  */
 struct hl_device {
        struct pci_dev                  *pdev;
@@ -2273,6 +2527,11 @@ struct hl_device {
 
        struct pci_mem_region           pci_mem_region[PCI_REGION_NUMBER];
 
+       struct hl_state_dump_specs      state_dump_specs;
+
+       struct multi_cs_completion      multi_cs_completion[
+                                                       MULTI_CS_MAX_USER_CTX];
+       u32                             *stream_master_qid_arr;
        atomic64_t                      dram_used_mem;
        u64                             timeout_jiffies;
        u64                             max_power;
@@ -2322,6 +2581,8 @@ struct hl_device {
        u8                              curr_reset_cause;
        u8                              skip_reset_on_timeout;
        u8                              device_cpu_is_halted;
+       u8                              supports_wait_for_multi_cs;
+       u8                              stream_master_qid_arr_size;
 
        /* Parameters for bring-up */
        u64                             nic_ports_mask;
@@ -2343,6 +2604,29 @@ struct hl_device {
 };
 
 
+/**
+ * struct hl_cs_encaps_sig_handle - encapsulated signals handle structure
+ * @refcount: refcount used to protect removing this id when several
+ *            wait cs are used to wait of the reserved encaps signals.
+ * @hdev: pointer to habanalabs device structure.
+ * @hw_sob: pointer to  H/W SOB used in the reservation.
+ * @cs_seq: staged cs sequence which contains encapsulated signals
+ * @id: idr handler id to be used to fetch the handler info
+ * @q_idx: stream queue index
+ * @pre_sob_val: current SOB value before reservation
+ * @count: signals number
+ */
+struct hl_cs_encaps_sig_handle {
+       struct kref refcount;
+       struct hl_device *hdev;
+       struct hl_hw_sob *hw_sob;
+       u64  cs_seq;
+       u32  id;
+       u32  q_idx;
+       u32  pre_sob_val;
+       u32  count;
+};
+
 /*
  * IOCTLs
  */
@@ -2372,6 +2656,23 @@ struct hl_ioctl_desc {
  * Kernel module functions that can be accessed by entire module
  */
 
+/**
+ * hl_get_sg_info() - get number of pages and the DMA address from SG list.
+ * @sg: the SG list.
+ * @dma_addr: pointer to DMA address to return.
+ *
+ * Calculate the number of consecutive pages described by the SG list. Take the
+ * offset of the address in the first page, add to it the length and round it up
+ * to the number of needed pages.
+ */
+static inline u32 hl_get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr)
+{
+       *dma_addr = sg_dma_address(sg);
+
+       return ((((*dma_addr) & (PAGE_SIZE - 1)) + sg_dma_len(sg)) +
+                       (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+}
+
 /**
  * hl_mem_area_inside_range() - Checks whether address+size are inside a range.
  * @address: The start address of the area we want to validate.
@@ -2436,7 +2737,9 @@ void destroy_hdev(struct hl_device *hdev);
 int hl_hw_queues_create(struct hl_device *hdev);
 void hl_hw_queues_destroy(struct hl_device *hdev);
 int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
-                               u32 cb_size, u64 cb_ptr);
+               u32 cb_size, u64 cb_ptr);
+void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
+               u32 ctl, u32 len, u64 ptr);
 int hl_hw_queue_schedule_cs(struct hl_cs *cs);
 u32 hl_hw_queue_add_ptr(u32 ptr, u16 val);
 void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id);
@@ -2470,6 +2773,8 @@ void hl_ctx_do_release(struct kref *ref);
 void hl_ctx_get(struct hl_device *hdev,        struct hl_ctx *ctx);
 int hl_ctx_put(struct hl_ctx *ctx);
 struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
+int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr,
+                               struct hl_fence **fence, u32 arr_len);
 void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr);
 void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr);
 
@@ -2511,18 +2816,19 @@ int hl_cb_va_pool_init(struct hl_ctx *ctx);
 void hl_cb_va_pool_fini(struct hl_ctx *ctx);
 
 void hl_cs_rollback_all(struct hl_device *hdev);
-void hl_pending_cb_list_flush(struct hl_ctx *ctx);
 struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
                enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
 void hl_sob_reset_error(struct kref *ref);
 int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask);
 void hl_fence_put(struct hl_fence *fence);
+void hl_fences_put(struct hl_fence **fence, int len);
 void hl_fence_get(struct hl_fence *fence);
 void cs_get(struct hl_cs *cs);
 bool cs_needs_completion(struct hl_cs *cs);
 bool cs_needs_timeout(struct hl_cs *cs);
 bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs);
 struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq);
+void hl_multi_cs_completion_init(struct hl_device *hdev);
 
 void goya_set_asic_funcs(struct hl_device *hdev);
 void gaudi_set_asic_funcs(struct hl_device *hdev);
@@ -2650,9 +2956,25 @@ int hl_set_voltage(struct hl_device *hdev,
                        int sensor_index, u32 attr, long value);
 int hl_set_current(struct hl_device *hdev,
                        int sensor_index, u32 attr, long value);
+void hw_sob_get(struct hl_hw_sob *hw_sob);
+void hw_sob_put(struct hl_hw_sob *hw_sob);
+void hl_encaps_handle_do_release(struct kref *ref);
+void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev,
+                       struct hl_cs *cs, struct hl_cs_job *job,
+                       struct hl_cs_compl *cs_cmpl);
 void hl_release_pending_user_interrupts(struct hl_device *hdev);
 int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
-                       struct hl_hw_sob **hw_sob, u32 count);
+                       struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig);
+
+int hl_state_dump(struct hl_device *hdev);
+const char *hl_state_dump_get_sync_name(struct hl_device *hdev, u32 sync_id);
+const char *hl_state_dump_get_monitor_name(struct hl_device *hdev,
+                                       struct hl_mon_state_dump *mon);
+void hl_state_dump_free_sync_to_engine_map(struct hl_sync_to_engine_map *map);
+__printf(4, 5) int hl_snprintf_resize(char **buf, size_t *size, size_t *offset,
+                                       const char *format, ...);
+char *hl_format_as_binary(char *buf, size_t buf_len, u32 n);
+const char *hl_sync_engine_to_string(enum hl_sync_engine_type engine_type);
 
 #ifdef CONFIG_DEBUG_FS
 
@@ -2673,6 +2995,8 @@ void hl_debugfs_remove_userptr(struct hl_device *hdev,
                                struct hl_userptr *userptr);
 void hl_debugfs_add_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx);
 void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx);
+void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data,
+                                       unsigned long length);
 
 #else
 
@@ -2746,6 +3070,11 @@ static inline void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev,
 {
 }
 
+static inline void hl_debugfs_set_state_dump(struct hl_device *hdev,
+                                       char *data, unsigned long length)
+{
+}
+
 #endif
 
 /* IOCTLs */
index 4194cda..a75e4fc 100644 (file)
@@ -141,7 +141,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
        hl_cb_mgr_init(&hpriv->cb_mgr);
        hl_ctx_mgr_init(&hpriv->ctx_mgr);
 
-       hpriv->taskpid = find_get_pid(current->pid);
+       hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
 
        mutex_lock(&hdev->fpriv_list_lock);
 
@@ -194,7 +194,6 @@ int hl_device_open(struct inode *inode, struct file *filp)
 
 out_err:
        mutex_unlock(&hdev->fpriv_list_lock);
-
        hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
        hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
        filp->private_data = NULL;
@@ -318,12 +317,16 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
                hdev->asic_prop.fw_security_enabled = false;
 
        /* Assign status description string */
-       strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
-                                       "disabled", HL_STR_MAX);
+       strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL],
+                                       "operational", HL_STR_MAX);
        strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET],
                                        "in reset", HL_STR_MAX);
+       strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
+                                       "disabled", HL_STR_MAX);
        strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET],
                                        "needs reset", HL_STR_MAX);
+       strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
+                                       "in device creation", HL_STR_MAX);
 
        hdev->major = hl_major;
        hdev->reset_on_lockup = reset_on_lockup;
@@ -532,7 +535,7 @@ hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
                result = PCI_ERS_RESULT_NONE;
        }
 
-       hdev->asic_funcs->halt_engines(hdev, true);
+       hdev->asic_funcs->halt_engines(hdev, true, false);
 
        return result;
 }
index f4dda7b..86c3257 100644 (file)
@@ -94,6 +94,8 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 
        hw_ip.first_available_interrupt_id =
                        prop->first_available_user_msix_interrupt;
+       hw_ip.server_type = prop->server_type;
+
        return copy_to_user(out, &hw_ip,
                min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0;
 }
index bcabfdb..76b7de8 100644 (file)
@@ -65,7 +65,7 @@ void hl_hw_queue_update_ci(struct hl_cs *cs)
 }
 
 /*
- * ext_and_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a
+ * hl_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a
  *                                H/W queue.
  * @hdev: pointer to habanalabs device structure
  * @q: pointer to habanalabs queue structure
@@ -80,8 +80,8 @@ void hl_hw_queue_update_ci(struct hl_cs *cs)
  * This function must be called when the scheduler mutex is taken
  *
  */
-static void ext_and_hw_queue_submit_bd(struct hl_device *hdev,
-                       struct hl_hw_queue *q, u32 ctl, u32 len, u64 ptr)
+void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
+               u32 ctl, u32 len, u64 ptr)
 {
        struct hl_bd *bd;
 
@@ -222,8 +222,8 @@ static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q,
  * @cb_size: size of CB
  * @cb_ptr: pointer to CB location
  *
- * This function sends a single CB, that must NOT generate a completion entry
- *
+ * This function sends a single CB, that must NOT generate a completion entry.
+ * Sending CPU messages can be done instead via 'hl_hw_queue_submit_bd()'
  */
 int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
                                u32 cb_size, u64 cb_ptr)
@@ -231,16 +231,7 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
        struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id];
        int rc = 0;
 
-       /*
-        * The CPU queue is a synchronous queue with an effective depth of
-        * a single entry (although it is allocated with room for multiple
-        * entries). Therefore, there is a different lock, called
-        * send_cpu_message_lock, that serializes accesses to the CPU queue.
-        * As a result, we don't need to lock the access to the entire H/W
-        * queues module when submitting a JOB to the CPU queue
-        */
-       if (q->queue_type != QUEUE_TYPE_CPU)
-               hdev->asic_funcs->hw_queues_lock(hdev);
+       hdev->asic_funcs->hw_queues_lock(hdev);
 
        if (hdev->disabled) {
                rc = -EPERM;
@@ -258,11 +249,10 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
                        goto out;
        }
 
-       ext_and_hw_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
+       hl_hw_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
 
 out:
-       if (q->queue_type != QUEUE_TYPE_CPU)
-               hdev->asic_funcs->hw_queues_unlock(hdev);
+       hdev->asic_funcs->hw_queues_unlock(hdev);
 
        return rc;
 }
@@ -328,7 +318,7 @@ static void ext_queue_schedule_job(struct hl_cs_job *job)
        cq->pi = hl_cq_inc_ptr(cq->pi);
 
 submit_bd:
-       ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
+       hl_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
 }
 
 /*
@@ -407,7 +397,7 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
        else
                ptr = (u64) (uintptr_t) job->user_cb;
 
-       ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
+       hl_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
 }
 
 static int init_signal_cs(struct hl_device *hdev,
@@ -426,8 +416,9 @@ static int init_signal_cs(struct hl_device *hdev,
        cs_cmpl->sob_val = prop->next_sob_val;
 
        dev_dbg(hdev->dev,
-               "generate signal CB, sob_id: %d, sob val: 0x%x, q_idx: %d\n",
-               cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, q_idx);
+               "generate signal CB, sob_id: %d, sob val: %u, q_idx: %d, seq: %llu\n",
+               cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, q_idx,
+               cs_cmpl->cs_seq);
 
        /* we set an EB since we must make sure all oeprations are done
         * when sending the signal
@@ -435,17 +426,37 @@ static int init_signal_cs(struct hl_device *hdev,
        hdev->asic_funcs->gen_signal_cb(hdev, job->patched_cb,
                                cs_cmpl->hw_sob->sob_id, 0, true);
 
-       rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, 1);
+       rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, 1,
+                                                               false);
 
        return rc;
 }
 
-static void init_wait_cs(struct hl_device *hdev, struct hl_cs *cs,
+void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev,
+                       struct hl_cs *cs, struct hl_cs_job *job,
+                       struct hl_cs_compl *cs_cmpl)
+{
+       struct hl_cs_encaps_sig_handle *handle = cs->encaps_sig_hdl;
+
+       cs_cmpl->hw_sob = handle->hw_sob;
+
+       /* Note that encaps_sig_wait_offset was validated earlier in the flow
+        * for offset value which exceeds the max reserved signal count.
+        * always decrement 1 of the offset since when the user
+        * set offset 1 for example he mean to wait only for the first
+        * signal only, which will be pre_sob_val, and if he set offset 2
+        * then the value required is (pre_sob_val + 1) and so on...
+        */
+       cs_cmpl->sob_val = handle->pre_sob_val +
+                       (job->encaps_sig_wait_offset - 1);
+}
+
+static int init_wait_cs(struct hl_device *hdev, struct hl_cs *cs,
                struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl)
 {
-       struct hl_cs_compl *signal_cs_cmpl;
-       struct hl_sync_stream_properties *prop;
        struct hl_gen_wait_properties wait_prop;
+       struct hl_sync_stream_properties *prop;
+       struct hl_cs_compl *signal_cs_cmpl;
        u32 q_idx;
 
        q_idx = job->hw_queue_id;
@@ -455,14 +466,51 @@ static void init_wait_cs(struct hl_device *hdev, struct hl_cs *cs,
                                        struct hl_cs_compl,
                                        base_fence);
 
-       /* copy the SOB id and value of the signal CS */
-       cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
-       cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
+       if (cs->encaps_signals) {
+               /* use the encaps signal handle stored earlier in the flow
+                * and set the SOB information from the encaps
+                * signals handle
+                */
+               hl_hw_queue_encaps_sig_set_sob_info(hdev, cs, job, cs_cmpl);
+
+               dev_dbg(hdev->dev, "Wait for encaps signals handle, qidx(%u), CS sequence(%llu), sob val: 0x%x, offset: %u\n",
+                               cs->encaps_sig_hdl->q_idx,
+                               cs->encaps_sig_hdl->cs_seq,
+                               cs_cmpl->sob_val,
+                               job->encaps_sig_wait_offset);
+       } else {
+               /* Copy the SOB id and value of the signal CS */
+               cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
+               cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
+       }
+
+       /* check again if the signal cs already completed.
+        * if yes then don't send any wait cs since the hw_sob
+        * could be in reset already. if signal is not completed
+        * then get refcount to hw_sob to prevent resetting the sob
+        * while wait cs is not submitted.
+        * note that this check is protected by two locks,
+        * hw queue lock and completion object lock,
+        * and the same completion object lock also protects
+        * the hw_sob reset handler function.
+        * The hw_queue lock prevent out of sync of hw_sob
+        * refcount value, changed by signal/wait flows.
+        */
+       spin_lock(&signal_cs_cmpl->lock);
+
+       if (completion_done(&cs->signal_fence->completion)) {
+               spin_unlock(&signal_cs_cmpl->lock);
+               return -EINVAL;
+       }
+
+       kref_get(&cs_cmpl->hw_sob->kref);
+
+       spin_unlock(&signal_cs_cmpl->lock);
 
        dev_dbg(hdev->dev,
-               "generate wait CB, sob_id: %d, sob_val: 0x%x, mon_id: %d, q_idx: %d\n",
+               "generate wait CB, sob_id: %d, sob_val: 0x%x, mon_id: %d, q_idx: %d, seq: %llu\n",
                cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val,
-               prop->base_mon_id, q_idx);
+               prop->base_mon_id, q_idx, cs->sequence);
 
        wait_prop.data = (void *) job->patched_cb;
        wait_prop.sob_base = cs_cmpl->hw_sob->sob_id;
@@ -471,17 +519,14 @@ static void init_wait_cs(struct hl_device *hdev, struct hl_cs *cs,
        wait_prop.mon_id = prop->base_mon_id;
        wait_prop.q_idx = q_idx;
        wait_prop.size = 0;
+
        hdev->asic_funcs->gen_wait_cb(hdev, &wait_prop);
 
-       kref_get(&cs_cmpl->hw_sob->kref);
-       /*
-        * Must put the signal fence after the SOB refcnt increment so
-        * the SOB refcnt won't turn 0 and reset the SOB before the
-        * wait CS was submitted.
-        */
        mb();
        hl_fence_put(cs->signal_fence);
        cs->signal_fence = NULL;
+
+       return 0;
 }
 
 /*
@@ -506,7 +551,60 @@ static int init_signal_wait_cs(struct hl_cs *cs)
        if (cs->type & CS_TYPE_SIGNAL)
                rc = init_signal_cs(hdev, job, cs_cmpl);
        else if (cs->type & CS_TYPE_WAIT)
-               init_wait_cs(hdev, cs, job, cs_cmpl);
+               rc = init_wait_cs(hdev, cs, job, cs_cmpl);
+
+       return rc;
+}
+
+static int encaps_sig_first_staged_cs_handler
+                       (struct hl_device *hdev, struct hl_cs *cs)
+{
+       struct hl_cs_compl *cs_cmpl =
+                       container_of(cs->fence,
+                                       struct hl_cs_compl, base_fence);
+       struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
+       struct hl_encaps_signals_mgr *mgr;
+       int rc = 0;
+
+       mgr = &hdev->compute_ctx->sig_mgr;
+
+       spin_lock(&mgr->lock);
+       encaps_sig_hdl = idr_find(&mgr->handles, cs->encaps_sig_hdl_id);
+       if (encaps_sig_hdl) {
+               /*
+                * Set handler CS sequence,
+                * the CS which contains the encapsulated signals.
+                */
+               encaps_sig_hdl->cs_seq = cs->sequence;
+               /* store the handle and set encaps signal indication,
+                * to be used later in cs_do_release to put the last
+                * reference to encaps signals handlers.
+                */
+               cs_cmpl->encaps_signals = true;
+               cs_cmpl->encaps_sig_hdl = encaps_sig_hdl;
+
+               /* set hw_sob pointer in completion object
+                * since it's used in cs_do_release flow to put
+                * refcount to sob
+                */
+               cs_cmpl->hw_sob = encaps_sig_hdl->hw_sob;
+               cs_cmpl->sob_val = encaps_sig_hdl->pre_sob_val +
+                                               encaps_sig_hdl->count;
+
+               dev_dbg(hdev->dev, "CS seq (%llu) added to encaps signal handler id (%u), count(%u), qidx(%u), sob(%u), val(%u)\n",
+                               cs->sequence, encaps_sig_hdl->id,
+                               encaps_sig_hdl->count,
+                               encaps_sig_hdl->q_idx,
+                               cs_cmpl->hw_sob->sob_id,
+                               cs_cmpl->sob_val);
+
+       } else {
+               dev_err(hdev->dev, "encaps handle id(%u) wasn't found!\n",
+                               cs->encaps_sig_hdl_id);
+               rc = -EINVAL;
+       }
+
+       spin_unlock(&mgr->lock);
 
        return rc;
 }
@@ -581,14 +679,21 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 
        if ((cs->type == CS_TYPE_SIGNAL) || (cs->type == CS_TYPE_WAIT)) {
                rc = init_signal_wait_cs(cs);
-               if (rc) {
-                       dev_err(hdev->dev, "Failed to submit signal cs\n");
+               if (rc)
                        goto unroll_cq_resv;
-               }
-       } else if (cs->type == CS_TYPE_COLLECTIVE_WAIT)
-               hdev->asic_funcs->collective_wait_init_cs(cs);
+       } else if (cs->type == CS_TYPE_COLLECTIVE_WAIT) {
+               rc = hdev->asic_funcs->collective_wait_init_cs(cs);
+               if (rc)
+                       goto unroll_cq_resv;
+       }
 
 
+       if (cs->encaps_signals && cs->staged_first) {
+               rc = encaps_sig_first_staged_cs_handler(hdev, cs);
+               if (rc)
+                       goto unroll_cq_resv;
+       }
+
        spin_lock(&hdev->cs_mirror_lock);
 
        /* Verify staged CS exists and add to the staged list */
@@ -613,6 +718,11 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
                }
 
                list_add_tail(&cs->staged_cs_node, &staged_cs->staged_cs_node);
+
+               /* update stream map of the first CS */
+               if (hdev->supports_wait_for_multi_cs)
+                       staged_cs->fence->stream_master_qid_map |=
+                                       cs->fence->stream_master_qid_map;
        }
 
        list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list);
@@ -834,6 +944,8 @@ static void sync_stream_queue_init(struct hl_device *hdev, u32 q_idx)
                hw_sob = &sync_stream_prop->hw_sob[sob];
                hw_sob->hdev = hdev;
                hw_sob->sob_id = sync_stream_prop->base_sob_id + sob;
+               hw_sob->sob_addr =
+                       hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id);
                hw_sob->q_idx = q_idx;
                kref_init(&hw_sob->kref);
        }
index af339ce..3398693 100644 (file)
@@ -124,7 +124,7 @@ static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,
 
        spin_lock(&vm->idr_lock);
        handle = idr_alloc(&vm->phys_pg_pack_handles, phys_pg_pack, 1, 0,
-                               GFP_KERNEL);
+                               GFP_ATOMIC);
        spin_unlock(&vm->idr_lock);
 
        if (handle < 0) {
@@ -528,6 +528,33 @@ static inline int add_va_block(struct hl_device *hdev,
        return rc;
 }
 
+/**
+ * is_hint_crossing_range() - check if hint address crossing specified reserved
+ * range.
+ */
+static inline bool is_hint_crossing_range(enum hl_va_range_type range_type,
+               u64 start_addr, u32 size, struct asic_fixed_properties *prop) {
+       bool range_cross;
+
+       if (range_type == HL_VA_RANGE_TYPE_DRAM)
+               range_cross =
+                       hl_mem_area_crosses_range(start_addr, size,
+                       prop->hints_dram_reserved_va_range.start_addr,
+                       prop->hints_dram_reserved_va_range.end_addr);
+       else if (range_type == HL_VA_RANGE_TYPE_HOST)
+               range_cross =
+                       hl_mem_area_crosses_range(start_addr,   size,
+                       prop->hints_host_reserved_va_range.start_addr,
+                       prop->hints_host_reserved_va_range.end_addr);
+       else
+               range_cross =
+                       hl_mem_area_crosses_range(start_addr, size,
+                       prop->hints_host_hpage_reserved_va_range.start_addr,
+                       prop->hints_host_hpage_reserved_va_range.end_addr);
+
+       return range_cross;
+}
+
 /**
  * get_va_block() - get a virtual block for the given size and alignment.
  *
@@ -536,6 +563,8 @@ static inline int add_va_block(struct hl_device *hdev,
  * @size: requested block size.
  * @hint_addr: hint for requested address by the user.
  * @va_block_align: required alignment of the virtual block start address.
+ * @range_type: va range type (host, dram)
+ * @flags: additional memory flags, currently only uses HL_MEM_FORCE_HINT
  *
  * This function does the following:
  * - Iterate on the virtual block list to find a suitable virtual block for the
@@ -545,13 +574,19 @@ static inline int add_va_block(struct hl_device *hdev,
  */
 static u64 get_va_block(struct hl_device *hdev,
                                struct hl_va_range *va_range,
-                               u64 size, u64 hint_addr, u32 va_block_align)
+                               u64 size, u64 hint_addr, u32 va_block_align,
+                               enum hl_va_range_type range_type,
+                               u32 flags)
 {
        struct hl_vm_va_block *va_block, *new_va_block = NULL;
+       struct asic_fixed_properties *prop = &hdev->asic_prop;
        u64 tmp_hint_addr, valid_start, valid_size, prev_start, prev_end,
-               align_mask, reserved_valid_start = 0, reserved_valid_size = 0;
+               align_mask, reserved_valid_start = 0, reserved_valid_size = 0,
+               dram_hint_mask = prop->dram_hints_align_mask;
        bool add_prev = false;
        bool is_align_pow_2  = is_power_of_2(va_range->page_size);
+       bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr);
+       bool force_hint = flags & HL_MEM_FORCE_HINT;
 
        if (is_align_pow_2)
                align_mask = ~((u64)va_block_align - 1);
@@ -564,12 +599,20 @@ static u64 get_va_block(struct hl_device *hdev,
                size = DIV_ROUND_UP_ULL(size, va_range->page_size) *
                                                        va_range->page_size;
 
-       tmp_hint_addr = hint_addr;
+       tmp_hint_addr = hint_addr & ~dram_hint_mask;
 
        /* Check if we need to ignore hint address */
        if ((is_align_pow_2 && (hint_addr & (va_block_align - 1))) ||
-                       (!is_align_pow_2 &&
-                               do_div(tmp_hint_addr, va_range->page_size))) {
+               (!is_align_pow_2 && is_hint_dram_addr &&
+                       do_div(tmp_hint_addr, va_range->page_size))) {
+
+               if (force_hint) {
+                       /* Hint must be respected, so here we just fail */
+                       dev_err(hdev->dev,
+                               "Hint address 0x%llx is not page aligned - cannot be respected\n",
+                               hint_addr);
+                       return 0;
+               }
 
                dev_dbg(hdev->dev,
                        "Hint address 0x%llx will be ignored because it is not aligned\n",
@@ -596,6 +639,16 @@ static u64 get_va_block(struct hl_device *hdev,
                if (valid_size < size)
                        continue;
 
+               /*
+                * In case hint address is 0, and arc_hints_range_reservation
+                * property enabled, then avoid allocating va blocks from the
+                * range reserved for hint addresses
+                */
+               if (prop->hints_range_reservation && !hint_addr)
+                       if (is_hint_crossing_range(range_type, valid_start,
+                                       size, prop))
+                               continue;
+
                /* Pick the minimal length block which has the required size */
                if (!new_va_block || (valid_size < reserved_valid_size)) {
                        new_va_block = va_block;
@@ -618,6 +671,17 @@ static u64 get_va_block(struct hl_device *hdev,
                goto out;
        }
 
+       if (force_hint && reserved_valid_start != hint_addr) {
+               /* Hint address must be respected. If we are here - this means
+                * we could not respect it.
+                */
+               dev_err(hdev->dev,
+                       "Hint address 0x%llx could not be respected\n",
+                       hint_addr);
+               reserved_valid_start = 0;
+               goto out;
+       }
+
        /*
         * Check if there is some leftover range due to reserving the new
         * va block, then return it to the main virtual addresses list.
@@ -670,7 +734,8 @@ u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
                enum hl_va_range_type type, u32 size, u32 alignment)
 {
        return get_va_block(hdev, ctx->va_range[type], size, 0,
-                       max(alignment, ctx->va_range[type]->page_size));
+                       max(alignment, ctx->va_range[type]->page_size),
+                       type, 0);
 }
 
 /**
@@ -731,29 +796,16 @@ int hl_unreserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
        return rc;
 }
 
-/**
- * get_sg_info() - get number of pages and the DMA address from SG list.
- * @sg: the SG list.
- * @dma_addr: pointer to DMA address to return.
- *
- * Calculate the number of consecutive pages described by the SG list. Take the
- * offset of the address in the first page, add to it the length and round it up
- * to the number of needed pages.
- */
-static u32 get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr)
-{
-       *dma_addr = sg_dma_address(sg);
-
-       return ((((*dma_addr) & (PAGE_SIZE - 1)) + sg_dma_len(sg)) +
-                       (PAGE_SIZE - 1)) >> PAGE_SHIFT;
-}
-
 /**
  * init_phys_pg_pack_from_userptr() - initialize physical page pack from host
  *                                    memory
  * @ctx: pointer to the context structure.
  * @userptr: userptr to initialize from.
  * @pphys_pg_pack: result pointer.
+ * @force_regular_page: tell the function to ignore huge page optimization,
+ *                      even if possible. Needed for cases where the device VA
+ *                      is allocated before we know the composition of the
+ *                      physical pages
  *
  * This function does the following:
  * - Pin the physical pages related to the given virtual block.
@@ -762,17 +814,18 @@ static u32 get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr)
  */
 static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
                                struct hl_userptr *userptr,
-                               struct hl_vm_phys_pg_pack **pphys_pg_pack)
+                               struct hl_vm_phys_pg_pack **pphys_pg_pack,
+                               bool force_regular_page)
 {
+       u32 npages, page_size = PAGE_SIZE,
+               huge_page_size = ctx->hdev->asic_prop.pmmu_huge.page_size;
+       u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size);
        struct hl_vm_phys_pg_pack *phys_pg_pack;
+       bool first = true, is_huge_page_opt;
+       u64 page_mask, total_npages;
        struct scatterlist *sg;
        dma_addr_t dma_addr;
-       u64 page_mask, total_npages;
-       u32 npages, page_size = PAGE_SIZE,
-               huge_page_size = ctx->hdev->asic_prop.pmmu_huge.page_size;
-       bool first = true, is_huge_page_opt = true;
        int rc, i, j;
-       u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size);
 
        phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
        if (!phys_pg_pack)
@@ -783,6 +836,8 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
        phys_pg_pack->asid = ctx->asid;
        atomic_set(&phys_pg_pack->mapping_cnt, 1);
 
+       is_huge_page_opt = (force_regular_page ? false : true);
+
        /* Only if all dma_addrs are aligned to 2MB and their
         * sizes is at least 2MB, we can use huge page mapping.
         * We limit the 2MB optimization to this condition,
@@ -791,7 +846,7 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
         */
        total_npages = 0;
        for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) {
-               npages = get_sg_info(sg, &dma_addr);
+               npages = hl_get_sg_info(sg, &dma_addr);
 
                total_npages += npages;
 
@@ -820,7 +875,7 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
 
        j = 0;
        for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) {
-               npages = get_sg_info(sg, &dma_addr);
+               npages = hl_get_sg_info(sg, &dma_addr);
 
                /* align down to physical page size and save the offset */
                if (first) {
@@ -1001,11 +1056,12 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
        struct hl_userptr *userptr = NULL;
        struct hl_vm_hash_node *hnode;
        struct hl_va_range *va_range;
-       enum vm_type_t *vm_type;
+       enum vm_type *vm_type;
        u64 ret_vaddr, hint_addr;
        u32 handle = 0, va_block_align;
        int rc;
        bool is_userptr = args->flags & HL_MEM_USERPTR;
+       enum hl_va_range_type va_range_type = 0;
 
        /* Assume failure */
        *device_addr = 0;
@@ -1023,7 +1079,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
                }
 
                rc = init_phys_pg_pack_from_userptr(ctx, userptr,
-                               &phys_pg_pack);
+                               &phys_pg_pack, false);
                if (rc) {
                        dev_err(hdev->dev,
                                "unable to init page pack for vaddr 0x%llx\n",
@@ -1031,14 +1087,14 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
                        goto init_page_pack_err;
                }
 
-               vm_type = (enum vm_type_t *) userptr;
+               vm_type = (enum vm_type *) userptr;
                hint_addr = args->map_host.hint_addr;
                handle = phys_pg_pack->handle;
 
                /* get required alignment */
                if (phys_pg_pack->page_size == page_size) {
                        va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];
-
+                       va_range_type = HL_VA_RANGE_TYPE_HOST;
                        /*
                         * huge page alignment may be needed in case of regular
                         * page mapping, depending on the host VA alignment
@@ -1053,6 +1109,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
                         * mapping
                         */
                        va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];
+                       va_range_type = HL_VA_RANGE_TYPE_HOST_HUGE;
                        va_block_align = huge_page_size;
                }
        } else {
@@ -1072,12 +1129,13 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 
                spin_unlock(&vm->idr_lock);
 
-               vm_type = (enum vm_type_t *) phys_pg_pack;
+               vm_type = (enum vm_type *) phys_pg_pack;
 
                hint_addr = args->map_device.hint_addr;
 
                /* DRAM VA alignment is the same as the MMU page size */
                va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];
+               va_range_type = HL_VA_RANGE_TYPE_DRAM;
                va_block_align = hdev->asic_prop.dmmu.page_size;
        }
 
@@ -1100,8 +1158,23 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
                goto hnode_err;
        }
 
+       if (hint_addr && phys_pg_pack->offset) {
+               if (args->flags & HL_MEM_FORCE_HINT) {
+                       /* Fail if hint must be respected but it can't be */
+                       dev_err(hdev->dev,
+                               "Hint address 0x%llx cannot be respected because source memory is not aligned 0x%x\n",
+                               hint_addr, phys_pg_pack->offset);
+                       rc = -EINVAL;
+                       goto va_block_err;
+               }
+               dev_dbg(hdev->dev,
+                       "Hint address 0x%llx will be ignored because source memory is not aligned 0x%x\n",
+                       hint_addr, phys_pg_pack->offset);
+       }
+
        ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size,
-                                       hint_addr, va_block_align);
+                                       hint_addr, va_block_align,
+                                       va_range_type, args->flags);
        if (!ret_vaddr) {
                dev_err(hdev->dev, "no available va block for handle %u\n",
                                handle);
@@ -1181,17 +1254,19 @@ init_page_pack_err:
 static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
                                bool ctx_free)
 {
-       struct hl_device *hdev = ctx->hdev;
-       struct asic_fixed_properties *prop = &hdev->asic_prop;
        struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
+       u64 vaddr = args->unmap.device_virt_addr;
        struct hl_vm_hash_node *hnode = NULL;
+       struct asic_fixed_properties *prop;
+       struct hl_device *hdev = ctx->hdev;
        struct hl_userptr *userptr = NULL;
        struct hl_va_range *va_range;
-       u64 vaddr = args->unmap.device_virt_addr;
-       enum vm_type_t *vm_type;
+       enum vm_type *vm_type;
        bool is_userptr;
        int rc = 0;
 
+       prop = &hdev->asic_prop;
+
        /* protect from double entrance */
        mutex_lock(&ctx->mem_hash_lock);
        hash_for_each_possible(ctx->mem_hash, hnode, node, (unsigned long)vaddr)
@@ -1214,8 +1289,9 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
        if (*vm_type == VM_TYPE_USERPTR) {
                is_userptr = true;
                userptr = hnode->ptr;
-               rc = init_phys_pg_pack_from_userptr(ctx, userptr,
-                                                       &phys_pg_pack);
+
+               rc = init_phys_pg_pack_from_userptr(ctx, userptr, &phys_pg_pack,
+                                                       false);
                if (rc) {
                        dev_err(hdev->dev,
                                "unable to init page pack for vaddr 0x%llx\n",
@@ -1299,7 +1375,7 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
        kfree(hnode);
 
        if (is_userptr) {
-               rc = free_phys_pg_pack(hdev, phys_pg_pack);
+               free_phys_pg_pack(hdev, phys_pg_pack);
                dma_unmap_host_va(hdev, userptr);
        }
 
@@ -1669,6 +1745,7 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
                return -EINVAL;
        }
 
+       userptr->pid = current->pid;
        userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_KERNEL);
        if (!userptr->sgt)
                return -ENOMEM;
@@ -2033,7 +2110,7 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
         * another side effect error
         */
        if (!hdev->hard_reset_pending && !hash_empty(ctx->mem_hash))
-               dev_notice(hdev->dev,
+               dev_dbg(hdev->dev,
                        "user released device without removing its memory mappings\n");
 
        hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) {
index c5e93ff..0f536f7 100644 (file)
@@ -470,13 +470,13 @@ static void hl_mmu_v1_fini(struct hl_device *hdev)
        if (!ZERO_OR_NULL_PTR(hdev->mmu_priv.hr.mmu_shadow_hop0)) {
                kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0);
                gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
-       }
 
-       /* Make sure that if we arrive here again without init was called we
-        * won't cause kernel panic. This can happen for example if we fail
-        * during hard reset code at certain points
-        */
-       hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL;
+               /* Make sure that if we arrive here again without init was
+                * called we won't cause kernel panic. This can happen for
+                * example if we fail during hard reset code at certain points
+                */
+               hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL;
+       }
 }
 
 /**
index d5bedf5..0b5366c 100644 (file)
@@ -436,6 +436,8 @@ int hl_pci_init(struct hl_device *hdev)
                goto unmap_pci_bars;
        }
 
+       dma_set_max_seg_size(&pdev->dev, U32_MAX);
+
        return 0;
 
 unmap_pci_bars:
diff --git a/drivers/misc/habanalabs/common/state_dump.c b/drivers/misc/habanalabs/common/state_dump.c
new file mode 100644 (file)
index 0000000..7472690
--- /dev/null
@@ -0,0 +1,718 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2021 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include <linux/vmalloc.h>
+#include <uapi/misc/habanalabs.h>
+#include "habanalabs.h"
+
+/**
+ * hl_format_as_binary - helper function, format an integer as binary
+ *                       using supplied scratch buffer
+ * @buf: the buffer to use
+ * @buf_len: buffer capacity
+ * @n: number to format
+ *
+ * Returns pointer to buffer
+ */
+char *hl_format_as_binary(char *buf, size_t buf_len, u32 n)
+{
+       int i;
+       u32 bit;
+       bool leading0 = true;
+       char *wrptr = buf;
+
+       if (buf_len > 0 && buf_len < 3) {
+               *wrptr = '\0';
+               return buf;
+       }
+
+       wrptr[0] = '0';
+       wrptr[1] = 'b';
+       wrptr += 2;
+       /* Remove 3 characters from length for '0b' and '\0' termination */
+       buf_len -= 3;
+
+       for (i = 0; i < sizeof(n) * BITS_PER_BYTE && buf_len; ++i, n <<= 1) {
+               /* Writing bit calculation in one line would cause a false
+                * positive static code analysis error, so splitting.
+                */
+               bit = n & (1 << (sizeof(n) * BITS_PER_BYTE - 1));
+               bit = !!bit;
+               leading0 &= !bit;
+               if (!leading0) {
+                       *wrptr = '0' + bit;
+                       ++wrptr;
+               }
+       }
+
+       *wrptr = '\0';
+
+       return buf;
+}
+
+/**
+ * resize_to_fit - helper function, resize buffer to fit given amount of data
+ * @buf: destination buffer double pointer
+ * @size: pointer to the size container
+ * @desired_size: size the buffer must contain
+ *
+ * Returns 0 on success or error code on failure.
+ * On success, the size of buffer is at least desired_size. Buffer is allocated
+ * via vmalloc and must be freed with vfree.
+ */
+static int resize_to_fit(char **buf, size_t *size, size_t desired_size)
+{
+       char *resized_buf;
+       size_t new_size;
+
+       if (*size >= desired_size)
+               return 0;
+
+       /* Not enough space to print all, have to resize */
+       new_size = max_t(size_t, PAGE_SIZE, round_up(desired_size, PAGE_SIZE));
+       resized_buf = vmalloc(new_size);
+       if (!resized_buf)
+               return -ENOMEM;
+       memcpy(resized_buf, *buf, *size);
+       vfree(*buf);
+       *buf = resized_buf;
+       *size = new_size;
+
+       return 1;
+}
+
+/**
+ * hl_snprintf_resize() - print formatted data to buffer, resize as needed
+ * @buf: buffer double pointer, to be written to and resized, must be either
+ *       NULL or allocated with vmalloc.
+ * @size: current size of the buffer
+ * @offset: current offset to write to
+ * @format: format of the data
+ *
+ * This function will write formatted data into the buffer. If buffer is not
+ * large enough, it will be resized using vmalloc. Size may be modified if the
+ * buffer was resized, offset will be advanced by the number of bytes written
+ * not including the terminating character
+ *
+ * Returns 0 on success or error code on failure
+ *
+ * Note that the buffer has to be manually released using vfree.
+ */
+int hl_snprintf_resize(char **buf, size_t *size, size_t *offset,
+                          const char *format, ...)
+{
+       va_list args;
+       size_t length;
+       int rc;
+
+       if (*buf == NULL && (*size != 0 || *offset != 0))
+               return -EINVAL;
+
+       va_start(args, format);
+       length = vsnprintf(*buf + *offset, *size - *offset, format, args);
+       va_end(args);
+
+       rc = resize_to_fit(buf, size, *offset + length + 1);
+       if (rc < 0)
+               return rc;
+       else if (rc > 0) {
+               /* Resize was needed, write again */
+               va_start(args, format);
+               length = vsnprintf(*buf + *offset, *size - *offset, format,
+                                  args);
+               va_end(args);
+       }
+
+       *offset += length;
+
+       return 0;
+}
+
+/**
+ * hl_sync_engine_to_string - convert engine type enum to string literal
+ * @engine_type: engine type (TPC/MME/DMA)
+ *
+ * Return the resolved string literal
+ */
+const char *hl_sync_engine_to_string(enum hl_sync_engine_type engine_type)
+{
+       switch (engine_type) {
+       case ENGINE_DMA:
+               return "DMA";
+       case ENGINE_MME:
+               return "MME";
+       case ENGINE_TPC:
+               return "TPC";
+       }
+       return "Invalid Engine Type";
+}
+
+/**
+ * hl_print_resize_sync_engine - helper function, format engine name and ID
+ * using hl_snprintf_resize
+ * @buf: destination buffer double pointer to be used with hl_snprintf_resize
+ * @size: pointer to the size container
+ * @offset: pointer to the offset container
+ * @engine_type: engine type (TPC/MME/DMA)
+ * @engine_id: engine numerical id
+ *
+ * Returns 0 on success or error code on failure
+ */
+static int hl_print_resize_sync_engine(char **buf, size_t *size, size_t *offset,
+                               enum hl_sync_engine_type engine_type,
+                               u32 engine_id)
+{
+       return hl_snprintf_resize(buf, size, offset, "%s%u",
+                       hl_sync_engine_to_string(engine_type), engine_id);
+}
+
+/**
+ * hl_state_dump_get_sync_name - transform sync object id to name if available
+ * @hdev: pointer to the device
+ * @sync_id: sync object id
+ *
+ * Returns a name literal or NULL if not resolved.
+ * Note: returning NULL shall not be considered as a failure, as not all
+ * sync objects are named.
+ */
+const char *hl_state_dump_get_sync_name(struct hl_device *hdev, u32 sync_id)
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       struct hl_hw_obj_name_entry *entry;
+
+       hash_for_each_possible(sds->so_id_to_str_tb, entry,
+                               node, sync_id)
+               if (sync_id == entry->id)
+                       return entry->name;
+
+       return NULL;
+}
+
+/**
+ * hl_state_dump_get_monitor_name - transform monitor object dump to monitor
+ * name if available
+ * @hdev: pointer to the device
+ * @mon: monitor state dump
+ *
+ * Returns a name literal or NULL if not resolved.
+ * Note: returning NULL shall not be considered as a failure, as not all
+ * monitors are named.
+ */
+const char *hl_state_dump_get_monitor_name(struct hl_device *hdev,
+                                       struct hl_mon_state_dump *mon)
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       struct hl_hw_obj_name_entry *entry;
+
+       hash_for_each_possible(sds->monitor_id_to_str_tb,
+                               entry, node, mon->id)
+               if (mon->id == entry->id)
+                       return entry->name;
+
+       return NULL;
+}
+
+/**
+ * hl_state_dump_free_sync_to_engine_map - free sync object to engine map
+ * @map: sync object to engine map
+ *
+ * Note: generic free implementation, the allocation is implemented per ASIC.
+ */
+void hl_state_dump_free_sync_to_engine_map(struct hl_sync_to_engine_map *map)
+{
+       struct hl_sync_to_engine_map_entry *entry;
+       struct hlist_node *tmp_node;
+       int i;
+
+       hash_for_each_safe(map->tb, i, tmp_node, entry, node) {
+               hash_del(&entry->node);
+               kfree(entry);
+       }
+}
+
+/**
+ * hl_state_dump_get_sync_to_engine - transform sync_id to
+ * hl_sync_to_engine_map_entry if available for current id
+ * @map: sync object to engine map
+ * @sync_id: sync object id
+ *
+ * Returns the translation entry if found or NULL if not.
+ * Note, returned NULL shall not be considered as a failure as the map
+ * does not cover all possible, it is a best effort sync ids.
+ */
+static struct hl_sync_to_engine_map_entry *
+hl_state_dump_get_sync_to_engine(struct hl_sync_to_engine_map *map, u32 sync_id)
+{
+       struct hl_sync_to_engine_map_entry *entry;
+
+       hash_for_each_possible(map->tb, entry, node, sync_id)
+               if (entry->sync_id == sync_id)
+                       return entry;
+       return NULL;
+}
+
+/**
+ * hl_state_dump_read_sync_objects - read sync objects array
+ * @hdev: pointer to the device
+ * @index: sync manager block index starting with E_N
+ *
+ * Returns array of size SP_SYNC_OBJ_AMOUNT on success or NULL on failure
+ */
+static u32 *hl_state_dump_read_sync_objects(struct hl_device *hdev, u32 index)
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       u32 *sync_objects;
+       s64 base_addr; /* Base addr can be negative */
+       int i;
+
+       base_addr = sds->props[SP_SYNC_OBJ_BASE_ADDR] +
+                       sds->props[SP_NEXT_SYNC_OBJ_ADDR] * index;
+
+       sync_objects = vmalloc(sds->props[SP_SYNC_OBJ_AMOUNT] * sizeof(u32));
+       if (!sync_objects)
+               return NULL;
+
+       for (i = 0; i < sds->props[SP_SYNC_OBJ_AMOUNT]; ++i)
+               sync_objects[i] = RREG32(base_addr + i * sizeof(u32));
+
+       return sync_objects;
+}
+
+/**
+ * hl_state_dump_free_sync_objects - free sync objects array allocated by
+ * hl_state_dump_read_sync_objects
+ * @sync_objects: sync objects array
+ */
+static void hl_state_dump_free_sync_objects(u32 *sync_objects)
+{
+       vfree(sync_objects);
+}
+
+
+/**
+ * hl_state_dump_print_syncs_single_block - print active sync objects on a
+ * single block
+ * @hdev: pointer to the device
+ * @index: sync manager block index starting with E_N
+ * @buf: destination buffer double pointer to be used with hl_snprintf_resize
+ * @size: pointer to the size container
+ * @offset: pointer to the offset container
+ * @map: sync engines names map
+ *
+ * Returns 0 on success or error code on failure
+ */
+static int
+hl_state_dump_print_syncs_single_block(struct hl_device *hdev, u32 index,
+                               char **buf, size_t *size, size_t *offset,
+                               struct hl_sync_to_engine_map *map)
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       const char *sync_name;
+       u32 *sync_objects = NULL;
+       int rc = 0, i;
+
+       if (sds->sync_namager_names) {
+               rc = hl_snprintf_resize(
+                       buf, size, offset, "%s\n",
+                       sds->sync_namager_names[index]);
+               if (rc)
+                       goto out;
+       }
+
+       sync_objects = hl_state_dump_read_sync_objects(hdev, index);
+       if (!sync_objects) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < sds->props[SP_SYNC_OBJ_AMOUNT]; ++i) {
+               struct hl_sync_to_engine_map_entry *entry;
+               u64 sync_object_addr;
+
+               if (!sync_objects[i])
+                       continue;
+
+               sync_object_addr = sds->props[SP_SYNC_OBJ_BASE_ADDR] +
+                               sds->props[SP_NEXT_SYNC_OBJ_ADDR] * index +
+                               i * sizeof(u32);
+
+               rc = hl_snprintf_resize(buf, size, offset, "sync id: %u", i);
+               if (rc)
+                       goto free_sync_objects;
+               sync_name = hl_state_dump_get_sync_name(hdev, i);
+               if (sync_name) {
+                       rc = hl_snprintf_resize(buf, size, offset, " %s",
+                                               sync_name);
+                       if (rc)
+                               goto free_sync_objects;
+               }
+               rc = hl_snprintf_resize(buf, size, offset, ", value: %u",
+                                       sync_objects[i]);
+               if (rc)
+                       goto free_sync_objects;
+
+               /* Append engine string */
+               entry = hl_state_dump_get_sync_to_engine(map,
+                       (u32)sync_object_addr);
+               if (entry) {
+                       rc = hl_snprintf_resize(buf, size, offset,
+                                               ", Engine: ");
+                       if (rc)
+                               goto free_sync_objects;
+                       rc = hl_print_resize_sync_engine(buf, size, offset,
+                                               entry->engine_type,
+                                               entry->engine_id);
+                       if (rc)
+                               goto free_sync_objects;
+               }
+
+               rc = hl_snprintf_resize(buf, size, offset, "\n");
+               if (rc)
+                       goto free_sync_objects;
+       }
+
+free_sync_objects:
+       hl_state_dump_free_sync_objects(sync_objects);
+out:
+       return rc;
+}
+
+/**
+ * hl_state_dump_print_syncs - print active sync objects
+ * @hdev: pointer to the device
+ * @buf: destination buffer double pointer to be used with hl_snprintf_resize
+ * @size: pointer to the size container
+ * @offset: pointer to the offset container
+ *
+ * Returns 0 on success or error code on failure
+ */
+static int hl_state_dump_print_syncs(struct hl_device *hdev,
+                                       char **buf, size_t *size,
+                                       size_t *offset)
+
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       struct hl_sync_to_engine_map *map;
+       u32 index;
+       int rc = 0;
+
+       map = kzalloc(sizeof(*map), GFP_KERNEL);
+       if (!map)
+               return -ENOMEM;
+
+       rc = sds->funcs.gen_sync_to_engine_map(hdev, map);
+       if (rc)
+               goto free_map_mem;
+
+       rc = hl_snprintf_resize(buf, size, offset, "Non zero sync objects:\n");
+       if (rc)
+               goto out;
+
+       if (sds->sync_namager_names) {
+               for (index = 0; sds->sync_namager_names[index]; ++index) {
+                       rc = hl_state_dump_print_syncs_single_block(
+                               hdev, index, buf, size, offset, map);
+                       if (rc)
+                               goto out;
+               }
+       } else {
+               for (index = 0; index < sds->props[SP_NUM_CORES]; ++index) {
+                       rc = hl_state_dump_print_syncs_single_block(
+                               hdev, index, buf, size, offset, map);
+                       if (rc)
+                               goto out;
+               }
+       }
+
+out:
+       hl_state_dump_free_sync_to_engine_map(map);
+free_map_mem:
+       kfree(map);
+
+       return rc;
+}
+
+/**
+ * hl_state_dump_alloc_read_sm_block_monitors - read monitors for a specific
+ * block
+ * @hdev: pointer to the device
+ * @index: sync manager block index starting with E_N
+ *
+ * Returns an array of monitor data of size SP_MONITORS_AMOUNT or NULL
+ * on error
+ */
+static struct hl_mon_state_dump *
+hl_state_dump_alloc_read_sm_block_monitors(struct hl_device *hdev, u32 index)
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       struct hl_mon_state_dump *monitors;
+       s64 base_addr; /* Base addr can be negative */
+       int i;
+
+       monitors = vmalloc(sds->props[SP_MONITORS_AMOUNT] *
+                          sizeof(struct hl_mon_state_dump));
+       if (!monitors)
+               return NULL;
+
+       base_addr = sds->props[SP_NEXT_SYNC_OBJ_ADDR] * index;
+
+       for (i = 0; i < sds->props[SP_MONITORS_AMOUNT]; ++i) {
+               monitors[i].id = i;
+               monitors[i].wr_addr_low =
+                       RREG32(base_addr + sds->props[SP_MON_OBJ_WR_ADDR_LOW] +
+                               i * sizeof(u32));
+
+               monitors[i].wr_addr_high =
+                       RREG32(base_addr + sds->props[SP_MON_OBJ_WR_ADDR_HIGH] +
+                               i * sizeof(u32));
+
+               monitors[i].wr_data =
+                       RREG32(base_addr + sds->props[SP_MON_OBJ_WR_DATA] +
+                               i * sizeof(u32));
+
+               monitors[i].arm_data =
+                       RREG32(base_addr + sds->props[SP_MON_OBJ_ARM_DATA] +
+                               i * sizeof(u32));
+
+               monitors[i].status =
+                       RREG32(base_addr + sds->props[SP_MON_OBJ_STATUS] +
+                               i * sizeof(u32));
+       }
+
+       return monitors;
+}
+
+/**
+ * hl_state_dump_free_monitors - free the monitors structure
+ * @monitors: monitors array created with
+ *            hl_state_dump_alloc_read_sm_block_monitors
+ */
+static void hl_state_dump_free_monitors(struct hl_mon_state_dump *monitors)
+{
+       vfree(monitors);
+}
+
+/**
+ * hl_state_dump_print_monitors_single_block - print active monitors on a
+ * single block
+ * @hdev: pointer to the device
+ * @index: sync manager block index starting with E_N
+ * @buf: destination buffer double pointer to be used with hl_snprintf_resize
+ * @size: pointer to the size container
+ * @offset: pointer to the offset container
+ *
+ * Returns 0 on success or error code on failure
+ */
+static int hl_state_dump_print_monitors_single_block(struct hl_device *hdev,
+                                               u32 index,
+                                               char **buf, size_t *size,
+                                               size_t *offset)
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       struct hl_mon_state_dump *monitors = NULL;
+       int rc = 0, i;
+
+       if (sds->sync_namager_names) {
+               rc = hl_snprintf_resize(
+                       buf, size, offset, "%s\n",
+                       sds->sync_namager_names[index]);
+               if (rc)
+                       goto out;
+       }
+
+       monitors = hl_state_dump_alloc_read_sm_block_monitors(hdev, index);
+       if (!monitors) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < sds->props[SP_MONITORS_AMOUNT]; ++i) {
+               if (!(sds->funcs.monitor_valid(&monitors[i])))
+                       continue;
+
+               /* Monitor is valid, dump it */
+               rc = sds->funcs.print_single_monitor(buf, size, offset, hdev,
+                                                       &monitors[i]);
+               if (rc)
+                       goto free_monitors;
+
+               hl_snprintf_resize(buf, size, offset, "\n");
+       }
+
+free_monitors:
+       hl_state_dump_free_monitors(monitors);
+out:
+       return rc;
+}
+
+/**
+ * hl_state_dump_print_monitors - print active monitors
+ * @hdev: pointer to the device
+ * @buf: destination buffer double pointer to be used with hl_snprintf_resize
+ * @size: pointer to the size container
+ * @offset: pointer to the offset container
+ *
+ * Returns 0 on success or error code on failure
+ */
+static int hl_state_dump_print_monitors(struct hl_device *hdev,
+                                       char **buf, size_t *size,
+                                       size_t *offset)
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       u32 index;
+       int rc = 0;
+
+       rc = hl_snprintf_resize(buf, size, offset,
+               "Valid (armed) monitor objects:\n");
+       if (rc)
+               goto out;
+
+       if (sds->sync_namager_names) {
+               for (index = 0; sds->sync_namager_names[index]; ++index) {
+                       rc = hl_state_dump_print_monitors_single_block(
+                               hdev, index, buf, size, offset);
+                       if (rc)
+                               goto out;
+               }
+       } else {
+               for (index = 0; index < sds->props[SP_NUM_CORES]; ++index) {
+                       rc = hl_state_dump_print_monitors_single_block(
+                               hdev, index, buf, size, offset);
+                       if (rc)
+                               goto out;
+               }
+       }
+
+out:
+       return rc;
+}
+
+/**
+ * hl_state_dump_print_engine_fences - print active fences for a specific
+ * engine
+ * @hdev: pointer to the device
+ * @engine_type: engine type to use
+ * @buf: destination buffer double pointer to be used with hl_snprintf_resize
+ * @size: pointer to the size container
+ * @offset: pointer to the offset container
+ */
+static int
+hl_state_dump_print_engine_fences(struct hl_device *hdev,
+                                 enum hl_sync_engine_type engine_type,
+                                 char **buf, size_t *size, size_t *offset)
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       int rc = 0, i, n_fences;
+       u64 base_addr, next_fence;
+
+       switch (engine_type) {
+       case ENGINE_TPC:
+               n_fences = sds->props[SP_NUM_OF_TPC_ENGINES];
+               base_addr = sds->props[SP_TPC0_CMDQ];
+               next_fence = sds->props[SP_NEXT_TPC];
+               break;
+       case ENGINE_MME:
+               n_fences = sds->props[SP_NUM_OF_MME_ENGINES];
+               base_addr = sds->props[SP_MME_CMDQ];
+               next_fence = sds->props[SP_NEXT_MME];
+               break;
+       case ENGINE_DMA:
+               n_fences = sds->props[SP_NUM_OF_DMA_ENGINES];
+               base_addr = sds->props[SP_DMA_CMDQ];
+               next_fence = sds->props[SP_DMA_QUEUES_OFFSET];
+               break;
+       default:
+               return -EINVAL;
+       }
+       for (i = 0; i < n_fences; ++i) {
+               rc = sds->funcs.print_fences_single_engine(
+                       hdev,
+                       base_addr + next_fence * i +
+                               sds->props[SP_FENCE0_CNT_OFFSET],
+                       base_addr + next_fence * i +
+                               sds->props[SP_CP_STS_OFFSET],
+                       engine_type, i, buf, size, offset);
+               if (rc)
+                       goto out;
+       }
+out:
+       return rc;
+}
+
+/**
+ * hl_state_dump_print_fences - print active fences
+ * @hdev: pointer to the device
+ * @buf: destination buffer double pointer to be used with hl_snprintf_resize
+ * @size: pointer to the size container
+ * @offset: pointer to the offset container
+ */
+static int hl_state_dump_print_fences(struct hl_device *hdev, char **buf,
+                                     size_t *size, size_t *offset)
+{
+       int rc = 0;
+
+       rc = hl_snprintf_resize(buf, size, offset, "Valid (armed) fences:\n");
+       if (rc)
+               goto out;
+
+       rc = hl_state_dump_print_engine_fences(hdev, ENGINE_TPC, buf, size, offset);
+       if (rc)
+               goto out;
+
+       rc = hl_state_dump_print_engine_fences(hdev, ENGINE_MME, buf, size, offset);
+       if (rc)
+               goto out;
+
+       rc = hl_state_dump_print_engine_fences(hdev, ENGINE_DMA, buf, size, offset);
+       if (rc)
+               goto out;
+
+out:
+       return rc;
+}
+
+/**
+ * hl_state_dump() - dump system state
+ * @hdev: pointer to device structure
+ */
+int hl_state_dump(struct hl_device *hdev)
+{
+       char *buf = NULL;
+       size_t offset = 0, size = 0;
+       int rc;
+
+       rc = hl_snprintf_resize(&buf, &size, &offset,
+                               "Timestamp taken on: %llu\n\n",
+                               ktime_to_ns(ktime_get()));
+       if (rc)
+               goto err;
+
+       rc = hl_state_dump_print_syncs(hdev, &buf, &size, &offset);
+       if (rc)
+               goto err;
+
+       hl_snprintf_resize(&buf, &size, &offset, "\n");
+
+       rc = hl_state_dump_print_monitors(hdev, &buf, &size, &offset);
+       if (rc)
+               goto err;
+
+       hl_snprintf_resize(&buf, &size, &offset, "\n");
+
+       rc = hl_state_dump_print_fences(hdev, &buf, &size, &offset);
+       if (rc)
+               goto err;
+
+       hl_snprintf_resize(&buf, &size, &offset, "\n");
+
+       hl_debugfs_set_state_dump(hdev, buf, size);
+
+       return 0;
+err:
+       vfree(buf);
+       return rc;
+}
index db72df2..34f9f27 100644 (file)
@@ -9,8 +9,7 @@
 
 #include <linux/pci.h>
 
-long hl_get_frequency(struct hl_device *hdev, u32 pll_index,
-                                                               bool curr)
+long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
 {
        struct cpucp_packet pkt;
        u32 used_pll_idx;
@@ -44,8 +43,7 @@ long hl_get_frequency(struct hl_device *hdev, u32 pll_index,
        return (long) result;
 }
 
-void hl_set_frequency(struct hl_device *hdev, u32 pll_index,
-                                                               u64 freq)
+void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
 {
        struct cpucp_packet pkt;
        u32 used_pll_idx;
@@ -285,16 +283,12 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr,
                                char *buf)
 {
        struct hl_device *hdev = dev_get_drvdata(dev);
-       char *str;
+       char str[HL_STR_MAX];
 
-       if (atomic_read(&hdev->in_reset))
-               str = "In reset";
-       else if (hdev->disabled)
-               str = "Malfunction";
-       else if (hdev->needs_reset)
-               str = "Needs Reset";
-       else
-               str = "Operational";
+       strscpy(str, hdev->status[hl_device_status(hdev)], HL_STR_MAX);
+
+       /* use uppercase for backward compatibility */
+       str[0] = 'A' + (str[0] - 'a');
 
        return sprintf(buf, "%s\n", str);
 }
index aa8a0ca..383865b 100644 (file)
@@ -76,7 +76,7 @@
 #define GAUDI_PLDM_MMU_TIMEOUT_USEC    (MMU_CONFIG_TIMEOUT_USEC * 100)
 #define GAUDI_PLDM_QMAN0_TIMEOUT_USEC  (HL_DEVICE_TIMEOUT_USEC * 30)
 #define GAUDI_PLDM_TPC_KERNEL_WAIT_USEC        (HL_DEVICE_TIMEOUT_USEC * 30)
-#define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC        1000000         /* 1s */
+#define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC        4000000         /* 4s */
 #define GAUDI_MSG_TO_CPU_TIMEOUT_USEC  4000000         /* 4s */
 #define GAUDI_WAIT_FOR_BL_TIMEOUT_USEC 15000000        /* 15s */
 
 
 #define GAUDI_PLL_MAX 10
 
+#define BIN_REG_STRING_SIZE    sizeof("0b10101010101010101010101010101010")
+
+#define MONITOR_SOB_STRING_SIZE                256
+
+static u32 gaudi_stream_master[GAUDI_STREAM_MASTER_ARR_SIZE] = {
+       GAUDI_QUEUE_ID_DMA_0_0,
+       GAUDI_QUEUE_ID_DMA_0_1,
+       GAUDI_QUEUE_ID_DMA_0_2,
+       GAUDI_QUEUE_ID_DMA_0_3,
+       GAUDI_QUEUE_ID_DMA_1_0,
+       GAUDI_QUEUE_ID_DMA_1_1,
+       GAUDI_QUEUE_ID_DMA_1_2,
+       GAUDI_QUEUE_ID_DMA_1_3
+};
+
 static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = {
                "gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3",
                "gaudi cq 1_0", "gaudi cq 1_1", "gaudi cq 1_2", "gaudi cq 1_3",
@@ -348,6 +363,97 @@ static enum hl_queue_type gaudi_queue_type[GAUDI_QUEUE_ID_SIZE] = {
        QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_3 */
 };
 
+static struct hl_hw_obj_name_entry gaudi_so_id_to_str[] = {
+       { .id = 0,  .name = "SYNC_OBJ_DMA_DOWN_FEEDBACK" },
+       { .id = 1,  .name = "SYNC_OBJ_DMA_UP_FEEDBACK" },
+       { .id = 2,  .name = "SYNC_OBJ_DMA_STATIC_DRAM_SRAM_FEEDBACK" },
+       { .id = 3,  .name = "SYNC_OBJ_DMA_SRAM_DRAM_FEEDBACK" },
+       { .id = 4,  .name = "SYNC_OBJ_FIRST_COMPUTE_FINISH" },
+       { .id = 5,  .name = "SYNC_OBJ_HOST_DRAM_DONE" },
+       { .id = 6,  .name = "SYNC_OBJ_DBG_CTR_DEPRECATED" },
+       { .id = 7,  .name = "SYNC_OBJ_DMA_ACTIVATIONS_DRAM_SRAM_FEEDBACK" },
+       { .id = 8,  .name = "SYNC_OBJ_ENGINE_SEM_MME_0" },
+       { .id = 9,  .name = "SYNC_OBJ_ENGINE_SEM_MME_1" },
+       { .id = 10, .name = "SYNC_OBJ_ENGINE_SEM_TPC_0" },
+       { .id = 11, .name = "SYNC_OBJ_ENGINE_SEM_TPC_1" },
+       { .id = 12, .name = "SYNC_OBJ_ENGINE_SEM_TPC_2" },
+       { .id = 13, .name = "SYNC_OBJ_ENGINE_SEM_TPC_3" },
+       { .id = 14, .name = "SYNC_OBJ_ENGINE_SEM_TPC_4" },
+       { .id = 15, .name = "SYNC_OBJ_ENGINE_SEM_TPC_5" },
+       { .id = 16, .name = "SYNC_OBJ_ENGINE_SEM_TPC_6" },
+       { .id = 17, .name = "SYNC_OBJ_ENGINE_SEM_TPC_7" },
+       { .id = 18, .name = "SYNC_OBJ_ENGINE_SEM_DMA_1" },
+       { .id = 19, .name = "SYNC_OBJ_ENGINE_SEM_DMA_2" },
+       { .id = 20, .name = "SYNC_OBJ_ENGINE_SEM_DMA_3" },
+       { .id = 21, .name = "SYNC_OBJ_ENGINE_SEM_DMA_4" },
+       { .id = 22, .name = "SYNC_OBJ_ENGINE_SEM_DMA_5" },
+       { .id = 23, .name = "SYNC_OBJ_ENGINE_SEM_DMA_6" },
+       { .id = 24, .name = "SYNC_OBJ_ENGINE_SEM_DMA_7" },
+       { .id = 25, .name = "SYNC_OBJ_DBG_CTR_0" },
+       { .id = 26, .name = "SYNC_OBJ_DBG_CTR_1" },
+};
+
+static struct hl_hw_obj_name_entry gaudi_monitor_id_to_str[] = {
+       { .id = 200, .name = "MON_OBJ_DMA_DOWN_FEEDBACK_RESET" },
+       { .id = 201, .name = "MON_OBJ_DMA_UP_FEADBACK_RESET" },
+       { .id = 203, .name = "MON_OBJ_DRAM_TO_SRAM_QUEUE_FENCE" },
+       { .id = 204, .name = "MON_OBJ_TPC_0_CLK_GATE" },
+       { .id = 205, .name = "MON_OBJ_TPC_1_CLK_GATE" },
+       { .id = 206, .name = "MON_OBJ_TPC_2_CLK_GATE" },
+       { .id = 207, .name = "MON_OBJ_TPC_3_CLK_GATE" },
+       { .id = 208, .name = "MON_OBJ_TPC_4_CLK_GATE" },
+       { .id = 209, .name = "MON_OBJ_TPC_5_CLK_GATE" },
+       { .id = 210, .name = "MON_OBJ_TPC_6_CLK_GATE" },
+       { .id = 211, .name = "MON_OBJ_TPC_7_CLK_GATE" },
+};
+
+static s64 gaudi_state_dump_specs_props[] = {
+       [SP_SYNC_OBJ_BASE_ADDR] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0,
+       [SP_NEXT_SYNC_OBJ_ADDR] = NEXT_SYNC_OBJ_ADDR_INTERVAL,
+       [SP_SYNC_OBJ_AMOUNT] = NUM_OF_SOB_IN_BLOCK,
+       [SP_MON_OBJ_WR_ADDR_LOW] =
+               mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0,
+       [SP_MON_OBJ_WR_ADDR_HIGH] =
+               mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRH_0,
+       [SP_MON_OBJ_WR_DATA] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_DATA_0,
+       [SP_MON_OBJ_ARM_DATA] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_ARM_0,
+       [SP_MON_OBJ_STATUS] = mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_0,
+       [SP_MONITORS_AMOUNT] = NUM_OF_MONITORS_IN_BLOCK,
+       [SP_TPC0_CMDQ] = mmTPC0_QM_GLBL_CFG0,
+       [SP_TPC0_CFG_SO] = mmTPC0_CFG_QM_SYNC_OBJECT_ADDR,
+       [SP_NEXT_TPC] = mmTPC1_QM_GLBL_CFG0 - mmTPC0_QM_GLBL_CFG0,
+       [SP_MME_CMDQ] = mmMME0_QM_GLBL_CFG0,
+       [SP_MME_CFG_SO] = mmMME0_CTRL_ARCH_DESC_SYNC_OBJECT_ADDR_LOW_LOCAL,
+       [SP_NEXT_MME] = mmMME2_QM_GLBL_CFG0 - mmMME0_QM_GLBL_CFG0,
+       [SP_DMA_CMDQ] = mmDMA0_QM_GLBL_CFG0,
+       [SP_DMA_CFG_SO] = mmDMA0_CORE_WR_COMP_ADDR_LO,
+       [SP_DMA_QUEUES_OFFSET] = mmDMA1_QM_GLBL_CFG0 - mmDMA0_QM_GLBL_CFG0,
+       [SP_NUM_OF_MME_ENGINES] = NUM_OF_MME_ENGINES,
+       [SP_SUB_MME_ENG_NUM] = NUM_OF_MME_SUB_ENGINES,
+       [SP_NUM_OF_DMA_ENGINES] = NUM_OF_DMA_ENGINES,
+       [SP_NUM_OF_TPC_ENGINES] = NUM_OF_TPC_ENGINES,
+       [SP_ENGINE_NUM_OF_QUEUES] = NUM_OF_QUEUES,
+       [SP_ENGINE_NUM_OF_STREAMS] = NUM_OF_STREAMS,
+       [SP_ENGINE_NUM_OF_FENCES] = NUM_OF_FENCES,
+       [SP_FENCE0_CNT_OFFSET] =
+               mmDMA0_QM_CP_FENCE0_CNT_0 - mmDMA0_QM_GLBL_CFG0,
+       [SP_FENCE0_RDATA_OFFSET] =
+               mmDMA0_QM_CP_FENCE0_RDATA_0 - mmDMA0_QM_GLBL_CFG0,
+       [SP_CP_STS_OFFSET] = mmDMA0_QM_CP_STS_0 - mmDMA0_QM_GLBL_CFG0,
+       [SP_NUM_CORES] = 1,
+};
+
+/* The order here is opposite to the order of the indexing in the h/w.
+ * i.e. SYNC_MGR_W_S is actually 0, SYNC_MGR_E_S is 1, etc.
+ */
+static const char * const gaudi_sync_manager_names[] = {
+       "SYNC_MGR_E_N",
+       "SYNC_MGR_W_N",
+       "SYNC_MGR_E_S",
+       "SYNC_MGR_W_S",
+       NULL
+};
+
 struct ecc_info_extract_params {
        u64 block_address;
        u32 num_memories;
@@ -363,8 +469,6 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr,
                                        u32 size, u64 val);
 static int gaudi_memset_registers(struct hl_device *hdev, u64 reg_base,
                                        u32 num_regs, u32 val);
-static int gaudi_schedule_register_memset(struct hl_device *hdev,
-               u32 hw_queue_id, u64 reg_base, u32 num_regs, u32 val);
 static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel,
                                u32 tpc_id);
 static int gaudi_mmu_clear_pgt_range(struct hl_device *hdev);
@@ -375,7 +479,6 @@ static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id,
                                u32 size, bool eb);
 static u32 gaudi_gen_wait_cb(struct hl_device *hdev,
                                struct hl_gen_wait_properties *prop);
-
 static inline enum hl_collective_mode
 get_collective_mode(struct hl_device *hdev, u32 queue_id)
 {
@@ -403,7 +506,11 @@ static inline void set_default_power_values(struct hl_device *hdev)
 
        if (hdev->card_type == cpucp_card_type_pmc) {
                prop->max_power_default = MAX_POWER_DEFAULT_PMC;
-               prop->dc_power_default = DC_POWER_DEFAULT_PMC;
+
+               if (prop->fw_security_enabled)
+                       prop->dc_power_default = DC_POWER_DEFAULT_PMC_SEC;
+               else
+                       prop->dc_power_default = DC_POWER_DEFAULT_PMC;
        } else {
                prop->max_power_default = MAX_POWER_DEFAULT_PCI;
                prop->dc_power_default = DC_POWER_DEFAULT_PCI;
@@ -450,6 +557,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
                                                get_collective_mode(hdev, i);
        }
 
+       prop->device_dma_offset_for_host_access = HOST_PHYS_BASE;
        prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
        prop->collective_first_sob = 0;
        prop->collective_first_mon = 0;
@@ -551,6 +659,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
        prop->hard_reset_done_by_fw = false;
        prop->gic_interrupts_enable = true;
 
+       prop->server_type = HL_SERVER_TYPE_UNKNOWN;
+
        return 0;
 }
 
@@ -723,14 +833,14 @@ pci_init:
                                        GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC);
        if (rc) {
                if (hdev->reset_on_preboot_fail)
-                       hdev->asic_funcs->hw_fini(hdev, true);
+                       hdev->asic_funcs->hw_fini(hdev, true, false);
                goto pci_fini;
        }
 
        if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
                dev_info(hdev->dev,
                        "H/W state is dirty, must reset before initializing\n");
-               hdev->asic_funcs->hw_fini(hdev, true);
+               hdev->asic_funcs->hw_fini(hdev, true, false);
        }
 
        return 0;
@@ -974,17 +1084,11 @@ static void gaudi_sob_group_hw_reset(struct kref *ref)
        struct gaudi_hw_sob_group *hw_sob_group =
                container_of(ref, struct gaudi_hw_sob_group, kref);
        struct hl_device *hdev = hw_sob_group->hdev;
-       u64 base_addr;
-       int rc;
+       int i;
 
-       base_addr = CFG_BASE + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 +
-                       hw_sob_group->base_sob_id * 4;
-       rc = gaudi_schedule_register_memset(hdev, hw_sob_group->queue_id,
-                       base_addr, NUMBER_OF_SOBS_IN_GRP, 0);
-       if (rc)
-               dev_err(hdev->dev,
-                       "failed resetting sob group - sob base %u, count %u",
-                       hw_sob_group->base_sob_id, NUMBER_OF_SOBS_IN_GRP);
+       for (i = 0 ; i < NUMBER_OF_SOBS_IN_GRP ; i++)
+               WREG32((mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 +
+                       (hw_sob_group->base_sob_id * 4) + (i * 4)), 0);
 
        kref_init(&hw_sob_group->kref);
 }
@@ -1121,6 +1225,20 @@ static void gaudi_collective_slave_init_job(struct hl_device *hdev,
        queue_id = job->hw_queue_id;
        prop = &hdev->kernel_queues[queue_id].sync_stream_prop;
 
+       if (job->cs->encaps_signals) {
+               /* use the encaps signal handle store earlier in the flow
+                * and set the SOB information from the encaps
+                * signals handle
+                */
+               hl_hw_queue_encaps_sig_set_sob_info(hdev, job->cs, job,
+                                               cs_cmpl);
+
+               dev_dbg(hdev->dev, "collective wait: Sequence %llu found, sob_id: %u,  wait for sob_val: %u\n",
+                               job->cs->sequence,
+                               cs_cmpl->hw_sob->sob_id,
+                               cs_cmpl->sob_val);
+       }
+
        /* Add to wait CBs using slave monitor */
        wait_prop.data = (void *) job->user_cb;
        wait_prop.sob_base = cs_cmpl->hw_sob->sob_id;
@@ -1131,7 +1249,7 @@ static void gaudi_collective_slave_init_job(struct hl_device *hdev,
        wait_prop.size = cb_size;
 
        dev_dbg(hdev->dev,
-               "Generate slave wait CB, sob %d, val:0x%x, mon %d, q %d\n",
+               "Generate slave wait CB, sob %d, val:%x, mon %d, q %d\n",
                cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val,
                prop->collective_slave_mon_id, queue_id);
 
@@ -1145,7 +1263,7 @@ static void gaudi_collective_slave_init_job(struct hl_device *hdev,
                        prop->collective_sob_id, cb_size, false);
 }
 
-static void gaudi_collective_wait_init_cs(struct hl_cs *cs)
+static int gaudi_collective_wait_init_cs(struct hl_cs *cs)
 {
        struct hl_cs_compl *signal_cs_cmpl =
                container_of(cs->signal_fence, struct hl_cs_compl, base_fence);
@@ -1163,9 +1281,37 @@ static void gaudi_collective_wait_init_cs(struct hl_cs *cs)
        gaudi = hdev->asic_specific;
        cprop = &gaudi->collective_props;
 
-       /* copy the SOB id and value of the signal CS */
-       cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
-       cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
+       /* In encaps signals case the SOB info will be retrieved from
+        * the handle in gaudi_collective_slave_init_job.
+        */
+       if (!cs->encaps_signals) {
+               /* copy the SOB id and value of the signal CS */
+               cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
+               cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
+       }
+
+       /* check again if the signal cs already completed.
+        * if yes then don't send any wait cs since the hw_sob
+        * could be in reset already. if signal is not completed
+        * then get refcount to hw_sob to prevent resetting the sob
+        * while wait cs is not submitted.
+        * note that this check is protected by two locks,
+        * hw queue lock and completion object lock,
+        * and the same completion object lock also protects
+        * the hw_sob reset handler function.
+        * The hw_queue lock prevent out of sync of hw_sob
+        * refcount value, changed by signal/wait flows.
+        */
+       spin_lock(&signal_cs_cmpl->lock);
+
+       if (completion_done(&cs->signal_fence->completion)) {
+               spin_unlock(&signal_cs_cmpl->lock);
+               return -EINVAL;
+       }
+       /* Increment kref since all slave queues are now waiting on it */
+       kref_get(&cs_cmpl->hw_sob->kref);
+
+       spin_unlock(&signal_cs_cmpl->lock);
 
        /* Calculate the stream from collective master queue (1st job) */
        job = list_first_entry(&cs->job_list, struct hl_cs_job, cs_node);
@@ -1210,21 +1356,17 @@ static void gaudi_collective_wait_init_cs(struct hl_cs *cs)
                                cprop->curr_sob_group_idx[stream], stream);
        }
 
-       /* Increment kref since all slave queues are now waiting on it */
-       kref_get(&cs_cmpl->hw_sob->kref);
-       /*
-        * Must put the signal fence after the SOB refcnt increment so
-        * the SOB refcnt won't turn 0 and reset the SOB before the
-        * wait CS was submitted.
-        */
        mb();
        hl_fence_put(cs->signal_fence);
        cs->signal_fence = NULL;
+
+       return 0;
 }
 
 static int gaudi_collective_wait_create_job(struct hl_device *hdev,
                struct hl_ctx *ctx, struct hl_cs *cs,
-               enum hl_collective_mode mode, u32 queue_id, u32 wait_queue_id)
+               enum hl_collective_mode mode, u32 queue_id, u32 wait_queue_id,
+               u32 encaps_signal_offset)
 {
        struct hw_queue_properties *hw_queue_prop;
        struct hl_cs_counters_atomic *cntr;
@@ -1284,6 +1426,13 @@ static int gaudi_collective_wait_create_job(struct hl_device *hdev,
        job->user_cb_size = cb_size;
        job->hw_queue_id = queue_id;
 
+       /* since its guaranteed to have only one chunk in the collective wait
+        * cs, we can use this chunk to set the encapsulated signal offset
+        * in the jobs.
+        */
+       if (cs->encaps_signals)
+               job->encaps_sig_wait_offset = encaps_signal_offset;
+
        /*
         * No need in parsing, user CB is the patched CB.
         * We call hl_cb_destroy() out of two reasons - we don't need
@@ -1312,8 +1461,9 @@ static int gaudi_collective_wait_create_job(struct hl_device *hdev,
 }
 
 static int gaudi_collective_wait_create_jobs(struct hl_device *hdev,
-               struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
-               u32 collective_engine_id)
+               struct hl_ctx *ctx, struct hl_cs *cs,
+               u32 wait_queue_id, u32 collective_engine_id,
+               u32 encaps_signal_offset)
 {
        struct gaudi_device *gaudi = hdev->asic_specific;
        struct hw_queue_properties *hw_queue_prop;
@@ -1363,7 +1513,8 @@ static int gaudi_collective_wait_create_jobs(struct hl_device *hdev,
                if (i == 0) {
                        queue_id = wait_queue_id;
                        rc = gaudi_collective_wait_create_job(hdev, ctx, cs,
-                               HL_COLLECTIVE_MASTER, queue_id, wait_queue_id);
+                               HL_COLLECTIVE_MASTER, queue_id,
+                               wait_queue_id, encaps_signal_offset);
                } else {
                        if (nic_idx < NIC_NUMBER_OF_ENGINES) {
                                if (gaudi->hw_cap_initialized &
@@ -1383,7 +1534,8 @@ static int gaudi_collective_wait_create_jobs(struct hl_device *hdev,
                        }
 
                        rc = gaudi_collective_wait_create_job(hdev, ctx, cs,
-                               HL_COLLECTIVE_SLAVE, queue_id, wait_queue_id);
+                               HL_COLLECTIVE_SLAVE, queue_id,
+                               wait_queue_id, encaps_signal_offset);
                }
 
                if (rc)
@@ -1431,6 +1583,11 @@ static int gaudi_late_init(struct hl_device *hdev)
                return rc;
        }
 
+       /* Scrub both SRAM and DRAM */
+       rc = hdev->asic_funcs->scrub_device_mem(hdev, 0, 0);
+       if (rc)
+               goto disable_pci_access;
+
        rc = gaudi_fetch_psoc_frequency(hdev);
        if (rc) {
                dev_err(hdev->dev, "Failed to fetch psoc frequency\n");
@@ -1455,6 +1612,11 @@ static int gaudi_late_init(struct hl_device *hdev)
                goto disable_pci_access;
        }
 
+       /* We only support a single ASID for the user, so for the sake of optimization, just
+        * initialize the ASID one time during device initialization with the fixed value of 1
+        */
+       gaudi_mmu_prepare(hdev, 1);
+
        return 0;
 
 disable_pci_access:
@@ -1720,8 +1882,12 @@ static int gaudi_sw_init(struct hl_device *hdev)
        hdev->supports_sync_stream = true;
        hdev->supports_coresight = true;
        hdev->supports_staged_submission = true;
+       hdev->supports_wait_for_multi_cs = true;
 
-       gaudi_set_pci_memory_regions(hdev);
+       hdev->asic_funcs->set_pci_memory_regions(hdev);
+       hdev->stream_master_qid_arr =
+                               hdev->asic_funcs->get_stream_master_qid_arr();
+       hdev->stream_master_qid_arr_size = GAUDI_STREAM_MASTER_ARR_SIZE;
 
        return 0;
 
@@ -2523,7 +2689,7 @@ static void gaudi_init_golden_registers(struct hl_device *hdev)
                                tpc_id < TPC_NUMBER_OF_ENGINES;
                                tpc_id++, tpc_offset += TPC_CFG_OFFSET) {
                /* Mask all arithmetic interrupts from TPC */
-               WREG32(mmTPC0_CFG_TPC_INTR_MASK + tpc_offset, 0x8FFF);
+               WREG32(mmTPC0_CFG_TPC_INTR_MASK + tpc_offset, 0x8FFE);
                /* Set 16 cache lines */
                WREG32_FIELD(TPC0_CFG_MSS_CONFIG, tpc_offset,
                                ICACHE_FETCH_LINE_NUM, 2);
@@ -3670,7 +3836,7 @@ static void gaudi_disable_timestamp(struct hl_device *hdev)
        WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0);
 }
 
-static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
+static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 {
        u32 wait_timeout_ms;
 
@@ -3682,6 +3848,9 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
        else
                wait_timeout_ms = GAUDI_RESET_WAIT_MSEC;
 
+       if (fw_reset)
+               goto skip_engines;
+
        gaudi_stop_nic_qmans(hdev);
        gaudi_stop_mme_qmans(hdev);
        gaudi_stop_tpc_qmans(hdev);
@@ -3707,6 +3876,7 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
 
        gaudi_disable_timestamp(hdev);
 
+skip_engines:
        gaudi_disable_msi(hdev);
 }
 
@@ -3739,6 +3909,9 @@ static int gaudi_mmu_init(struct hl_device *hdev)
        WREG32(mmSTLB_CACHE_INV_BASE_39_8, MMU_CACHE_MNG_ADDR >> 8);
        WREG32(mmSTLB_CACHE_INV_BASE_49_40, MMU_CACHE_MNG_ADDR >> 40);
 
+       /* mem cache invalidation */
+       WREG32(mmSTLB_MEM_CACHE_INVALIDATION, 1);
+
        hdev->asic_funcs->mmu_invalidate_cache(hdev, true, 0);
 
        WREG32(mmMMU_UP_MMU_ENABLE, 1);
@@ -4071,7 +4244,7 @@ disable_queues:
        return rc;
 }
 
-static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
+static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 {
        struct cpu_dyn_regs *dyn_regs =
                        &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs;
@@ -4092,6 +4265,14 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
                cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC;
        }
 
+       if (fw_reset) {
+               dev_info(hdev->dev,
+                       "Firmware performs HARD reset, going to wait %dms\n",
+                       reset_timeout_ms);
+
+               goto skip_reset;
+       }
+
        driver_performs_reset = !!(!hdev->asic_prop.fw_security_enabled &&
                                        !hdev->asic_prop.hard_reset_done_by_fw);
 
@@ -4168,6 +4349,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
                        reset_timeout_ms);
        }
 
+skip_reset:
        /*
         * After hard reset, we can't poll the BTM_FSM register because the PSOC
         * itself is in reset. Need to wait until the reset is deasserted
@@ -4212,7 +4394,7 @@ static int gaudi_resume(struct hl_device *hdev)
        return gaudi_init_iatu(hdev);
 }
 
-static int gaudi_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
+static int gaudi_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
                        void *cpu_addr, dma_addr_t dma_addr, size_t size)
 {
        int rc;
@@ -4621,8 +4803,8 @@ static int gaudi_hbm_scrubbing(struct hl_device *hdev)
                                "Doing HBM scrubbing for 0x%09llx - 0x%09llx\n",
                                cur_addr, cur_addr + chunk_size);
 
-                       WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset, 0);
-                       WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset, 0);
+                       WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset, 0xdeadbeaf);
+                       WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset, 0xdeadbeaf);
                        WREG32(mmDMA0_CORE_DST_BASE_LO + dma_offset,
                                                lower_32_bits(cur_addr));
                        WREG32(mmDMA0_CORE_DST_BASE_HI + dma_offset,
@@ -5796,78 +5978,6 @@ release_cb:
        return rc;
 }
 
-static int gaudi_schedule_register_memset(struct hl_device *hdev,
-               u32 hw_queue_id, u64 reg_base, u32 num_regs, u32 val)
-{
-       struct hl_ctx *ctx;
-       struct hl_pending_cb *pending_cb;
-       struct packet_msg_long *pkt;
-       u32 cb_size, ctl;
-       struct hl_cb *cb;
-       int i, rc;
-
-       mutex_lock(&hdev->fpriv_list_lock);
-       ctx = hdev->compute_ctx;
-
-       /* If no compute context available or context is going down
-        * memset registers directly
-        */
-       if (!ctx || kref_read(&ctx->refcount) == 0) {
-               rc = gaudi_memset_registers(hdev, reg_base, num_regs, val);
-               mutex_unlock(&hdev->fpriv_list_lock);
-               return rc;
-       }
-
-       mutex_unlock(&hdev->fpriv_list_lock);
-
-       cb_size = (sizeof(*pkt) * num_regs) +
-                       sizeof(struct packet_msg_prot) * 2;
-
-       if (cb_size > SZ_2M) {
-               dev_err(hdev->dev, "CB size must be smaller than %uMB", SZ_2M);
-               return -ENOMEM;
-       }
-
-       pending_cb = kzalloc(sizeof(*pending_cb), GFP_KERNEL);
-       if (!pending_cb)
-               return -ENOMEM;
-
-       cb = hl_cb_kernel_create(hdev, cb_size, false);
-       if (!cb) {
-               kfree(pending_cb);
-               return -EFAULT;
-       }
-
-       pkt = cb->kernel_address;
-
-       ctl = FIELD_PREP(GAUDI_PKT_LONG_CTL_OP_MASK, 0); /* write the value */
-       ctl |= FIELD_PREP(GAUDI_PKT_CTL_OPCODE_MASK, PACKET_MSG_LONG);
-       ctl |= FIELD_PREP(GAUDI_PKT_CTL_EB_MASK, 1);
-       ctl |= FIELD_PREP(GAUDI_PKT_CTL_RB_MASK, 1);
-       ctl |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1);
-
-       for (i = 0; i < num_regs ; i++, pkt++) {
-               pkt->ctl = cpu_to_le32(ctl);
-               pkt->value = cpu_to_le32(val);
-               pkt->addr = cpu_to_le64(reg_base + (i * 4));
-       }
-
-       hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
-
-       pending_cb->cb = cb;
-       pending_cb->cb_size = cb_size;
-       /* The queue ID MUST be an external queue ID. Otherwise, we will
-        * have undefined behavior
-        */
-       pending_cb->hw_queue_id = hw_queue_id;
-
-       spin_lock(&ctx->pending_cb_lock);
-       list_add_tail(&pending_cb->cb_node, &ctx->pending_cb_list);
-       spin_unlock(&ctx->pending_cb_lock);
-
-       return 0;
-}
-
 static int gaudi_restore_sm_registers(struct hl_device *hdev)
 {
        u64 base_addr;
@@ -6013,7 +6123,7 @@ static int gaudi_restore_user_registers(struct hl_device *hdev)
 
 static int gaudi_context_switch(struct hl_device *hdev, u32 asid)
 {
-       return gaudi_restore_user_registers(hdev);
+       return 0;
 }
 
 static int gaudi_mmu_clear_pgt_range(struct hl_device *hdev)
@@ -6723,6 +6833,9 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
                                asid);
        }
 
+       gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_ARUSER, asid);
+       gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_AWUSER, asid);
+
        hdev->asic_funcs->set_clock_gating(hdev);
 
        mutex_unlock(&gaudi->clk_gate_mutex);
@@ -6772,7 +6885,8 @@ static int gaudi_send_job_on_qman0(struct hl_device *hdev,
 
        dma_offset = gaudi_dma_assignment[GAUDI_PCI_DMA_1] * DMA_CORE_OFFSET;
 
-       WREG32_OR(mmDMA0_CORE_PROT + dma_offset, BIT(DMA0_CORE_PROT_VAL_SHIFT));
+       WREG32(mmDMA0_CORE_PROT + dma_offset,
+                       BIT(DMA0_CORE_PROT_ERR_VAL_SHIFT) | BIT(DMA0_CORE_PROT_VAL_SHIFT));
 
        rc = hl_hw_queue_send_cb_no_cmpl(hdev, GAUDI_QUEUE_ID_DMA_0_0,
                                        job->job_cb_size, cb->bus_address);
@@ -6793,8 +6907,7 @@ static int gaudi_send_job_on_qman0(struct hl_device *hdev,
        }
 
 free_fence_ptr:
-       WREG32_AND(mmDMA0_CORE_PROT + dma_offset,
-                       ~BIT(DMA0_CORE_PROT_VAL_SHIFT));
+       WREG32(mmDMA0_CORE_PROT + dma_offset, BIT(DMA0_CORE_PROT_ERR_VAL_SHIFT));
 
        hdev->asic_funcs->asic_dma_pool_free(hdev, (void *) fence_ptr,
                                        fence_dma_addr);
@@ -7168,7 +7281,7 @@ static void gaudi_print_sw_config_stream_data(struct hl_device *hdev, u32 stream
 
        cq_ptr = (((u64) RREG32(cq_ptr_hi)) << 32) | RREG32(cq_ptr_lo);
        size = RREG32(cq_tsize);
-       dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %x\n",
+       dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %u\n",
                                                        stream, cq_ptr, size);
 }
 
@@ -7224,7 +7337,7 @@ static void gaudi_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base,
 
                addr = le64_to_cpu(bd->ptr);
 
-               dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %x\n",
+               dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %u\n",
                                                        stream, ci, addr, len);
 
                /* get previous ci, wrap if needed */
@@ -7326,24 +7439,30 @@ static void gaudi_print_sm_sei_info(struct hl_device *hdev, u16 event_type,
 {
        u32 index = event_type - GAUDI_EVENT_DMA_IF_SEI_0;
 
+       /* Flip the bits as the enum is ordered in the opposite way */
+       index = (index ^ 0x3) & 0x3;
+
        switch (sei_data->sei_cause) {
        case SM_SEI_SO_OVERFLOW:
-               dev_err(hdev->dev,
-                       "SM %u SEI Error: SO %u overflow/underflow",
-                       index, le32_to_cpu(sei_data->sei_log));
+               dev_err_ratelimited(hdev->dev,
+                       "%s SEI Error: SOB Group %u overflow/underflow",
+                       gaudi_sync_manager_names[index],
+                       le32_to_cpu(sei_data->sei_log));
                break;
        case SM_SEI_LBW_4B_UNALIGNED:
-               dev_err(hdev->dev,
-                       "SM %u SEI Error: Unaligned 4B LBW access, monitor agent address low - %#x",
-                       index, le32_to_cpu(sei_data->sei_log));
+               dev_err_ratelimited(hdev->dev,
+                       "%s SEI Error: Unaligned 4B LBW access, monitor agent address low - %#x",
+                       gaudi_sync_manager_names[index],
+                       le32_to_cpu(sei_data->sei_log));
                break;
        case SM_SEI_AXI_RESPONSE_ERR:
-               dev_err(hdev->dev,
-                       "SM %u SEI Error: AXI ID %u response error",
-                       index, le32_to_cpu(sei_data->sei_log));
+               dev_err_ratelimited(hdev->dev,
+                       "%s SEI Error: AXI ID %u response error",
+                       gaudi_sync_manager_names[index],
+                       le32_to_cpu(sei_data->sei_log));
                break;
        default:
-               dev_err(hdev->dev, "Unknown SM SEI cause %u",
+               dev_err_ratelimited(hdev->dev, "Unknown SM SEI cause %u",
                                le32_to_cpu(sei_data->sei_log));
                break;
        }
@@ -7358,6 +7477,11 @@ static void gaudi_handle_ecc_event(struct hl_device *hdev, u16 event_type,
        bool extract_info_from_fw;
        int rc;
 
+       if (hdev->asic_prop.fw_security_enabled) {
+               extract_info_from_fw = true;
+               goto extract_ecc_info;
+       }
+
        switch (event_type) {
        case GAUDI_EVENT_PCIE_CORE_SERR ... GAUDI_EVENT_PCIE_PHY_DERR:
        case GAUDI_EVENT_DMA0_SERR_ECC ... GAUDI_EVENT_MMU_DERR:
@@ -7430,6 +7554,7 @@ static void gaudi_handle_ecc_event(struct hl_device *hdev, u16 event_type,
                return;
        }
 
+extract_ecc_info:
        if (extract_info_from_fw) {
                ecc_address = le64_to_cpu(ecc_data->ecc_address);
                ecc_syndrom = le64_to_cpu(ecc_data->ecc_syndrom);
@@ -7806,8 +7931,15 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
        u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
        u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
                        >> EQ_CTL_EVENT_TYPE_SHIFT);
-       u8 cause;
        bool reset_required;
+       u8 cause;
+       int rc;
+
+       if (event_type >= GAUDI_EVENT_SIZE) {
+               dev_err(hdev->dev, "Event type %u exceeds maximum of %u",
+                               event_type, GAUDI_EVENT_SIZE - 1);
+               return;
+       }
 
        gaudi->events_stat[event_type]++;
        gaudi->events_stat_aggregate[event_type]++;
@@ -7880,10 +8012,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
                                        tpc_dec_event_to_tpc_id(event_type),
                                        "AXI_SLV_DEC_Error");
                if (reset_required) {
-                       dev_err(hdev->dev, "hard reset required due to %s\n",
+                       dev_err(hdev->dev, "reset required due to %s\n",
                                gaudi_irq_map_table[event_type].name);
 
-                       goto reset_device;
+                       hl_device_reset(hdev, 0);
                } else {
                        hl_fw_unmask_irq(hdev, event_type);
                }
@@ -7902,10 +8034,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
                                        tpc_krn_event_to_tpc_id(event_type),
                                        "KRN_ERR");
                if (reset_required) {
-                       dev_err(hdev->dev, "hard reset required due to %s\n",
+                       dev_err(hdev->dev, "reset required due to %s\n",
                                gaudi_irq_map_table[event_type].name);
 
-                       goto reset_device;
+                       hl_device_reset(hdev, 0);
                } else {
                        hl_fw_unmask_irq(hdev, event_type);
                }
@@ -7993,6 +8125,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
                gaudi_print_irq_info(hdev, event_type, false);
                gaudi_print_sm_sei_info(hdev, event_type,
                                        &eq_entry->sm_sei_data);
+               rc = hl_state_dump(hdev);
+               if (rc)
+                       dev_err(hdev->dev,
+                               "Error during system state dump %d\n", rc);
                hl_fw_unmask_irq(hdev, event_type);
                break;
 
@@ -8031,7 +8167,9 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
        return;
 
 reset_device:
-       if (hdev->hard_reset_on_fw_events)
+       if (hdev->asic_prop.fw_security_enabled)
+               hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW);
+       else if (hdev->hard_reset_on_fw_events)
                hl_device_reset(hdev, HL_RESET_HARD);
        else
                hl_fw_unmask_irq(hdev, event_type);
@@ -8563,11 +8701,20 @@ static void gaudi_internal_cb_pool_fini(struct hl_device *hdev,
 
 static int gaudi_ctx_init(struct hl_ctx *ctx)
 {
+       int rc;
+
        if (ctx->asid == HL_KERNEL_ASID_ID)
                return 0;
 
-       gaudi_mmu_prepare(ctx->hdev, ctx->asid);
-       return gaudi_internal_cb_pool_init(ctx->hdev, ctx);
+       rc = gaudi_internal_cb_pool_init(ctx->hdev, ctx);
+       if (rc)
+               return rc;
+
+       rc = gaudi_restore_user_registers(ctx->hdev);
+       if (rc)
+               gaudi_internal_cb_pool_fini(ctx->hdev, ctx);
+
+       return rc;
 }
 
 static void gaudi_ctx_fini(struct hl_ctx *ctx)
@@ -8596,6 +8743,11 @@ static u32 gaudi_get_wait_cb_size(struct hl_device *hdev)
                        sizeof(struct packet_msg_prot) * 2;
 }
 
+static u32 gaudi_get_sob_addr(struct hl_device *hdev, u32 sob_id)
+{
+       return mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 + (sob_id * 4);
+}
+
 static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id,
                                u32 size, bool eb)
 {
@@ -8902,16 +9054,12 @@ static u32 gaudi_gen_wait_cb(struct hl_device *hdev,
 static void gaudi_reset_sob(struct hl_device *hdev, void *data)
 {
        struct hl_hw_sob *hw_sob = (struct hl_hw_sob *) data;
-       int rc;
 
        dev_dbg(hdev->dev, "reset SOB, q_idx: %d, sob_id: %d\n", hw_sob->q_idx,
                hw_sob->sob_id);
 
-       rc = gaudi_schedule_register_memset(hdev, hw_sob->q_idx,
-                       CFG_BASE + mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 +
-                       hw_sob->sob_id * 4, 1, 0);
-       if (rc)
-               dev_err(hdev->dev, "failed resetting sob %u", hw_sob->sob_id);
+       WREG32(mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 +
+                       hw_sob->sob_id * 4, 0);
 
        kref_init(&hw_sob->kref);
 }
@@ -8977,6 +9125,280 @@ static int gaudi_map_pll_idx_to_fw_idx(u32 pll_idx)
        }
 }
 
+static int gaudi_add_sync_to_engine_map_entry(
+       struct hl_sync_to_engine_map *map, u32 reg_value,
+       enum hl_sync_engine_type engine_type, u32 engine_id)
+{
+       struct hl_sync_to_engine_map_entry *entry;
+
+       /* Reg value represents a partial address of sync object,
+        * it is used as unique identifier. For this we need to
+        * clear the cutoff cfg base bits from the value.
+        */
+       if (reg_value == 0 || reg_value == 0xffffffff)
+               return 0;
+       reg_value -= (u32)CFG_BASE;
+
+       /* create a new hash entry */
+       entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+       if (!entry)
+               return -ENOMEM;
+       entry->engine_type = engine_type;
+       entry->engine_id = engine_id;
+       entry->sync_id = reg_value;
+       hash_add(map->tb, &entry->node, reg_value);
+
+       return 0;
+}
+
+static int gaudi_gen_sync_to_engine_map(struct hl_device *hdev,
+                               struct hl_sync_to_engine_map *map)
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       struct gaudi_device *gaudi = hdev->asic_specific;
+       int i, j, rc;
+       u32 reg_value;
+
+       /* Iterate over TPC engines */
+       for (i = 0; i < sds->props[SP_NUM_OF_TPC_ENGINES]; ++i) {
+               /* TPC registered must be accessed with clock gating disabled */
+               mutex_lock(&gaudi->clk_gate_mutex);
+               hdev->asic_funcs->disable_clock_gating(hdev);
+
+               reg_value = RREG32(sds->props[SP_TPC0_CFG_SO] +
+                                       sds->props[SP_NEXT_TPC] * i);
+
+               /* We can reenable clock_gating */
+               hdev->asic_funcs->set_clock_gating(hdev);
+               mutex_unlock(&gaudi->clk_gate_mutex);
+
+               rc = gaudi_add_sync_to_engine_map_entry(map, reg_value,
+                                                       ENGINE_TPC, i);
+               if (rc)
+                       goto free_sync_to_engine_map;
+       }
+
+       /* Iterate over MME engines */
+       for (i = 0; i < sds->props[SP_NUM_OF_MME_ENGINES]; ++i) {
+               for (j = 0; j < sds->props[SP_SUB_MME_ENG_NUM]; ++j) {
+                       /* MME registered must be accessed with clock gating
+                        * disabled
+                        */
+                       mutex_lock(&gaudi->clk_gate_mutex);
+                       hdev->asic_funcs->disable_clock_gating(hdev);
+
+                       reg_value = RREG32(sds->props[SP_MME_CFG_SO] +
+                                               sds->props[SP_NEXT_MME] * i +
+                                               j * sizeof(u32));
+
+                       /* We can reenable clock_gating */
+                       hdev->asic_funcs->set_clock_gating(hdev);
+                       mutex_unlock(&gaudi->clk_gate_mutex);
+
+                       rc = gaudi_add_sync_to_engine_map_entry(
+                               map, reg_value, ENGINE_MME,
+                               i * sds->props[SP_SUB_MME_ENG_NUM] + j);
+                       if (rc)
+                               goto free_sync_to_engine_map;
+               }
+       }
+
+       /* Iterate over DMA engines */
+       for (i = 0; i < sds->props[SP_NUM_OF_DMA_ENGINES]; ++i) {
+               reg_value = RREG32(sds->props[SP_DMA_CFG_SO] +
+                                       sds->props[SP_DMA_QUEUES_OFFSET] * i);
+               rc = gaudi_add_sync_to_engine_map_entry(map, reg_value,
+                                                       ENGINE_DMA, i);
+               if (rc)
+                       goto free_sync_to_engine_map;
+       }
+
+       return 0;
+
+free_sync_to_engine_map:
+       hl_state_dump_free_sync_to_engine_map(map);
+
+       return rc;
+}
+
+static int gaudi_monitor_valid(struct hl_mon_state_dump *mon)
+{
+       return FIELD_GET(
+               SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_VALID_MASK,
+               mon->status);
+}
+
+static void gaudi_fill_sobs_from_mon(char *sobs, struct hl_mon_state_dump *mon)
+{
+       const size_t max_write = 10;
+       u32 gid, mask, sob;
+       int i, offset;
+
+       /* Sync object ID is calculated as follows:
+        * (8 * group_id + cleared bits in mask)
+        */
+       gid = FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_MASK,
+                       mon->arm_data);
+       mask = FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_MASK,
+                       mon->arm_data);
+
+       for (i = 0, offset = 0; mask && offset < MONITOR_SOB_STRING_SIZE -
+               max_write; mask >>= 1, i++) {
+               if (!(mask & 1)) {
+                       sob = gid * MONITOR_MAX_SOBS + i;
+
+                       if (offset > 0)
+                               offset += snprintf(sobs + offset, max_write,
+                                                       ", ");
+
+                       offset += snprintf(sobs + offset, max_write, "%u", sob);
+               }
+       }
+}
+
+static int gaudi_print_single_monitor(char **buf, size_t *size, size_t *offset,
+                               struct hl_device *hdev,
+                               struct hl_mon_state_dump *mon)
+{
+       const char *name;
+       char scratch_buf1[BIN_REG_STRING_SIZE],
+               scratch_buf2[BIN_REG_STRING_SIZE];
+       char monitored_sobs[MONITOR_SOB_STRING_SIZE] = {0};
+
+       name = hl_state_dump_get_monitor_name(hdev, mon);
+       if (!name)
+               name = "";
+
+       gaudi_fill_sobs_from_mon(monitored_sobs, mon);
+
+       return hl_snprintf_resize(
+               buf, size, offset,
+               "Mon id: %u%s, wait for group id: %u mask %s to reach val: %u and write %u to address 0x%llx. Pending: %s. Means sync objects [%s] are being monitored.",
+               mon->id, name,
+               FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_MASK,
+                               mon->arm_data),
+               hl_format_as_binary(
+                       scratch_buf1, sizeof(scratch_buf1),
+                       FIELD_GET(
+                               SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_MASK,
+                               mon->arm_data)),
+               FIELD_GET(SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOD_MASK,
+                               mon->arm_data),
+               mon->wr_data,
+               (((u64)mon->wr_addr_high) << 32) | mon->wr_addr_low,
+               hl_format_as_binary(
+                       scratch_buf2, sizeof(scratch_buf2),
+                       FIELD_GET(
+                               SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_PENDING_MASK,
+                               mon->status)),
+               monitored_sobs);
+}
+
+
+static int gaudi_print_fences_single_engine(
+       struct hl_device *hdev, u64 base_offset, u64 status_base_offset,
+       enum hl_sync_engine_type engine_type, u32 engine_id, char **buf,
+       size_t *size, size_t *offset)
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       int rc = -ENOMEM, i;
+       u32 *statuses, *fences;
+
+       statuses = kcalloc(sds->props[SP_ENGINE_NUM_OF_QUEUES],
+                       sizeof(*statuses), GFP_KERNEL);
+       if (!statuses)
+               goto out;
+
+       fences = kcalloc(sds->props[SP_ENGINE_NUM_OF_FENCES] *
+                               sds->props[SP_ENGINE_NUM_OF_QUEUES],
+                        sizeof(*fences), GFP_KERNEL);
+       if (!fences)
+               goto free_status;
+
+       for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_FENCES]; ++i)
+               statuses[i] = RREG32(status_base_offset + i * sizeof(u32));
+
+       for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_FENCES] *
+                               sds->props[SP_ENGINE_NUM_OF_QUEUES]; ++i)
+               fences[i] = RREG32(base_offset + i * sizeof(u32));
+
+       /* The actual print */
+       for (i = 0; i < sds->props[SP_ENGINE_NUM_OF_QUEUES]; ++i) {
+               u32 fence_id;
+               u64 fence_cnt, fence_rdata;
+               const char *engine_name;
+
+               if (!FIELD_GET(TPC0_QM_CP_STS_0_FENCE_IN_PROGRESS_MASK,
+                       statuses[i]))
+                       continue;
+
+               fence_id =
+                       FIELD_GET(TPC0_QM_CP_STS_0_FENCE_ID_MASK, statuses[i]);
+               fence_cnt = base_offset + CFG_BASE +
+                       sizeof(u32) *
+                       (i + fence_id * sds->props[SP_ENGINE_NUM_OF_QUEUES]);
+               fence_rdata = fence_cnt - sds->props[SP_FENCE0_CNT_OFFSET] +
+                               sds->props[SP_FENCE0_RDATA_OFFSET];
+               engine_name = hl_sync_engine_to_string(engine_type);
+
+               rc = hl_snprintf_resize(
+                       buf, size, offset,
+                       "%s%u, stream %u: fence id %u cnt = 0x%llx (%s%u_QM.CP_FENCE%u_CNT_%u) rdata = 0x%llx (%s%u_QM.CP_FENCE%u_RDATA_%u) value = %u, cp_status = %u\n",
+                       engine_name, engine_id,
+                       i, fence_id,
+                       fence_cnt, engine_name, engine_id, fence_id, i,
+                       fence_rdata, engine_name, engine_id, fence_id, i,
+                       fences[fence_id],
+                       statuses[i]);
+               if (rc)
+                       goto free_fences;
+       }
+
+       rc = 0;
+
+free_fences:
+       kfree(fences);
+free_status:
+       kfree(statuses);
+out:
+       return rc;
+}
+
+
+static struct hl_state_dump_specs_funcs gaudi_state_dump_funcs = {
+       .monitor_valid = gaudi_monitor_valid,
+       .print_single_monitor = gaudi_print_single_monitor,
+       .gen_sync_to_engine_map = gaudi_gen_sync_to_engine_map,
+       .print_fences_single_engine = gaudi_print_fences_single_engine,
+};
+
+static void gaudi_state_dump_init(struct hl_device *hdev)
+{
+       struct hl_state_dump_specs *sds = &hdev->state_dump_specs;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(gaudi_so_id_to_str); ++i)
+               hash_add(sds->so_id_to_str_tb,
+                       &gaudi_so_id_to_str[i].node,
+                       gaudi_so_id_to_str[i].id);
+
+       for (i = 0; i < ARRAY_SIZE(gaudi_monitor_id_to_str); ++i)
+               hash_add(sds->monitor_id_to_str_tb,
+                       &gaudi_monitor_id_to_str[i].node,
+                       gaudi_monitor_id_to_str[i].id);
+
+       sds->props = gaudi_state_dump_specs_props;
+
+       sds->sync_namager_names = gaudi_sync_manager_names;
+
+       sds->funcs = gaudi_state_dump_funcs;
+}
+
+static u32 *gaudi_get_stream_master_qid_arr(void)
+{
+       return gaudi_stream_master;
+}
+
 static const struct hl_asic_funcs gaudi_funcs = {
        .early_init = gaudi_early_init,
        .early_fini = gaudi_early_fini,
@@ -8989,7 +9411,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
        .halt_engines = gaudi_halt_engines,
        .suspend = gaudi_suspend,
        .resume = gaudi_resume,
-       .cb_mmap = gaudi_cb_mmap,
+       .mmap = gaudi_mmap,
        .ring_doorbell = gaudi_ring_doorbell,
        .pqe_write = gaudi_pqe_write,
        .asic_dma_alloc_coherent = gaudi_dma_alloc_coherent,
@@ -9062,7 +9484,11 @@ static const struct hl_asic_funcs gaudi_funcs = {
        .enable_events_from_fw = gaudi_enable_events_from_fw,
        .map_pll_idx_to_fw_idx = gaudi_map_pll_idx_to_fw_idx,
        .init_firmware_loader = gaudi_init_firmware_loader,
-       .init_cpu_scrambler_dram = gaudi_init_scrambler_hbm
+       .init_cpu_scrambler_dram = gaudi_init_scrambler_hbm,
+       .state_dump_init = gaudi_state_dump_init,
+       .get_sob_addr = gaudi_get_sob_addr,
+       .set_pci_memory_regions = gaudi_set_pci_memory_regions,
+       .get_stream_master_qid_arr = gaudi_get_stream_master_qid_arr
 };
 
 /**
index 957bf37..bbbf1c3 100644 (file)
@@ -36,6 +36,8 @@
 #define NUMBER_OF_INTERRUPTS           (NUMBER_OF_CMPLT_QUEUES + \
                                                NUMBER_OF_CPU_HW_QUEUES)
 
+#define GAUDI_STREAM_MASTER_ARR_SIZE   8
+
 #if (NUMBER_OF_INTERRUPTS > GAUDI_MSI_ENTRIES)
 #error "Number of MSI interrupts must be smaller or equal to GAUDI_MSI_ENTRIES"
 #endif
@@ -50,6 +52,8 @@
 #define DC_POWER_DEFAULT_PCI           60000           /* 60W */
 #define DC_POWER_DEFAULT_PMC           60000           /* 60W */
 
+#define DC_POWER_DEFAULT_PMC_SEC       97000           /* 97W */
+
 #define GAUDI_CPU_TIMEOUT_USEC         30000000        /* 30s */
 
 #define TPC_ENABLED_MASK               0xFF
@@ -62,7 +66,7 @@
 
 #define DMA_MAX_TRANSFER_SIZE          U32_MAX
 
-#define GAUDI_DEFAULT_CARD_NAME                "HL2000"
+#define GAUDI_DEFAULT_CARD_NAME                "HL205"
 
 #define GAUDI_MAX_PENDING_CS           SZ_16K
 
        (((mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_511 - \
        mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_0) + 4) >> 2)
 
+#define MONITOR_MAX_SOBS       8
 
 /* DRAM Memory Map */
 
 #define HW_CAP_TPC_MASK                GENMASK(31, 24)
 #define HW_CAP_TPC_SHIFT       24
 
+#define NEXT_SYNC_OBJ_ADDR_INTERVAL \
+       (mmSYNC_MNGR_W_N_SYNC_MNGR_OBJS_SOB_OBJ_0 - \
+        mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0)
+#define NUM_OF_MME_ENGINES                     2
+#define NUM_OF_MME_SUB_ENGINES         2
+#define NUM_OF_TPC_ENGINES                     8
+#define NUM_OF_DMA_ENGINES                     8
+#define NUM_OF_QUEUES                          5
+#define NUM_OF_STREAMS                         4
+#define NUM_OF_FENCES                          4
+
+
 #define GAUDI_CPU_PCI_MSB_ADDR(addr)   (((addr) & GENMASK_ULL(49, 39)) >> 39)
 #define GAUDI_PCI_TO_CPU_ADDR(addr)                    \
        do {                                            \
index c2a27ed..5349c1b 100644 (file)
@@ -622,11 +622,6 @@ static int gaudi_config_etr(struct hl_device *hdev,
                        return -EINVAL;
                }
 
-               gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_ARUSER,
-                                               hdev->compute_ctx->asid);
-               gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_AWUSER,
-                                               hdev->compute_ctx->asid);
-
                msb = upper_32_bits(input->buffer_address) >> 8;
                msb &= PSOC_GLOBAL_CONF_TRACE_ADDR_MSB_MASK;
                WREG32(mmPSOC_GLOBAL_CONF_TRACE_ADDR, msb);
index 0d3240f..cb265c0 100644 (file)
@@ -9559,6 +9559,7 @@ static void gaudi_init_tpc_protection_bits(struct hl_device *hdev)
        mask |= 1U << ((mmTPC0_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC0_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2);
        mask |= 1U << ((mmTPC0_CFG_TPC_STALL & 0x7F) >> 2);
+       mask |= 1U << ((mmTPC0_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC0_CFG_RD_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC0_CFG_WR_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC0_CFG_MSS_CONFIG & 0x7F) >> 2);
@@ -10013,6 +10014,7 @@ static void gaudi_init_tpc_protection_bits(struct hl_device *hdev)
        mask |= 1U << ((mmTPC1_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC1_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2);
        mask |= 1U << ((mmTPC1_CFG_TPC_STALL & 0x7F) >> 2);
+       mask |= 1U << ((mmTPC1_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC1_CFG_RD_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC1_CFG_WR_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC1_CFG_MSS_CONFIG & 0x7F) >> 2);
@@ -10466,6 +10468,7 @@ static void gaudi_init_tpc_protection_bits(struct hl_device *hdev)
        mask |= 1U << ((mmTPC2_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC2_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2);
        mask |= 1U << ((mmTPC2_CFG_TPC_STALL & 0x7F) >> 2);
+       mask |= 1U << ((mmTPC2_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC2_CFG_RD_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC2_CFG_WR_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC2_CFG_MSS_CONFIG & 0x7F) >> 2);
@@ -10919,6 +10922,7 @@ static void gaudi_init_tpc_protection_bits(struct hl_device *hdev)
        mask |= 1U << ((mmTPC3_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC3_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2);
        mask |= 1U << ((mmTPC3_CFG_TPC_STALL & 0x7F) >> 2);
+       mask |= 1U << ((mmTPC3_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC3_CFG_RD_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC3_CFG_WR_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC3_CFG_MSS_CONFIG & 0x7F) >> 2);
@@ -11372,6 +11376,7 @@ static void gaudi_init_tpc_protection_bits(struct hl_device *hdev)
        mask |= 1U << ((mmTPC4_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC4_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2);
        mask |= 1U << ((mmTPC4_CFG_TPC_STALL & 0x7F) >> 2);
+       mask |= 1U << ((mmTPC4_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC4_CFG_RD_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC4_CFG_WR_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC4_CFG_MSS_CONFIG & 0x7F) >> 2);
@@ -11825,6 +11830,7 @@ static void gaudi_init_tpc_protection_bits(struct hl_device *hdev)
        mask |= 1U << ((mmTPC5_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC5_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2);
        mask |= 1U << ((mmTPC5_CFG_TPC_STALL & 0x7F) >> 2);
+       mask |= 1U << ((mmTPC5_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC5_CFG_RD_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC5_CFG_WR_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC5_CFG_MSS_CONFIG & 0x7F) >> 2);
@@ -12280,6 +12286,7 @@ static void gaudi_init_tpc_protection_bits(struct hl_device *hdev)
        mask |= 1U << ((mmTPC6_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC6_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2);
        mask |= 1U << ((mmTPC6_CFG_TPC_STALL & 0x7F) >> 2);
+       mask |= 1U << ((mmTPC6_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC6_CFG_RD_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC6_CFG_WR_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC6_CFG_MSS_CONFIG & 0x7F) >> 2);
@@ -12735,6 +12742,7 @@ static void gaudi_init_tpc_protection_bits(struct hl_device *hdev)
        mask |= 1U << ((mmTPC7_CFG_CFG_BASE_ADDRESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC7_CFG_CFG_SUBTRACT_VALUE & 0x7F) >> 2);
        mask |= 1U << ((mmTPC7_CFG_TPC_STALL & 0x7F) >> 2);
+       mask |= 1U << ((mmTPC7_CFG_ICACHE_BASE_ADDERESS_HIGH & 0x7F) >> 2);
        mask |= 1U << ((mmTPC7_CFG_RD_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC7_CFG_WR_RATE_LIMIT & 0x7F) >> 2);
        mask |= 1U << ((mmTPC7_CFG_MSS_CONFIG & 0x7F) >> 2);
index 755e08c..031c184 100644 (file)
@@ -350,6 +350,8 @@ static u32 goya_all_events[] = {
        GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E
 };
 
+static s64 goya_state_dump_specs_props[SP_MAX] = {0};
+
 static int goya_mmu_clear_pgt_range(struct hl_device *hdev);
 static int goya_mmu_set_dram_default_page(struct hl_device *hdev);
 static int goya_mmu_add_mappings_for_device_cpu(struct hl_device *hdev);
@@ -387,6 +389,7 @@ int goya_set_fixed_properties(struct hl_device *hdev)
                prop->hw_queues_props[i].cb_alloc_flags = CB_ALLOC_USER;
        }
 
+       prop->device_dma_offset_for_host_access = HOST_PHYS_BASE;
        prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
 
        prop->dram_base_address = DRAM_PHYS_BASE;
@@ -466,6 +469,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
        prop->hard_reset_done_by_fw = false;
        prop->gic_interrupts_enable = true;
 
+       prop->server_type = HL_SERVER_TYPE_UNKNOWN;
+
        return 0;
 }
 
@@ -649,14 +654,14 @@ pci_init:
                                        GOYA_BOOT_FIT_REQ_TIMEOUT_USEC);
        if (rc) {
                if (hdev->reset_on_preboot_fail)
-                       hdev->asic_funcs->hw_fini(hdev, true);
+                       hdev->asic_funcs->hw_fini(hdev, true, false);
                goto pci_fini;
        }
 
        if (goya_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
                dev_info(hdev->dev,
                        "H/W state is dirty, must reset before initializing\n");
-               hdev->asic_funcs->hw_fini(hdev, true);
+               hdev->asic_funcs->hw_fini(hdev, true, false);
        }
 
        if (!hdev->pldm) {
@@ -955,8 +960,9 @@ static int goya_sw_init(struct hl_device *hdev)
        hdev->supports_coresight = true;
        hdev->supports_soft_reset = true;
        hdev->allow_external_soft_reset = true;
+       hdev->supports_wait_for_multi_cs = false;
 
-       goya_set_pci_memory_regions(hdev);
+       hdev->asic_funcs->set_pci_memory_regions(hdev);
 
        return 0;
 
@@ -2374,7 +2380,7 @@ static void goya_disable_timestamp(struct hl_device *hdev)
        WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0);
 }
 
-static void goya_halt_engines(struct hl_device *hdev, bool hard_reset)
+static void goya_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 {
        u32 wait_timeout_ms;
 
@@ -2493,6 +2499,7 @@ static void goya_init_firmware_loader(struct hl_device *hdev)
        struct fw_load_mgr *fw_loader = &hdev->fw_loader;
 
        /* fill common fields */
+       fw_loader->linux_loaded = false;
        fw_loader->boot_fit_img.image_name = GOYA_BOOT_FIT_FILE;
        fw_loader->linux_img.image_name = GOYA_LINUX_FW_FILE;
        fw_loader->cpu_timeout = GOYA_CPU_TIMEOUT_USEC;
@@ -2696,14 +2703,7 @@ disable_queues:
        return rc;
 }
 
-/*
- * goya_hw_fini - Goya hardware tear-down code
- *
- * @hdev: pointer to hl_device structure
- * @hard_reset: should we do hard reset to all engines or just reset the
- *              compute/dma engines
- */
-static void goya_hw_fini(struct hl_device *hdev, bool hard_reset)
+static void goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 {
        struct goya_device *goya = hdev->asic_specific;
        u32 reset_timeout_ms, cpu_timeout_ms, status;
@@ -2796,7 +2796,7 @@ int goya_resume(struct hl_device *hdev)
        return goya_init_iatu(hdev);
 }
 
-static int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
+static int goya_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
                        void *cpu_addr, dma_addr_t dma_addr, size_t size)
 {
        int rc;
@@ -4797,6 +4797,12 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
                                >> EQ_CTL_EVENT_TYPE_SHIFT);
        struct goya_device *goya = hdev->asic_specific;
 
+       if (event_type >= GOYA_ASYNC_EVENT_ID_SIZE) {
+               dev_err(hdev->dev, "Event type %u exceeds maximum of %u",
+                               event_type, GOYA_ASYNC_EVENT_ID_SIZE - 1);
+               return;
+       }
+
        goya->events_stat[event_type]++;
        goya->events_stat_aggregate[event_type]++;
 
@@ -5475,14 +5481,14 @@ u64 goya_get_device_time(struct hl_device *hdev)
        return device_time | RREG32(mmPSOC_TIMESTAMP_CNTCVL);
 }
 
-static void goya_collective_wait_init_cs(struct hl_cs *cs)
+static int goya_collective_wait_init_cs(struct hl_cs *cs)
 {
-
+       return 0;
 }
 
 static int goya_collective_wait_create_jobs(struct hl_device *hdev,
                struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
-               u32 collective_engine_id)
+               u32 collective_engine_id, u32 encaps_signal_offset)
 {
        return -EINVAL;
 }
@@ -5524,6 +5530,62 @@ static int goya_map_pll_idx_to_fw_idx(u32 pll_idx)
        }
 }
 
+static int goya_gen_sync_to_engine_map(struct hl_device *hdev,
+                               struct hl_sync_to_engine_map *map)
+{
+       /* Not implemented */
+       return 0;
+}
+
+static int goya_monitor_valid(struct hl_mon_state_dump *mon)
+{
+       /* Not implemented */
+       return 0;
+}
+
+static int goya_print_single_monitor(char **buf, size_t *size, size_t *offset,
+                               struct hl_device *hdev,
+                               struct hl_mon_state_dump *mon)
+{
+       /* Not implemented */
+       return 0;
+}
+
+
+static int goya_print_fences_single_engine(
+       struct hl_device *hdev, u64 base_offset, u64 status_base_offset,
+       enum hl_sync_engine_type engine_type, u32 engine_id, char **buf,
+       size_t *size, size_t *offset)
+{
+       /* Not implemented */
+       return 0;
+}
+
+
+static struct hl_state_dump_specs_funcs goya_state_dump_funcs = {
+       .monitor_valid = goya_monitor_valid,
+       .print_single_monitor = goya_print_single_monitor,
+       .gen_sync_to_engine_map = goya_gen_sync_to_engine_map,
+       .print_fences_single_engine = goya_print_fences_single_engine,
+};
+
+static void goya_state_dump_init(struct hl_device *hdev)
+{
+       /* Not implemented */
+       hdev->state_dump_specs.props = goya_state_dump_specs_props;
+       hdev->state_dump_specs.funcs = goya_state_dump_funcs;
+}
+
+static u32 goya_get_sob_addr(struct hl_device *hdev, u32 sob_id)
+{
+       return 0;
+}
+
+static u32 *goya_get_stream_master_qid_arr(void)
+{
+       return NULL;
+}
+
 static const struct hl_asic_funcs goya_funcs = {
        .early_init = goya_early_init,
        .early_fini = goya_early_fini,
@@ -5536,7 +5598,7 @@ static const struct hl_asic_funcs goya_funcs = {
        .halt_engines = goya_halt_engines,
        .suspend = goya_suspend,
        .resume = goya_resume,
-       .cb_mmap = goya_cb_mmap,
+       .mmap = goya_mmap,
        .ring_doorbell = goya_ring_doorbell,
        .pqe_write = goya_pqe_write,
        .asic_dma_alloc_coherent = goya_dma_alloc_coherent,
@@ -5609,7 +5671,11 @@ static const struct hl_asic_funcs goya_funcs = {
        .enable_events_from_fw = goya_enable_events_from_fw,
        .map_pll_idx_to_fw_idx = goya_map_pll_idx_to_fw_idx,
        .init_firmware_loader = goya_init_firmware_loader,
-       .init_cpu_scrambler_dram = goya_cpu_init_scrambler_dram
+       .init_cpu_scrambler_dram = goya_cpu_init_scrambler_dram,
+       .state_dump_init = goya_state_dump_init,
+       .get_sob_addr = &goya_get_sob_addr,
+       .set_pci_memory_regions = goya_set_pci_memory_regions,
+       .get_stream_master_qid_arr = goya_get_stream_master_qid_arr,
 };
 
 /*
index 80b1d5a..9ff6a44 100644 (file)
@@ -98,6 +98,18 @@ struct hl_eq_fw_alive {
        __u8 pad[7];
 };
 
+enum hl_pcie_addr_dec_cause {
+       PCIE_ADDR_DEC_HBW_ERR_RESP,
+       PCIE_ADDR_DEC_LBW_ERR_RESP,
+       PCIE_ADDR_DEC_TLP_BLOCKED_BY_RR
+};
+
+struct hl_eq_pcie_addr_dec_data {
+       /* enum hl_pcie_addr_dec_cause */
+       __u8 addr_dec_cause;
+       __u8 pad[7];
+};
+
 struct hl_eq_entry {
        struct hl_eq_header hdr;
        union {
@@ -106,6 +118,7 @@ struct hl_eq_entry {
                struct hl_eq_sm_sei_data sm_sei_data;
                struct cpucp_pkt_sync_err pkt_sync_err;
                struct hl_eq_fw_alive fw_alive;
+               struct hl_eq_pcie_addr_dec_data pcie_addr_dec_data;
                __le64 data[7];
        };
 };
@@ -116,7 +129,7 @@ struct hl_eq_entry {
 #define EQ_CTL_READY_MASK              0x80000000
 
 #define EQ_CTL_EVENT_TYPE_SHIFT                16
-#define EQ_CTL_EVENT_TYPE_MASK         0x03FF0000
+#define EQ_CTL_EVENT_TYPE_MASK         0x0FFF0000
 
 #define EQ_CTL_INDEX_SHIFT             0
 #define EQ_CTL_INDEX_MASK              0x0000FFFF
@@ -300,7 +313,7 @@ enum pq_init_status {
  *       The packet's arguments specify the desired sensor and the field to
  *       set.
  *
- * CPUCP_PACKET_PCIE_THROUGHPUT_GET
+ * CPUCP_PACKET_PCIE_THROUGHPUT_GET -
  *       Get throughput of PCIe.
  *       The packet's arguments specify the transaction direction (TX/RX).
  *       The window measurement is 10[msec], and the return value is in KB/sec.
@@ -309,19 +322,19 @@ enum pq_init_status {
  *       Replay count measures number of "replay" events, which is basicly
  *       number of retries done by PCIe.
  *
- * CPUCP_PACKET_TOTAL_ENERGY_GET
+ * CPUCP_PACKET_TOTAL_ENERGY_GET -
  *       Total Energy is measurement of energy from the time FW Linux
  *       is loaded. It is calculated by multiplying the average power
  *       by time (passed from armcp start). The units are in MilliJouls.
  *
- * CPUCP_PACKET_PLL_INFO_GET
+ * CPUCP_PACKET_PLL_INFO_GET -
  *       Fetch frequencies of PLL from the required PLL IP.
  *       The packet's arguments specify the device PLL type
  *       Pll type is the PLL from device pll_index enum.
  *       The result is composed of 4 outputs, each is 16-bit
  *       frequency in MHz.
  *
- * CPUCP_PACKET_POWER_GET
+ * CPUCP_PACKET_POWER_GET -
  *       Fetch the present power consumption of the device (Current * Voltage).
  *
  * CPUCP_PACKET_NIC_PFC_SET -
@@ -345,6 +358,24 @@ enum pq_init_status {
  * CPUCP_PACKET_MSI_INFO_SET -
  *       set the index number for each supported msi type going from
  *       host to device
+ *
+ * CPUCP_PACKET_NIC_XPCS91_REGS_GET -
+ *       Fetch the un/correctable counters values from the NIC MAC.
+ *
+ * CPUCP_PACKET_NIC_STAT_REGS_GET -
+ *       Fetch various NIC MAC counters from the NIC STAT.
+ *
+ * CPUCP_PACKET_NIC_STAT_REGS_CLR -
+ *       Clear the various NIC MAC counters in the NIC STAT.
+ *
+ * CPUCP_PACKET_NIC_STAT_REGS_ALL_GET -
+ *       Fetch all NIC MAC counters from the NIC STAT.
+ *
+ * CPUCP_PACKET_IS_IDLE_CHECK -
+ *       Check if the device is IDLE in regard to the DMA/compute engines
+ *       and QMANs. The f/w will return a bitmask where each bit represents
+ *       a different engine or QMAN according to enum cpucp_idle_mask.
+ *       The bit will be 1 if the engine is NOT idle.
  */
 
 enum cpucp_packet_id {
@@ -385,6 +416,11 @@ enum cpucp_packet_id {
        CPUCP_PACKET_NIC_LPBK_SET,              /* internal */
        CPUCP_PACKET_NIC_MAC_CFG,               /* internal */
        CPUCP_PACKET_MSI_INFO_SET,              /* internal */
+       CPUCP_PACKET_NIC_XPCS91_REGS_GET,       /* internal */
+       CPUCP_PACKET_NIC_STAT_REGS_GET,         /* internal */
+       CPUCP_PACKET_NIC_STAT_REGS_CLR,         /* internal */
+       CPUCP_PACKET_NIC_STAT_REGS_ALL_GET,     /* internal */
+       CPUCP_PACKET_IS_IDLE_CHECK,             /* internal */
 };
 
 #define CPUCP_PACKET_FENCE_VAL 0xFE8CE7A5
@@ -414,6 +450,11 @@ enum cpucp_packet_id {
 #define CPUCP_PKT_VAL_LPBK_IN2_SHIFT   1
 #define CPUCP_PKT_VAL_LPBK_IN2_MASK    0x000000000000001Eull
 
+#define CPUCP_PKT_VAL_MAC_CNT_IN1_SHIFT        0
+#define CPUCP_PKT_VAL_MAC_CNT_IN1_MASK 0x0000000000000001ull
+#define CPUCP_PKT_VAL_MAC_CNT_IN2_SHIFT        1
+#define CPUCP_PKT_VAL_MAC_CNT_IN2_MASK 0x00000000FFFFFFFEull
+
 /* heartbeat status bits */
 #define CPUCP_PKT_HB_STATUS_EQ_FAULT_SHIFT             0
 #define CPUCP_PKT_HB_STATUS_EQ_FAULT_MASK              0x00000001
@@ -467,7 +508,8 @@ struct cpucp_packet {
                __le32 status_mask;
        };
 
-       __le32 reserved;
+       /* For NIC requests */
+       __le32 port_index;
 };
 
 struct cpucp_unmask_irq_arr_packet {
@@ -476,6 +518,12 @@ struct cpucp_unmask_irq_arr_packet {
        __le32 irqs[0];
 };
 
+struct cpucp_nic_status_packet {
+       struct cpucp_packet cpucp_pkt;
+       __le32 length;
+       __le32 data[0];
+};
+
 struct cpucp_array_data_packet {
        struct cpucp_packet cpucp_pkt;
        __le32 length;
@@ -595,6 +643,18 @@ enum pll_index {
        PLL_MAX
 };
 
+enum rl_index {
+       TPC_RL = 0,
+       MME_RL,
+};
+
+enum pvt_index {
+       PVT_SW,
+       PVT_SE,
+       PVT_NW,
+       PVT_NE
+};
+
 /* Event Queue Packets */
 
 struct eq_generic_event {
@@ -700,6 +760,15 @@ struct cpucp_mac_addr {
        __u8 mac_addr[ETH_ALEN];
 };
 
+enum cpucp_serdes_type {
+       TYPE_1_SERDES_TYPE,
+       TYPE_2_SERDES_TYPE,
+       HLS1_SERDES_TYPE,
+       HLS1H_SERDES_TYPE,
+       UNKNOWN_SERDES_TYPE,
+       MAX_NUM_SERDES_TYPE = UNKNOWN_SERDES_TYPE
+};
+
 struct cpucp_nic_info {
        struct cpucp_mac_addr mac_addrs[CPUCP_MAX_NICS];
        __le64 link_mask[CPUCP_NIC_MASK_ARR_LEN];
@@ -708,6 +777,40 @@ struct cpucp_nic_info {
        __le64 link_ext_mask[CPUCP_NIC_MASK_ARR_LEN];
        __u8 qsfp_eeprom[CPUCP_NIC_QSFP_EEPROM_MAX_LEN];
        __le64 auto_neg_mask[CPUCP_NIC_MASK_ARR_LEN];
+       __le16 serdes_type; /* enum cpucp_serdes_type */
+       __u8 reserved[6];
+};
+
+/*
+ * struct cpucp_nic_status - describes the status of a NIC port.
+ * @port: NIC port index.
+ * @bad_format_cnt: e.g. CRC.
+ * @responder_out_of_sequence_psn_cnt: e.g NAK.
+ * @high_ber_reinit_cnt: link reinit due to high BER.
+ * @correctable_err_cnt: e.g. bit-flip.
+ * @uncorrectable_err_cnt: e.g. MAC errors.
+ * @retraining_cnt: re-training counter.
+ * @up: is port up.
+ * @pcs_link: has PCS link.
+ * @phy_ready: is PHY ready.
+ * @auto_neg: is Autoneg enabled.
+ * @timeout_retransmission_cnt: timeout retransmission events
+ * @high_ber_cnt: high ber events
+ */
+struct cpucp_nic_status {
+       __le32 port;
+       __le32 bad_format_cnt;
+       __le32 responder_out_of_sequence_psn_cnt;
+       __le32 high_ber_reinit;
+       __le32 correctable_err_cnt;
+       __le32 uncorrectable_err_cnt;
+       __le32 retraining_cnt;
+       __u8 up;
+       __u8 pcs_link;
+       __u8 phy_ready;
+       __u8 auto_neg;
+       __le32 timeout_retransmission_cnt;
+       __le32 high_ber_cnt;
 };
 
 #endif /* CPUCP_IF_H */
index fa8a5ad..3099653 100644 (file)
  * CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL  Device is unusable and customer support
  *                                     should be contacted.
  *
+ * CPU_BOOT_ERR0_ARC0_HALT_ACK_NOT_RCVD        HALT ACK from ARC0 is not received
+ *                                     within specified retries after issuing
+ *                                     HALT request. ARC0 appears to be in bad
+ *                                     reset.
+ *
+ * CPU_BOOT_ERR0_ARC1_HALT_ACK_NOT_RCVD        HALT ACK from ARC1 is not received
+ *                                     within specified retries after issuing
+ *                                     HALT request. ARC1 appears to be in bad
+ *                                     reset.
+ *
+ * CPU_BOOT_ERR0_ARC0_RUN_ACK_NOT_RCVD RUN ACK from ARC0 is not received
+ *                                     within specified timeout after issuing
+ *                                     RUN request. ARC0 appears to be in bad
+ *                                     reset.
+ *
+ * CPU_BOOT_ERR0_ARC1_RUN_ACK_NOT_RCVD RUN ACK from ARC1 is not received
+ *                                     within specified timeout after issuing
+ *                                     RUN request. ARC1 appears to be in bad
+ *                                     reset.
+ *
  * CPU_BOOT_ERR0_ENABLED               Error registers enabled.
  *                                     This is a main indication that the
  *                                     running FW populates the error
 #define CPU_BOOT_ERR0_SEC_IMG_VER_FAIL         (1 << 11)
 #define CPU_BOOT_ERR0_PLL_FAIL                 (1 << 12)
 #define CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL     (1 << 13)
+#define CPU_BOOT_ERR0_ARC0_HALT_ACK_NOT_RCVD   (1 << 14)
+#define CPU_BOOT_ERR0_ARC1_HALT_ACK_NOT_RCVD   (1 << 15)
+#define CPU_BOOT_ERR0_ARC0_RUN_ACK_NOT_RCVD    (1 << 16)
+#define CPU_BOOT_ERR0_ARC1_RUN_ACK_NOT_RCVD    (1 << 17)
 #define CPU_BOOT_ERR0_ENABLED                  (1 << 31)
 #define CPU_BOOT_ERR1_ENABLED                  (1 << 31)
 
  *                                     configured and is ready for use.
  *                                     Initialized in: ppboot
  *
+ * CPU_BOOT_DEV_STS0_FW_NIC_MAC_EN     NIC MAC channels init is done by FW and
+ *                                     any access to them is done via the FW.
+ *                                     Initialized in: linux
+ *
  * CPU_BOOT_DEV_STS0_DYN_PLL_EN                Dynamic PLL configuration is enabled.
  *                                     FW sends to host a bitmap of supported
  *                                     PLLs.
  *                                     prevent IRQs overriding each other.
  *                                     Initialized in: linux
  *
+ * CPU_BOOT_DEV_STS0_FW_NIC_STAT_XPCS91_EN
+ *                                     NIC STAT and XPCS91 access is restricted
+ *                                     and is done via FW only.
+ *                                     Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN
+ *                                     NIC STAT get all is supported.
+ *                                     Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN
+ *                                     F/W checks if the device is idle by reading defined set
+ *                                     of registers. It returns a bitmask of all the engines,
+ *                                     where a bit is set if the engine is not idle.
+ *                                     Initialized in: linux
+ *
  * CPU_BOOT_DEV_STS0_ENABLED           Device status register enabled.
  *                                     This is a main indication that the
  *                                     running FW populates the device status
 #define CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN                        (1 << 15)
 #define CPU_BOOT_DEV_STS0_FW_LD_COM_EN                 (1 << 16)
 #define CPU_BOOT_DEV_STS0_FW_IATU_CONF_EN              (1 << 17)
+#define CPU_BOOT_DEV_STS0_FW_NIC_MAC_EN                        (1 << 18)
 #define CPU_BOOT_DEV_STS0_DYN_PLL_EN                   (1 << 19)
 #define CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN            (1 << 20)
 #define CPU_BOOT_DEV_STS0_EQ_INDEX_EN                  (1 << 21)
 #define CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN            (1 << 22)
+#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_XPCS91_EN                (1 << 23)
+#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN           (1 << 24)
+#define CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN             (1 << 25)
 #define CPU_BOOT_DEV_STS0_ENABLED                      (1 << 31)
 #define CPU_BOOT_DEV_STS1_ENABLED                      (1 << 31)
 
@@ -313,10 +360,7 @@ struct cpu_dyn_regs {
        __le32 hw_state;
        __le32 kmd_msg_to_cpu;
        __le32 cpu_cmd_status_to_host;
-       union {
-               __le32 gic_host_irq_ctrl;
-               __le32 gic_host_pi_upd_irq;
-       };
+       __le32 gic_host_pi_upd_irq;
        __le32 gic_tpc_qm_irq_ctrl;
        __le32 gic_mme_qm_irq_ctrl;
        __le32 gic_dma_qm_irq_ctrl;
@@ -324,7 +368,9 @@ struct cpu_dyn_regs {
        __le32 gic_dma_core_irq_ctrl;
        __le32 gic_host_halt_irq;
        __le32 gic_host_ints_irq;
-       __le32 reserved1[24];           /* reserve for future use */
+       __le32 gic_host_soft_rst_irq;
+       __le32 gic_rot_qm_irq_ctrl;
+       __le32 reserved1[22];           /* reserve for future use */
 };
 
 /* TODO: remove the desc magic after the code is updated to use message */
@@ -462,6 +508,11 @@ struct lkd_fw_comms_msg {
  *                             Do not wait for BMC response.
  *
  * COMMS_LOW_PLL_OPP           Initialize PLLs for low OPP.
+ *
+ * COMMS_PREP_DESC_ELBI                Same as COMMS_PREP_DESC only that the memory
+ *                             space is allocated in a ELBI access only
+ *                             address range.
+ *
  */
 enum comms_cmd {
        COMMS_NOOP = 0,
@@ -474,6 +525,7 @@ enum comms_cmd {
        COMMS_GOTO_WFE = 7,
        COMMS_SKIP_BMC = 8,
        COMMS_LOW_PLL_OPP = 9,
+       COMMS_PREP_DESC_ELBI = 10,
        COMMS_INVLD_LAST
 };
 
index 5bb54b3..ffdfbd9 100644 (file)
 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_1                     0x4F2004
 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_2047                  0x4F3FFC
 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0               0x4F4000
+#define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_ADDRH_0               0x4F4800
+#define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_PAY_DATA_0                0x4F5000
+#define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_ARM_0                     0x4F5800
 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_0                  0x4F6000
 #define mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_MON_STATUS_511                0x4F67FC
 
index 9aea7e9..acc85d3 100644 (file)
@@ -449,4 +449,21 @@ enum axi_id {
 #define PCIE_AUX_FLR_CTRL_HW_CTRL_MASK                               0x1
 #define PCIE_AUX_FLR_CTRL_INT_MASK_MASK                              0x2
 
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_VALID_SHIFT        0
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_VALID_MASK         0x1
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_PENDING_SHIFT      1
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_STATUS_0_PENDING_MASK       0x1FE
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_SHIFT             0
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SID_MASK              0xFF
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_SHIFT            8
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_MASK_MASK             0xFF00
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOP_SHIFT             16
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOP_MASK              0x10000
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOD_SHIFT             17
+#define SYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_ARM_0_SOD_MASK              0xFFFE0000
+#define TPC0_QM_CP_STS_0_FENCE_ID_SHIFT                              20
+#define TPC0_QM_CP_STS_0_FENCE_ID_MASK                               0x300000
+#define TPC0_QM_CP_STS_0_FENCE_IN_PROGRESS_SHIFT                     22
+#define TPC0_QM_CP_STS_0_FENCE_IN_PROGRESS_MASK                      0x400000
+
 #endif /* GAUDI_MASKS_H_ */
index d95d416..b9bd5a7 100644 (file)
@@ -12,8 +12,6 @@
  * PSOC scratch-pad registers
  */
 #define mmHW_STATE                     mmPSOC_GLOBAL_CONF_SCRATCHPAD_0
-/* TODO: remove mmGIC_HOST_IRQ_CTRL_POLL_REG */
-#define mmGIC_HOST_IRQ_CTRL_POLL_REG   mmPSOC_GLOBAL_CONF_SCRATCHPAD_1
 #define mmGIC_HOST_PI_UPD_IRQ_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_1
 #define mmGIC_TPC_QM_IRQ_CTRL_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_2
 #define mmGIC_MME_QM_IRQ_CTRL_POLL_REG mmPSOC_GLOBAL_CONF_SCRATCHPAD_3
index 8679a10..7efb31b 100644 (file)
@@ -116,6 +116,8 @@ static struct class *nvme_ns_chr_class;
 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
                                           unsigned nsid);
+static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
+                                  struct nvme_command *cmd);
 
 /*
  * Prepare a queue for teardown.
@@ -1152,7 +1154,8 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
        return effects;
 }
 
-static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
+static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
+                             struct nvme_command *cmd, int status)
 {
        if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
                nvme_unfreeze(ctrl);
@@ -1167,6 +1170,26 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
                nvme_queue_scan(ctrl);
                flush_work(&ctrl->scan_work);
        }
+
+       switch (cmd->common.opcode) {
+       case nvme_admin_set_features:
+               switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) {
+               case NVME_FEAT_KATO:
+                       /*
+                        * Keep alive commands interval on the host should be
+                        * updated when KATO is modified by Set Features
+                        * commands.
+                        */
+                       if (!status)
+                               nvme_update_keep_alive(ctrl, cmd);
+                       break;
+               default:
+                       break;
+               }
+               break;
+       default:
+               break;
+       }
 }
 
 int nvme_execute_passthru_rq(struct request *rq)
@@ -1181,7 +1204,7 @@ int nvme_execute_passthru_rq(struct request *rq)
        effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
        ret = nvme_execute_rq(disk, rq, false);
        if (effects) /* nothing to be done for zero cmd effects */
-               nvme_passthru_end(ctrl, effects);
+               nvme_passthru_end(ctrl, effects, cmd, ret);
 
        return ret;
 }
@@ -1269,6 +1292,21 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
 
+static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
+                                  struct nvme_command *cmd)
+{
+       unsigned int new_kato =
+               DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000);
+
+       dev_info(ctrl->device,
+                "keep alive interval updated from %u ms to %u ms\n",
+                ctrl->kato * 1000 / 2, new_kato * 1000 / 2);
+
+       nvme_stop_keep_alive(ctrl);
+       ctrl->kato = new_kato;
+       nvme_start_keep_alive(ctrl);
+}
+
 /*
  * In NVMe 1.0 the CNS field was just a binary controller or namespace
  * flag, thus sending any new CNS opcodes has a big chance of not working.
@@ -1302,11 +1340,6 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
        return error;
 }
 
-static bool nvme_multi_css(struct nvme_ctrl *ctrl)
-{
-       return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
-}
-
 static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
                struct nvme_ns_id_desc *cur, bool *csi_seen)
 {
@@ -1874,6 +1907,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
                        goto out_unfreeze;
        }
 
+       set_bit(NVME_NS_READY, &ns->flags);
        blk_mq_unfreeze_queue(ns->disk->queue);
 
        if (blk_queue_is_zoned(ns->queue)) {
@@ -1885,6 +1919,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
        if (nvme_ns_head_multipath(ns->head)) {
                blk_mq_freeze_queue(ns->head->disk->queue);
                nvme_update_disk_info(ns->head->disk, ns, id);
+               nvme_mpath_revalidate_paths(ns);
                blk_stack_limits(&ns->head->disk->queue->limits,
                                 &ns->queue->limits, 0);
                disk_update_readahead(ns->head->disk);
@@ -3763,7 +3798,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 
        nvme_get_ctrl(ctrl);
 
-       device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups);
+       if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
+               goto out_cleanup_ns_from_list;
+
        if (!nvme_ns_head_multipath(ns->head))
                nvme_add_ns_cdev(ns);
 
@@ -3773,6 +3810,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 
        return;
 
+ out_cleanup_ns_from_list:
+       nvme_put_ctrl(ctrl);
+       down_write(&ctrl->namespaces_rwsem);
+       list_del_init(&ns->list);
+       up_write(&ctrl->namespaces_rwsem);
  out_unlink_ns:
        mutex_lock(&ctrl->subsys->lock);
        list_del_rcu(&ns->siblings);
@@ -3795,6 +3837,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
        if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
                return;
 
+       clear_bit(NVME_NS_READY, &ns->flags);
        set_capacity(ns->disk, 0);
        nvme_fault_inject_fini(&ns->fault_inject);
 
@@ -3802,9 +3845,12 @@ static void nvme_ns_remove(struct nvme_ns *ns)
        list_del_rcu(&ns->siblings);
        mutex_unlock(&ns->ctrl->subsys->lock);
 
-       synchronize_rcu(); /* guarantee not available in head->list */
-       nvme_mpath_clear_current_path(ns);
-       synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
+       /* guarantee not available in head->list */
+       synchronize_rcu();
+
+       /* wait for concurrent submissions */
+       if (nvme_mpath_clear_current_path(ns))
+               synchronize_srcu(&ns->head->srcu);
 
        if (!nvme_ns_head_multipath(ns->head))
                nvme_cdev_del(&ns->cdev, &ns->cdev_device);
index 37ce3e8..5d7bc58 100644 (file)
@@ -147,6 +147,21 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
        mutex_unlock(&ctrl->scan_lock);
 }
 
+void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
+{
+       struct nvme_ns_head *head = ns->head;
+       sector_t capacity = get_capacity(head->disk);
+       int node;
+
+       list_for_each_entry_rcu(ns, &head->list, siblings) {
+               if (capacity != get_capacity(ns->disk))
+                       clear_bit(NVME_NS_READY, &ns->flags);
+       }
+
+       for_each_node(node)
+               rcu_assign_pointer(head->current_path[node], NULL);
+}
+
 static bool nvme_path_is_disabled(struct nvme_ns *ns)
 {
        /*
@@ -158,7 +173,7 @@ static bool nvme_path_is_disabled(struct nvme_ns *ns)
            ns->ctrl->state != NVME_CTRL_DELETING)
                return true;
        if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
-           test_bit(NVME_NS_REMOVING, &ns->flags))
+           !test_bit(NVME_NS_READY, &ns->flags))
                return true;
        return false;
 }
@@ -465,6 +480,8 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
                        ctrl->subsys->instance, head->instance);
 
        blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
+       blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
+
        /* set to a default value of 512 until the disk is validated */
        blk_queue_logical_block_size(head->disk->queue, 512);
        blk_set_stacking_limits(&head->disk->queue->limits);
index a2e1f29..9871c0c 100644 (file)
@@ -456,6 +456,7 @@ struct nvme_ns {
 #define NVME_NS_DEAD           1
 #define NVME_NS_ANA_PENDING    2
 #define NVME_NS_FORCE_RO       3
+#define NVME_NS_READY          4
 
        struct cdev             cdev;
        struct device           cdev_device;
@@ -748,6 +749,7 @@ void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
 void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
 void nvme_mpath_stop(struct nvme_ctrl *ctrl);
 bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
+void nvme_mpath_revalidate_paths(struct nvme_ns *ns);
 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
 void nvme_mpath_shutdown_disk(struct nvme_ns_head *head);
 
@@ -795,6 +797,9 @@ static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
 {
        return false;
 }
+static inline void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
+{
+}
 static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
 {
 }
@@ -887,4 +892,9 @@ struct nvme_ctrl *nvme_ctrl_from_file(struct file *file);
 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid);
 void nvme_put_ns(struct nvme_ns *ns);
 
+static inline bool nvme_multi_css(struct nvme_ctrl *ctrl)
+{
+       return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
+}
+
 #endif /* _NVME_H */
index 6450256..e2ab12f 100644 (file)
@@ -45,6 +45,7 @@ struct nvme_tcp_request {
        u32                     pdu_len;
        u32                     pdu_sent;
        u16                     ttag;
+       __le16                  status;
        struct list_head        entry;
        struct llist_node       lentry;
        __le32                  ddgst;
@@ -485,6 +486,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
                struct nvme_completion *cqe)
 {
+       struct nvme_tcp_request *req;
        struct request *rq;
 
        rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id);
@@ -496,7 +498,11 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
                return -EINVAL;
        }
 
-       if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
+       req = blk_mq_rq_to_pdu(rq);
+       if (req->status == cpu_to_le16(NVME_SC_SUCCESS))
+               req->status = cqe->status;
+
+       if (!nvme_try_complete_req(rq, req->status, cqe->result))
                nvme_complete_rq(rq);
        queue->nr_cqe++;
 
@@ -758,7 +764,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
                        queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
                } else {
                        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
-                               nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
+                               nvme_tcp_end_request(rq,
+                                               le16_to_cpu(req->status));
                                queue->nr_cqe++;
                        }
                        nvme_tcp_init_recv_ctx(queue);
@@ -788,18 +795,24 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
                return 0;
 
        if (queue->recv_ddgst != queue->exp_ddgst) {
+               struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
+                                       pdu->command_id);
+               struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+
+               req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR);
+
                dev_err(queue->ctrl->ctrl.device,
                        "data digest error: recv %#x expected %#x\n",
                        le32_to_cpu(queue->recv_ddgst),
                        le32_to_cpu(queue->exp_ddgst));
-               return -EIO;
        }
 
        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
                struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
                                        pdu->command_id);
+               struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 
-               nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
+               nvme_tcp_end_request(rq, le16_to_cpu(req->status));
                queue->nr_cqe++;
        }
 
@@ -2293,6 +2306,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
                return ret;
 
        req->state = NVME_TCP_SEND_CMD_PDU;
+       req->status = cpu_to_le16(NVME_SC_SUCCESS);
        req->offset = 0;
        req->data_sent = 0;
        req->pdu_len = 0;
index 0cb98f2..aa6d84d 100644 (file)
@@ -1015,7 +1015,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
        if (unlikely(ret))
                return ret;
 
-       if (nvmet_req_passthru_ctrl(req))
+       if (nvmet_is_passthru_req(req))
                return nvmet_parse_passthru_admin_cmd(req);
 
        switch (cmd->common.opcode) {
index 2735551..d784f3c 100644 (file)
@@ -1028,7 +1028,7 @@ nvmet_subsys_attr_version_store_locked(struct nvmet_subsys *subsys,
        }
 
        /* passthru subsystems use the underlying controller's version */
-       if (nvmet_passthru_ctrl(subsys))
+       if (nvmet_is_passthru_subsys(subsys))
                return -EINVAL;
 
        ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary);
@@ -1067,7 +1067,8 @@ static ssize_t nvmet_subsys_attr_serial_show(struct config_item *item,
 {
        struct nvmet_subsys *subsys = to_subsys(item);
 
-       return snprintf(page, PAGE_SIZE, "%s\n", subsys->serial);
+       return snprintf(page, PAGE_SIZE, "%*s\n",
+                       NVMET_SN_MAX_SIZE, subsys->serial);
 }
 
 static ssize_t
index 66d05ee..b8425fa 100644 (file)
@@ -553,7 +553,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
        mutex_lock(&subsys->lock);
        ret = 0;
 
-       if (nvmet_passthru_ctrl(subsys)) {
+       if (nvmet_is_passthru_subsys(subsys)) {
                pr_info("cannot enable both passthru and regular namespaces for a single subsystem");
                goto out_unlock;
        }
@@ -869,7 +869,7 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
        if (unlikely(ret))
                return ret;
 
-       if (nvmet_req_passthru_ctrl(req))
+       if (nvmet_is_passthru_req(req))
                return nvmet_parse_passthru_io_cmd(req);
 
        ret = nvmet_req_find_ns(req);
@@ -1206,6 +1206,9 @@ static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
        ctrl->cap |= (15ULL << 24);
        /* maximum queue entries supported: */
        ctrl->cap |= NVMET_QUEUE_SIZE - 1;
+
+       if (nvmet_is_passthru_subsys(ctrl->subsys))
+               nvmet_passthrough_override_cap(ctrl);
 }
 
 struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn,
@@ -1363,8 +1366,6 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
                goto out_put_subsystem;
        mutex_init(&ctrl->lock);
 
-       nvmet_init_cap(ctrl);
-
        ctrl->port = req->port;
 
        INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
@@ -1378,6 +1379,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 
        kref_init(&ctrl->ref);
        ctrl->subsys = subsys;
+       nvmet_init_cap(ctrl);
        WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL);
 
        ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES,
index 06dd3d5..7143c7f 100644 (file)
@@ -582,7 +582,7 @@ int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys);
 void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys);
 u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req);
 u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req);
-static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys)
+static inline bool nvmet_is_passthru_subsys(struct nvmet_subsys *subsys)
 {
        return subsys->passthru_ctrl;
 }
@@ -601,18 +601,19 @@ static inline u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req)
 {
        return 0;
 }
-static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys)
+static inline bool nvmet_is_passthru_subsys(struct nvmet_subsys *subsys)
 {
        return NULL;
 }
 #endif /* CONFIG_NVME_TARGET_PASSTHRU */
 
-static inline struct nvme_ctrl *
-nvmet_req_passthru_ctrl(struct nvmet_req *req)
+static inline bool nvmet_is_passthru_req(struct nvmet_req *req)
 {
-       return nvmet_passthru_ctrl(nvmet_req_subsys(req));
+       return nvmet_is_passthru_subsys(nvmet_req_subsys(req));
 }
 
+void nvmet_passthrough_override_cap(struct nvmet_ctrl *ctrl);
+
 u16 errno_to_nvme_status(struct nvmet_req *req, int errno);
 u16 nvmet_report_invalid_opcode(struct nvmet_req *req);
 
index 225cd1f..f0efb35 100644 (file)
@@ -20,6 +20,16 @@ MODULE_IMPORT_NS(NVME_TARGET_PASSTHRU);
  */
 static DEFINE_XARRAY(passthru_subsystems);
 
+void nvmet_passthrough_override_cap(struct nvmet_ctrl *ctrl)
+{
+       /*
+        * Multiple command set support can only be declared if the underlying
+        * controller actually supports it.
+        */
+       if (!nvme_multi_css(ctrl->subsys->passthru_ctrl))
+               ctrl->cap &= ~(1ULL << 43);
+}
+
 static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
 {
        struct nvmet_ctrl *ctrl = req->sq->ctrl;
@@ -218,7 +228,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
 
 static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
 {
-       struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req);
+       struct nvme_ctrl *ctrl = nvmet_req_subsys(req)->passthru_ctrl;
        struct request_queue *q = ctrl->admin_q;
        struct nvme_ns *ns = NULL;
        struct request *rq = NULL;
@@ -299,7 +309,7 @@ out:
  */
 static void nvmet_passthru_set_host_behaviour(struct nvmet_req *req)
 {
-       struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req);
+       struct nvme_ctrl *ctrl = nvmet_req_subsys(req)->passthru_ctrl;
        struct nvme_feat_host_behavior *host;
        u16 status = NVME_SC_INTERNAL;
        int ret;
index 0c0dc2e..3fd74bb 100644 (file)
@@ -1444,6 +1444,9 @@ static int of_fwnode_add_links(struct fwnode_handle *fwnode)
        struct property *p;
        struct device_node *con_np = to_of_node(fwnode);
 
+       if (IS_ENABLED(CONFIG_X86))
+               return 0;
+
        if (!con_np)
                return -EINVAL;
 
index 889d7ce..952a925 100644 (file)
@@ -156,15 +156,6 @@ static inline struct dino_device *DINO_DEV(struct pci_hba_data *hba)
        return container_of(hba, struct dino_device, hba);
 }
 
-/* Check if PCI device is behind a Card-mode Dino. */
-static int pci_dev_is_behind_card_dino(struct pci_dev *dev)
-{
-       struct dino_device *dino_dev;
-
-       dino_dev = DINO_DEV(parisc_walk_tree(dev->bus->bridge));
-       return is_card_dino(&dino_dev->hba.dev->id);
-}
-
 /*
  * Dino Configuration Space Accessor Functions
  */
@@ -447,6 +438,15 @@ static void quirk_cirrus_cardbus(struct pci_dev *dev)
 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_CIRRUS, PCI_DEVICE_ID_CIRRUS_6832, quirk_cirrus_cardbus );
 
 #ifdef CONFIG_TULIP
+/* Check if PCI device is behind a Card-mode Dino. */
+static int pci_dev_is_behind_card_dino(struct pci_dev *dev)
+{
+       struct dino_device *dino_dev;
+
+       dino_dev = DINO_DEV(parisc_walk_tree(dev->bus->bridge));
+       return is_card_dino(&dino_dev->hba.dev->id);
+}
+
 static void pci_fixup_tulip(struct pci_dev *dev)
 {
        if (!pci_dev_is_behind_card_dino(dev))
index c76aded..aa29841 100644 (file)
@@ -272,7 +272,7 @@ config PWM_IQS620A
 
 config PWM_JZ4740
        tristate "Ingenic JZ47xx PWM support"
-       depends on MIPS
+       depends on MIPS || COMPILE_TEST
        depends on COMMON_CLK
        select MFD_SYSCON
        help
@@ -284,7 +284,8 @@ config PWM_JZ4740
 
 config PWM_KEEMBAY
        tristate "Intel Keem Bay PWM driver"
-       depends on ARCH_KEEMBAY || (ARM64 && COMPILE_TEST)
+       depends on ARCH_KEEMBAY || COMPILE_TEST
+       depends on COMMON_CLK && HAS_IOMEM
        help
          The platform driver for Intel Keem Bay PWM controller.
 
index 35e894f..4527f09 100644 (file)
@@ -304,7 +304,7 @@ EXPORT_SYMBOL_GPL(pwmchip_add);
  *
  * Returns: 0 on success or a negative error code on failure.
  */
-int pwmchip_remove(struct pwm_chip *chip)
+void pwmchip_remove(struct pwm_chip *chip)
 {
        pwmchip_sysfs_unexport(chip);
 
@@ -318,8 +318,6 @@ int pwmchip_remove(struct pwm_chip *chip)
        free_pwms(chip);
 
        mutex_unlock(&pwm_lock);
-
-       return 0;
 }
 EXPORT_SYMBOL_GPL(pwmchip_remove);
 
index e2a26d9..ad37bc4 100644 (file)
 
 struct ab8500_pwm_chip {
        struct pwm_chip chip;
+       unsigned int hwid;
 };
 
+static struct ab8500_pwm_chip *ab8500_pwm_from_chip(struct pwm_chip *chip)
+{
+       return container_of(chip, struct ab8500_pwm_chip, chip);
+}
+
 static int ab8500_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
                            const struct pwm_state *state)
 {
        int ret;
        u8 reg;
        unsigned int higher_val, lower_val;
+       struct ab8500_pwm_chip *ab8500 = ab8500_pwm_from_chip(chip);
 
        if (state->polarity != PWM_POLARITY_NORMAL)
                return -EINVAL;
@@ -37,7 +44,7 @@ static int ab8500_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
        if (!state->enabled) {
                ret = abx500_mask_and_set_register_interruptible(chip->dev,
                                        AB8500_MISC, AB8500_PWM_OUT_CTRL7_REG,
-                                       1 << (chip->base - 1), 0);
+                                       1 << ab8500->hwid, 0);
 
                if (ret < 0)
                        dev_err(chip->dev, "%s: Failed to disable PWM, Error %d\n",
@@ -56,7 +63,7 @@ static int ab8500_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
         */
        higher_val = ((state->duty_cycle & 0x0300) >> 8);
 
-       reg = AB8500_PWM_OUT_CTRL1_REG + ((chip->base - 1) * 2);
+       reg = AB8500_PWM_OUT_CTRL1_REG + (ab8500->hwid * 2);
 
        ret = abx500_set_register_interruptible(chip->dev, AB8500_MISC,
                        reg, (u8)lower_val);
@@ -70,7 +77,7 @@ static int ab8500_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
        ret = abx500_mask_and_set_register_interruptible(chip->dev,
                                AB8500_MISC, AB8500_PWM_OUT_CTRL7_REG,
-                               1 << (chip->base - 1), 1 << (chip->base - 1));
+                               1 << ab8500->hwid, 1 << ab8500->hwid);
        if (ret < 0)
                dev_err(chip->dev, "%s: Failed to enable PWM, Error %d\n",
                                                        pwm->label, ret);
@@ -88,6 +95,9 @@ static int ab8500_pwm_probe(struct platform_device *pdev)
        struct ab8500_pwm_chip *ab8500;
        int err;
 
+       if (pdev->id < 1 || pdev->id > 31)
+               return dev_err_probe(&pdev->dev, EINVAL, "Invalid device id %d\n", pdev->id);
+
        /*
         * Nothing to be done in probe, this is required to get the
         * device which is required for ab8500 read and write
@@ -99,27 +109,13 @@ static int ab8500_pwm_probe(struct platform_device *pdev)
        ab8500->chip.dev = &pdev->dev;
        ab8500->chip.ops = &ab8500_pwm_ops;
        ab8500->chip.npwm = 1;
+       ab8500->hwid = pdev->id - 1;
 
-       err = pwmchip_add(&ab8500->chip);
+       err = devm_pwmchip_add(&pdev->dev, &ab8500->chip);
        if (err < 0)
                return dev_err_probe(&pdev->dev, err, "Failed to add pwm chip\n");
 
        dev_dbg(&pdev->dev, "pwm probe successful\n");
-       platform_set_drvdata(pdev, ab8500);
-
-       return 0;
-}
-
-static int ab8500_pwm_remove(struct platform_device *pdev)
-{
-       struct ab8500_pwm_chip *ab8500 = platform_get_drvdata(pdev);
-       int err;
-
-       err = pwmchip_remove(&ab8500->chip);
-       if (err < 0)
-               return err;
-
-       dev_dbg(&pdev->dev, "pwm driver removed\n");
 
        return 0;
 }
@@ -129,7 +125,6 @@ static struct platform_driver ab8500_pwm_driver = {
                .name = "ab8500-pwm",
        },
        .probe = ab8500_pwm_probe,
-       .remove = ab8500_pwm_remove,
 };
 module_platform_driver(ab8500_pwm_driver);
 
index 4459325..a43b2ba 100644 (file)
@@ -281,11 +281,8 @@ static int atmel_hlcdc_pwm_probe(struct platform_device *pdev)
 static int atmel_hlcdc_pwm_remove(struct platform_device *pdev)
 {
        struct atmel_hlcdc_pwm *chip = platform_get_drvdata(pdev);
-       int ret;
 
-       ret = pwmchip_remove(&chip->chip);
-       if (ret)
-               return ret;
+       pwmchip_remove(&chip->chip);
 
        clk_disable_unprepare(chip->hlcdc->periph_clk);
 
index bf398f2..36f7ea3 100644 (file)
@@ -503,11 +503,8 @@ err_slow_clk:
 static int atmel_tcb_pwm_remove(struct platform_device *pdev)
 {
        struct atmel_tcb_pwm_chip *tcbpwm = platform_get_drvdata(pdev);
-       int err;
 
-       err = pwmchip_remove(&tcbpwm->chip);
-       if (err < 0)
-               return err;
+       pwmchip_remove(&tcbpwm->chip);
 
        clk_disable_unprepare(tcbpwm->slow_clk);
        clk_put(tcbpwm->slow_clk);
index a8162ba..e748604 100644 (file)
@@ -84,9 +84,19 @@ struct atmel_pwm_chip {
        void __iomem *base;
        const struct atmel_pwm_data *data;
 
-       unsigned int updated_pwms;
-       /* ISR is cleared when read, ensure only one thread does that */
-       struct mutex isr_lock;
+       /*
+        * The hardware supports a mechanism to update a channel's duty cycle at
+        * the end of the currently running period. When such an update is
+        * pending we delay disabling the PWM until the new configuration is
+        * active because otherwise pmw_config(duty_cycle=0); pwm_disable();
+        * might not result in an inactive output.
+        * This bitmask tracks for which channels an update is pending in
+        * hardware.
+        */
+       u32 update_pending;
+
+       /* Protects .update_pending */
+       spinlock_t lock;
 };
 
 static inline struct atmel_pwm_chip *to_atmel_pwm_chip(struct pwm_chip *chip)
@@ -123,6 +133,64 @@ static inline void atmel_pwm_ch_writel(struct atmel_pwm_chip *chip,
        atmel_pwm_writel(chip, base + offset, val);
 }
 
+static void atmel_pwm_update_pending(struct atmel_pwm_chip *chip)
+{
+       /*
+        * Each channel that has its bit in ISR set started a new period since
+        * ISR was cleared and so there is no more update pending.  Note that
+        * reading ISR clears it, so this needs to handle all channels to not
+        * loose information.
+        */
+       u32 isr = atmel_pwm_readl(chip, PWM_ISR);
+
+       chip->update_pending &= ~isr;
+}
+
+static void atmel_pwm_set_pending(struct atmel_pwm_chip *chip, unsigned int ch)
+{
+       spin_lock(&chip->lock);
+
+       /*
+        * Clear pending flags in hardware because otherwise there might still
+        * be a stale flag in ISR.
+        */
+       atmel_pwm_update_pending(chip);
+
+       chip->update_pending |= (1 << ch);
+
+       spin_unlock(&chip->lock);
+}
+
+static int atmel_pwm_test_pending(struct atmel_pwm_chip *chip, unsigned int ch)
+{
+       int ret = 0;
+
+       spin_lock(&chip->lock);
+
+       if (chip->update_pending & (1 << ch)) {
+               atmel_pwm_update_pending(chip);
+
+               if (chip->update_pending & (1 << ch))
+                       ret = 1;
+       }
+
+       spin_unlock(&chip->lock);
+
+       return ret;
+}
+
+static int atmel_pwm_wait_nonpending(struct atmel_pwm_chip *chip, unsigned int ch)
+{
+       unsigned long timeout = jiffies + 2 * HZ;
+       int ret;
+
+       while ((ret = atmel_pwm_test_pending(chip, ch)) &&
+              time_before(jiffies, timeout))
+               usleep_range(10, 100);
+
+       return ret ? -ETIMEDOUT : 0;
+}
+
 static int atmel_pwm_calculate_cprd_and_pres(struct pwm_chip *chip,
                                             unsigned long clkrate,
                                             const struct pwm_state *state,
@@ -185,6 +253,7 @@ static void atmel_pwm_update_cdty(struct pwm_chip *chip, struct pwm_device *pwm,
 
        atmel_pwm_ch_writel(atmel_pwm, pwm->hwpwm,
                            atmel_pwm->data->regs.duty_upd, cdty);
+       atmel_pwm_set_pending(atmel_pwm, pwm->hwpwm);
 }
 
 static void atmel_pwm_set_cprd_cdty(struct pwm_chip *chip,
@@ -205,20 +274,8 @@ static void atmel_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm,
        struct atmel_pwm_chip *atmel_pwm = to_atmel_pwm_chip(chip);
        unsigned long timeout = jiffies + 2 * HZ;
 
-       /*
-        * Wait for at least a complete period to have passed before disabling a
-        * channel to be sure that CDTY has been updated
-        */
-       mutex_lock(&atmel_pwm->isr_lock);
-       atmel_pwm->updated_pwms |= atmel_pwm_readl(atmel_pwm, PWM_ISR);
-
-       while (!(atmel_pwm->updated_pwms & (1 << pwm->hwpwm)) &&
-              time_before(jiffies, timeout)) {
-               usleep_range(10, 100);
-               atmel_pwm->updated_pwms |= atmel_pwm_readl(atmel_pwm, PWM_ISR);
-       }
+       atmel_pwm_wait_nonpending(atmel_pwm, pwm->hwpwm);
 
-       mutex_unlock(&atmel_pwm->isr_lock);
        atmel_pwm_writel(atmel_pwm, PWM_DIS, 1 << pwm->hwpwm);
 
        /*
@@ -292,10 +349,6 @@ static int atmel_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
                        val |= PWM_CMR_CPOL;
                atmel_pwm_ch_writel(atmel_pwm, pwm->hwpwm, PWM_CMR, val);
                atmel_pwm_set_cprd_cdty(chip, pwm, cprd, cdty);
-               mutex_lock(&atmel_pwm->isr_lock);
-               atmel_pwm->updated_pwms |= atmel_pwm_readl(atmel_pwm, PWM_ISR);
-               atmel_pwm->updated_pwms &= ~(1 << pwm->hwpwm);
-               mutex_unlock(&atmel_pwm->isr_lock);
                atmel_pwm_writel(atmel_pwm, PWM_ENA, 1 << pwm->hwpwm);
        } else if (cstate.enabled) {
                atmel_pwm_disable(chip, pwm, true);
@@ -326,6 +379,9 @@ static void atmel_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
                tmp <<= pres;
                state->period = DIV64_U64_ROUND_UP(tmp, rate);
 
+               /* Wait for an updated duty_cycle queued in hardware */
+               atmel_pwm_wait_nonpending(atmel_pwm, pwm->hwpwm);
+
                cdty = atmel_pwm_ch_readl(atmel_pwm, pwm->hwpwm,
                                          atmel_pwm->data->regs.duty);
                tmp = (u64)(cprd - cdty) * NSEC_PER_SEC;
@@ -416,9 +472,10 @@ static int atmel_pwm_probe(struct platform_device *pdev)
        if (!atmel_pwm)
                return -ENOMEM;
 
-       mutex_init(&atmel_pwm->isr_lock);
        atmel_pwm->data = of_device_get_match_data(&pdev->dev);
-       atmel_pwm->updated_pwms = 0;
+
+       atmel_pwm->update_pending = 0;
+       spin_lock_init(&atmel_pwm->lock);
 
        atmel_pwm->base = devm_platform_ioremap_resource(pdev, 0);
        if (IS_ERR(atmel_pwm->base))
@@ -460,7 +517,6 @@ static int atmel_pwm_remove(struct platform_device *pdev)
        pwmchip_remove(&atmel_pwm->chip);
 
        clk_unprepare(atmel_pwm->clk);
-       mutex_destroy(&atmel_pwm->isr_lock);
 
        return 0;
 }
index 8c85c66..64148f5 100644 (file)
@@ -267,8 +267,6 @@ static int kona_pwmc_probe(struct platform_device *pdev)
        if (kp == NULL)
                return -ENOMEM;
 
-       platform_set_drvdata(pdev, kp);
-
        kp->chip.dev = &pdev->dev;
        kp->chip.ops = &kona_pwm_ops;
        kp->chip.npwm = 6;
@@ -298,20 +296,13 @@ static int kona_pwmc_probe(struct platform_device *pdev)
 
        clk_disable_unprepare(kp->clk);
 
-       ret = pwmchip_add(&kp->chip);
+       ret = devm_pwmchip_add(&pdev->dev, &kp->chip);
        if (ret < 0)
                dev_err(&pdev->dev, "failed to add PWM chip: %d\n", ret);
 
        return ret;
 }
 
-static int kona_pwmc_remove(struct platform_device *pdev)
-{
-       struct kona_pwmc *kp = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&kp->chip);
-}
-
 static const struct of_device_id bcm_kona_pwmc_dt[] = {
        { .compatible = "brcm,kona-pwm" },
        { },
@@ -324,7 +315,6 @@ static struct platform_driver kona_pwmc_driver = {
                .of_match_table = bcm_kona_pwmc_dt,
        },
        .probe = kona_pwmc_probe,
-       .remove = kona_pwmc_remove,
 };
 module_platform_driver(kona_pwmc_driver);
 
index 8b1d1e7..3b529f8 100644 (file)
@@ -282,12 +282,11 @@ out_clk:
 static int brcmstb_pwm_remove(struct platform_device *pdev)
 {
        struct brcmstb_pwm *p = platform_get_drvdata(pdev);
-       int ret;
 
-       ret = pwmchip_remove(&p->chip);
+       pwmchip_remove(&p->chip);
        clk_disable_unprepare(p->clk);
 
-       return ret;
+       return 0;
 }
 
 #ifdef CONFIG_PM_SLEEP
index 9fffb56..5e29d9c 100644 (file)
@@ -280,7 +280,9 @@ static int cros_ec_pwm_remove(struct platform_device *dev)
        struct cros_ec_pwm_device *ec_pwm = platform_get_drvdata(dev);
        struct pwm_chip *chip = &ec_pwm->chip;
 
-       return pwmchip_remove(chip);
+       pwmchip_remove(chip);
+
+       return 0;
 }
 
 #ifdef CONFIG_OF
index fc3cb7d..c45a75e 100644 (file)
@@ -183,27 +183,18 @@ static int ep93xx_pwm_probe(struct platform_device *pdev)
        ep93xx_pwm->chip.ops = &ep93xx_pwm_ops;
        ep93xx_pwm->chip.npwm = 1;
 
-       ret = pwmchip_add(&ep93xx_pwm->chip);
+       ret = devm_pwmchip_add(&pdev->dev, &ep93xx_pwm->chip);
        if (ret < 0)
                return ret;
 
-       platform_set_drvdata(pdev, ep93xx_pwm);
        return 0;
 }
 
-static int ep93xx_pwm_remove(struct platform_device *pdev)
-{
-       struct ep93xx_pwm *ep93xx_pwm = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&ep93xx_pwm->chip);
-}
-
 static struct platform_driver ep93xx_pwm_driver = {
        .driver = {
                .name = "ep93xx-pwm",
        },
        .probe = ep93xx_pwm_probe,
-       .remove = ep93xx_pwm_remove,
 };
 module_platform_driver(ep93xx_pwm_driver);
 
index 96ccd77..0247757 100644 (file)
@@ -453,7 +453,7 @@ static int fsl_pwm_probe(struct platform_device *pdev)
        fpc->chip.ops = &fsl_pwm_ops;
        fpc->chip.npwm = 8;
 
-       ret = pwmchip_add(&fpc->chip);
+       ret = devm_pwmchip_add(&pdev->dev, &fpc->chip);
        if (ret < 0) {
                dev_err(&pdev->dev, "failed to add PWM chip: %d\n", ret);
                return ret;
@@ -464,13 +464,6 @@ static int fsl_pwm_probe(struct platform_device *pdev)
        return fsl_pwm_init(fpc);
 }
 
-static int fsl_pwm_remove(struct platform_device *pdev)
-{
-       struct fsl_pwm_chip *fpc = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&fpc->chip);
-}
-
 #ifdef CONFIG_PM_SLEEP
 static int fsl_pwm_suspend(struct device *dev)
 {
@@ -552,7 +545,6 @@ static struct platform_driver fsl_pwm_driver = {
                .pm = &fsl_pwm_pm_ops,
        },
        .probe = fsl_pwm_probe,
-       .remove = fsl_pwm_remove,
 };
 module_platform_driver(fsl_pwm_driver);
 
index 4a6e9ad..333f1b1 100644 (file)
@@ -248,13 +248,15 @@ static int hibvt_pwm_remove(struct platform_device *pdev)
 
        pwm_chip = platform_get_drvdata(pdev);
 
+       pwmchip_remove(&pwm_chip->chip);
+
        reset_control_assert(pwm_chip->rstc);
        msleep(30);
        reset_control_deassert(pwm_chip->rstc);
 
        clk_disable_unprepare(pwm_chip->clk);
 
-       return pwmchip_remove(&pwm_chip->chip);
+       return 0;
 }
 
 static const struct of_device_id hibvt_pwm_of_match[] = {
index 11b16ec..f97f825 100644 (file)
@@ -326,28 +326,14 @@ err_pm_disable:
 static int img_pwm_remove(struct platform_device *pdev)
 {
        struct img_pwm_chip *pwm_chip = platform_get_drvdata(pdev);
-       u32 val;
-       unsigned int i;
-       int ret;
-
-       ret = pm_runtime_get_sync(&pdev->dev);
-       if (ret < 0) {
-               pm_runtime_put(&pdev->dev);
-               return ret;
-       }
-
-       for (i = 0; i < pwm_chip->chip.npwm; i++) {
-               val = img_pwm_readl(pwm_chip, PWM_CTRL_CFG);
-               val &= ~BIT(i);
-               img_pwm_writel(pwm_chip, PWM_CTRL_CFG, val);
-       }
 
-       pm_runtime_put(&pdev->dev);
        pm_runtime_disable(&pdev->dev);
        if (!pm_runtime_status_suspended(&pdev->dev))
                img_pwm_runtime_suspend(&pdev->dev);
 
-       return pwmchip_remove(&pwm_chip->chip);
+       pwmchip_remove(&pwm_chip->chip);
+
+       return 0;
 }
 
 #ifdef CONFIG_PM_SLEEP
index dbb5049..e5e7b7c 100644 (file)
@@ -382,11 +382,12 @@ static int pwm_imx_tpm_probe(struct platform_device *pdev)
 static int pwm_imx_tpm_remove(struct platform_device *pdev)
 {
        struct imx_tpm_pwm_chip *tpm = platform_get_drvdata(pdev);
-       int ret = pwmchip_remove(&tpm->chip);
+
+       pwmchip_remove(&tpm->chip);
 
        clk_disable_unprepare(tpm->clk);
 
-       return ret;
+       return 0;
 }
 
 static int __maybe_unused pwm_imx_tpm_suspend(struct device *dev)
index f6588a9..ea91a2f 100644 (file)
@@ -313,8 +313,6 @@ static int pwm_imx27_probe(struct platform_device *pdev)
        if (imx == NULL)
                return -ENOMEM;
 
-       platform_set_drvdata(pdev, imx);
-
        imx->clk_ipg = devm_clk_get(&pdev->dev, "ipg");
        if (IS_ERR(imx->clk_ipg))
                return dev_err_probe(&pdev->dev, PTR_ERR(imx->clk_ipg),
@@ -342,16 +340,7 @@ static int pwm_imx27_probe(struct platform_device *pdev)
        if (!(pwmcr & MX3_PWMCR_EN))
                pwm_imx27_clk_disable_unprepare(imx);
 
-       return pwmchip_add(&imx->chip);
-}
-
-static int pwm_imx27_remove(struct platform_device *pdev)
-{
-       struct pwm_imx27_chip *imx;
-
-       imx = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&imx->chip);
+       return devm_pwmchip_add(&pdev->dev, &imx->chip);
 }
 
 static struct platform_driver imx_pwm_driver = {
@@ -360,7 +349,6 @@ static struct platform_driver imx_pwm_driver = {
                .of_match_table = pwm_imx27_dt_ids,
        },
        .probe = pwm_imx27_probe,
-       .remove = pwm_imx27_remove,
 };
 module_platform_driver(imx_pwm_driver);
 
index 015f5eb..b66c350 100644 (file)
@@ -176,8 +176,6 @@ static int lgm_pwm_probe(struct platform_device *pdev)
        if (!pc)
                return -ENOMEM;
 
-       platform_set_drvdata(pdev, pc);
-
        io_base = devm_platform_ioremap_resource(pdev, 0);
        if (IS_ERR(io_base))
                return PTR_ERR(io_base);
@@ -210,20 +208,13 @@ static int lgm_pwm_probe(struct platform_device *pdev)
 
        lgm_pwm_init(pc);
 
-       ret = pwmchip_add(&pc->chip);
+       ret = devm_pwmchip_add(dev, &pc->chip);
        if (ret < 0)
                return dev_err_probe(dev, ret, "failed to add PWM chip\n");
 
        return 0;
 }
 
-static int lgm_pwm_remove(struct platform_device *pdev)
-{
-       struct lgm_pwm_chip *pc = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&pc->chip);
-}
-
 static const struct of_device_id lgm_pwm_of_match[] = {
        { .compatible = "intel,lgm-pwm" },
        { }
@@ -236,7 +227,6 @@ static struct platform_driver lgm_pwm_driver = {
                .of_match_table = lgm_pwm_of_match,
        },
        .probe = lgm_pwm_probe,
-       .remove = lgm_pwm_remove,
 };
 module_platform_driver(lgm_pwm_driver);
 
index 6c6e26d..54bd95a 100644 (file)
@@ -189,7 +189,6 @@ static int iqs620_pwm_probe(struct platform_device *pdev)
        if (!iqs620_pwm)
                return -ENOMEM;
 
-       platform_set_drvdata(pdev, iqs620_pwm);
        iqs620_pwm->iqs62x = iqs62x;
 
        ret = regmap_read(iqs62x->regmap, IQS620_PWR_SETTINGS, &val);
@@ -224,31 +223,18 @@ static int iqs620_pwm_probe(struct platform_device *pdev)
        if (ret)
                return ret;
 
-       ret = pwmchip_add(&iqs620_pwm->chip);
+       ret = devm_pwmchip_add(&pdev->dev, &iqs620_pwm->chip);
        if (ret)
                dev_err(&pdev->dev, "Failed to add device: %d\n", ret);
 
        return ret;
 }
 
-static int iqs620_pwm_remove(struct platform_device *pdev)
-{
-       struct iqs620_pwm_private *iqs620_pwm = platform_get_drvdata(pdev);
-       int ret;
-
-       ret = pwmchip_remove(&iqs620_pwm->chip);
-       if (ret)
-               dev_err(&pdev->dev, "Failed to remove device: %d\n", ret);
-
-       return ret;
-}
-
 static struct platform_driver iqs620_pwm_platform_driver = {
        .driver = {
                .name = "iqs620a-pwm",
        },
        .probe = iqs620_pwm_probe,
-       .remove = iqs620_pwm_remove,
 };
 module_platform_driver(iqs620_pwm_platform_driver);
 
index 990e790..23dc1fb 100644 (file)
@@ -245,16 +245,7 @@ static int jz4740_pwm_probe(struct platform_device *pdev)
        jz4740->chip.ops = &jz4740_pwm_ops;
        jz4740->chip.npwm = info->num_pwms;
 
-       platform_set_drvdata(pdev, jz4740);
-
-       return pwmchip_add(&jz4740->chip);
-}
-
-static int jz4740_pwm_remove(struct platform_device *pdev)
-{
-       struct jz4740_pwm_chip *jz4740 = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&jz4740->chip);
+       return devm_pwmchip_add(dev, &jz4740->chip);
 }
 
 static const struct soc_info __maybe_unused jz4740_soc_info = {
@@ -280,7 +271,6 @@ static struct platform_driver jz4740_pwm_driver = {
                .of_match_table = of_match_ptr(jz4740_pwm_dt_ids),
        },
        .probe = jz4740_pwm_probe,
-       .remove = jz4740_pwm_remove,
 };
 module_platform_driver(jz4740_pwm_driver);
 
index 521a825..733811b 100644 (file)
@@ -207,22 +207,13 @@ static int keembay_pwm_probe(struct platform_device *pdev)
        priv->chip.ops = &keembay_pwm_ops;
        priv->chip.npwm = KMB_TOTAL_PWM_CHANNELS;
 
-       ret = pwmchip_add(&priv->chip);
+       ret = devm_pwmchip_add(dev, &priv->chip);
        if (ret)
                return dev_err_probe(dev, ret, "Failed to add PWM chip\n");
 
-       platform_set_drvdata(pdev, priv);
-
        return 0;
 }
 
-static int keembay_pwm_remove(struct platform_device *pdev)
-{
-       struct keembay_pwm *priv = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&priv->chip);
-}
-
 static const struct of_device_id keembay_pwm_of_match[] = {
        { .compatible = "intel,keembay-pwm" },
        { }
@@ -231,7 +222,6 @@ MODULE_DEVICE_TABLE(of, keembay_pwm_of_match);
 
 static struct platform_driver keembay_pwm_driver = {
        .probe  = keembay_pwm_probe,
-       .remove = keembay_pwm_remove,
        .driver = {
                .name = "pwm-keembay",
                .of_match_table = keembay_pwm_of_match,
index 7551253..ea17d44 100644 (file)
@@ -276,16 +276,7 @@ static int lp3943_pwm_probe(struct platform_device *pdev)
        lp3943_pwm->chip.ops = &lp3943_pwm_ops;
        lp3943_pwm->chip.npwm = LP3943_NUM_PWMS;
 
-       platform_set_drvdata(pdev, lp3943_pwm);
-
-       return pwmchip_add(&lp3943_pwm->chip);
-}
-
-static int lp3943_pwm_remove(struct platform_device *pdev)
-{
-       struct lp3943_pwm *lp3943_pwm = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&lp3943_pwm->chip);
+       return devm_pwmchip_add(&pdev->dev, &lp3943_pwm->chip);
 }
 
 #ifdef CONFIG_OF
@@ -298,7 +289,6 @@ MODULE_DEVICE_TABLE(of, lp3943_pwm_of_match);
 
 static struct platform_driver lp3943_pwm_driver = {
        .probe = lp3943_pwm_probe,
-       .remove = lp3943_pwm_remove,
        .driver = {
                .name = "lp3943-pwm",
                .of_match_table = of_match_ptr(lp3943_pwm_of_match),
index 2834a0f..ddeab56 100644 (file)
@@ -117,29 +117,20 @@ static int lpc32xx_pwm_probe(struct platform_device *pdev)
        lpc32xx->chip.ops = &lpc32xx_pwm_ops;
        lpc32xx->chip.npwm = 1;
 
-       ret = pwmchip_add(&lpc32xx->chip);
-       if (ret < 0) {
-               dev_err(&pdev->dev, "failed to add PWM chip, error %d\n", ret);
-               return ret;
-       }
-
-       /* When PWM is disable, configure the output to the default value */
+       /* If PWM is disabled, configure the output to the default value */
        val = readl(lpc32xx->base + (lpc32xx->chip.pwms[0].hwpwm << 2));
        val &= ~PWM_PIN_LEVEL;
        writel(val, lpc32xx->base + (lpc32xx->chip.pwms[0].hwpwm << 2));
 
-       platform_set_drvdata(pdev, lpc32xx);
+       ret = devm_pwmchip_add(&pdev->dev, &lpc32xx->chip);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "failed to add PWM chip, error %d\n", ret);
+               return ret;
+       }
 
        return 0;
 }
 
-static int lpc32xx_pwm_remove(struct platform_device *pdev)
-{
-       struct lpc32xx_pwm_chip *lpc32xx = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&lpc32xx->chip);
-}
-
 static const struct of_device_id lpc32xx_pwm_dt_ids[] = {
        { .compatible = "nxp,lpc3220-pwm", },
        { /* sentinel */ }
@@ -152,7 +143,6 @@ static struct platform_driver lpc32xx_pwm_driver = {
                .of_match_table = lpc32xx_pwm_dt_ids,
        },
        .probe = lpc32xx_pwm_probe,
-       .remove = lpc32xx_pwm_remove,
 };
 module_platform_driver(lpc32xx_pwm_driver);
 
index b4a3106..0d4dd80 100644 (file)
@@ -253,13 +253,11 @@ static int pwm_mediatek_probe(struct platform_device *pdev)
                }
        }
 
-       platform_set_drvdata(pdev, pc);
-
        pc->chip.dev = &pdev->dev;
        pc->chip.ops = &pwm_mediatek_ops;
        pc->chip.npwm = pc->soc->num_pwms;
 
-       ret = pwmchip_add(&pc->chip);
+       ret = devm_pwmchip_add(&pdev->dev, &pc->chip);
        if (ret < 0) {
                dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret);
                return ret;
@@ -268,13 +266,6 @@ static int pwm_mediatek_probe(struct platform_device *pdev)
        return 0;
 }
 
-static int pwm_mediatek_remove(struct platform_device *pdev)
-{
-       struct pwm_mediatek_chip *pc = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&pc->chip);
-}
-
 static const struct pwm_mediatek_of_data mt2712_pwm_data = {
        .num_pwms = 8,
        .pwm45_fixup = false,
@@ -335,7 +326,6 @@ static struct platform_driver pwm_mediatek_driver = {
                .of_match_table = pwm_mediatek_of_match,
        },
        .probe = pwm_mediatek_probe,
-       .remove = pwm_mediatek_remove,
 };
 module_platform_driver(pwm_mediatek_driver);
 
index 9b3ba40..c605013 100644 (file)
@@ -5,6 +5,7 @@
  * Author: YH Huang <yh.huang@mediatek.com>
  */
 
+#include <linux/bitfield.h>
 #include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/io.h>
@@ -47,6 +48,7 @@ struct mtk_disp_pwm {
        struct clk *clk_main;
        struct clk *clk_mm;
        void __iomem *base;
+       bool enabled;
 };
 
 static inline struct mtk_disp_pwm *to_mtk_disp_pwm(struct pwm_chip *chip)
@@ -66,14 +68,47 @@ static void mtk_disp_pwm_update_bits(struct mtk_disp_pwm *mdp, u32 offset,
        writel(value, address);
 }
 
-static int mtk_disp_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
-                              int duty_ns, int period_ns)
+static int mtk_disp_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
+                             const struct pwm_state *state)
 {
        struct mtk_disp_pwm *mdp = to_mtk_disp_pwm(chip);
        u32 clk_div, period, high_width, value;
        u64 div, rate;
        int err;
 
+       if (state->polarity != PWM_POLARITY_NORMAL)
+               return -EINVAL;
+
+       if (!state->enabled) {
+               mtk_disp_pwm_update_bits(mdp, DISP_PWM_EN, mdp->data->enable_mask,
+                                        0x0);
+
+               if (mdp->enabled) {
+                       clk_disable_unprepare(mdp->clk_mm);
+                       clk_disable_unprepare(mdp->clk_main);
+               }
+
+               mdp->enabled = false;
+               return 0;
+       }
+
+       if (!mdp->enabled) {
+               err = clk_prepare_enable(mdp->clk_main);
+               if (err < 0) {
+                       dev_err(chip->dev, "Can't enable mdp->clk_main: %pe\n",
+                               ERR_PTR(err));
+                       return err;
+               }
+
+               err = clk_prepare_enable(mdp->clk_mm);
+               if (err < 0) {
+                       dev_err(chip->dev, "Can't enable mdp->clk_mm: %pe\n",
+                               ERR_PTR(err));
+                       clk_disable_unprepare(mdp->clk_main);
+                       return err;
+               }
+       }
+
        /*
         * Find period, high_width and clk_div to suit duty_ns and period_ns.
         * Calculate proper div value to keep period value in the bound.
@@ -85,29 +120,24 @@ static int mtk_disp_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
         * high_width = (PWM_CLK_RATE * duty_ns) / (10^9 * (clk_div + 1))
         */
        rate = clk_get_rate(mdp->clk_main);
-       clk_div = div_u64(rate * period_ns, NSEC_PER_SEC) >>
+       clk_div = mul_u64_u64_div_u64(state->period, rate, NSEC_PER_SEC) >>
                          PWM_PERIOD_BIT_WIDTH;
-       if (clk_div > PWM_CLKDIV_MAX)
+       if (clk_div > PWM_CLKDIV_MAX) {
+               if (!mdp->enabled) {
+                       clk_disable_unprepare(mdp->clk_mm);
+                       clk_disable_unprepare(mdp->clk_main);
+               }
                return -EINVAL;
+       }
 
        div = NSEC_PER_SEC * (clk_div + 1);
-       period = div64_u64(rate * period_ns, div);
+       period = mul_u64_u64_div_u64(state->period, rate, div);
        if (period > 0)
                period--;
 
-       high_width = div64_u64(rate * duty_ns, div);
+       high_width = mul_u64_u64_div_u64(state->duty_cycle, rate, div);
        value = period | (high_width << PWM_HIGH_WIDTH_SHIFT);
 
-       err = clk_enable(mdp->clk_main);
-       if (err < 0)
-               return err;
-
-       err = clk_enable(mdp->clk_mm);
-       if (err < 0) {
-               clk_disable(mdp->clk_main);
-               return err;
-       }
-
        mtk_disp_pwm_update_bits(mdp, mdp->data->con0,
                                 PWM_CLKDIV_MASK,
                                 clk_div << PWM_CLKDIV_SHIFT);
@@ -122,50 +152,70 @@ static int mtk_disp_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
                mtk_disp_pwm_update_bits(mdp, mdp->data->commit,
                                         mdp->data->commit_mask,
                                         0x0);
+       } else {
+               /*
+                * For MT2701, disable double buffer before writing register
+                * and select manual mode and use PWM_PERIOD/PWM_HIGH_WIDTH.
+                */
+               mtk_disp_pwm_update_bits(mdp, mdp->data->bls_debug,
+                                        mdp->data->bls_debug_mask,
+                                        mdp->data->bls_debug_mask);
+               mtk_disp_pwm_update_bits(mdp, mdp->data->con0,
+                                        mdp->data->con0_sel,
+                                        mdp->data->con0_sel);
        }
 
-       clk_disable(mdp->clk_mm);
-       clk_disable(mdp->clk_main);
+       mtk_disp_pwm_update_bits(mdp, DISP_PWM_EN, mdp->data->enable_mask,
+                                mdp->data->enable_mask);
+       mdp->enabled = true;
 
        return 0;
 }
 
-static int mtk_disp_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
+static void mtk_disp_pwm_get_state(struct pwm_chip *chip,
+                                  struct pwm_device *pwm,
+                                  struct pwm_state *state)
 {
        struct mtk_disp_pwm *mdp = to_mtk_disp_pwm(chip);
+       u64 rate, period, high_width;
+       u32 clk_div, con0, con1;
        int err;
 
-       err = clk_enable(mdp->clk_main);
-       if (err < 0)
-               return err;
-
-       err = clk_enable(mdp->clk_mm);
+       err = clk_prepare_enable(mdp->clk_main);
        if (err < 0) {
-               clk_disable(mdp->clk_main);
-               return err;
+               dev_err(chip->dev, "Can't enable mdp->clk_main: %pe\n", ERR_PTR(err));
+               return;
        }
 
-       mtk_disp_pwm_update_bits(mdp, DISP_PWM_EN, mdp->data->enable_mask,
-                                mdp->data->enable_mask);
-
-       return 0;
-}
-
-static void mtk_disp_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
-{
-       struct mtk_disp_pwm *mdp = to_mtk_disp_pwm(chip);
-
-       mtk_disp_pwm_update_bits(mdp, DISP_PWM_EN, mdp->data->enable_mask,
-                                0x0);
+       err = clk_prepare_enable(mdp->clk_mm);
+       if (err < 0) {
+               dev_err(chip->dev, "Can't enable mdp->clk_mm: %pe\n", ERR_PTR(err));
+               clk_disable_unprepare(mdp->clk_main);
+               return;
+       }
 
-       clk_disable(mdp->clk_mm);
-       clk_disable(mdp->clk_main);
+       rate = clk_get_rate(mdp->clk_main);
+       con0 = readl(mdp->base + mdp->data->con0);
+       con1 = readl(mdp->base + mdp->data->con1);
+       state->enabled = !!(con0 & BIT(0));
+       clk_div = FIELD_GET(PWM_CLKDIV_MASK, con0);
+       period = FIELD_GET(PWM_PERIOD_MASK, con1);
+       /*
+        * period has 12 bits, clk_div 11 and NSEC_PER_SEC has 30,
+        * so period * (clk_div + 1) * NSEC_PER_SEC doesn't overflow.
+        */
+       state->period = DIV64_U64_ROUND_UP(period * (clk_div + 1) * NSEC_PER_SEC, rate);
+       high_width = FIELD_GET(PWM_HIGH_WIDTH_MASK, con1);
+       state->duty_cycle = DIV64_U64_ROUND_UP(high_width * (clk_div + 1) * NSEC_PER_SEC,
+                                              rate);
+       state->polarity = PWM_POLARITY_NORMAL;
+       clk_disable_unprepare(mdp->clk_mm);
+       clk_disable_unprepare(mdp->clk_main);
 }
 
 static const struct pwm_ops mtk_disp_pwm_ops = {
-       .config = mtk_disp_pwm_config,
-       .enable = mtk_disp_pwm_enable,
-       .disable = mtk_disp_pwm_disable,
+       .apply = mtk_disp_pwm_apply,
+       .get_state = mtk_disp_pwm_get_state,
        .owner = THIS_MODULE,
 };
 
@@ -192,58 +242,28 @@ static int mtk_disp_pwm_probe(struct platform_device *pdev)
        if (IS_ERR(mdp->clk_mm))
                return PTR_ERR(mdp->clk_mm);
 
-       ret = clk_prepare(mdp->clk_main);
-       if (ret < 0)
-               return ret;
-
-       ret = clk_prepare(mdp->clk_mm);
-       if (ret < 0)
-               goto disable_clk_main;
-
        mdp->chip.dev = &pdev->dev;
        mdp->chip.ops = &mtk_disp_pwm_ops;
        mdp->chip.npwm = 1;
 
        ret = pwmchip_add(&mdp->chip);
        if (ret < 0) {
-               dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret);
-               goto disable_clk_mm;
+               dev_err(&pdev->dev, "pwmchip_add() failed: %pe\n", ERR_PTR(ret));
+               return ret;
        }
 
        platform_set_drvdata(pdev, mdp);
 
-       /*
-        * For MT2701, disable double buffer before writing register
-        * and select manual mode and use PWM_PERIOD/PWM_HIGH_WIDTH.
-        */
-       if (!mdp->data->has_commit) {
-               mtk_disp_pwm_update_bits(mdp, mdp->data->bls_debug,
-                                        mdp->data->bls_debug_mask,
-                                        mdp->data->bls_debug_mask);
-               mtk_disp_pwm_update_bits(mdp, mdp->data->con0,
-                                        mdp->data->con0_sel,
-                                        mdp->data->con0_sel);
-       }
-
        return 0;
-
-disable_clk_mm:
-       clk_unprepare(mdp->clk_mm);
-disable_clk_main:
-       clk_unprepare(mdp->clk_main);
-       return ret;
 }
 
 static int mtk_disp_pwm_remove(struct platform_device *pdev)
 {
        struct mtk_disp_pwm *mdp = platform_get_drvdata(pdev);
-       int ret;
 
-       ret = pwmchip_remove(&mdp->chip);
-       clk_unprepare(mdp->clk_mm);
-       clk_unprepare(mdp->clk_main);
+       pwmchip_remove(&mdp->chip);
 
-       return ret;
+       return 0;
 }
 
 static const struct mtk_pwm_data mt2701_pwm_data = {
index a221808..766dbc5 100644 (file)
@@ -145,30 +145,18 @@ static int mxs_pwm_probe(struct platform_device *pdev)
                return ret;
        }
 
-       ret = pwmchip_add(&mxs->chip);
+       /* FIXME: Only do this if the PWM isn't already running */
+       ret = stmp_reset_block(mxs->base);
+       if (ret)
+               return dev_err_probe(&pdev->dev, ret, "failed to reset PWM\n");
+
+       ret = devm_pwmchip_add(&pdev->dev, &mxs->chip);
        if (ret < 0) {
                dev_err(&pdev->dev, "failed to add pwm chip %d\n", ret);
                return ret;
        }
 
-       platform_set_drvdata(pdev, mxs);
-
-       ret = stmp_reset_block(mxs->base);
-       if (ret)
-               goto pwm_remove;
-
        return 0;
-
-pwm_remove:
-       pwmchip_remove(&mxs->chip);
-       return ret;
-}
-
-static int mxs_pwm_remove(struct platform_device *pdev)
-{
-       struct mxs_pwm_chip *mxs = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&mxs->chip);
 }
 
 static const struct of_device_id mxs_pwm_dt_ids[] = {
@@ -183,7 +171,6 @@ static struct platform_driver mxs_pwm_driver = {
                .of_match_table = mxs_pwm_dt_ids,
        },
        .probe = mxs_pwm_probe,
-       .remove = mxs_pwm_remove,
 };
 module_platform_driver(mxs_pwm_driver);
 
index 50c454c..ab63b08 100644 (file)
@@ -150,23 +150,12 @@ static int ntxec_pwm_probe(struct platform_device *pdev)
        priv->ec = ec;
        priv->dev = &pdev->dev;
 
-       platform_set_drvdata(pdev, priv);
-
        chip = &priv->chip;
        chip->dev = &pdev->dev;
        chip->ops = &ntxec_pwm_ops;
-       chip->base = -1;
        chip->npwm = 1;
 
-       return pwmchip_add(chip);
-}
-
-static int ntxec_pwm_remove(struct platform_device *pdev)
-{
-       struct ntxec_pwm *priv = platform_get_drvdata(pdev);
-       struct pwm_chip *chip = &priv->chip;
-
-       return pwmchip_remove(chip);
+       return devm_pwmchip_add(&pdev->dev, chip);
 }
 
 static struct platform_driver ntxec_pwm_driver = {
@@ -174,7 +163,6 @@ static struct platform_driver ntxec_pwm_driver = {
                .name = "ntxec-pwm",
        },
        .probe = ntxec_pwm_probe,
-       .remove = ntxec_pwm_remove,
 };
 module_platform_driver(ntxec_pwm_driver);
 
index 507a2d9..fa800fc 100644 (file)
@@ -444,11 +444,8 @@ err_find_timer_pdev:
 static int pwm_omap_dmtimer_remove(struct platform_device *pdev)
 {
        struct pwm_omap_dmtimer_chip *omap = platform_get_drvdata(pdev);
-       int ret;
 
-       ret = pwmchip_remove(&omap->chip);
-       if (ret)
-               return ret;
+       pwmchip_remove(&omap->chip);
 
        if (pm_runtime_active(&omap->dm_timer_pdev->dev))
                omap->pdata->stop(omap->dm_timer);
index 42ed770..c56001a 100644 (file)
@@ -601,11 +601,8 @@ static int pca9685_pwm_probe(struct i2c_client *client,
 static int pca9685_pwm_remove(struct i2c_client *client)
 {
        struct pca9685 *pca = i2c_get_clientdata(client);
-       int ret;
 
-       ret = pwmchip_remove(&pca->chip);
-       if (ret)
-               return ret;
+       pwmchip_remove(&pca->chip);
 
        if (!pm_runtime_enabled(&client->dev)) {
                /* Put chip in sleep state if runtime PM is disabled */
index e091a52..a9efdcf 100644 (file)
@@ -195,32 +195,21 @@ static int pwm_probe(struct platform_device *pdev)
        if (IS_ERR(pc->mmio_base))
                return PTR_ERR(pc->mmio_base);
 
-       ret = pwmchip_add(&pc->chip);
+       ret = devm_pwmchip_add(&pdev->dev, &pc->chip);
        if (ret < 0) {
                dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret);
                return ret;
        }
 
-       platform_set_drvdata(pdev, pc);
        return 0;
 }
 
-static int pwm_remove(struct platform_device *pdev)
-{
-       struct pxa_pwm_chip *pc;
-
-       pc = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&pc->chip);
-}
-
 static struct platform_driver pwm_driver = {
        .driver         = {
                .name   = "pxa25x-pwm",
                .of_match_table = pwm_of_match,
        },
        .probe          = pwm_probe,
-       .remove         = pwm_remove,
        .id_table       = pwm_id_table,
 };
 
index 043fc32..579a152 100644 (file)
@@ -166,8 +166,6 @@ static int raspberrypi_pwm_probe(struct platform_device *pdev)
        rpipwm->chip.base = -1;
        rpipwm->chip.npwm = RASPBERRYPI_FIRMWARE_PWM_NUM;
 
-       platform_set_drvdata(pdev, rpipwm);
-
        ret = raspberrypi_pwm_get_property(rpipwm->firmware, RPI_PWM_CUR_DUTY_REG,
                                           &rpipwm->duty_cycle);
        if (ret) {
@@ -175,14 +173,7 @@ static int raspberrypi_pwm_probe(struct platform_device *pdev)
                return ret;
        }
 
-       return pwmchip_add(&rpipwm->chip);
-}
-
-static int raspberrypi_pwm_remove(struct platform_device *pdev)
-{
-       struct raspberrypi_pwm *rpipwm = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&rpipwm->chip);
+       return devm_pwmchip_add(dev, &rpipwm->chip);
 }
 
 static const struct of_device_id raspberrypi_pwm_of_match[] = {
@@ -197,7 +188,6 @@ static struct platform_driver raspberrypi_pwm_driver = {
                .of_match_table = raspberrypi_pwm_of_match,
        },
        .probe = raspberrypi_pwm_probe,
-       .remove = raspberrypi_pwm_remove,
 };
 module_platform_driver(raspberrypi_pwm_driver);
 
index 9daca0c..b437192 100644 (file)
@@ -241,13 +241,12 @@ static int rcar_pwm_probe(struct platform_device *pdev)
 static int rcar_pwm_remove(struct platform_device *pdev)
 {
        struct rcar_pwm_chip *rcar_pwm = platform_get_drvdata(pdev);
-       int ret;
 
-       ret = pwmchip_remove(&rcar_pwm->chip);
+       pwmchip_remove(&rcar_pwm->chip);
 
        pm_runtime_disable(&pdev->dev);
 
-       return ret;
+       return 0;
 }
 
 static const struct of_device_id rcar_pwm_of_table[] = {
index b853e79..4381df9 100644 (file)
@@ -425,13 +425,12 @@ static int tpu_probe(struct platform_device *pdev)
 static int tpu_remove(struct platform_device *pdev)
 {
        struct tpu_device *tpu = platform_get_drvdata(pdev);
-       int ret;
 
-       ret = pwmchip_remove(&tpu->chip);
+       pwmchip_remove(&tpu->chip);
 
        pm_runtime_disable(&pdev->dev);
 
-       return ret;
+       return 0;
 }
 
 #ifdef CONFIG_OF
index cbe9008..f3647b3 100644 (file)
@@ -384,24 +384,12 @@ static int rockchip_pwm_remove(struct platform_device *pdev)
 {
        struct rockchip_pwm_chip *pc = platform_get_drvdata(pdev);
 
-       /*
-        * Disable the PWM clk before unpreparing it if the PWM device is still
-        * running. This should only happen when the last PWM user left it
-        * enabled, or when nobody requested a PWM that was previously enabled
-        * by the bootloader.
-        *
-        * FIXME: Maybe the core should disable all PWM devices in
-        * pwmchip_remove(). In this case we'd only have to call
-        * clk_unprepare() after pwmchip_remove().
-        *
-        */
-       if (pwm_is_enabled(pc->chip.pwms))
-               clk_disable(pc->clk);
+       pwmchip_remove(&pc->chip);
 
        clk_unprepare(pc->pclk);
        clk_unprepare(pc->clk);
 
-       return pwmchip_remove(&pc->chip);
+       return 0;
 }
 
 static struct platform_driver rockchip_pwm_driver = {
index f6c528f..dd94c43 100644 (file)
@@ -580,11 +580,8 @@ static int pwm_samsung_probe(struct platform_device *pdev)
 static int pwm_samsung_remove(struct platform_device *pdev)
 {
        struct samsung_pwm_chip *chip = platform_get_drvdata(pdev);
-       int ret;
 
-       ret = pwmchip_remove(&chip->chip);
-       if (ret < 0)
-               return ret;
+       pwmchip_remove(&chip->chip);
 
        clk_disable_unprepare(chip->base_clk);
 
index 420edc4..253c4a1 100644 (file)
@@ -291,7 +291,7 @@ static int pwm_sifive_remove(struct platform_device *dev)
        struct pwm_sifive_ddata *ddata = platform_get_drvdata(dev);
        bool is_enabled = false;
        struct pwm_device *pwm;
-       int ret, ch;
+       int ch;
 
        for (ch = 0; ch < ddata->chip.npwm; ch++) {
                pwm = &ddata->chip.pwms[ch];
@@ -304,10 +304,10 @@ static int pwm_sifive_remove(struct platform_device *dev)
                clk_disable(ddata->clk);
 
        clk_disable_unprepare(ddata->clk);
-       ret = pwmchip_remove(&ddata->chip);
+       pwmchip_remove(&ddata->chip);
        clk_notifier_unregister(ddata->clk, &ddata->notifier);
 
-       return ret;
+       return 0;
 }
 
 static const struct of_device_id pwm_sifive_of_match[] = {
index 7a69c1a..589aeaa 100644 (file)
@@ -231,9 +231,7 @@ static int sl28cpld_pwm_probe(struct platform_device *pdev)
        chip->ops = &sl28cpld_pwm_ops;
        chip->npwm = 1;
 
-       platform_set_drvdata(pdev, priv);
-
-       ret = pwmchip_add(&priv->pwm_chip);
+       ret = devm_pwmchip_add(&pdev->dev, &priv->pwm_chip);
        if (ret) {
                dev_err(&pdev->dev, "failed to add PWM chip (%pe)",
                        ERR_PTR(ret));
@@ -243,13 +241,6 @@ static int sl28cpld_pwm_probe(struct platform_device *pdev)
        return 0;
 }
 
-static int sl28cpld_pwm_remove(struct platform_device *pdev)
-{
-       struct sl28cpld_pwm *priv = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&priv->pwm_chip);
-}
-
 static const struct of_device_id sl28cpld_pwm_of_match[] = {
        { .compatible = "kontron,sl28cpld-pwm" },
        {}
@@ -258,7 +249,6 @@ MODULE_DEVICE_TABLE(of, sl28cpld_pwm_of_match);
 
 static struct platform_driver sl28cpld_pwm_driver = {
        .probe = sl28cpld_pwm_probe,
-       .remove = sl28cpld_pwm_remove,
        .driver = {
                .name = "sl28cpld-pwm",
                .of_match_table = sl28cpld_pwm_of_match,
index 93dd036..3115abb 100644 (file)
@@ -209,7 +209,7 @@ static int stm32_pwm_lp_probe(struct platform_device *pdev)
        priv->chip.ops = &stm32_pwm_lp_ops;
        priv->chip.npwm = 1;
 
-       ret = pwmchip_add(&priv->chip);
+       ret = devm_pwmchip_add(&pdev->dev, &priv->chip);
        if (ret < 0)
                return ret;
 
@@ -218,15 +218,6 @@ static int stm32_pwm_lp_probe(struct platform_device *pdev)
        return 0;
 }
 
-static int stm32_pwm_lp_remove(struct platform_device *pdev)
-{
-       struct stm32_pwm_lp *priv = platform_get_drvdata(pdev);
-
-       pwm_disable(&priv->chip.pwms[0]);
-
-       return pwmchip_remove(&priv->chip);
-}
-
 static int __maybe_unused stm32_pwm_lp_suspend(struct device *dev)
 {
        struct stm32_pwm_lp *priv = dev_get_drvdata(dev);
@@ -258,7 +249,6 @@ MODULE_DEVICE_TABLE(of, stm32_pwm_lp_of_match);
 
 static struct platform_driver stm32_pwm_lp_driver = {
        .probe  = stm32_pwm_lp_probe,
-       .remove = stm32_pwm_lp_remove,
        .driver = {
                .name = "stm32-pwm-lp",
                .of_match_table = of_match_ptr(stm32_pwm_lp_of_match),
index c952604..91ca676 100644 (file)
@@ -484,11 +484,8 @@ err_bus:
 static int sun4i_pwm_remove(struct platform_device *pdev)
 {
        struct sun4i_pwm_chip *pwm = platform_get_drvdata(pdev);
-       int ret;
 
-       ret = pwmchip_remove(&pwm->chip);
-       if (ret)
-               return ret;
+       pwmchip_remove(&pwm->chip);
 
        clk_disable_unprepare(pwm->bus_clk);
        reset_control_assert(pwm->rst);
index 35eb19a..4701f0c 100644 (file)
@@ -253,7 +253,7 @@ static int ecap_pwm_probe(struct platform_device *pdev)
        if (IS_ERR(pc->mmio_base))
                return PTR_ERR(pc->mmio_base);
 
-       ret = pwmchip_add(&pc->chip);
+       ret = devm_pwmchip_add(&pdev->dev, &pc->chip);
        if (ret < 0) {
                dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret);
                return ret;
@@ -267,11 +267,9 @@ static int ecap_pwm_probe(struct platform_device *pdev)
 
 static int ecap_pwm_remove(struct platform_device *pdev)
 {
-       struct ecap_pwm_chip *pc = platform_get_drvdata(pdev);
-
        pm_runtime_disable(&pdev->dev);
 
-       return pwmchip_remove(&pc->chip);
+       return 0;
 }
 
 #ifdef CONFIG_PM_SLEEP
index 17909fa..5b723a4 100644 (file)
@@ -485,11 +485,13 @@ static int ehrpwm_pwm_remove(struct platform_device *pdev)
 {
        struct ehrpwm_pwm_chip *pc = platform_get_drvdata(pdev);
 
+       pwmchip_remove(&pc->chip);
+
        clk_unprepare(pc->tbclk);
 
        pm_runtime_disable(&pdev->dev);
 
-       return pwmchip_remove(&pc->chip);
+       return 0;
 }
 
 #ifdef CONFIG_PM_SLEEP
index 6c8df5f..49d9f7a 100644 (file)
@@ -276,7 +276,6 @@ static const struct pwm_ops twl6030_pwmled_ops = {
 static int twl_pwmled_probe(struct platform_device *pdev)
 {
        struct twl_pwmled_chip *twl;
-       int ret;
 
        twl = devm_kzalloc(&pdev->dev, sizeof(*twl), GFP_KERNEL);
        if (!twl)
@@ -294,20 +293,7 @@ static int twl_pwmled_probe(struct platform_device *pdev)
 
        mutex_init(&twl->mutex);
 
-       ret = pwmchip_add(&twl->chip);
-       if (ret < 0)
-               return ret;
-
-       platform_set_drvdata(pdev, twl);
-
-       return 0;
-}
-
-static int twl_pwmled_remove(struct platform_device *pdev)
-{
-       struct twl_pwmled_chip *twl = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&twl->chip);
+       return devm_pwmchip_add(&pdev->dev, &twl->chip);
 }
 
 #ifdef CONFIG_OF
@@ -325,7 +311,6 @@ static struct platform_driver twl_pwmled_driver = {
                .of_match_table = of_match_ptr(twl_pwmled_of_match),
        },
        .probe = twl_pwmled_probe,
-       .remove = twl_pwmled_remove,
 };
 module_platform_driver(twl_pwmled_driver);
 
index e83a826..203194f 100644 (file)
@@ -298,7 +298,6 @@ static const struct pwm_ops twl6030_pwm_ops = {
 static int twl_pwm_probe(struct platform_device *pdev)
 {
        struct twl_pwm_chip *twl;
-       int ret;
 
        twl = devm_kzalloc(&pdev->dev, sizeof(*twl), GFP_KERNEL);
        if (!twl)
@@ -314,20 +313,7 @@ static int twl_pwm_probe(struct platform_device *pdev)
 
        mutex_init(&twl->mutex);
 
-       ret = pwmchip_add(&twl->chip);
-       if (ret < 0)
-               return ret;
-
-       platform_set_drvdata(pdev, twl);
-
-       return 0;
-}
-
-static int twl_pwm_remove(struct platform_device *pdev)
-{
-       struct twl_pwm_chip *twl = platform_get_drvdata(pdev);
-
-       return pwmchip_remove(&twl->chip);
+       return devm_pwmchip_add(&pdev->dev, &twl->chip);
 }
 
 #ifdef CONFIG_OF
@@ -345,7 +331,6 @@ static struct platform_driver twl_pwm_driver = {
                .of_match_table = of_match_ptr(twl_pwm_of_match),
        },
        .probe = twl_pwm_probe,
-       .remove = twl_pwm_remove,
 };
 module_platform_driver(twl_pwm_driver);
 
index 12153d5..e1bc521 100644 (file)
@@ -75,6 +75,15 @@ config RTC_DEBUG
          Say yes here to enable debugging support in the RTC framework
          and individual RTC drivers.
 
+config RTC_LIB_KUNIT_TEST
+       tristate "KUnit test for RTC lib functions" if !KUNIT_ALL_TESTS
+       depends on KUNIT
+       default KUNIT_ALL_TESTS
+       help
+         Enable this option to test RTC library functions.
+
+         If unsure, say N.
+
 config RTC_NVMEM
        bool "RTC non volatile storage support"
        select NVMEM
@@ -624,6 +633,7 @@ config RTC_DRV_FM3130
 
 config RTC_DRV_RX8010
        tristate "Epson RX8010SJ"
+       select REGMAP_I2C
        help
          If you say yes here you get support for the Epson RX8010SJ RTC
          chip.
index 2dd0dd9..5ceeafe 100644 (file)
@@ -15,6 +15,8 @@ rtc-core-$(CONFIG_RTC_INTF_DEV)               += dev.o
 rtc-core-$(CONFIG_RTC_INTF_PROC)       += proc.o
 rtc-core-$(CONFIG_RTC_INTF_SYSFS)      += sysfs.o
 
+obj-$(CONFIG_RTC_LIB_KUNIT_TEST)       += lib_test.o
+
 # Keep the list ordered.
 
 obj-$(CONFIG_RTC_DRV_88PM80X)  += rtc-88pm80x.o
index 2328458..fe36165 100644 (file)
@@ -6,6 +6,8 @@
  * Author: Alessandro Zummo <a.zummo@towertech.it>
  *
  * based on arch/arm/common/rtctime.c and other bits
+ *
+ * Author: Cassio Neri <cassio.neri@gmail.com> (rtc_time64_to_tm)
  */
 
 #include <linux/export.h>
@@ -22,8 +24,6 @@ static const unsigned short rtc_ydays[2][13] = {
        { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 }
 };
 
-#define LEAPS_THRU_END_OF(y) ((y) / 4 - (y) / 100 + (y) / 400)
-
 /*
  * The number of days in the month.
  */
@@ -42,42 +42,95 @@ int rtc_year_days(unsigned int day, unsigned int month, unsigned int year)
 }
 EXPORT_SYMBOL(rtc_year_days);
 
-/*
- * rtc_time64_to_tm - Converts time64_t to rtc_time.
- * Convert seconds since 01-01-1970 00:00:00 to Gregorian date.
+/**
+ * rtc_time64_to_tm - converts time64_t to rtc_time.
+ *
+ * @time:      The number of seconds since 01-01-1970 00:00:00.
+ *             (Must be positive.)
+ * @tm:                Pointer to the struct rtc_time.
  */
 void rtc_time64_to_tm(time64_t time, struct rtc_time *tm)
 {
-       unsigned int month, year, secs;
+       unsigned int secs;
        int days;
 
+       u64 u64tmp;
+       u32 u32tmp, udays, century, day_of_century, year_of_century, year,
+               day_of_year, month, day;
+       bool is_Jan_or_Feb, is_leap_year;
+
        /* time must be positive */
        days = div_s64_rem(time, 86400, &secs);
 
        /* day of the week, 1970-01-01 was a Thursday */
        tm->tm_wday = (days + 4) % 7;
 
-       year = 1970 + days / 365;
-       days -= (year - 1970) * 365
-               + LEAPS_THRU_END_OF(year - 1)
-               - LEAPS_THRU_END_OF(1970 - 1);
-       while (days < 0) {
-               year -= 1;
-               days += 365 + is_leap_year(year);
-       }
-       tm->tm_year = year - 1900;
-       tm->tm_yday = days + 1;
-
-       for (month = 0; month < 11; month++) {
-               int newdays;
-
-               newdays = days - rtc_month_days(month, year);
-               if (newdays < 0)
-                       break;
-               days = newdays;
-       }
-       tm->tm_mon = month;
-       tm->tm_mday = days + 1;
+       /*
+        * The following algorithm is, basically, Proposition 6.3 of Neri
+        * and Schneider [1]. In a few words: it works on the computational
+        * (fictitious) calendar where the year starts in March, month = 2
+        * (*), and finishes in February, month = 13. This calendar is
+        * mathematically convenient because the day of the year does not
+        * depend on whether the year is leap or not. For instance:
+        *
+        * March 1st            0-th day of the year;
+        * ...
+        * April 1st            31-st day of the year;
+        * ...
+        * January 1st          306-th day of the year; (Important!)
+        * ...
+        * February 28th        364-th day of the year;
+        * February 29th        365-th day of the year (if it exists).
+        *
+        * After having worked out the date in the computational calendar
+        * (using just arithmetics) it's easy to convert it to the
+        * corresponding date in the Gregorian calendar.
+        *
+        * [1] "Euclidean Affine Functions and Applications to Calendar
+        * Algorithms". https://arxiv.org/abs/2102.06959
+        *
+        * (*) The numbering of months follows rtc_time more closely and
+        * thus, is slightly different from [1].
+        */
+
+       udays           = ((u32) days) + 719468;
+
+       u32tmp          = 4 * udays + 3;
+       century         = u32tmp / 146097;
+       day_of_century  = u32tmp % 146097 / 4;
+
+       u32tmp          = 4 * day_of_century + 3;
+       u64tmp          = 2939745ULL * u32tmp;
+       year_of_century = upper_32_bits(u64tmp);
+       day_of_year     = lower_32_bits(u64tmp) / 2939745 / 4;
+
+       year            = 100 * century + year_of_century;
+       is_leap_year    = year_of_century != 0 ?
+               year_of_century % 4 == 0 : century % 4 == 0;
+
+       u32tmp          = 2141 * day_of_year + 132377;
+       month           = u32tmp >> 16;
+       day             = ((u16) u32tmp) / 2141;
+
+       /*
+        * Recall that January 01 is the 306-th day of the year in the
+        * computational (not Gregorian) calendar.
+        */
+       is_Jan_or_Feb   = day_of_year >= 306;
+
+       /* Converts to the Gregorian calendar. */
+       year            = year + is_Jan_or_Feb;
+       month           = is_Jan_or_Feb ? month - 12 : month;
+       day             = day + 1;
+
+       day_of_year     = is_Jan_or_Feb ?
+               day_of_year - 306 : day_of_year + 31 + 28 + is_leap_year;
+
+       /* Converts to rtc_time's format. */
+       tm->tm_year     = (int) (year - 1900);
+       tm->tm_mon      = (int) month;
+       tm->tm_mday     = (int) day;
+       tm->tm_yday     = (int) day_of_year + 1;
 
        tm->tm_hour = secs / 3600;
        secs -= tm->tm_hour * 3600;
diff --git a/drivers/rtc/lib_test.c b/drivers/rtc/lib_test.c
new file mode 100644 (file)
index 0000000..d5caf36
--- /dev/null
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: LGPL-2.1+
+
+#include <kunit/test.h>
+#include <linux/rtc.h>
+
+/*
+ * Advance a date by one day.
+ */
+static void advance_date(int *year, int *month, int *mday, int *yday)
+{
+       if (*mday != rtc_month_days(*month - 1, *year)) {
+               ++*mday;
+               ++*yday;
+               return;
+       }
+
+       *mday = 1;
+       if (*month != 12) {
+               ++*month;
+               ++*yday;
+               return;
+       }
+
+       *month = 1;
+       *yday  = 1;
+       ++*year;
+}
+
+/*
+ * Checks every day in a 160000 years interval starting on 1970-01-01
+ * against the expected result.
+ */
+static void rtc_time64_to_tm_test_date_range(struct kunit *test)
+{
+       /*
+        * 160000 years = (160000 / 400) * 400 years
+        *              = (160000 / 400) * 146097 days
+        *              = (160000 / 400) * 146097 * 86400 seconds
+        */
+       time64_t total_secs = ((time64_t) 160000) / 400 * 146097 * 86400;
+
+       int year        = 1970;
+       int month       = 1;
+       int mday        = 1;
+       int yday        = 1;
+
+       struct rtc_time result;
+       time64_t secs;
+       s64 days;
+
+       for (secs = 0; secs <= total_secs; secs += 86400) {
+
+               rtc_time64_to_tm(secs, &result);
+
+               days = div_s64(secs, 86400);
+
+               #define FAIL_MSG "%d/%02d/%02d (%2d) : %ld", \
+                       year, month, mday, yday, days
+
+               KUNIT_ASSERT_EQ_MSG(test, year - 1900, result.tm_year, FAIL_MSG);
+               KUNIT_ASSERT_EQ_MSG(test, month - 1, result.tm_mon, FAIL_MSG);
+               KUNIT_ASSERT_EQ_MSG(test, mday, result.tm_mday, FAIL_MSG);
+               KUNIT_ASSERT_EQ_MSG(test, yday, result.tm_yday, FAIL_MSG);
+
+               advance_date(&year, &month, &mday, &yday);
+       }
+}
+
+static struct kunit_case rtc_lib_test_cases[] = {
+       KUNIT_CASE(rtc_time64_to_tm_test_date_range),
+       {}
+};
+
+static struct kunit_suite rtc_lib_test_suite = {
+       .name = "rtc_lib_test_cases",
+       .test_cases = rtc_lib_test_cases,
+};
+
+kunit_test_suite(rtc_lib_test_suite);
+
+MODULE_LICENSE("GPL");
index 670fd8a..eb15067 100644 (file)
@@ -229,19 +229,13 @@ static int cmos_read_time(struct device *dev, struct rtc_time *t)
        if (!pm_trace_rtc_valid())
                return -EIO;
 
-       /* REVISIT:  if the clock has a "century" register, use
-        * that instead of the heuristic in mc146818_get_time().
-        * That'll make Y3K compatility (year > 2070) easy!
-        */
        mc146818_get_time(t);
        return 0;
 }
 
 static int cmos_set_time(struct device *dev, struct rtc_time *t)
 {
-       /* REVISIT:  set the "century" register if available
-        *
-        * NOTE: this ignores the issue whereby updating the seconds
+       /* NOTE: this ignores the issue whereby updating the seconds
         * takes effect exactly 500ms after we write the register.
         * (Also queueing and other delays before we get this far.)
         */
index c914091..d38aaf0 100644 (file)
 #define RX8025_ADJ_DATA_MAX    62
 #define RX8025_ADJ_DATA_MIN    -62
 
+enum rx_model {
+       model_rx_unknown,
+       model_rx_8025,
+       model_rx_8035,
+       model_last
+};
+
 static const struct i2c_device_id rx8025_id[] = {
-       { "rx8025", 0 },
+       { "rx8025", model_rx_8025 },
+       { "rx8035", model_rx_8035 },
        { }
 };
 MODULE_DEVICE_TABLE(i2c, rx8025_id);
 
 struct rx8025_data {
        struct rtc_device *rtc;
+       enum rx_model model;
        u8 ctrl1;
 };
 
@@ -100,10 +109,26 @@ static s32 rx8025_write_regs(const struct i2c_client *client,
                                              length, values);
 }
 
+static int rx8025_is_osc_stopped(enum rx_model model, int ctrl2)
+{
+       int xstp = ctrl2 & RX8025_BIT_CTRL2_XST;
+       /* XSTP bit has different polarity on RX-8025 vs RX-8035.
+        * RX-8025: 0 == oscillator stopped
+        * RX-8035: 1 == oscillator stopped
+        */
+
+       if (model == model_rx_8025)
+               xstp = !xstp;
+
+       return xstp;
+}
+
 static int rx8025_check_validity(struct device *dev)
 {
        struct i2c_client *client = to_i2c_client(dev);
+       struct rx8025_data *drvdata = dev_get_drvdata(dev);
        int ctrl2;
+       int xstp;
 
        ctrl2 = rx8025_read_reg(client, RX8025_REG_CTRL2);
        if (ctrl2 < 0)
@@ -117,7 +142,8 @@ static int rx8025_check_validity(struct device *dev)
                return -EINVAL;
        }
 
-       if (!(ctrl2 & RX8025_BIT_CTRL2_XST)) {
+       xstp = rx8025_is_osc_stopped(drvdata->model, ctrl2);
+       if (xstp) {
                dev_warn(dev, "crystal stopped, date is invalid\n");
                return -EINVAL;
        }
@@ -127,6 +153,7 @@ static int rx8025_check_validity(struct device *dev)
 
 static int rx8025_reset_validity(struct i2c_client *client)
 {
+       struct rx8025_data *drvdata = i2c_get_clientdata(client);
        int ctrl2 = rx8025_read_reg(client, RX8025_REG_CTRL2);
 
        if (ctrl2 < 0)
@@ -134,22 +161,28 @@ static int rx8025_reset_validity(struct i2c_client *client)
 
        ctrl2 &= ~(RX8025_BIT_CTRL2_PON | RX8025_BIT_CTRL2_VDET);
 
+       if (drvdata->model == model_rx_8025)
+               ctrl2 |= RX8025_BIT_CTRL2_XST;
+       else
+               ctrl2 &= ~(RX8025_BIT_CTRL2_XST);
+
        return rx8025_write_reg(client, RX8025_REG_CTRL2,
-                               ctrl2 | RX8025_BIT_CTRL2_XST);
+                               ctrl2);
 }
 
 static irqreturn_t rx8025_handle_irq(int irq, void *dev_id)
 {
        struct i2c_client *client = dev_id;
        struct rx8025_data *rx8025 = i2c_get_clientdata(client);
-       int status;
+       int status, xstp;
 
        rtc_lock(rx8025->rtc);
        status = rx8025_read_reg(client, RX8025_REG_CTRL2);
        if (status < 0)
                goto out;
 
-       if (!(status & RX8025_BIT_CTRL2_XST))
+       xstp = rx8025_is_osc_stopped(rx8025->model, status);
+       if (xstp)
                dev_warn(&client->dev, "Oscillation stop was detected,"
                         "you may have to readjust the clock\n");
 
@@ -519,6 +552,9 @@ static int rx8025_probe(struct i2c_client *client,
 
        i2c_set_clientdata(client, rx8025);
 
+       if (id)
+               rx8025->model = id->driver_data;
+
        err = rx8025_init_client(client);
        if (err)
                return err;
index 6b56f8e..fb9c6b7 100644 (file)
@@ -204,15 +204,9 @@ static int s5m8767_tm_to_data(struct rtc_time *tm, u8 *data)
        data[RTC_WEEKDAY] = 1 << tm->tm_wday;
        data[RTC_DATE] = tm->tm_mday;
        data[RTC_MONTH] = tm->tm_mon + 1;
-       data[RTC_YEAR1] = tm->tm_year > 100 ? (tm->tm_year - 100) : 0;
+       data[RTC_YEAR1] = tm->tm_year - 100;
 
-       if (tm->tm_year < 100) {
-               pr_err("RTC cannot handle the year %d\n",
-                      1900 + tm->tm_year);
-               return -EINVAL;
-       } else {
-               return 0;
-       }
+       return 0;
 }
 
 /*
@@ -786,29 +780,35 @@ static int s5m_rtc_probe(struct platform_device *pdev)
        if (ret)
                return ret;
 
-       device_init_wakeup(&pdev->dev, 1);
-
-       info->rtc_dev = devm_rtc_device_register(&pdev->dev, "s5m-rtc",
-                                                &s5m_rtc_ops, THIS_MODULE);
-
+       info->rtc_dev = devm_rtc_allocate_device(&pdev->dev);
        if (IS_ERR(info->rtc_dev))
                return PTR_ERR(info->rtc_dev);
 
-       if (!info->irq) {
-               dev_info(&pdev->dev, "Alarm IRQ not available\n");
-               return 0;
+       info->rtc_dev->ops = &s5m_rtc_ops;
+
+       if (info->device_type == S5M8763X) {
+               info->rtc_dev->range_min = RTC_TIMESTAMP_BEGIN_0000;
+               info->rtc_dev->range_max = RTC_TIMESTAMP_END_9999;
+       } else {
+               info->rtc_dev->range_min = RTC_TIMESTAMP_BEGIN_2000;
+               info->rtc_dev->range_max = RTC_TIMESTAMP_END_2099;
        }
 
-       ret = devm_request_threaded_irq(&pdev->dev, info->irq, NULL,
-                                       s5m_rtc_alarm_irq, 0, "rtc-alarm0",
-                                       info);
-       if (ret < 0) {
-               dev_err(&pdev->dev, "Failed to request alarm IRQ: %d: %d\n",
-                       info->irq, ret);
-               return ret;
+       if (!info->irq) {
+               clear_bit(RTC_FEATURE_ALARM, info->rtc_dev->features);
+       } else {
+               ret = devm_request_threaded_irq(&pdev->dev, info->irq, NULL,
+                                               s5m_rtc_alarm_irq, 0, "rtc-alarm0",
+                                               info);
+               if (ret < 0) {
+                       dev_err(&pdev->dev, "Failed to request alarm IRQ: %d: %d\n",
+                               info->irq, ret);
+                       return ret;
+               }
+               device_init_wakeup(&pdev->dev, 1);
        }
 
-       return 0;
+       return devm_rtc_register_device(info->rtc_dev);
 }
 
 #ifdef CONFIG_PM_SLEEP
index bc89c62..75e4c2d 100644 (file)
@@ -467,6 +467,6 @@ static struct platform_driver tps65910_rtc_driver = {
 };
 
 module_platform_driver(tps65910_rtc_driver);
-MODULE_ALIAS("platform:rtc-tps65910");
+MODULE_ALIAS("platform:tps65910-rtc");
 MODULE_AUTHOR("Venu Byravarasu <vbyravarasu@nvidia.com>");
 MODULE_LICENSE("GPL");
index 823354a..19926be 100644 (file)
@@ -108,9 +108,12 @@ static struct attribute *imok_attr[] = {
        NULL
 };
 
+static const struct attribute_group imok_attribute_group = {
+       .attrs = imok_attr,
+};
+
 static const struct attribute_group data_attribute_group = {
        .bin_attrs = data_attributes,
-       .attrs = imok_attr,
 };
 
 static ssize_t available_uuids_show(struct device *dev,
@@ -522,6 +525,12 @@ static int int3400_thermal_probe(struct platform_device *pdev)
        if (result)
                goto free_rel_misc;
 
+       if (acpi_has_method(priv->adev->handle, "IMOK")) {
+               result = sysfs_create_group(&pdev->dev.kobj, &imok_attribute_group);
+               if (result)
+                       goto free_imok;
+       }
+
        if (priv->data_vault) {
                result = sysfs_create_group(&pdev->dev.kobj,
                                            &data_attribute_group);
@@ -545,6 +554,8 @@ free_sysfs:
        }
 free_uuid:
        sysfs_remove_group(&pdev->dev.kobj, &uuid_attribute_group);
+free_imok:
+       sysfs_remove_group(&pdev->dev.kobj, &imok_attribute_group);
 free_rel_misc:
        if (!priv->rel_misc_dev_res)
                acpi_thermal_rel_misc_device_remove(priv->adev->handle);
@@ -573,6 +584,7 @@ static int int3400_thermal_remove(struct platform_device *pdev)
        if (priv->data_vault)
                sysfs_remove_group(&pdev->dev.kobj, &data_attribute_group);
        sysfs_remove_group(&pdev->dev.kobj, &uuid_attribute_group);
+       sysfs_remove_group(&pdev->dev.kobj, &imok_attribute_group);
        thermal_zone_device_unregister(priv->thermal);
        kfree(priv->data_vault);
        kfree(priv->trts);
index b0eb5ec..a5b58ea 100644 (file)
@@ -528,7 +528,7 @@ static int start_power_clamp(void)
 
        set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
        /* prevent cpu hotplug */
-       get_online_cpus();
+       cpus_read_lock();
 
        /* prefer BSP */
        control_cpu = 0;
@@ -542,7 +542,7 @@ static int start_power_clamp(void)
        for_each_online_cpu(cpu) {
                start_power_clamp_worker(cpu);
        }
-       put_online_cpus();
+       cpus_read_unlock();
 
        return 0;
 }
index 8ec10d5..cd80c7d 100644 (file)
@@ -79,6 +79,8 @@ static const struct x86_cpu_id tcc_ids[] __initconst = {
        X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, NULL),
        X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, NULL),
        X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, NULL),
+       X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, NULL),
+       X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, NULL),
        {}
 };
 
index 8d5ac2d..7d942f7 100644 (file)
@@ -31,3 +31,13 @@ config QCOM_SPMI_TEMP_ALARM
          trip points. The temperature reported by the thermal sensor reflects the
          real time die temperature if an ADC is present or an estimate of the
          temperature based upon the over temperature stage value.
+
+config QCOM_LMH
+       tristate "Qualcomm Limits Management Hardware"
+       depends on ARCH_QCOM
+       help
+         This enables initialization of Qualcomm limits management
+         hardware(LMh). LMh allows for hardware-enforced mitigation for cpus based on
+         input from temperature and current sensors.  On many newer Qualcomm SoCs
+         LMh is configured in the firmware and this feature need not be enabled.
+         However, on certain SoCs like sdm845 LMh has to be configured from kernel.
index 252ea7d..0fa2512 100644 (file)
@@ -5,3 +5,4 @@ qcom_tsens-y                    += tsens.o tsens-v2.o tsens-v1.o tsens-v0_1.o \
                                   tsens-8960.o
 obj-$(CONFIG_QCOM_SPMI_ADC_TM5)        += qcom-spmi-adc-tm5.o
 obj-$(CONFIG_QCOM_SPMI_TEMP_ALARM)     += qcom-spmi-temp-alarm.o
+obj-$(CONFIG_QCOM_LMH)         += lmh.o
diff --git a/drivers/thermal/qcom/lmh.c b/drivers/thermal/qcom/lmh.c
new file mode 100644 (file)
index 0000000..eafa752
--- /dev/null
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright (C) 2021, Linaro Limited. All rights reserved.
+ */
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/irqdomain.h>
+#include <linux/err.h>
+#include <linux/platform_device.h>
+#include <linux/of_platform.h>
+#include <linux/slab.h>
+#include <linux/qcom_scm.h>
+
+#define LMH_NODE_DCVS                  0x44435653
+#define LMH_CLUSTER0_NODE_ID           0x6370302D
+#define LMH_CLUSTER1_NODE_ID           0x6370312D
+
+#define LMH_SUB_FN_THERMAL             0x54484D4C
+#define LMH_SUB_FN_CRNT                        0x43524E54
+#define LMH_SUB_FN_REL                 0x52454C00
+#define LMH_SUB_FN_BCL                 0x42434C00
+
+#define LMH_ALGO_MODE_ENABLE           0x454E424C
+#define LMH_TH_HI_THRESHOLD            0x48494748
+#define LMH_TH_LOW_THRESHOLD           0x4C4F5700
+#define LMH_TH_ARM_THRESHOLD           0x41524D00
+
+#define LMH_REG_DCVS_INTR_CLR          0x8
+
+struct lmh_hw_data {
+       void __iomem *base;
+       struct irq_domain *domain;
+       int irq;
+};
+
+static irqreturn_t lmh_handle_irq(int hw_irq, void *data)
+{
+       struct lmh_hw_data *lmh_data = data;
+       int irq = irq_find_mapping(lmh_data->domain, 0);
+
+       /* Call the cpufreq driver to handle the interrupt */
+       if (irq)
+               generic_handle_irq(irq);
+
+       return 0;
+}
+
+static void lmh_enable_interrupt(struct irq_data *d)
+{
+       struct lmh_hw_data *lmh_data = irq_data_get_irq_chip_data(d);
+
+       /* Clear the existing interrupt */
+       writel(0xff, lmh_data->base + LMH_REG_DCVS_INTR_CLR);
+       enable_irq(lmh_data->irq);
+}
+
+static void lmh_disable_interrupt(struct irq_data *d)
+{
+       struct lmh_hw_data *lmh_data = irq_data_get_irq_chip_data(d);
+
+       disable_irq_nosync(lmh_data->irq);
+}
+
+static struct irq_chip lmh_irq_chip = {
+       .name           = "lmh",
+       .irq_enable     = lmh_enable_interrupt,
+       .irq_disable    = lmh_disable_interrupt
+};
+
+static int lmh_irq_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw)
+{
+       struct lmh_hw_data *lmh_data = d->host_data;
+
+       irq_set_chip_and_handler(irq, &lmh_irq_chip, handle_simple_irq);
+       irq_set_chip_data(irq, lmh_data);
+
+       return 0;
+}
+
+static const struct irq_domain_ops lmh_irq_ops = {
+       .map = lmh_irq_map,
+       .xlate = irq_domain_xlate_onecell,
+};
+
+static int lmh_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct device_node *np = dev->of_node;
+       struct device_node *cpu_node;
+       struct lmh_hw_data *lmh_data;
+       int temp_low, temp_high, temp_arm, cpu_id, ret;
+       u32 node_id;
+
+       lmh_data = devm_kzalloc(dev, sizeof(*lmh_data), GFP_KERNEL);
+       if (!lmh_data)
+               return -ENOMEM;
+
+       lmh_data->base = devm_platform_ioremap_resource(pdev, 0);
+       if (IS_ERR(lmh_data->base))
+               return PTR_ERR(lmh_data->base);
+
+       cpu_node = of_parse_phandle(np, "cpus", 0);
+       if (!cpu_node)
+               return -EINVAL;
+       cpu_id = of_cpu_node_to_id(cpu_node);
+       of_node_put(cpu_node);
+
+       ret = of_property_read_u32(np, "qcom,lmh-temp-high-millicelsius", &temp_high);
+       if (ret) {
+               dev_err(dev, "missing qcom,lmh-temp-high-millicelsius property\n");
+               return ret;
+       }
+
+       ret = of_property_read_u32(np, "qcom,lmh-temp-low-millicelsius", &temp_low);
+       if (ret) {
+               dev_err(dev, "missing qcom,lmh-temp-low-millicelsius property\n");
+               return ret;
+       }
+
+       ret = of_property_read_u32(np, "qcom,lmh-temp-arm-millicelsius", &temp_arm);
+       if (ret) {
+               dev_err(dev, "missing qcom,lmh-temp-arm-millicelsius property\n");
+               return ret;
+       }
+
+       /*
+        * Only sdm845 has lmh hardware currently enabled from hlos. If this is needed
+        * for other platforms, revisit this to check if the <cpu-id, node-id> should be part
+        * of a dt match table.
+        */
+       if (cpu_id == 0) {
+               node_id = LMH_CLUSTER0_NODE_ID;
+       } else if (cpu_id == 4) {
+               node_id = LMH_CLUSTER1_NODE_ID;
+       } else {
+               dev_err(dev, "Wrong CPU id associated with LMh node\n");
+               return -EINVAL;
+       }
+
+       if (!qcom_scm_lmh_dcvsh_available())
+               return -EINVAL;
+
+       ret = qcom_scm_lmh_dcvsh(LMH_SUB_FN_CRNT, LMH_ALGO_MODE_ENABLE, 1,
+                                LMH_NODE_DCVS, node_id, 0);
+       if (ret)
+               dev_err(dev, "Error %d enabling current subfunction\n", ret);
+
+       ret = qcom_scm_lmh_dcvsh(LMH_SUB_FN_REL, LMH_ALGO_MODE_ENABLE, 1,
+                                LMH_NODE_DCVS, node_id, 0);
+       if (ret)
+               dev_err(dev, "Error %d enabling reliability subfunction\n", ret);
+
+       ret = qcom_scm_lmh_dcvsh(LMH_SUB_FN_BCL, LMH_ALGO_MODE_ENABLE, 1,
+                                LMH_NODE_DCVS, node_id, 0);
+       if (ret)
+               dev_err(dev, "Error %d enabling BCL subfunction\n", ret);
+
+       ret = qcom_scm_lmh_dcvsh(LMH_SUB_FN_THERMAL, LMH_ALGO_MODE_ENABLE, 1,
+                                LMH_NODE_DCVS, node_id, 0);
+       if (ret) {
+               dev_err(dev, "Error %d enabling thermal subfunction\n", ret);
+               return ret;
+       }
+
+       ret = qcom_scm_lmh_profile_change(0x1);
+       if (ret) {
+               dev_err(dev, "Error %d changing profile\n", ret);
+               return ret;
+       }
+
+       /* Set default thermal trips */
+       ret = qcom_scm_lmh_dcvsh(LMH_SUB_FN_THERMAL, LMH_TH_ARM_THRESHOLD, temp_arm,
+                                LMH_NODE_DCVS, node_id, 0);
+       if (ret) {
+               dev_err(dev, "Error setting thermal ARM threshold%d\n", ret);
+               return ret;
+       }
+
+       ret = qcom_scm_lmh_dcvsh(LMH_SUB_FN_THERMAL, LMH_TH_HI_THRESHOLD, temp_high,
+                                LMH_NODE_DCVS, node_id, 0);
+       if (ret) {
+               dev_err(dev, "Error setting thermal HI threshold%d\n", ret);
+               return ret;
+       }
+
+       ret = qcom_scm_lmh_dcvsh(LMH_SUB_FN_THERMAL, LMH_TH_LOW_THRESHOLD, temp_low,
+                                LMH_NODE_DCVS, node_id, 0);
+       if (ret) {
+               dev_err(dev, "Error setting thermal ARM threshold%d\n", ret);
+               return ret;
+       }
+
+       lmh_data->irq = platform_get_irq(pdev, 0);
+       lmh_data->domain = irq_domain_add_linear(np, 1, &lmh_irq_ops, lmh_data);
+       if (!lmh_data->domain) {
+               dev_err(dev, "Error adding irq_domain\n");
+               return -EINVAL;
+       }
+
+       /* Disable the irq and let cpufreq enable it when ready to handle the interrupt */
+       irq_set_status_flags(lmh_data->irq, IRQ_NOAUTOEN);
+       ret = devm_request_irq(dev, lmh_data->irq, lmh_handle_irq,
+                              IRQF_ONESHOT | IRQF_NO_SUSPEND,
+                              "lmh-irq", lmh_data);
+       if (ret) {
+               dev_err(dev, "Error %d registering irq %x\n", ret, lmh_data->irq);
+               irq_domain_remove(lmh_data->domain);
+               return ret;
+       }
+
+       return 0;
+}
+
+static const struct of_device_id lmh_table[] = {
+       { .compatible = "qcom,sdm845-lmh", },
+       {}
+};
+MODULE_DEVICE_TABLE(of, lmh_table);
+
+static struct platform_driver lmh_driver = {
+       .probe = lmh_probe,
+       .driver = {
+               .name = "qcom-lmh",
+               .of_match_table = lmh_table,
+               .suppress_bind_attrs = true,
+       },
+};
+module_platform_driver(lmh_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("QCOM LMh driver");
index 232fd0b..8494cc0 100644 (file)
@@ -359,6 +359,12 @@ static int adc_tm5_register_tzd(struct adc_tm5_chip *adc_tm)
                                                           &adc_tm->channels[i],
                                                           &adc_tm5_ops);
                if (IS_ERR(tzd)) {
+                       if (PTR_ERR(tzd) == -ENODEV) {
+                               dev_warn(adc_tm->dev, "thermal sensor on channel %d is not used\n",
+                                        adc_tm->channels[i].channel);
+                               continue;
+                       }
+
                        dev_err(adc_tm->dev, "Error registering TZ zone for channel %d: %ld\n",
                                adc_tm->channels[i].channel, PTR_ERR(tzd));
                        return PTR_ERR(tzd);
index fdf16aa..85228d3 100644 (file)
@@ -84,7 +84,7 @@ struct rcar_gen3_thermal_tsc {
        struct thermal_zone_device *zone;
        struct equation_coefs coef;
        int tj_t;
-       int id; /* thermal channel id */
+       unsigned int id; /* thermal channel id */
 };
 
 struct rcar_gen3_thermal_priv {
@@ -190,10 +190,64 @@ static int rcar_gen3_thermal_get_temp(void *devdata, int *temp)
        return 0;
 }
 
-static const struct thermal_zone_of_device_ops rcar_gen3_tz_of_ops = {
+static int rcar_gen3_thermal_mcelsius_to_temp(struct rcar_gen3_thermal_tsc *tsc,
+                                             int mcelsius)
+{
+       int celsius, val;
+
+       celsius = DIV_ROUND_CLOSEST(mcelsius, 1000);
+       if (celsius <= INT_FIXPT(tsc->tj_t))
+               val = celsius * tsc->coef.a1 + tsc->coef.b1;
+       else
+               val = celsius * tsc->coef.a2 + tsc->coef.b2;
+
+       return INT_FIXPT(val);
+}
+
+static int rcar_gen3_thermal_set_trips(void *devdata, int low, int high)
+{
+       struct rcar_gen3_thermal_tsc *tsc = devdata;
+       u32 irqmsk = 0;
+
+       if (low != -INT_MAX) {
+               irqmsk |= IRQ_TEMPD1;
+               rcar_gen3_thermal_write(tsc, REG_GEN3_IRQTEMP1,
+                                       rcar_gen3_thermal_mcelsius_to_temp(tsc, low));
+       }
+
+       if (high != INT_MAX) {
+               irqmsk |= IRQ_TEMP2;
+               rcar_gen3_thermal_write(tsc, REG_GEN3_IRQTEMP2,
+                                       rcar_gen3_thermal_mcelsius_to_temp(tsc, high));
+       }
+
+       rcar_gen3_thermal_write(tsc, REG_GEN3_IRQMSK, irqmsk);
+
+       return 0;
+}
+
+static struct thermal_zone_of_device_ops rcar_gen3_tz_of_ops = {
        .get_temp       = rcar_gen3_thermal_get_temp,
+       .set_trips      = rcar_gen3_thermal_set_trips,
 };
 
+static irqreturn_t rcar_gen3_thermal_irq(int irq, void *data)
+{
+       struct rcar_gen3_thermal_priv *priv = data;
+       unsigned int i;
+       u32 status;
+
+       for (i = 0; i < priv->num_tscs; i++) {
+               status = rcar_gen3_thermal_read(priv->tscs[i], REG_GEN3_IRQSTR);
+               rcar_gen3_thermal_write(priv->tscs[i], REG_GEN3_IRQSTR, 0);
+               if (status)
+                       thermal_zone_device_update(priv->tscs[i]->zone,
+                                                  THERMAL_EVENT_UNSPECIFIED);
+       }
+
+       return IRQ_HANDLED;
+}
+
 static const struct soc_device_attribute r8a7795es1[] = {
        { .soc_id = "r8a7795", .revision = "ES1.*" },
        { /* sentinel */ }
@@ -210,6 +264,9 @@ static void rcar_gen3_thermal_init_r8a7795es1(struct rcar_gen3_thermal_tsc *tsc)
 
        rcar_gen3_thermal_write(tsc, REG_GEN3_IRQCTL, 0x3F);
        rcar_gen3_thermal_write(tsc, REG_GEN3_IRQMSK, 0);
+       if (tsc->zone->ops->set_trips)
+               rcar_gen3_thermal_write(tsc, REG_GEN3_IRQEN,
+                                       IRQ_TEMPD1 | IRQ_TEMP2);
 
        rcar_gen3_thermal_write(tsc, REG_GEN3_CTSR,
                                CTSR_PONM | CTSR_AOUT | CTSR_THBGR | CTSR_VMEN);
@@ -235,6 +292,9 @@ static void rcar_gen3_thermal_init(struct rcar_gen3_thermal_tsc *tsc)
 
        rcar_gen3_thermal_write(tsc, REG_GEN3_IRQCTL, 0);
        rcar_gen3_thermal_write(tsc, REG_GEN3_IRQMSK, 0);
+       if (tsc->zone->ops->set_trips)
+               rcar_gen3_thermal_write(tsc, REG_GEN3_IRQEN,
+                                       IRQ_TEMPD1 | IRQ_TEMP2);
 
        reg_val = rcar_gen3_thermal_read(tsc, REG_GEN3_THCTR);
        reg_val |= THCTR_THSST;
@@ -303,6 +363,34 @@ static void rcar_gen3_hwmon_action(void *data)
        thermal_remove_hwmon_sysfs(zone);
 }
 
+static int rcar_gen3_thermal_request_irqs(struct rcar_gen3_thermal_priv *priv,
+                                         struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       unsigned int i;
+       char *irqname;
+       int ret, irq;
+
+       for (i = 0; i < 2; i++) {
+               irq = platform_get_irq_optional(pdev, i);
+               if (irq < 0)
+                       return irq;
+
+               irqname = devm_kasprintf(dev, GFP_KERNEL, "%s:ch%d",
+                                        dev_name(dev), i);
+               if (!irqname)
+                       return -ENOMEM;
+
+               ret = devm_request_threaded_irq(dev, irq, NULL,
+                                               rcar_gen3_thermal_irq,
+                                               IRQF_ONESHOT, irqname, priv);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
 static int rcar_gen3_thermal_probe(struct platform_device *pdev)
 {
        struct rcar_gen3_thermal_priv *priv;
@@ -310,7 +398,8 @@ static int rcar_gen3_thermal_probe(struct platform_device *pdev)
        const int *ths_tj_1 = of_device_get_match_data(dev);
        struct resource *res;
        struct thermal_zone_device *zone;
-       int ret, i;
+       unsigned int i;
+       int ret;
 
        /* default values if FUSEs are missing */
        /* TODO: Read values from hardware on supported platforms */
@@ -326,6 +415,9 @@ static int rcar_gen3_thermal_probe(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, priv);
 
+       if (rcar_gen3_thermal_request_irqs(priv, pdev))
+               rcar_gen3_tz_of_ops.set_trips = NULL;
+
        pm_runtime_enable(dev);
        pm_runtime_get_sync(dev);
 
@@ -351,9 +443,6 @@ static int rcar_gen3_thermal_probe(struct platform_device *pdev)
 
                priv->tscs[i] = tsc;
 
-               priv->thermal_init(tsc);
-               rcar_gen3_thermal_calc_coefs(tsc, ptat, thcodes[i], *ths_tj_1);
-
                zone = devm_thermal_zone_of_sensor_register(dev, i, tsc,
                                                            &rcar_gen3_tz_of_ops);
                if (IS_ERR(zone)) {
@@ -363,6 +452,9 @@ static int rcar_gen3_thermal_probe(struct platform_device *pdev)
                }
                tsc->zone = zone;
 
+               priv->thermal_init(tsc);
+               rcar_gen3_thermal_calc_coefs(tsc, ptat, thcodes[i], *ths_tj_1);
+
                tsc->zone->tzp->no_hwmon = false;
                ret = thermal_add_hwmon_sysfs(tsc->zone);
                if (ret)
@@ -376,7 +468,7 @@ static int rcar_gen3_thermal_probe(struct platform_device *pdev)
                if (ret < 0)
                        goto error_unregister;
 
-               dev_info(dev, "TSC%d: Loaded %d trip points\n", i, ret);
+               dev_info(dev, "TSC%u: Loaded %d trip points\n", i, ret);
        }
 
        priv->num_tscs = i;
@@ -401,8 +493,12 @@ static int __maybe_unused rcar_gen3_thermal_resume(struct device *dev)
 
        for (i = 0; i < priv->num_tscs; i++) {
                struct rcar_gen3_thermal_tsc *tsc = priv->tscs[i];
+               struct thermal_zone_device *zone = tsc->zone;
 
                priv->thermal_init(tsc);
+               if (zone->ops->set_trips)
+                       rcar_gen3_thermal_set_trips(tsc, zone->prev_low_trip,
+                                                   zone->prev_high_trip);
        }
 
        return 0;
index e9a90bc..f4ab4c5 100644 (file)
@@ -1073,6 +1073,7 @@ static int exynos_tmu_probe(struct platform_device *pdev)
                data->sclk = devm_clk_get(&pdev->dev, "tmu_sclk");
                if (IS_ERR(data->sclk)) {
                        dev_err(&pdev->dev, "Failed to get sclk\n");
+                       ret = PTR_ERR(data->sclk);
                        goto err_clk;
                } else {
                        ret = clk_prepare_enable(data->sclk);
index 46c2215..cfa41d8 100644 (file)
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 menu "NVIDIA Tegra thermal drivers"
-depends on ARCH_TEGRA
+depends on ARCH_TEGRA || COMPILE_TEST
 
 config TEGRA_SOCTHERM
        tristate "Tegra SOCTHERM thermal management"
@@ -18,4 +18,11 @@ config TEGRA_BPMP_THERMAL
          Enable this option for support for sensing system temperature of NVIDIA
          Tegra systems-on-chip with the BPMP coprocessor (Tegra186).
 
+config TEGRA30_TSENSOR
+       tristate "Tegra30 Thermal Sensor"
+       depends on ARCH_TEGRA_3x_SOC || COMPILE_TEST
+       help
+         Enable this option to support thermal management of NVIDIA Tegra30
+         system-on-chip.
+
 endmenu
index 0f2b66e..eb27d19 100644 (file)
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_TEGRA_SOCTHERM)           += tegra-soctherm.o
 obj-$(CONFIG_TEGRA_BPMP_THERMAL)       += tegra-bpmp-thermal.o
+obj-$(CONFIG_TEGRA30_TSENSOR)          += tegra30-tsensor.o
 
 tegra-soctherm-y                               := soctherm.o soctherm-fuse.o
 tegra-soctherm-$(CONFIG_ARCH_TEGRA_124_SOC)    += tegra124-soctherm.o
index 8e303e9..210325f 100644 (file)
@@ -450,8 +450,8 @@ static int enforce_temp_range(struct device *dev, int trip_temp)
 
        temp = clamp_val(trip_temp, min_low_temp, max_high_temp);
        if (temp != trip_temp)
-               dev_info(dev, "soctherm: trip temperature %d forced to %d\n",
-                        trip_temp, temp);
+               dev_dbg(dev, "soctherm: trip temperature %d forced to %d\n",
+                       trip_temp, temp);
        return temp;
 }
 
diff --git a/drivers/thermal/tegra/tegra30-tsensor.c b/drivers/thermal/tegra/tegra30-tsensor.c
new file mode 100644 (file)
index 0000000..9b6b693
--- /dev/null
@@ -0,0 +1,673 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tegra30 SoC Thermal Sensor driver
+ *
+ * Based on downstream HWMON driver from NVIDIA.
+ * Copyright (C) 2011 NVIDIA Corporation
+ *
+ * Author: Dmitry Osipenko <digetx@gmail.com>
+ * Copyright (C) 2021 GRATE-DRIVER project
+ */
+
+#include <linux/bitfield.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/math.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/reset.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+#include <linux/types.h>
+
+#include <soc/tegra/fuse.h>
+
+#include "../thermal_core.h"
+#include "../thermal_hwmon.h"
+
+#define TSENSOR_SENSOR0_CONFIG0                                0x0
+#define TSENSOR_SENSOR0_CONFIG0_SENSOR_STOP            BIT(0)
+#define TSENSOR_SENSOR0_CONFIG0_HW_FREQ_DIV_EN         BIT(1)
+#define TSENSOR_SENSOR0_CONFIG0_THERMAL_RST_EN         BIT(2)
+#define TSENSOR_SENSOR0_CONFIG0_DVFS_EN                        BIT(3)
+#define TSENSOR_SENSOR0_CONFIG0_INTR_OVERFLOW_EN       BIT(4)
+#define TSENSOR_SENSOR0_CONFIG0_INTR_HW_FREQ_DIV_EN    BIT(5)
+#define TSENSOR_SENSOR0_CONFIG0_INTR_THERMAL_RST_EN    BIT(6)
+#define TSENSOR_SENSOR0_CONFIG0_M                      GENMASK(23,  8)
+#define TSENSOR_SENSOR0_CONFIG0_N                      GENMASK(31, 24)
+
+#define TSENSOR_SENSOR0_CONFIG1                                0x8
+#define TSENSOR_SENSOR0_CONFIG1_TH1                    GENMASK(15,  0)
+#define TSENSOR_SENSOR0_CONFIG1_TH2                    GENMASK(31, 16)
+
+#define TSENSOR_SENSOR0_CONFIG2                                0xc
+#define TSENSOR_SENSOR0_CONFIG2_TH3                    GENMASK(15,  0)
+
+#define TSENSOR_SENSOR0_STATUS0                                0x18
+#define TSENSOR_SENSOR0_STATUS0_STATE                  GENMASK(2, 0)
+#define TSENSOR_SENSOR0_STATUS0_INTR                   BIT(8)
+#define TSENSOR_SENSOR0_STATUS0_CURRENT_VALID          BIT(9)
+
+#define TSENSOR_SENSOR0_TS_STATUS1                     0x1c
+#define TSENSOR_SENSOR0_TS_STATUS1_CURRENT_COUNT       GENMASK(31, 16)
+
+#define TEGRA30_FUSE_TEST_PROG_VER                     0x28
+
+#define TEGRA30_FUSE_TSENSOR_CALIB                     0x98
+#define TEGRA30_FUSE_TSENSOR_CALIB_LOW                 GENMASK(15,  0)
+#define TEGRA30_FUSE_TSENSOR_CALIB_HIGH                        GENMASK(31, 16)
+
+#define TEGRA30_FUSE_SPARE_BIT                         0x144
+
+struct tegra_tsensor;
+
+struct tegra_tsensor_calibration_data {
+       int a, b, m, n, p, r;
+};
+
+struct tegra_tsensor_channel {
+       void __iomem *regs;
+       unsigned int id;
+       struct tegra_tsensor *ts;
+       struct thermal_zone_device *tzd;
+};
+
+struct tegra_tsensor {
+       void __iomem *regs;
+       bool swap_channels;
+       struct clk *clk;
+       struct device *dev;
+       struct reset_control *rst;
+       struct tegra_tsensor_channel ch[2];
+       struct tegra_tsensor_calibration_data calib;
+};
+
+static int tegra_tsensor_hw_enable(const struct tegra_tsensor *ts)
+{
+       u32 val;
+       int err;
+
+       err = reset_control_assert(ts->rst);
+       if (err) {
+               dev_err(ts->dev, "failed to assert hardware reset: %d\n", err);
+               return err;
+       }
+
+       err = clk_prepare_enable(ts->clk);
+       if (err) {
+               dev_err(ts->dev, "failed to enable clock: %d\n", err);
+               return err;
+       }
+
+       fsleep(1000);
+
+       err = reset_control_deassert(ts->rst);
+       if (err) {
+               dev_err(ts->dev, "failed to deassert hardware reset: %d\n", err);
+               goto disable_clk;
+       }
+
+       /*
+        * Sensors are enabled after reset by default, but not gauging
+        * until clock counter is programmed.
+        *
+        * M: number of reference clock pulses after which every
+        *    temperature / voltage measurement is made
+        *
+        * N: number of reference clock counts for which the counter runs
+        */
+       val  = FIELD_PREP(TSENSOR_SENSOR0_CONFIG0_M, 12500);
+       val |= FIELD_PREP(TSENSOR_SENSOR0_CONFIG0_N, 255);
+
+       /* apply the same configuration to both channels */
+       writel_relaxed(val, ts->regs + 0x40 + TSENSOR_SENSOR0_CONFIG0);
+       writel_relaxed(val, ts->regs + 0x80 + TSENSOR_SENSOR0_CONFIG0);
+
+       return 0;
+
+disable_clk:
+       clk_disable_unprepare(ts->clk);
+
+       return err;
+}
+
+static int tegra_tsensor_hw_disable(const struct tegra_tsensor *ts)
+{
+       int err;
+
+       err = reset_control_assert(ts->rst);
+       if (err) {
+               dev_err(ts->dev, "failed to assert hardware reset: %d\n", err);
+               return err;
+       }
+
+       clk_disable_unprepare(ts->clk);
+
+       return 0;
+}
+
+static void devm_tegra_tsensor_hw_disable(void *data)
+{
+       const struct tegra_tsensor *ts = data;
+
+       tegra_tsensor_hw_disable(ts);
+}
+
+static int tegra_tsensor_get_temp(void *data, int *temp)
+{
+       const struct tegra_tsensor_channel *tsc = data;
+       const struct tegra_tsensor *ts = tsc->ts;
+       int err, c1, c2, c3, c4, counter;
+       u32 val;
+
+       /*
+        * Counter will be invalid if hardware is misprogrammed or not enough
+        * time passed since the time when sensor was enabled.
+        */
+       err = readl_relaxed_poll_timeout(tsc->regs + TSENSOR_SENSOR0_STATUS0, val,
+                                        val & TSENSOR_SENSOR0_STATUS0_CURRENT_VALID,
+                                        21 * USEC_PER_MSEC,
+                                        21 * USEC_PER_MSEC * 50);
+       if (err) {
+               dev_err_once(ts->dev, "ch%u: counter invalid\n", tsc->id);
+               return err;
+       }
+
+       val = readl_relaxed(tsc->regs + TSENSOR_SENSOR0_TS_STATUS1);
+       counter = FIELD_GET(TSENSOR_SENSOR0_TS_STATUS1_CURRENT_COUNT, val);
+
+       /*
+        * This shouldn't happen with a valid counter status, nevertheless
+        * lets verify the value since it's in a separate (from status)
+        * register.
+        */
+       if (counter == 0xffff) {
+               dev_err_once(ts->dev, "ch%u: counter overflow\n", tsc->id);
+               return -EINVAL;
+       }
+
+       /*
+        * temperature = a * counter + b
+        * temperature = m * (temperature ^ 2) + n * temperature + p
+        */
+       c1 = DIV_ROUND_CLOSEST(ts->calib.a * counter + ts->calib.b, 1000000);
+       c1 = c1 ?: 1;
+       c2 = DIV_ROUND_CLOSEST(ts->calib.p, c1);
+       c3 = c1 * ts->calib.m;
+       c4 = ts->calib.n;
+
+       *temp = DIV_ROUND_CLOSEST(c1 * (c2 + c3 + c4), 1000);
+
+       return 0;
+}
+
+static int tegra_tsensor_temp_to_counter(const struct tegra_tsensor *ts, int temp)
+{
+       int c1, c2;
+
+       c1 = DIV_ROUND_CLOSEST(ts->calib.p - temp * 1000, ts->calib.m);
+       c2 = -ts->calib.r - int_sqrt(ts->calib.r * ts->calib.r - c1);
+
+       return DIV_ROUND_CLOSEST(c2 * 1000000 - ts->calib.b, ts->calib.a);
+}
+
+static int tegra_tsensor_set_trips(void *data, int low, int high)
+{
+       const struct tegra_tsensor_channel *tsc = data;
+       const struct tegra_tsensor *ts = tsc->ts;
+       u32 val;
+
+       /*
+        * TSENSOR doesn't trigger interrupt on the "low" temperature breach,
+        * hence bail out if high temperature is unspecified.
+        */
+       if (high == INT_MAX)
+               return 0;
+
+       val = readl_relaxed(tsc->regs + TSENSOR_SENSOR0_CONFIG1);
+       val &= ~TSENSOR_SENSOR0_CONFIG1_TH1;
+
+       high = tegra_tsensor_temp_to_counter(ts, high);
+       val |= FIELD_PREP(TSENSOR_SENSOR0_CONFIG1_TH1, high);
+       writel_relaxed(val, tsc->regs + TSENSOR_SENSOR0_CONFIG1);
+
+       return 0;
+}
+
+static const struct thermal_zone_of_device_ops ops = {
+       .get_temp = tegra_tsensor_get_temp,
+       .set_trips = tegra_tsensor_set_trips,
+};
+
+static bool
+tegra_tsensor_handle_channel_interrupt(const struct tegra_tsensor *ts,
+                                      unsigned int id)
+{
+       const struct tegra_tsensor_channel *tsc = &ts->ch[id];
+       u32 val;
+
+       val = readl_relaxed(tsc->regs + TSENSOR_SENSOR0_STATUS0);
+       writel_relaxed(val, tsc->regs + TSENSOR_SENSOR0_STATUS0);
+
+       if (FIELD_GET(TSENSOR_SENSOR0_STATUS0_STATE, val) == 5)
+               dev_err_ratelimited(ts->dev, "ch%u: counter overflowed\n", id);
+
+       if (!FIELD_GET(TSENSOR_SENSOR0_STATUS0_INTR, val))
+               return false;
+
+       thermal_zone_device_update(tsc->tzd, THERMAL_EVENT_UNSPECIFIED);
+
+       return true;
+}
+
+static irqreturn_t tegra_tsensor_isr(int irq, void *data)
+{
+       const struct tegra_tsensor *ts = data;
+       bool handled = false;
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(ts->ch); i++)
+               handled |= tegra_tsensor_handle_channel_interrupt(ts, i);
+
+       return handled ? IRQ_HANDLED : IRQ_NONE;
+}
+
+static int tegra_tsensor_disable_hw_channel(const struct tegra_tsensor *ts,
+                                           unsigned int id)
+{
+       const struct tegra_tsensor_channel *tsc = &ts->ch[id];
+       struct thermal_zone_device *tzd = tsc->tzd;
+       u32 val;
+       int err;
+
+       if (!tzd)
+               goto stop_channel;
+
+       err = thermal_zone_device_disable(tzd);
+       if (err) {
+               dev_err(ts->dev, "ch%u: failed to disable zone: %d\n", id, err);
+               return err;
+       }
+
+stop_channel:
+       /* stop channel gracefully */
+       val = readl_relaxed(tsc->regs + TSENSOR_SENSOR0_CONFIG0);
+       val |= FIELD_PREP(TSENSOR_SENSOR0_CONFIG0_SENSOR_STOP, 1);
+       writel_relaxed(val, tsc->regs + TSENSOR_SENSOR0_CONFIG0);
+
+       return 0;
+}
+
+static void tegra_tsensor_get_hw_channel_trips(struct thermal_zone_device *tzd,
+                                              int *hot_trip, int *crit_trip)
+{
+       unsigned int i;
+
+       /*
+        * 90C is the maximal critical temperature of all Tegra30 SoC variants,
+        * use it for the default trip if unspecified in a device-tree.
+        */
+       *hot_trip  = 85000;
+       *crit_trip = 90000;
+
+       for (i = 0; i < tzd->trips; i++) {
+               enum thermal_trip_type type;
+               int trip_temp;
+
+               tzd->ops->get_trip_temp(tzd, i, &trip_temp);
+               tzd->ops->get_trip_type(tzd, i, &type);
+
+               if (type == THERMAL_TRIP_HOT)
+                       *hot_trip = trip_temp;
+
+               if (type == THERMAL_TRIP_CRITICAL)
+                       *crit_trip = trip_temp;
+       }
+
+       /* clamp hardware trips to the calibration limits */
+       *hot_trip = clamp(*hot_trip, 25000, 90000);
+
+       /*
+        * Kernel will perform a normal system shut down if it will
+        * see that critical temperature is breached, hence set the
+        * hardware limit by 5C higher in order to allow system to
+        * shut down gracefully before sending signal to the Power
+        * Management controller.
+        */
+       *crit_trip = clamp(*crit_trip + 5000, 25000, 90000);
+}
+
+static int tegra_tsensor_enable_hw_channel(const struct tegra_tsensor *ts,
+                                          unsigned int id)
+{
+       const struct tegra_tsensor_channel *tsc = &ts->ch[id];
+       struct thermal_zone_device *tzd = tsc->tzd;
+       int err, hot_trip = 0, crit_trip = 0;
+       u32 val;
+
+       if (!tzd) {
+               val = readl_relaxed(tsc->regs + TSENSOR_SENSOR0_CONFIG0);
+               val &= ~TSENSOR_SENSOR0_CONFIG0_SENSOR_STOP;
+               writel_relaxed(val, tsc->regs + TSENSOR_SENSOR0_CONFIG0);
+
+               return 0;
+       }
+
+       tegra_tsensor_get_hw_channel_trips(tzd, &hot_trip, &crit_trip);
+
+       /* prevent potential racing with tegra_tsensor_set_trips() */
+       mutex_lock(&tzd->lock);
+
+       dev_info_once(ts->dev, "ch%u: PMC emergency shutdown trip set to %dC\n",
+                     id, DIV_ROUND_CLOSEST(crit_trip, 1000));
+
+       hot_trip  = tegra_tsensor_temp_to_counter(ts, hot_trip);
+       crit_trip = tegra_tsensor_temp_to_counter(ts, crit_trip);
+
+       /* program LEVEL2 counter threshold */
+       val = readl_relaxed(tsc->regs + TSENSOR_SENSOR0_CONFIG1);
+       val &= ~TSENSOR_SENSOR0_CONFIG1_TH2;
+       val |= FIELD_PREP(TSENSOR_SENSOR0_CONFIG1_TH2, hot_trip);
+       writel_relaxed(val, tsc->regs + TSENSOR_SENSOR0_CONFIG1);
+
+       /* program LEVEL3 counter threshold */
+       val = readl_relaxed(tsc->regs + TSENSOR_SENSOR0_CONFIG2);
+       val &= ~TSENSOR_SENSOR0_CONFIG2_TH3;
+       val |= FIELD_PREP(TSENSOR_SENSOR0_CONFIG2_TH3, crit_trip);
+       writel_relaxed(val, tsc->regs + TSENSOR_SENSOR0_CONFIG2);
+
+       /*
+        * Enable sensor, emergency shutdown, interrupts for level 1/2/3
+        * breaches and counter overflow condition.
+        *
+        * Disable DIV2 throttle for now since we need to figure out how
+        * to integrate it properly with the thermal framework.
+        *
+        * Thermal levels supported by hardware:
+        *
+        *     Level 0 = cold
+        *     Level 1 = passive cooling (cpufreq DVFS)
+        *     Level 2 = passive cooling assisted by hardware (DIV2)
+        *     Level 3 = emergency shutdown assisted by hardware (PMC)
+        */
+       val = readl_relaxed(tsc->regs + TSENSOR_SENSOR0_CONFIG0);
+       val &= ~TSENSOR_SENSOR0_CONFIG0_SENSOR_STOP;
+       val |= FIELD_PREP(TSENSOR_SENSOR0_CONFIG0_DVFS_EN, 1);
+       val |= FIELD_PREP(TSENSOR_SENSOR0_CONFIG0_HW_FREQ_DIV_EN, 0);
+       val |= FIELD_PREP(TSENSOR_SENSOR0_CONFIG0_THERMAL_RST_EN, 1);
+       val |= FIELD_PREP(TSENSOR_SENSOR0_CONFIG0_INTR_OVERFLOW_EN, 1);
+       val |= FIELD_PREP(TSENSOR_SENSOR0_CONFIG0_INTR_HW_FREQ_DIV_EN, 1);
+       val |= FIELD_PREP(TSENSOR_SENSOR0_CONFIG0_INTR_THERMAL_RST_EN, 1);
+       writel_relaxed(val, tsc->regs + TSENSOR_SENSOR0_CONFIG0);
+
+       mutex_unlock(&tzd->lock);
+
+       err = thermal_zone_device_enable(tzd);
+       if (err) {
+               dev_err(ts->dev, "ch%u: failed to enable zone: %d\n", id, err);
+               return err;
+       }
+
+       return 0;
+}
+
+static bool tegra_tsensor_fuse_read_spare(unsigned int spare)
+{
+       u32 val = 0;
+
+       tegra_fuse_readl(TEGRA30_FUSE_SPARE_BIT + spare * 4, &val);
+
+       return !!val;
+}
+
+static int tegra_tsensor_nvmem_setup(struct tegra_tsensor *ts)
+{
+       u32 i, ate_ver = 0, cal = 0, t1_25C = 0, t2_90C = 0;
+       int err, c1_25C, c2_90C;
+
+       err = tegra_fuse_readl(TEGRA30_FUSE_TEST_PROG_VER, &ate_ver);
+       if (err) {
+               dev_err_probe(ts->dev, err, "failed to get ATE version\n");
+               return err;
+       }
+
+       if (ate_ver < 8) {
+               dev_info(ts->dev, "unsupported ATE version: %u\n", ate_ver);
+               return -ENODEV;
+       }
+
+       /*
+        * We have two TSENSOR channels in a two different spots on SoC.
+        * Second channel provides more accurate data on older SoC versions,
+        * use it as a primary channel.
+        */
+       if (ate_ver <= 21) {
+               dev_info_once(ts->dev,
+                             "older ATE version detected, channels remapped\n");
+               ts->swap_channels = true;
+       }
+
+       err = tegra_fuse_readl(TEGRA30_FUSE_TSENSOR_CALIB, &cal);
+       if (err) {
+               dev_err(ts->dev, "failed to get calibration data: %d\n", err);
+               return err;
+       }
+
+       /* get calibrated counter values for 25C/90C thresholds */
+       c1_25C = FIELD_GET(TEGRA30_FUSE_TSENSOR_CALIB_LOW, cal);
+       c2_90C = FIELD_GET(TEGRA30_FUSE_TSENSOR_CALIB_HIGH, cal);
+
+       /* and calibrated temperatures corresponding to the counter values */
+       for (i = 0; i < 7; i++) {
+               t1_25C |= tegra_tsensor_fuse_read_spare(14 + i) << i;
+               t1_25C |= tegra_tsensor_fuse_read_spare(21 + i) << i;
+
+               t2_90C |= tegra_tsensor_fuse_read_spare(0 + i) << i;
+               t2_90C |= tegra_tsensor_fuse_read_spare(7 + i) << i;
+       }
+
+       if (c2_90C - c1_25C <= t2_90C - t1_25C) {
+               dev_err(ts->dev, "invalid calibration data: %d %d %u %u\n",
+                       c2_90C, c1_25C, t2_90C, t1_25C);
+               return -EINVAL;
+       }
+
+       /* all calibration coefficients are premultiplied by 1000000 */
+
+       ts->calib.a = DIV_ROUND_CLOSEST((t2_90C - t1_25C) * 1000000,
+                                       (c2_90C - c1_25C));
+
+       ts->calib.b = t1_25C * 1000000 - ts->calib.a * c1_25C;
+
+       if (tegra_sku_info.revision == TEGRA_REVISION_A01) {
+               ts->calib.m =     -2775;
+               ts->calib.n =   1338811;
+               ts->calib.p =  -7300000;
+       } else {
+               ts->calib.m =     -3512;
+               ts->calib.n =   1528943;
+               ts->calib.p = -11100000;
+       }
+
+       /* except the coefficient of a reduced quadratic equation */
+       ts->calib.r = DIV_ROUND_CLOSEST(ts->calib.n, ts->calib.m * 2);
+
+       dev_info_once(ts->dev,
+                     "calibration: %d %d %u %u ATE ver: %u SoC rev: %u\n",
+                     c2_90C, c1_25C, t2_90C, t1_25C, ate_ver,
+                     tegra_sku_info.revision);
+
+       return 0;
+}
+
+static int tegra_tsensor_register_channel(struct tegra_tsensor *ts,
+                                         unsigned int id)
+{
+       struct tegra_tsensor_channel *tsc = &ts->ch[id];
+       unsigned int hw_id = ts->swap_channels ? !id : id;
+
+       tsc->ts = ts;
+       tsc->id = id;
+       tsc->regs = ts->regs + 0x40 * (hw_id + 1);
+
+       tsc->tzd = devm_thermal_zone_of_sensor_register(ts->dev, id, tsc, &ops);
+       if (IS_ERR(tsc->tzd)) {
+               if (PTR_ERR(tsc->tzd) != -ENODEV)
+                       return dev_err_probe(ts->dev, PTR_ERR(tsc->tzd),
+                                            "failed to register thermal zone\n");
+
+               /*
+                * It's okay if sensor isn't assigned to any thermal zone
+                * in a device-tree.
+                */
+               tsc->tzd = NULL;
+               return 0;
+       }
+
+       if (devm_thermal_add_hwmon_sysfs(tsc->tzd))
+               dev_warn(ts->dev, "failed to add hwmon sysfs attributes\n");
+
+       return 0;
+}
+
+static int tegra_tsensor_probe(struct platform_device *pdev)
+{
+       struct tegra_tsensor *ts;
+       unsigned int i;
+       int err, irq;
+
+       ts = devm_kzalloc(&pdev->dev, sizeof(*ts), GFP_KERNEL);
+       if (!ts)
+               return -ENOMEM;
+
+       irq = platform_get_irq(pdev, 0);
+       if (irq < 0)
+               return irq;
+
+       ts->dev = &pdev->dev;
+       platform_set_drvdata(pdev, ts);
+
+       ts->regs = devm_platform_ioremap_resource(pdev, 0);
+       if (IS_ERR(ts->regs))
+               return PTR_ERR(ts->regs);
+
+       ts->clk = devm_clk_get(&pdev->dev, NULL);
+       if (IS_ERR(ts->clk))
+               return dev_err_probe(&pdev->dev, PTR_ERR(ts->clk),
+                                    "failed to get clock\n");
+
+       ts->rst = devm_reset_control_get_exclusive(&pdev->dev, NULL);
+       if (IS_ERR(ts->rst))
+               return dev_err_probe(&pdev->dev, PTR_ERR(ts->rst),
+                                    "failed to get reset control\n");
+
+       err = tegra_tsensor_nvmem_setup(ts);
+       if (err)
+               return err;
+
+       err = tegra_tsensor_hw_enable(ts);
+       if (err)
+               return err;
+
+       err = devm_add_action_or_reset(&pdev->dev,
+                                      devm_tegra_tsensor_hw_disable,
+                                      ts);
+       if (err)
+               return err;
+
+       for (i = 0; i < ARRAY_SIZE(ts->ch); i++) {
+               err = tegra_tsensor_register_channel(ts, i);
+               if (err)
+                       return err;
+       }
+
+       err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+                                       tegra_tsensor_isr, IRQF_ONESHOT,
+                                       "tegra_tsensor", ts);
+       if (err)
+               return dev_err_probe(&pdev->dev, err,
+                                    "failed to request interrupt\n");
+
+       for (i = 0; i < ARRAY_SIZE(ts->ch); i++) {
+               err = tegra_tsensor_enable_hw_channel(ts, i);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+static int __maybe_unused tegra_tsensor_suspend(struct device *dev)
+{
+       struct tegra_tsensor *ts = dev_get_drvdata(dev);
+       unsigned int i;
+       int err;
+
+       for (i = 0; i < ARRAY_SIZE(ts->ch); i++) {
+               err = tegra_tsensor_disable_hw_channel(ts, i);
+               if (err)
+                       goto enable_channel;
+       }
+
+       err = tegra_tsensor_hw_disable(ts);
+       if (err)
+               goto enable_channel;
+
+       return 0;
+
+enable_channel:
+       while (i--)
+               tegra_tsensor_enable_hw_channel(ts, i);
+
+       return err;
+}
+
+static int __maybe_unused tegra_tsensor_resume(struct device *dev)
+{
+       struct tegra_tsensor *ts = dev_get_drvdata(dev);
+       unsigned int i;
+       int err;
+
+       err = tegra_tsensor_hw_enable(ts);
+       if (err)
+               return err;
+
+       for (i = 0; i < ARRAY_SIZE(ts->ch); i++) {
+               err = tegra_tsensor_enable_hw_channel(ts, i);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+static const struct dev_pm_ops tegra_tsensor_pm_ops = {
+       SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(tegra_tsensor_suspend,
+                                     tegra_tsensor_resume)
+};
+
+static const struct of_device_id tegra_tsensor_of_match[] = {
+       { .compatible = "nvidia,tegra30-tsensor", },
+       {},
+};
+MODULE_DEVICE_TABLE(of, tegra_tsensor_of_match);
+
+static struct platform_driver tegra_tsensor_driver = {
+       .probe = tegra_tsensor_probe,
+       .driver = {
+               .name = "tegra30-tsensor",
+               .of_match_table = tegra_tsensor_of_match,
+               .pm = &tegra_tsensor_pm_ops,
+       },
+};
+module_platform_driver(tegra_tsensor_driver);
+
+MODULE_DESCRIPTION("NVIDIA Tegra30 Thermal Sensor driver");
+MODULE_AUTHOR("Dmitry Osipenko <digetx@gmail.com>");
+MODULE_LICENSE("GPL");
index a503c1b..3d91982 100644 (file)
@@ -33,6 +33,16 @@ config VDPA_SIM_BLOCK
          vDPA block device simulator which terminates IO request in a
          memory buffer.
 
+config VDPA_USER
+       tristate "VDUSE (vDPA Device in Userspace) support"
+       depends on EVENTFD && MMU && HAS_DMA
+       select DMA_OPS
+       select VHOST_IOTLB
+       select IOMMU_IOVA
+       help
+         With VDUSE it is possible to emulate a vDPA Device
+         in a userspace program.
+
 config IFCVF
        tristate "Intel IFC VF vDPA driver"
        depends on PCI_MSI
@@ -53,6 +63,7 @@ config MLX5_VDPA
 config MLX5_VDPA_NET
        tristate "vDPA driver for ConnectX devices"
        select MLX5_VDPA
+       select VHOST_RING
        depends on MLX5_CORE
        help
          VDPA network driver for ConnectX6 and newer. Provides offloading
index 67fe7f3..f02ebed 100644 (file)
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_VDPA) += vdpa.o
 obj-$(CONFIG_VDPA_SIM) += vdpa_sim/
+obj-$(CONFIG_VDPA_USER) += vdpa_user/
 obj-$(CONFIG_IFCVF)    += ifcvf/
 obj-$(CONFIG_MLX5_VDPA) += mlx5/
 obj-$(CONFIG_VP_VDPA)    += virtio_pci/
index 6e197fe..2808f1b 100644 (file)
@@ -158,7 +158,9 @@ next:
                return -EIO;
        }
 
-       for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+       hw->nr_vring = ifc_ioread16(&hw->common_cfg->num_queues);
+
+       for (i = 0; i < hw->nr_vring; i++) {
                ifc_iowrite16(i, &hw->common_cfg->queue_select);
                notify_off = ifc_ioread16(&hw->common_cfg->queue_notify_off);
                hw->vring[i].notify_addr = hw->notify_base +
@@ -304,7 +306,7 @@ u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid)
        u32 q_pair_id;
 
        ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
-       q_pair_id = qid / (IFCVF_MAX_QUEUE_PAIRS * 2);
+       q_pair_id = qid / hw->nr_vring;
        avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2];
        last_avail_idx = ifc_ioread16(avail_idx_addr);
 
@@ -318,7 +320,7 @@ int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num)
        u32 q_pair_id;
 
        ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
-       q_pair_id = qid / (IFCVF_MAX_QUEUE_PAIRS * 2);
+       q_pair_id = qid / hw->nr_vring;
        avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2];
        hw->vring[qid].last_avail_idx = num;
        ifc_iowrite16(num, avail_idx_addr);
index 2996db0..09918af 100644 (file)
 #define N3000_DEVICE_ID                0x1041
 #define N3000_SUBSYS_DEVICE_ID 0x001A
 
-#define IFCVF_NET_SUPPORTED_FEATURES \
-               ((1ULL << VIRTIO_NET_F_MAC)                     | \
-                (1ULL << VIRTIO_F_ANY_LAYOUT)                  | \
-                (1ULL << VIRTIO_F_VERSION_1)                   | \
-                (1ULL << VIRTIO_NET_F_STATUS)                  | \
-                (1ULL << VIRTIO_F_ORDER_PLATFORM)              | \
-                (1ULL << VIRTIO_F_ACCESS_PLATFORM)             | \
-                (1ULL << VIRTIO_NET_F_MRG_RXBUF))
-
-/* Only one queue pair for now. */
-#define IFCVF_MAX_QUEUE_PAIRS  1
+/* Max 8 data queue pairs(16 queues) and one control vq for now. */
+#define IFCVF_MAX_QUEUES       17
 
 #define IFCVF_QUEUE_ALIGNMENT  PAGE_SIZE
 #define IFCVF_QUEUE_MAX                32768
@@ -51,8 +42,6 @@
 #define ifcvf_private_to_vf(adapter) \
        (&((struct ifcvf_adapter *)adapter)->vf)
 
-#define IFCVF_MAX_INTR (IFCVF_MAX_QUEUE_PAIRS * 2 + 1)
-
 struct vring_info {
        u64 desc;
        u64 avail;
@@ -83,7 +72,7 @@ struct ifcvf_hw {
        u32 dev_type;
        struct virtio_pci_common_cfg __iomem *common_cfg;
        void __iomem *net_cfg;
-       struct vring_info vring[IFCVF_MAX_QUEUE_PAIRS * 2];
+       struct vring_info vring[IFCVF_MAX_QUEUES];
        void __iomem * const *base;
        char config_msix_name[256];
        struct vdpa_callback config_cb;
@@ -103,7 +92,13 @@ struct ifcvf_vring_lm_cfg {
 
 struct ifcvf_lm_cfg {
        u8 reserved[IFCVF_LM_RING_STATE_OFFSET];
-       struct ifcvf_vring_lm_cfg vring_lm_cfg[IFCVF_MAX_QUEUE_PAIRS];
+       struct ifcvf_vring_lm_cfg vring_lm_cfg[IFCVF_MAX_QUEUES];
+};
+
+struct ifcvf_vdpa_mgmt_dev {
+       struct vdpa_mgmt_dev mdev;
+       struct ifcvf_adapter *adapter;
+       struct pci_dev *pdev;
 };
 
 int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *dev);
index 351c6cf..dcd648e 100644 (file)
@@ -63,9 +63,13 @@ static int ifcvf_request_irq(struct ifcvf_adapter *adapter)
        struct pci_dev *pdev = adapter->pdev;
        struct ifcvf_hw *vf = &adapter->vf;
        int vector, i, ret, irq;
+       u16 max_intr;
 
-       ret = pci_alloc_irq_vectors(pdev, IFCVF_MAX_INTR,
-                                   IFCVF_MAX_INTR, PCI_IRQ_MSIX);
+       /* all queues and config interrupt  */
+       max_intr = vf->nr_vring + 1;
+
+       ret = pci_alloc_irq_vectors(pdev, max_intr,
+                                   max_intr, PCI_IRQ_MSIX);
        if (ret < 0) {
                IFCVF_ERR(pdev, "Failed to alloc IRQ vectors\n");
                return ret;
@@ -83,7 +87,7 @@ static int ifcvf_request_irq(struct ifcvf_adapter *adapter)
                return ret;
        }
 
-       for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+       for (i = 0; i < vf->nr_vring; i++) {
                snprintf(vf->vring[i].msix_name, 256, "ifcvf[%s]-%d\n",
                         pci_name(pdev), i);
                vector = i + IFCVF_MSI_QUEUE_OFF;
@@ -112,7 +116,6 @@ static int ifcvf_start_datapath(void *private)
        u8 status;
        int ret;
 
-       vf->nr_vring = IFCVF_MAX_QUEUE_PAIRS * 2;
        ret = ifcvf_start_hw(vf);
        if (ret < 0) {
                status = ifcvf_get_status(vf);
@@ -128,7 +131,7 @@ static int ifcvf_stop_datapath(void *private)
        struct ifcvf_hw *vf = ifcvf_private_to_vf(private);
        int i;
 
-       for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++)
+       for (i = 0; i < vf->nr_vring; i++)
                vf->vring[i].cb.callback = NULL;
 
        ifcvf_stop_hw(vf);
@@ -141,7 +144,7 @@ static void ifcvf_reset_vring(struct ifcvf_adapter *adapter)
        struct ifcvf_hw *vf = ifcvf_private_to_vf(adapter);
        int i;
 
-       for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+       for (i = 0; i < vf->nr_vring; i++) {
                vf->vring[i].last_avail_idx = 0;
                vf->vring[i].desc = 0;
                vf->vring[i].avail = 0;
@@ -171,17 +174,12 @@ static u64 ifcvf_vdpa_get_features(struct vdpa_device *vdpa_dev)
        struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev);
        struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
        struct pci_dev *pdev = adapter->pdev;
-
+       u32 type = vf->dev_type;
        u64 features;
 
-       switch (vf->dev_type) {
-       case VIRTIO_ID_NET:
-               features = ifcvf_get_features(vf) & IFCVF_NET_SUPPORTED_FEATURES;
-               break;
-       case VIRTIO_ID_BLOCK:
+       if (type == VIRTIO_ID_NET || type == VIRTIO_ID_BLOCK)
                features = ifcvf_get_features(vf);
-               break;
-       default:
+       else {
                features = 0;
                IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", vf->dev_type);
        }
@@ -218,23 +216,12 @@ static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
        int ret;
 
        vf  = vdpa_to_vf(vdpa_dev);
-       adapter = dev_get_drvdata(vdpa_dev->dev.parent);
+       adapter = vdpa_to_adapter(vdpa_dev);
        status_old = ifcvf_get_status(vf);
 
        if (status_old == status)
                return;
 
-       if ((status_old & VIRTIO_CONFIG_S_DRIVER_OK) &&
-           !(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
-               ifcvf_stop_datapath(adapter);
-               ifcvf_free_irq(adapter, IFCVF_MAX_QUEUE_PAIRS * 2);
-       }
-
-       if (status == 0) {
-               ifcvf_reset_vring(adapter);
-               return;
-       }
-
        if ((status & VIRTIO_CONFIG_S_DRIVER_OK) &&
            !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) {
                ret = ifcvf_request_irq(adapter);
@@ -254,6 +241,29 @@ static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
        ifcvf_set_status(vf, status);
 }
 
+static int ifcvf_vdpa_reset(struct vdpa_device *vdpa_dev)
+{
+       struct ifcvf_adapter *adapter;
+       struct ifcvf_hw *vf;
+       u8 status_old;
+
+       vf  = vdpa_to_vf(vdpa_dev);
+       adapter = vdpa_to_adapter(vdpa_dev);
+       status_old = ifcvf_get_status(vf);
+
+       if (status_old == 0)
+               return 0;
+
+       if (status_old & VIRTIO_CONFIG_S_DRIVER_OK) {
+               ifcvf_stop_datapath(adapter);
+               ifcvf_free_irq(adapter, vf->nr_vring);
+       }
+
+       ifcvf_reset_vring(adapter);
+
+       return 0;
+}
+
 static u16 ifcvf_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
 {
        return IFCVF_QUEUE_MAX;
@@ -437,6 +447,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = {
        .set_features   = ifcvf_vdpa_set_features,
        .get_status     = ifcvf_vdpa_get_status,
        .set_status     = ifcvf_vdpa_set_status,
+       .reset          = ifcvf_vdpa_reset,
        .get_vq_num_max = ifcvf_vdpa_get_vq_num_max,
        .get_vq_state   = ifcvf_vdpa_get_vq_state,
        .set_vq_state   = ifcvf_vdpa_set_vq_state,
@@ -458,63 +469,63 @@ static const struct vdpa_config_ops ifc_vdpa_ops = {
        .get_vq_notification = ifcvf_get_vq_notification,
 };
 
-static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+static struct virtio_device_id id_table_net[] = {
+       {VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID},
+       {0},
+};
+
+static struct virtio_device_id id_table_blk[] = {
+       {VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID},
+       {0},
+};
+
+static u32 get_dev_type(struct pci_dev *pdev)
 {
-       struct device *dev = &pdev->dev;
-       struct ifcvf_adapter *adapter;
-       struct ifcvf_hw *vf;
-       int ret, i;
+       u32 dev_type;
 
-       ret = pcim_enable_device(pdev);
-       if (ret) {
-               IFCVF_ERR(pdev, "Failed to enable device\n");
-               return ret;
-       }
+       /* This drirver drives both modern virtio devices and transitional
+        * devices in modern mode.
+        * vDPA requires feature bit VIRTIO_F_ACCESS_PLATFORM,
+        * so legacy devices and transitional devices in legacy
+        * mode will not work for vDPA, this driver will not
+        * drive devices with legacy interface.
+        */
 
-       ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4),
-                                IFCVF_DRIVER_NAME);
-       if (ret) {
-               IFCVF_ERR(pdev, "Failed to request MMIO region\n");
-               return ret;
-       }
+       if (pdev->device < 0x1040)
+               dev_type =  pdev->subsystem_device;
+       else
+               dev_type =  pdev->device - 0x1040;
 
-       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
-       if (ret) {
-               IFCVF_ERR(pdev, "No usable DMA configuration\n");
-               return ret;
-       }
+       return dev_type;
+}
 
-       ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev);
-       if (ret) {
-               IFCVF_ERR(pdev,
-                         "Failed for adding devres for freeing irq vectors\n");
-               return ret;
-       }
+static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
+{
+       struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev;
+       struct ifcvf_adapter *adapter;
+       struct pci_dev *pdev;
+       struct ifcvf_hw *vf;
+       struct device *dev;
+       int ret, i;
 
+       ifcvf_mgmt_dev = container_of(mdev, struct ifcvf_vdpa_mgmt_dev, mdev);
+       if (ifcvf_mgmt_dev->adapter)
+               return -EOPNOTSUPP;
+
+       pdev = ifcvf_mgmt_dev->pdev;
+       dev = &pdev->dev;
        adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa,
-                                   dev, &ifc_vdpa_ops, NULL);
+                                   dev, &ifc_vdpa_ops, name, false);
        if (IS_ERR(adapter)) {
                IFCVF_ERR(pdev, "Failed to allocate vDPA structure");
                return PTR_ERR(adapter);
        }
 
-       pci_set_master(pdev);
-       pci_set_drvdata(pdev, adapter);
+       ifcvf_mgmt_dev->adapter = adapter;
+       pci_set_drvdata(pdev, ifcvf_mgmt_dev);
 
        vf = &adapter->vf;
-
-       /* This drirver drives both modern virtio devices and transitional
-        * devices in modern mode.
-        * vDPA requires feature bit VIRTIO_F_ACCESS_PLATFORM,
-        * so legacy devices and transitional devices in legacy
-        * mode will not work for vDPA, this driver will not
-        * drive devices with legacy interface.
-        */
-       if (pdev->device < 0x1040)
-               vf->dev_type =  pdev->subsystem_device;
-       else
-               vf->dev_type =  pdev->device - 0x1040;
-
+       vf->dev_type = get_dev_type(pdev);
        vf->base = pcim_iomap_table(pdev);
 
        adapter->pdev = pdev;
@@ -526,14 +537,15 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
                goto err;
        }
 
-       for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++)
+       for (i = 0; i < vf->nr_vring; i++)
                vf->vring[i].irq = -EINVAL;
 
        vf->hw_features = ifcvf_get_hw_features(vf);
 
-       ret = vdpa_register_device(&adapter->vdpa, IFCVF_MAX_QUEUE_PAIRS * 2);
+       adapter->vdpa.mdev = &ifcvf_mgmt_dev->mdev;
+       ret = _vdpa_register_device(&adapter->vdpa, vf->nr_vring);
        if (ret) {
-               IFCVF_ERR(pdev, "Failed to register ifcvf to vdpa bus");
+               IFCVF_ERR(pdev, "Failed to register to vDPA bus");
                goto err;
        }
 
@@ -544,11 +556,100 @@ err:
        return ret;
 }
 
+static void ifcvf_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
+{
+       struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev;
+
+       ifcvf_mgmt_dev = container_of(mdev, struct ifcvf_vdpa_mgmt_dev, mdev);
+       _vdpa_unregister_device(dev);
+       ifcvf_mgmt_dev->adapter = NULL;
+}
+
+static const struct vdpa_mgmtdev_ops ifcvf_vdpa_mgmt_dev_ops = {
+       .dev_add = ifcvf_vdpa_dev_add,
+       .dev_del = ifcvf_vdpa_dev_del
+};
+
+static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+       struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev;
+       struct device *dev = &pdev->dev;
+       u32 dev_type;
+       int ret;
+
+       ifcvf_mgmt_dev = kzalloc(sizeof(struct ifcvf_vdpa_mgmt_dev), GFP_KERNEL);
+       if (!ifcvf_mgmt_dev) {
+               IFCVF_ERR(pdev, "Failed to alloc memory for the vDPA management device\n");
+               return -ENOMEM;
+       }
+
+       dev_type = get_dev_type(pdev);
+       switch (dev_type) {
+       case VIRTIO_ID_NET:
+               ifcvf_mgmt_dev->mdev.id_table = id_table_net;
+               break;
+       case VIRTIO_ID_BLOCK:
+               ifcvf_mgmt_dev->mdev.id_table = id_table_blk;
+               break;
+       default:
+               IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", dev_type);
+               ret = -EOPNOTSUPP;
+               goto err;
+       }
+
+       ifcvf_mgmt_dev->mdev.ops = &ifcvf_vdpa_mgmt_dev_ops;
+       ifcvf_mgmt_dev->mdev.device = dev;
+       ifcvf_mgmt_dev->pdev = pdev;
+
+       ret = pcim_enable_device(pdev);
+       if (ret) {
+               IFCVF_ERR(pdev, "Failed to enable device\n");
+               goto err;
+       }
+
+       ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4),
+                                IFCVF_DRIVER_NAME);
+       if (ret) {
+               IFCVF_ERR(pdev, "Failed to request MMIO region\n");
+               goto err;
+       }
+
+       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
+       if (ret) {
+               IFCVF_ERR(pdev, "No usable DMA configuration\n");
+               goto err;
+       }
+
+       ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev);
+       if (ret) {
+               IFCVF_ERR(pdev,
+                         "Failed for adding devres for freeing irq vectors\n");
+               goto err;
+       }
+
+       pci_set_master(pdev);
+
+       ret = vdpa_mgmtdev_register(&ifcvf_mgmt_dev->mdev);
+       if (ret) {
+               IFCVF_ERR(pdev,
+                         "Failed to initialize the management interfaces\n");
+               goto err;
+       }
+
+       return 0;
+
+err:
+       kfree(ifcvf_mgmt_dev);
+       return ret;
+}
+
 static void ifcvf_remove(struct pci_dev *pdev)
 {
-       struct ifcvf_adapter *adapter = pci_get_drvdata(pdev);
+       struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev;
 
-       vdpa_unregister_device(&adapter->vdpa);
+       ifcvf_mgmt_dev = pci_get_drvdata(pdev);
+       vdpa_mgmtdev_unregister(&ifcvf_mgmt_dev->mdev);
+       kfree(ifcvf_mgmt_dev);
 }
 
 static struct pci_device_id ifcvf_pci_ids[] = {
index 0002b21..01a848a 100644 (file)
@@ -5,7 +5,7 @@
 #define __MLX5_VDPA_H__
 
 #include <linux/etherdevice.h>
-#include <linux/if_vlan.h>
+#include <linux/vringh.h>
 #include <linux/vdpa.h>
 #include <linux/mlx5/driver.h>
 
@@ -48,6 +48,26 @@ struct mlx5_vdpa_resources {
        bool valid;
 };
 
+struct mlx5_control_vq {
+       struct vhost_iotlb *iotlb;
+       /* spinlock to synchronize iommu table */
+       spinlock_t iommu_lock;
+       struct vringh vring;
+       bool ready;
+       u64 desc_addr;
+       u64 device_addr;
+       u64 driver_addr;
+       struct vdpa_callback event_cb;
+       struct vringh_kiov riov;
+       struct vringh_kiov wiov;
+       unsigned short head;
+};
+
+struct mlx5_ctrl_wq_ent {
+       struct work_struct work;
+       struct mlx5_vdpa_dev *mvdev;
+};
+
 struct mlx5_vdpa_dev {
        struct vdpa_device vdev;
        struct mlx5_core_dev *mdev;
@@ -57,9 +77,12 @@ struct mlx5_vdpa_dev {
        u64 actual_features;
        u8 status;
        u32 max_vqs;
+       u16 max_idx;
        u32 generation;
 
        struct mlx5_vdpa_mr mr;
+       struct mlx5_control_vq cvq;
+       struct workqueue_struct *wq;
 };
 
 int mlx5_vdpa_alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid);
@@ -68,6 +91,7 @@ int mlx5_vdpa_get_null_mkey(struct mlx5_vdpa_dev *dev, u32 *null_mkey);
 int mlx5_vdpa_create_tis(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tisn);
 void mlx5_vdpa_destroy_tis(struct mlx5_vdpa_dev *mvdev, u32 tisn);
 int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 *rqtn);
+int mlx5_vdpa_modify_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 rqtn);
 void mlx5_vdpa_destroy_rqt(struct mlx5_vdpa_dev *mvdev, u32 rqtn);
 int mlx5_vdpa_create_tir(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tirn);
 void mlx5_vdpa_destroy_tir(struct mlx5_vdpa_dev *mvdev, u32 tirn);
index e59135f..ff010c6 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
 
+#include <linux/vhost_types.h>
 #include <linux/vdpa.h>
 #include <linux/gcd.h>
 #include <linux/string.h>
@@ -451,33 +452,30 @@ static void destroy_dma_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr)
        mlx5_vdpa_destroy_mkey(mvdev, &mr->mkey);
 }
 
-static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
+static int dup_iotlb(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *src)
 {
-       struct mlx5_vdpa_mr *mr = &mvdev->mr;
+       struct vhost_iotlb_map *map;
+       u64 start = 0, last = ULLONG_MAX;
        int err;
 
-       if (mr->initialized)
-               return 0;
-
-       if (iotlb)
-               err = create_user_mr(mvdev, iotlb);
-       else
-               err = create_dma_mr(mvdev, mr);
-
-       if (!err)
-               mr->initialized = true;
+       if (!src) {
+               err = vhost_iotlb_add_range(mvdev->cvq.iotlb, start, last, start, VHOST_ACCESS_RW);
+               return err;
+       }
 
-       return err;
+       for (map = vhost_iotlb_itree_first(src, start, last); map;
+               map = vhost_iotlb_itree_next(map, start, last)) {
+               err = vhost_iotlb_add_range(mvdev->cvq.iotlb, map->start, map->last,
+                                           map->addr, map->perm);
+               if (err)
+                       return err;
+       }
+       return 0;
 }
 
-int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
+static void prune_iotlb(struct mlx5_vdpa_dev *mvdev)
 {
-       int err;
-
-       mutex_lock(&mvdev->mr.mkey_mtx);
-       err = _mlx5_vdpa_create_mr(mvdev, iotlb);
-       mutex_unlock(&mvdev->mr.mkey_mtx);
-       return err;
+       vhost_iotlb_del_range(mvdev->cvq.iotlb, 0, ULLONG_MAX);
 }
 
 static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr)
@@ -501,6 +499,7 @@ void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev)
        if (!mr->initialized)
                goto out;
 
+       prune_iotlb(mvdev);
        if (mr->user_mr)
                destroy_user_mr(mvdev, mr);
        else
@@ -512,6 +511,48 @@ out:
        mutex_unlock(&mr->mkey_mtx);
 }
 
+static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
+{
+       struct mlx5_vdpa_mr *mr = &mvdev->mr;
+       int err;
+
+       if (mr->initialized)
+               return 0;
+
+       if (iotlb)
+               err = create_user_mr(mvdev, iotlb);
+       else
+               err = create_dma_mr(mvdev, mr);
+
+       if (err)
+               return err;
+
+       err = dup_iotlb(mvdev, iotlb);
+       if (err)
+               goto out_err;
+
+       mr->initialized = true;
+       return 0;
+
+out_err:
+       if (iotlb)
+               destroy_user_mr(mvdev, mr);
+       else
+               destroy_dma_mr(mvdev, mr);
+
+       return err;
+}
+
+int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
+{
+       int err;
+
+       mutex_lock(&mvdev->mr.mkey_mtx);
+       err = _mlx5_vdpa_create_mr(mvdev, iotlb);
+       mutex_unlock(&mvdev->mr.mkey_mtx);
+       return err;
+}
+
 int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
                             bool *change_map)
 {
index d460621..15e266d 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
 
+#include <linux/iova.h>
 #include <linux/mlx5/driver.h>
 #include "mlx5_vdpa.h"
 
@@ -128,6 +129,16 @@ int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 *
        return err;
 }
 
+int mlx5_vdpa_modify_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 rqtn)
+{
+       u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {};
+
+       MLX5_SET(modify_rqt_in, in, uid, mvdev->res.uid);
+       MLX5_SET(modify_rqt_in, in, rqtn, rqtn);
+       MLX5_SET(modify_rqt_in, in, opcode, MLX5_CMD_OP_MODIFY_RQT);
+       return mlx5_cmd_exec(mvdev->mdev, in, inlen, out, sizeof(out));
+}
+
 void mlx5_vdpa_destroy_rqt(struct mlx5_vdpa_dev *mvdev, u32 rqtn)
 {
        u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {};
@@ -221,6 +232,22 @@ int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, struct mlx5_core_mkey *m
        return mlx5_cmd_exec_in(mvdev->mdev, destroy_mkey, in);
 }
 
+static int init_ctrl_vq(struct mlx5_vdpa_dev *mvdev)
+{
+       mvdev->cvq.iotlb = vhost_iotlb_alloc(0, 0);
+       if (!mvdev->cvq.iotlb)
+               return -ENOMEM;
+
+       vringh_set_iotlb(&mvdev->cvq.vring, mvdev->cvq.iotlb, &mvdev->cvq.iommu_lock);
+
+       return 0;
+}
+
+static void cleanup_ctrl_vq(struct mlx5_vdpa_dev *mvdev)
+{
+       vhost_iotlb_free(mvdev->cvq.iotlb);
+}
+
 int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
 {
        u64 offset = MLX5_CAP64_DEV_VDPA_EMULATION(mvdev->mdev, doorbell_bar_offset);
@@ -260,10 +287,17 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
                err = -ENOMEM;
                goto err_key;
        }
+
+       err = init_ctrl_vq(mvdev);
+       if (err)
+               goto err_ctrl;
+
        res->valid = true;
 
        return 0;
 
+err_ctrl:
+       iounmap(res->kick_addr);
 err_key:
        dealloc_pd(mvdev, res->pdn, res->uid);
 err_pd:
@@ -282,6 +316,7 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev)
        if (!res->valid)
                return;
 
+       cleanup_ctrl_vq(mvdev);
        iounmap(res->kick_addr);
        res->kick_addr = NULL;
        dealloc_pd(mvdev, res->pdn, res->uid);
index 5906cad..294ba05 100644 (file)
@@ -45,6 +45,8 @@ MODULE_LICENSE("Dual BSD/GPL");
        (VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
         VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
 
+#define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
+
 struct mlx5_vdpa_net_resources {
        u32 tisn;
        u32 tdn;
@@ -90,7 +92,6 @@ struct mlx5_vq_restore_info {
        u16 avail_index;
        u16 used_index;
        bool ready;
-       struct vdpa_callback cb;
        bool restore;
 };
 
@@ -100,7 +101,6 @@ struct mlx5_vdpa_virtqueue {
        u64 device_addr;
        u64 driver_addr;
        u32 num_ent;
-       struct vdpa_callback event_cb;
 
        /* Resources for implementing the notification channel from the device
         * to the driver. fwqp is the firmware end of an RC connection; the
@@ -135,11 +135,20 @@ struct mlx5_vdpa_virtqueue {
  */
 #define MLX5_MAX_SUPPORTED_VQS 16
 
+static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
+{
+       if (unlikely(idx > mvdev->max_idx))
+               return false;
+
+       return true;
+}
+
 struct mlx5_vdpa_net {
        struct mlx5_vdpa_dev mvdev;
        struct mlx5_vdpa_net_resources res;
        struct virtio_net_config config;
        struct mlx5_vdpa_virtqueue vqs[MLX5_MAX_SUPPORTED_VQS];
+       struct vdpa_callback event_cbs[MLX5_MAX_SUPPORTED_VQS + 1];
 
        /* Serialize vq resources creation and destruction. This is required
         * since memory map might change and we need to destroy and create
@@ -151,15 +160,18 @@ struct mlx5_vdpa_net {
        struct mlx5_flow_handle *rx_rule;
        bool setup;
        u16 mtu;
+       u32 cur_num_vqs;
 };
 
 static void free_resources(struct mlx5_vdpa_net *ndev);
 static void init_mvqs(struct mlx5_vdpa_net *ndev);
-static int setup_driver(struct mlx5_vdpa_net *ndev);
+static int setup_driver(struct mlx5_vdpa_dev *mvdev);
 static void teardown_driver(struct mlx5_vdpa_net *ndev);
 
 static bool mlx5_vdpa_debug;
 
+#define MLX5_CVQ_MAX_ENT 16
+
 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
        do {                                                                                       \
                if (features & BIT_ULL(_feature))                                                  \
@@ -172,11 +184,41 @@ static bool mlx5_vdpa_debug;
                        mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
        } while (0)
 
+/* TODO: cross-endian support */
+static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
+{
+       return virtio_legacy_is_little_endian() ||
+               (mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
+}
+
+static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
+{
+       return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
+}
+
+static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
+{
+       return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
+}
+
 static inline u32 mlx5_vdpa_max_qps(int max_vqs)
 {
        return max_vqs / 2;
 }
 
+static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
+{
+       if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
+               return 2;
+
+       return 2 * mlx5_vdpa_max_qps(mvdev->max_vqs);
+}
+
+static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
+{
+       return idx == ctrl_vq_idx(mvdev);
+}
+
 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
 {
        if (status & ~VALID_STATUS_MASK)
@@ -481,6 +523,10 @@ static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
 
 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
 {
+       struct mlx5_vdpa_net *ndev = mvq->ndev;
+       struct vdpa_callback *event_cb;
+
+       event_cb = &ndev->event_cbs[mvq->index];
        mlx5_cq_set_ci(&mvq->cq.mcq);
 
        /* make sure CQ cosumer update is visible to the hardware before updating
@@ -488,8 +534,8 @@ static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int nu
         */
        dma_wmb();
        rx_post(&mvq->vqqp, num);
-       if (mvq->event_cb.callback)
-               mvq->event_cb.callback(mvq->event_cb.private);
+       if (event_cb->callback)
+               event_cb->callback(event_cb->private);
 }
 
 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
@@ -1100,10 +1146,8 @@ static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
        if (!mvq->num_ent)
                return 0;
 
-       if (mvq->initialized) {
-               mlx5_vdpa_warn(&ndev->mvdev, "attempt re init\n");
-               return -EINVAL;
-       }
+       if (mvq->initialized)
+               return 0;
 
        err = cq_create(ndev, idx, mvq->num_ent);
        if (err)
@@ -1190,19 +1234,20 @@ static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *
 
 static int create_rqt(struct mlx5_vdpa_net *ndev)
 {
-       int log_max_rqt;
        __be32 *list;
+       int max_rqt;
        void *rqtc;
        int inlen;
        void *in;
        int i, j;
        int err;
 
-       log_max_rqt = min_t(int, 1, MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
-       if (log_max_rqt < 1)
+       max_rqt = min_t(int, MLX5_MAX_SUPPORTED_VQS / 2,
+                       1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
+       if (max_rqt < 1)
                return -EOPNOTSUPP;
 
-       inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + (1 << log_max_rqt) * MLX5_ST_SZ_BYTES(rq_num);
+       inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num);
        in = kzalloc(inlen, GFP_KERNEL);
        if (!in)
                return -ENOMEM;
@@ -1211,10 +1256,9 @@ static int create_rqt(struct mlx5_vdpa_net *ndev)
        rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
 
        MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
-       MLX5_SET(rqtc, rqtc, rqt_max_size, 1 << log_max_rqt);
-       MLX5_SET(rqtc, rqtc, rqt_actual_size, 1);
+       MLX5_SET(rqtc, rqtc, rqt_max_size, max_rqt);
        list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
-       for (i = 0, j = 0; j < ndev->mvdev.max_vqs; j++) {
+       for (i = 0, j = 0; j < max_rqt; j++) {
                if (!ndev->vqs[j].initialized)
                        continue;
 
@@ -1223,6 +1267,7 @@ static int create_rqt(struct mlx5_vdpa_net *ndev)
                        i++;
                }
        }
+       MLX5_SET(rqtc, rqtc, rqt_actual_size, i);
 
        err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
        kfree(in);
@@ -1232,6 +1277,52 @@ static int create_rqt(struct mlx5_vdpa_net *ndev)
        return 0;
 }
 
+#define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
+
+static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
+{
+       __be32 *list;
+       int max_rqt;
+       void *rqtc;
+       int inlen;
+       void *in;
+       int i, j;
+       int err;
+
+       max_rqt = min_t(int, ndev->cur_num_vqs / 2,
+                       1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size));
+       if (max_rqt < 1)
+               return -EOPNOTSUPP;
+
+       inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num);
+       in = kzalloc(inlen, GFP_KERNEL);
+       if (!in)
+               return -ENOMEM;
+
+       MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
+       MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
+       rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
+       MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
+
+       list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
+       for (i = 0, j = 0; j < num; j++) {
+               if (!ndev->vqs[j].initialized)
+                       continue;
+
+               if (!vq_is_tx(ndev->vqs[j].index)) {
+                       list[i] = cpu_to_be32(ndev->vqs[j].virtq_id);
+                       i++;
+               }
+       }
+       MLX5_SET(rqtc, rqtc, rqt_actual_size, i);
+       err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
+       kfree(in);
+       if (err)
+               return err;
+
+       return 0;
+}
+
 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
 {
        mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
@@ -1345,12 +1436,206 @@ static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev)
        ndev->rx_rule = NULL;
 }
 
+static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
+{
+       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+       struct mlx5_control_vq *cvq = &mvdev->cvq;
+       virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+       struct mlx5_core_dev *pfmdev;
+       size_t read;
+       u8 mac[ETH_ALEN];
+
+       pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
+       switch (cmd) {
+       case VIRTIO_NET_CTRL_MAC_ADDR_SET:
+               read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
+               if (read != ETH_ALEN)
+                       break;
+
+               if (!memcmp(ndev->config.mac, mac, 6)) {
+                       status = VIRTIO_NET_OK;
+                       break;
+               }
+
+               if (!is_zero_ether_addr(ndev->config.mac)) {
+                       if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
+                               mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
+                                              ndev->config.mac);
+                               break;
+                       }
+               }
+
+               if (mlx5_mpfs_add_mac(pfmdev, mac)) {
+                       mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
+                                      mac);
+                       break;
+               }
+
+               memcpy(ndev->config.mac, mac, ETH_ALEN);
+               status = VIRTIO_NET_OK;
+               break;
+
+       default:
+               break;
+       }
+
+       return status;
+}
+
+static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
+{
+       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+       int cur_qps = ndev->cur_num_vqs / 2;
+       int err;
+       int i;
+
+       if (cur_qps > newqps) {
+               err = modify_rqt(ndev, 2 * newqps);
+               if (err)
+                       return err;
+
+               for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
+                       teardown_vq(ndev, &ndev->vqs[i]);
+
+               ndev->cur_num_vqs = 2 * newqps;
+       } else {
+               ndev->cur_num_vqs = 2 * newqps;
+               for (i = cur_qps * 2; i < 2 * newqps; i++) {
+                       err = setup_vq(ndev, &ndev->vqs[i]);
+                       if (err)
+                               goto clean_added;
+               }
+               err = modify_rqt(ndev, 2 * newqps);
+               if (err)
+                       goto clean_added;
+       }
+       return 0;
+
+clean_added:
+       for (--i; i >= cur_qps; --i)
+               teardown_vq(ndev, &ndev->vqs[i]);
+
+       return err;
+}
+
+static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
+{
+       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+       virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+       struct mlx5_control_vq *cvq = &mvdev->cvq;
+       struct virtio_net_ctrl_mq mq;
+       size_t read;
+       u16 newqps;
+
+       switch (cmd) {
+       case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
+               read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
+               if (read != sizeof(mq))
+                       break;
+
+               newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
+               if (ndev->cur_num_vqs == 2 * newqps) {
+                       status = VIRTIO_NET_OK;
+                       break;
+               }
+
+               if (newqps & (newqps - 1))
+                       break;
+
+               if (!change_num_qps(mvdev, newqps))
+                       status = VIRTIO_NET_OK;
+
+               break;
+       default:
+               break;
+       }
+
+       return status;
+}
+
+static void mlx5_cvq_kick_handler(struct work_struct *work)
+{
+       virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+       struct virtio_net_ctrl_hdr ctrl;
+       struct mlx5_ctrl_wq_ent *wqent;
+       struct mlx5_vdpa_dev *mvdev;
+       struct mlx5_control_vq *cvq;
+       struct mlx5_vdpa_net *ndev;
+       size_t read, write;
+       int err;
+
+       wqent = container_of(work, struct mlx5_ctrl_wq_ent, work);
+       mvdev = wqent->mvdev;
+       ndev = to_mlx5_vdpa_ndev(mvdev);
+       cvq = &mvdev->cvq;
+       if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
+               goto out;
+
+       if (!cvq->ready)
+               goto out;
+
+       while (true) {
+               err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
+                                          GFP_ATOMIC);
+               if (err <= 0)
+                       break;
+
+               read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
+               if (read != sizeof(ctrl))
+                       break;
+
+               switch (ctrl.class) {
+               case VIRTIO_NET_CTRL_MAC:
+                       status = handle_ctrl_mac(mvdev, ctrl.cmd);
+                       break;
+               case VIRTIO_NET_CTRL_MQ:
+                       status = handle_ctrl_mq(mvdev, ctrl.cmd);
+                       break;
+
+               default:
+                       break;
+               }
+
+               /* Make sure data is written before advancing index */
+               smp_wmb();
+
+               write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
+               vringh_complete_iotlb(&cvq->vring, cvq->head, write);
+               vringh_kiov_cleanup(&cvq->riov);
+               vringh_kiov_cleanup(&cvq->wiov);
+
+               if (vringh_need_notify_iotlb(&cvq->vring))
+                       vringh_notify(&cvq->vring);
+       }
+out:
+       kfree(wqent);
+}
+
 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-       struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+       struct mlx5_vdpa_virtqueue *mvq;
+       struct mlx5_ctrl_wq_ent *wqent;
+
+       if (!is_index_valid(mvdev, idx))
+               return;
+
+       if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
+               if (!mvdev->cvq.ready)
+                       return;
+
+               wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
+               if (!wqent)
+                       return;
 
+               wqent->mvdev = mvdev;
+               INIT_WORK(&wqent->work, mlx5_cvq_kick_handler);
+               queue_work(mvdev->wq, &wqent->work);
+               return;
+       }
+
+       mvq = &ndev->vqs[idx];
        if (unlikely(!mvq->ready))
                return;
 
@@ -1362,8 +1647,19 @@ static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-       struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+       struct mlx5_vdpa_virtqueue *mvq;
+
+       if (!is_index_valid(mvdev, idx))
+               return -EINVAL;
 
+       if (is_ctrl_vq_idx(mvdev, idx)) {
+               mvdev->cvq.desc_addr = desc_area;
+               mvdev->cvq.device_addr = device_area;
+               mvdev->cvq.driver_addr = driver_area;
+               return 0;
+       }
+
+       mvq = &ndev->vqs[idx];
        mvq->desc_addr = desc_area;
        mvq->device_addr = device_area;
        mvq->driver_addr = driver_area;
@@ -1376,6 +1672,9 @@ static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
        struct mlx5_vdpa_virtqueue *mvq;
 
+       if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
+               return;
+
        mvq = &ndev->vqs[idx];
        mvq->num_ent = num;
 }
@@ -1384,17 +1683,46 @@ static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_c
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-       struct mlx5_vdpa_virtqueue *vq = &ndev->vqs[idx];
 
-       vq->event_cb = *cb;
+       ndev->event_cbs[idx] = *cb;
+}
+
+static void mlx5_cvq_notify(struct vringh *vring)
+{
+       struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
+
+       if (!cvq->event_cb.callback)
+               return;
+
+       cvq->event_cb.callback(cvq->event_cb.private);
+}
+
+static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
+{
+       struct mlx5_control_vq *cvq = &mvdev->cvq;
+
+       cvq->ready = ready;
+       if (!ready)
+               return;
+
+       cvq->vring.notify = mlx5_cvq_notify;
 }
 
 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-       struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+       struct mlx5_vdpa_virtqueue *mvq;
+
+       if (!is_index_valid(mvdev, idx))
+               return;
+
+       if (is_ctrl_vq_idx(mvdev, idx)) {
+               set_cvq_ready(mvdev, ready);
+               return;
+       }
 
+       mvq = &ndev->vqs[idx];
        if (!ready)
                suspend_vq(ndev, mvq);
 
@@ -1405,9 +1733,14 @@ static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-       struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
 
-       return mvq->ready;
+       if (!is_index_valid(mvdev, idx))
+               return false;
+
+       if (is_ctrl_vq_idx(mvdev, idx))
+               return mvdev->cvq.ready;
+
+       return ndev->vqs[idx].ready;
 }
 
 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
@@ -1415,8 +1748,17 @@ static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-       struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+       struct mlx5_vdpa_virtqueue *mvq;
 
+       if (!is_index_valid(mvdev, idx))
+               return -EINVAL;
+
+       if (is_ctrl_vq_idx(mvdev, idx)) {
+               mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
+               return 0;
+       }
+
+       mvq = &ndev->vqs[idx];
        if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
                mlx5_vdpa_warn(mvdev, "can't modify available index\n");
                return -EINVAL;
@@ -1431,10 +1773,19 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
        struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-       struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
+       struct mlx5_vdpa_virtqueue *mvq;
        struct mlx5_virtq_attr attr;
        int err;
 
+       if (!is_index_valid(mvdev, idx))
+               return -EINVAL;
+
+       if (is_ctrl_vq_idx(mvdev, idx)) {
+               state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
+               return 0;
+       }
+
+       mvq = &ndev->vqs[idx];
        /* If the virtq object was destroyed, use the value saved at
         * the last minute of suspend_vq. This caters for userspace
         * that cares about emulating the index after vq is stopped.
@@ -1491,10 +1842,14 @@ static u64 mlx5_vdpa_get_features(struct vdpa_device *vdev)
        u16 dev_features;
 
        dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, device_features_bits_mask);
-       ndev->mvdev.mlx_features = mlx_to_vritio_features(dev_features);
+       ndev->mvdev.mlx_features |= mlx_to_vritio_features(dev_features);
        if (MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, virtio_version_1_0))
                ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_VERSION_1);
        ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
+       ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
+       ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
+       ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MQ);
+
        print_features(mvdev, ndev->mvdev.mlx_features, false);
        return ndev->mvdev.mlx_features;
 }
@@ -1507,17 +1862,29 @@ static int verify_min_features(struct mlx5_vdpa_dev *mvdev, u64 features)
        return 0;
 }
 
-static int setup_virtqueues(struct mlx5_vdpa_net *ndev)
+static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev)
 {
+       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+       struct mlx5_control_vq *cvq = &mvdev->cvq;
        int err;
        int i;
 
-       for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); i++) {
+       for (i = 0; i < 2 * mlx5_vdpa_max_qps(mvdev->max_vqs); i++) {
                err = setup_vq(ndev, &ndev->vqs[i]);
                if (err)
                        goto err_vq;
        }
 
+       if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) {
+               err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
+                                       MLX5_CVQ_MAX_ENT, false,
+                                       (struct vring_desc *)(uintptr_t)cvq->desc_addr,
+                                       (struct vring_avail *)(uintptr_t)cvq->driver_addr,
+                                       (struct vring_used *)(uintptr_t)cvq->device_addr);
+               if (err)
+                       goto err_vq;
+       }
+
        return 0;
 
 err_vq:
@@ -1541,16 +1908,22 @@ static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
        }
 }
 
-/* TODO: cross-endian support */
-static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
-{
-       return virtio_legacy_is_little_endian() ||
-               (mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
-}
-
-static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
+static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
 {
-       return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
+       if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
+               if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
+                       /* MQ supported. CVQ index is right above the last data virtqueue's */
+                       mvdev->max_idx = mvdev->max_vqs;
+               } else {
+                       /* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
+                        * CVQ gets index 2
+                        */
+                       mvdev->max_idx = 2;
+               }
+       } else {
+               /* Two data virtqueues only: one for rx and one for tx */
+               mvdev->max_idx = 1;
+       }
 }
 
 static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features)
@@ -1568,6 +1941,7 @@ static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features)
        ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
        ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu);
        ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
+       update_cvq_info(mvdev);
        return err;
 }
 
@@ -1605,15 +1979,14 @@ static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
 {
        struct mlx5_vq_restore_info *ri = &mvq->ri;
-       struct mlx5_virtq_attr attr;
+       struct mlx5_virtq_attr attr = {};
        int err;
 
-       if (!mvq->initialized)
-               return 0;
-
-       err = query_virtqueue(ndev, mvq, &attr);
-       if (err)
-               return err;
+       if (mvq->initialized) {
+               err = query_virtqueue(ndev, mvq, &attr);
+               if (err)
+                       return err;
+       }
 
        ri->avail_index = attr.available_index;
        ri->used_index = attr.used_index;
@@ -1622,7 +1995,6 @@ static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu
        ri->desc_addr = mvq->desc_addr;
        ri->device_addr = mvq->device_addr;
        ri->driver_addr = mvq->driver_addr;
-       ri->cb = mvq->event_cb;
        ri->restore = true;
        return 0;
 }
@@ -1667,12 +2039,12 @@ static void restore_channels_info(struct mlx5_vdpa_net *ndev)
                mvq->desc_addr = ri->desc_addr;
                mvq->device_addr = ri->device_addr;
                mvq->driver_addr = ri->driver_addr;
-               mvq->event_cb = ri->cb;
        }
 }
 
-static int mlx5_vdpa_change_map(struct mlx5_vdpa_net *ndev, struct vhost_iotlb *iotlb)
+static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
 {
+       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
        int err;
 
        suspend_vqs(ndev);
@@ -1681,58 +2053,59 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_net *ndev, struct vhost_iotlb *
                goto err_mr;
 
        teardown_driver(ndev);
-       mlx5_vdpa_destroy_mr(&ndev->mvdev);
-       err = mlx5_vdpa_create_mr(&ndev->mvdev, iotlb);
+       mlx5_vdpa_destroy_mr(mvdev);
+       err = mlx5_vdpa_create_mr(mvdev, iotlb);
        if (err)
                goto err_mr;
 
-       if (!(ndev->mvdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
+       if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
                return 0;
 
        restore_channels_info(ndev);
-       err = setup_driver(ndev);
+       err = setup_driver(mvdev);
        if (err)
                goto err_setup;
 
        return 0;
 
 err_setup:
-       mlx5_vdpa_destroy_mr(&ndev->mvdev);
+       mlx5_vdpa_destroy_mr(mvdev);
 err_mr:
        return err;
 }
 
-static int setup_driver(struct mlx5_vdpa_net *ndev)
+static int setup_driver(struct mlx5_vdpa_dev *mvdev)
 {
+       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
        int err;
 
        mutex_lock(&ndev->reslock);
        if (ndev->setup) {
-               mlx5_vdpa_warn(&ndev->mvdev, "setup driver called for already setup driver\n");
+               mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
                err = 0;
                goto out;
        }
-       err = setup_virtqueues(ndev);
+       err = setup_virtqueues(mvdev);
        if (err) {
-               mlx5_vdpa_warn(&ndev->mvdev, "setup_virtqueues\n");
+               mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
                goto out;
        }
 
        err = create_rqt(ndev);
        if (err) {
-               mlx5_vdpa_warn(&ndev->mvdev, "create_rqt\n");
+               mlx5_vdpa_warn(mvdev, "create_rqt\n");
                goto err_rqt;
        }
 
        err = create_tir(ndev);
        if (err) {
-               mlx5_vdpa_warn(&ndev->mvdev, "create_tir\n");
+               mlx5_vdpa_warn(mvdev, "create_tir\n");
                goto err_tir;
        }
 
        err = add_fwd_to_tir(ndev);
        if (err) {
-               mlx5_vdpa_warn(&ndev->mvdev, "add_fwd_to_tir\n");
+               mlx5_vdpa_warn(mvdev, "add_fwd_to_tir\n");
                goto err_fwd;
        }
        ndev->setup = true;
@@ -1781,24 +2154,10 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
        int err;
 
        print_status(mvdev, status, true);
-       if (!status) {
-               mlx5_vdpa_info(mvdev, "performing device reset\n");
-               teardown_driver(ndev);
-               clear_vqs_ready(ndev);
-               mlx5_vdpa_destroy_mr(&ndev->mvdev);
-               ndev->mvdev.status = 0;
-               ndev->mvdev.mlx_features = 0;
-               ++mvdev->generation;
-               if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
-                       if (mlx5_vdpa_create_mr(mvdev, NULL))
-                               mlx5_vdpa_warn(mvdev, "create MR failed\n");
-               }
-               return;
-       }
 
        if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
                if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
-                       err = setup_driver(ndev);
+                       err = setup_driver(mvdev);
                        if (err) {
                                mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
                                goto err_setup;
@@ -1817,6 +2176,29 @@ err_setup:
        ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
 }
 
+static int mlx5_vdpa_reset(struct vdpa_device *vdev)
+{
+       struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+
+       print_status(mvdev, 0, true);
+       mlx5_vdpa_info(mvdev, "performing device reset\n");
+       teardown_driver(ndev);
+       clear_vqs_ready(ndev);
+       mlx5_vdpa_destroy_mr(&ndev->mvdev);
+       ndev->mvdev.status = 0;
+       ndev->mvdev.mlx_features = 0;
+       memset(ndev->event_cbs, 0, sizeof(ndev->event_cbs));
+       ndev->mvdev.actual_features = 0;
+       ++mvdev->generation;
+       if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
+               if (mlx5_vdpa_create_mr(mvdev, NULL))
+                       mlx5_vdpa_warn(mvdev, "create MR failed\n");
+       }
+
+       return 0;
+}
+
 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
 {
        return sizeof(struct virtio_net_config);
@@ -1848,7 +2230,6 @@ static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb)
 {
        struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
-       struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
        bool change_map;
        int err;
 
@@ -1859,7 +2240,7 @@ static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb
        }
 
        if (change_map)
-               return mlx5_vdpa_change_map(ndev, iotlb);
+               return mlx5_vdpa_change_map(mvdev, iotlb);
 
        return 0;
 }
@@ -1889,6 +2270,9 @@ static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device
        struct mlx5_vdpa_net *ndev;
        phys_addr_t addr;
 
+       if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
+               return ret;
+
        /* If SF BAR size is smaller than PAGE_SIZE, do not use direct
         * notification to avoid the risk of mapping pages that contain BAR of more
         * than one SF
@@ -1928,6 +2312,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = {
        .get_vendor_id = mlx5_vdpa_get_vendor_id,
        .get_status = mlx5_vdpa_get_status,
        .set_status = mlx5_vdpa_set_status,
+       .reset = mlx5_vdpa_reset,
        .get_config_size = mlx5_vdpa_get_config_size,
        .get_config = mlx5_vdpa_get_config,
        .set_config = mlx5_vdpa_set_config,
@@ -2040,7 +2425,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
        max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
 
        ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
-                                name);
+                                name, false);
        if (IS_ERR(ndev))
                return PTR_ERR(ndev);
 
@@ -2063,8 +2448,11 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
                err = mlx5_mpfs_add_mac(pfmdev, config->mac);
                if (err)
                        goto err_mtu;
+
+               ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC);
        }
 
+       config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, mlx5_vdpa_max_qps(max_vqs));
        mvdev->vdev.dma_dev = &mdev->pdev->dev;
        err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
        if (err)
@@ -2080,8 +2468,15 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
        if (err)
                goto err_mr;
 
+       mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_ctrl_wq");
+       if (!mvdev->wq) {
+               err = -ENOMEM;
+               goto err_res2;
+       }
+
+       ndev->cur_num_vqs = 2 * mlx5_vdpa_max_qps(max_vqs);
        mvdev->vdev.mdev = &mgtdev->mgtdev;
-       err = _vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs));
+       err = _vdpa_register_device(&mvdev->vdev, ndev->cur_num_vqs + 1);
        if (err)
                goto err_reg;
 
@@ -2089,6 +2484,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
        return 0;
 
 err_reg:
+       destroy_workqueue(mvdev->wq);
+err_res2:
        free_resources(ndev);
 err_mr:
        mlx5_vdpa_destroy_mr(mvdev);
@@ -2106,7 +2503,9 @@ err_mtu:
 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
 {
        struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
+       struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
 
+       destroy_workqueue(mvdev->wq);
        _vdpa_unregister_device(dev);
        mgtdev->ndev = NULL;
 }
index 3fc4525..1dc121a 100644 (file)
@@ -69,6 +69,7 @@ static void vdpa_release_dev(struct device *d)
  * @config: the bus operations that is supported by this device
  * @size: size of the parent structure that contains private data
  * @name: name of the vdpa device; optional.
+ * @use_va: indicate whether virtual address must be used by this device
  *
  * Driver should use vdpa_alloc_device() wrapper macro instead of
  * using this directly.
@@ -78,7 +79,8 @@ static void vdpa_release_dev(struct device *d)
  */
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
                                        const struct vdpa_config_ops *config,
-                                       size_t size, const char *name)
+                                       size_t size, const char *name,
+                                       bool use_va)
 {
        struct vdpa_device *vdev;
        int err = -EINVAL;
@@ -89,6 +91,10 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
        if (!!config->dma_map != !!config->dma_unmap)
                goto err;
 
+       /* It should only work for the device that use on-chip IOMMU */
+       if (use_va && !(config->dma_map || config->set_map))
+               goto err;
+
        err = -ENOMEM;
        vdev = kzalloc(size, GFP_KERNEL);
        if (!vdev)
@@ -104,6 +110,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
        vdev->index = err;
        vdev->config = config;
        vdev->features_valid = false;
+       vdev->use_va = use_va;
 
        if (name)
                err = dev_set_name(&vdev->dev, "%s", name);
index c621cf7..5f484ff 100644 (file)
@@ -92,7 +92,7 @@ static void vdpasim_vq_reset(struct vdpasim *vdpasim,
        vq->vring.notify = NULL;
 }
 
-static void vdpasim_reset(struct vdpasim *vdpasim)
+static void vdpasim_do_reset(struct vdpasim *vdpasim)
 {
        int i;
 
@@ -137,7 +137,8 @@ static dma_addr_t vdpasim_map_range(struct vdpasim *vdpasim, phys_addr_t paddr,
        int ret;
 
        /* We set the limit_pfn to the maximum (ULONG_MAX - 1) */
-       iova = alloc_iova(&vdpasim->iova, size, ULONG_MAX - 1, true);
+       iova = alloc_iova(&vdpasim->iova, size >> iova_shift(&vdpasim->iova),
+                         ULONG_MAX - 1, true);
        if (!iova)
                return DMA_MAPPING_ERROR;
 
@@ -250,7 +251,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr)
                ops = &vdpasim_config_ops;
 
        vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops,
-                                   dev_attr->name);
+                                   dev_attr->name, false);
        if (IS_ERR(vdpasim)) {
                ret = PTR_ERR(vdpasim);
                goto err_alloc;
@@ -459,11 +460,21 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status)
 
        spin_lock(&vdpasim->lock);
        vdpasim->status = status;
-       if (status == 0)
-               vdpasim_reset(vdpasim);
        spin_unlock(&vdpasim->lock);
 }
 
+static int vdpasim_reset(struct vdpa_device *vdpa)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+       spin_lock(&vdpasim->lock);
+       vdpasim->status = 0;
+       vdpasim_do_reset(vdpasim);
+       spin_unlock(&vdpasim->lock);
+
+       return 0;
+}
+
 static size_t vdpasim_get_config_size(struct vdpa_device *vdpa)
 {
        struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
@@ -544,14 +555,14 @@ err:
 }
 
 static int vdpasim_dma_map(struct vdpa_device *vdpa, u64 iova, u64 size,
-                          u64 pa, u32 perm)
+                          u64 pa, u32 perm, void *opaque)
 {
        struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
        int ret;
 
        spin_lock(&vdpasim->iommu_lock);
-       ret = vhost_iotlb_add_range(vdpasim->iommu, iova, iova + size - 1, pa,
-                                   perm);
+       ret = vhost_iotlb_add_range_ctx(vdpasim->iommu, iova, iova + size - 1,
+                                       pa, perm, opaque);
        spin_unlock(&vdpasim->iommu_lock);
 
        return ret;
@@ -607,6 +618,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = {
        .get_vendor_id          = vdpasim_get_vendor_id,
        .get_status             = vdpasim_get_status,
        .set_status             = vdpasim_set_status,
+       .reset                  = vdpasim_reset,
        .get_config_size        = vdpasim_get_config_size,
        .get_config             = vdpasim_get_config,
        .set_config             = vdpasim_set_config,
@@ -635,6 +647,7 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = {
        .get_vendor_id          = vdpasim_get_vendor_id,
        .get_status             = vdpasim_get_status,
        .set_status             = vdpasim_set_status,
+       .reset                  = vdpasim_reset,
        .get_config_size        = vdpasim_get_config_size,
        .get_config             = vdpasim_get_config,
        .set_config             = vdpasim_set_config,
diff --git a/drivers/vdpa/vdpa_user/Makefile b/drivers/vdpa/vdpa_user/Makefile
new file mode 100644 (file)
index 0000000..260e0b2
--- /dev/null
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+vduse-y := vduse_dev.o iova_domain.o
+
+obj-$(CONFIG_VDPA_USER) += vduse.o
diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
new file mode 100644 (file)
index 0000000..1daae26
--- /dev/null
@@ -0,0 +1,545 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MMU-based software IOTLB.
+ *
+ * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xie Yongji <xieyongji@bytedance.com>
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/vdpa.h>
+
+#include "iova_domain.h"
+
+static int vduse_iotlb_add_range(struct vduse_iova_domain *domain,
+                                u64 start, u64 last,
+                                u64 addr, unsigned int perm,
+                                struct file *file, u64 offset)
+{
+       struct vdpa_map_file *map_file;
+       int ret;
+
+       map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC);
+       if (!map_file)
+               return -ENOMEM;
+
+       map_file->file = get_file(file);
+       map_file->offset = offset;
+
+       ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last,
+                                       addr, perm, map_file);
+       if (ret) {
+               fput(map_file->file);
+               kfree(map_file);
+               return ret;
+       }
+       return 0;
+}
+
+static void vduse_iotlb_del_range(struct vduse_iova_domain *domain,
+                                 u64 start, u64 last)
+{
+       struct vdpa_map_file *map_file;
+       struct vhost_iotlb_map *map;
+
+       while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) {
+               map_file = (struct vdpa_map_file *)map->opaque;
+               fput(map_file->file);
+               kfree(map_file);
+               vhost_iotlb_map_free(domain->iotlb, map);
+       }
+}
+
+int vduse_domain_set_map(struct vduse_iova_domain *domain,
+                        struct vhost_iotlb *iotlb)
+{
+       struct vdpa_map_file *map_file;
+       struct vhost_iotlb_map *map;
+       u64 start = 0ULL, last = ULLONG_MAX;
+       int ret;
+
+       spin_lock(&domain->iotlb_lock);
+       vduse_iotlb_del_range(domain, start, last);
+
+       for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
+            map = vhost_iotlb_itree_next(map, start, last)) {
+               map_file = (struct vdpa_map_file *)map->opaque;
+               ret = vduse_iotlb_add_range(domain, map->start, map->last,
+                                           map->addr, map->perm,
+                                           map_file->file,
+                                           map_file->offset);
+               if (ret)
+                       goto err;
+       }
+       spin_unlock(&domain->iotlb_lock);
+
+       return 0;
+err:
+       vduse_iotlb_del_range(domain, start, last);
+       spin_unlock(&domain->iotlb_lock);
+       return ret;
+}
+
+void vduse_domain_clear_map(struct vduse_iova_domain *domain,
+                           struct vhost_iotlb *iotlb)
+{
+       struct vhost_iotlb_map *map;
+       u64 start = 0ULL, last = ULLONG_MAX;
+
+       spin_lock(&domain->iotlb_lock);
+       for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
+            map = vhost_iotlb_itree_next(map, start, last)) {
+               vduse_iotlb_del_range(domain, map->start, map->last);
+       }
+       spin_unlock(&domain->iotlb_lock);
+}
+
+static int vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
+                                        u64 iova, u64 size, u64 paddr)
+{
+       struct vduse_bounce_map *map;
+       u64 last = iova + size - 1;
+
+       while (iova <= last) {
+               map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+               if (!map->bounce_page) {
+                       map->bounce_page = alloc_page(GFP_ATOMIC);
+                       if (!map->bounce_page)
+                               return -ENOMEM;
+               }
+               map->orig_phys = paddr;
+               paddr += PAGE_SIZE;
+               iova += PAGE_SIZE;
+       }
+       return 0;
+}
+
+static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain,
+                                          u64 iova, u64 size)
+{
+       struct vduse_bounce_map *map;
+       u64 last = iova + size - 1;
+
+       while (iova <= last) {
+               map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+               map->orig_phys = INVALID_PHYS_ADDR;
+               iova += PAGE_SIZE;
+       }
+}
+
+static void do_bounce(phys_addr_t orig, void *addr, size_t size,
+                     enum dma_data_direction dir)
+{
+       unsigned long pfn = PFN_DOWN(orig);
+       unsigned int offset = offset_in_page(orig);
+       char *buffer;
+       unsigned int sz = 0;
+
+       while (size) {
+               sz = min_t(size_t, PAGE_SIZE - offset, size);
+
+               buffer = kmap_atomic(pfn_to_page(pfn));
+               if (dir == DMA_TO_DEVICE)
+                       memcpy(addr, buffer + offset, sz);
+               else
+                       memcpy(buffer + offset, addr, sz);
+               kunmap_atomic(buffer);
+
+               size -= sz;
+               pfn++;
+               addr += sz;
+               offset = 0;
+       }
+}
+
+static void vduse_domain_bounce(struct vduse_iova_domain *domain,
+                               dma_addr_t iova, size_t size,
+                               enum dma_data_direction dir)
+{
+       struct vduse_bounce_map *map;
+       unsigned int offset;
+       void *addr;
+       size_t sz;
+
+       if (iova >= domain->bounce_size)
+               return;
+
+       while (size) {
+               map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+               offset = offset_in_page(iova);
+               sz = min_t(size_t, PAGE_SIZE - offset, size);
+
+               if (WARN_ON(!map->bounce_page ||
+                           map->orig_phys == INVALID_PHYS_ADDR))
+                       return;
+
+               addr = page_address(map->bounce_page) + offset;
+               do_bounce(map->orig_phys + offset, addr, sz, dir);
+               size -= sz;
+               iova += sz;
+       }
+}
+
+static struct page *
+vduse_domain_get_coherent_page(struct vduse_iova_domain *domain, u64 iova)
+{
+       u64 start = iova & PAGE_MASK;
+       u64 last = start + PAGE_SIZE - 1;
+       struct vhost_iotlb_map *map;
+       struct page *page = NULL;
+
+       spin_lock(&domain->iotlb_lock);
+       map = vhost_iotlb_itree_first(domain->iotlb, start, last);
+       if (!map)
+               goto out;
+
+       page = pfn_to_page((map->addr + iova - map->start) >> PAGE_SHIFT);
+       get_page(page);
+out:
+       spin_unlock(&domain->iotlb_lock);
+
+       return page;
+}
+
+static struct page *
+vduse_domain_get_bounce_page(struct vduse_iova_domain *domain, u64 iova)
+{
+       struct vduse_bounce_map *map;
+       struct page *page = NULL;
+
+       spin_lock(&domain->iotlb_lock);
+       map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+       if (!map->bounce_page)
+               goto out;
+
+       page = map->bounce_page;
+       get_page(page);
+out:
+       spin_unlock(&domain->iotlb_lock);
+
+       return page;
+}
+
+static void
+vduse_domain_free_bounce_pages(struct vduse_iova_domain *domain)
+{
+       struct vduse_bounce_map *map;
+       unsigned long pfn, bounce_pfns;
+
+       bounce_pfns = domain->bounce_size >> PAGE_SHIFT;
+
+       for (pfn = 0; pfn < bounce_pfns; pfn++) {
+               map = &domain->bounce_maps[pfn];
+               if (WARN_ON(map->orig_phys != INVALID_PHYS_ADDR))
+                       continue;
+
+               if (!map->bounce_page)
+                       continue;
+
+               __free_page(map->bounce_page);
+               map->bounce_page = NULL;
+       }
+}
+
+void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain)
+{
+       if (!domain->bounce_map)
+               return;
+
+       spin_lock(&domain->iotlb_lock);
+       if (!domain->bounce_map)
+               goto unlock;
+
+       vduse_iotlb_del_range(domain, 0, domain->bounce_size - 1);
+       domain->bounce_map = 0;
+unlock:
+       spin_unlock(&domain->iotlb_lock);
+}
+
+static int vduse_domain_init_bounce_map(struct vduse_iova_domain *domain)
+{
+       int ret = 0;
+
+       if (domain->bounce_map)
+               return 0;
+
+       spin_lock(&domain->iotlb_lock);
+       if (domain->bounce_map)
+               goto unlock;
+
+       ret = vduse_iotlb_add_range(domain, 0, domain->bounce_size - 1,
+                                   0, VHOST_MAP_RW, domain->file, 0);
+       if (ret)
+               goto unlock;
+
+       domain->bounce_map = 1;
+unlock:
+       spin_unlock(&domain->iotlb_lock);
+       return ret;
+}
+
+static dma_addr_t
+vduse_domain_alloc_iova(struct iova_domain *iovad,
+                       unsigned long size, unsigned long limit)
+{
+       unsigned long shift = iova_shift(iovad);
+       unsigned long iova_len = iova_align(iovad, size) >> shift;
+       unsigned long iova_pfn;
+
+       /*
+        * Freeing non-power-of-two-sized allocations back into the IOVA caches
+        * will come back to bite us badly, so we have to waste a bit of space
+        * rounding up anything cacheable to make sure that can't happen. The
+        * order of the unadjusted size will still match upon freeing.
+        */
+       if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
+               iova_len = roundup_pow_of_two(iova_len);
+       iova_pfn = alloc_iova_fast(iovad, iova_len, limit >> shift, true);
+
+       return iova_pfn << shift;
+}
+
+static void vduse_domain_free_iova(struct iova_domain *iovad,
+                                  dma_addr_t iova, size_t size)
+{
+       unsigned long shift = iova_shift(iovad);
+       unsigned long iova_len = iova_align(iovad, size) >> shift;
+
+       free_iova_fast(iovad, iova >> shift, iova_len);
+}
+
+dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain,
+                                struct page *page, unsigned long offset,
+                                size_t size, enum dma_data_direction dir,
+                                unsigned long attrs)
+{
+       struct iova_domain *iovad = &domain->stream_iovad;
+       unsigned long limit = domain->bounce_size - 1;
+       phys_addr_t pa = page_to_phys(page) + offset;
+       dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
+
+       if (!iova)
+               return DMA_MAPPING_ERROR;
+
+       if (vduse_domain_init_bounce_map(domain))
+               goto err;
+
+       if (vduse_domain_map_bounce_page(domain, (u64)iova, (u64)size, pa))
+               goto err;
+
+       if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+               vduse_domain_bounce(domain, iova, size, DMA_TO_DEVICE);
+
+       return iova;
+err:
+       vduse_domain_free_iova(iovad, iova, size);
+       return DMA_MAPPING_ERROR;
+}
+
+void vduse_domain_unmap_page(struct vduse_iova_domain *domain,
+                            dma_addr_t dma_addr, size_t size,
+                            enum dma_data_direction dir, unsigned long attrs)
+{
+       struct iova_domain *iovad = &domain->stream_iovad;
+
+       if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+               vduse_domain_bounce(domain, dma_addr, size, DMA_FROM_DEVICE);
+
+       vduse_domain_unmap_bounce_page(domain, (u64)dma_addr, (u64)size);
+       vduse_domain_free_iova(iovad, dma_addr, size);
+}
+
+void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
+                                 size_t size, dma_addr_t *dma_addr,
+                                 gfp_t flag, unsigned long attrs)
+{
+       struct iova_domain *iovad = &domain->consistent_iovad;
+       unsigned long limit = domain->iova_limit;
+       dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
+       void *orig = alloc_pages_exact(size, flag);
+
+       if (!iova || !orig)
+               goto err;
+
+       spin_lock(&domain->iotlb_lock);
+       if (vduse_iotlb_add_range(domain, (u64)iova, (u64)iova + size - 1,
+                                 virt_to_phys(orig), VHOST_MAP_RW,
+                                 domain->file, (u64)iova)) {
+               spin_unlock(&domain->iotlb_lock);
+               goto err;
+       }
+       spin_unlock(&domain->iotlb_lock);
+
+       *dma_addr = iova;
+
+       return orig;
+err:
+       *dma_addr = DMA_MAPPING_ERROR;
+       if (orig)
+               free_pages_exact(orig, size);
+       if (iova)
+               vduse_domain_free_iova(iovad, iova, size);
+
+       return NULL;
+}
+
+void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
+                               void *vaddr, dma_addr_t dma_addr,
+                               unsigned long attrs)
+{
+       struct iova_domain *iovad = &domain->consistent_iovad;
+       struct vhost_iotlb_map *map;
+       struct vdpa_map_file *map_file;
+       phys_addr_t pa;
+
+       spin_lock(&domain->iotlb_lock);
+       map = vhost_iotlb_itree_first(domain->iotlb, (u64)dma_addr,
+                                     (u64)dma_addr + size - 1);
+       if (WARN_ON(!map)) {
+               spin_unlock(&domain->iotlb_lock);
+               return;
+       }
+       map_file = (struct vdpa_map_file *)map->opaque;
+       fput(map_file->file);
+       kfree(map_file);
+       pa = map->addr;
+       vhost_iotlb_map_free(domain->iotlb, map);
+       spin_unlock(&domain->iotlb_lock);
+
+       vduse_domain_free_iova(iovad, dma_addr, size);
+       free_pages_exact(phys_to_virt(pa), size);
+}
+
+static vm_fault_t vduse_domain_mmap_fault(struct vm_fault *vmf)
+{
+       struct vduse_iova_domain *domain = vmf->vma->vm_private_data;
+       unsigned long iova = vmf->pgoff << PAGE_SHIFT;
+       struct page *page;
+
+       if (!domain)
+               return VM_FAULT_SIGBUS;
+
+       if (iova < domain->bounce_size)
+               page = vduse_domain_get_bounce_page(domain, iova);
+       else
+               page = vduse_domain_get_coherent_page(domain, iova);
+
+       if (!page)
+               return VM_FAULT_SIGBUS;
+
+       vmf->page = page;
+
+       return 0;
+}
+
+static const struct vm_operations_struct vduse_domain_mmap_ops = {
+       .fault = vduse_domain_mmap_fault,
+};
+
+static int vduse_domain_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       struct vduse_iova_domain *domain = file->private_data;
+
+       vma->vm_flags |= VM_DONTDUMP | VM_DONTEXPAND;
+       vma->vm_private_data = domain;
+       vma->vm_ops = &vduse_domain_mmap_ops;
+
+       return 0;
+}
+
+static int vduse_domain_release(struct inode *inode, struct file *file)
+{
+       struct vduse_iova_domain *domain = file->private_data;
+
+       spin_lock(&domain->iotlb_lock);
+       vduse_iotlb_del_range(domain, 0, ULLONG_MAX);
+       vduse_domain_free_bounce_pages(domain);
+       spin_unlock(&domain->iotlb_lock);
+       put_iova_domain(&domain->stream_iovad);
+       put_iova_domain(&domain->consistent_iovad);
+       vhost_iotlb_free(domain->iotlb);
+       vfree(domain->bounce_maps);
+       kfree(domain);
+
+       return 0;
+}
+
+static const struct file_operations vduse_domain_fops = {
+       .owner = THIS_MODULE,
+       .mmap = vduse_domain_mmap,
+       .release = vduse_domain_release,
+};
+
+void vduse_domain_destroy(struct vduse_iova_domain *domain)
+{
+       fput(domain->file);
+}
+
+struct vduse_iova_domain *
+vduse_domain_create(unsigned long iova_limit, size_t bounce_size)
+{
+       struct vduse_iova_domain *domain;
+       struct file *file;
+       struct vduse_bounce_map *map;
+       unsigned long pfn, bounce_pfns;
+
+       bounce_pfns = PAGE_ALIGN(bounce_size) >> PAGE_SHIFT;
+       if (iova_limit <= bounce_size)
+               return NULL;
+
+       domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+       if (!domain)
+               return NULL;
+
+       domain->iotlb = vhost_iotlb_alloc(0, 0);
+       if (!domain->iotlb)
+               goto err_iotlb;
+
+       domain->iova_limit = iova_limit;
+       domain->bounce_size = PAGE_ALIGN(bounce_size);
+       domain->bounce_maps = vzalloc(bounce_pfns *
+                               sizeof(struct vduse_bounce_map));
+       if (!domain->bounce_maps)
+               goto err_map;
+
+       for (pfn = 0; pfn < bounce_pfns; pfn++) {
+               map = &domain->bounce_maps[pfn];
+               map->orig_phys = INVALID_PHYS_ADDR;
+       }
+       file = anon_inode_getfile("[vduse-domain]", &vduse_domain_fops,
+                               domain, O_RDWR);
+       if (IS_ERR(file))
+               goto err_file;
+
+       domain->file = file;
+       spin_lock_init(&domain->iotlb_lock);
+       init_iova_domain(&domain->stream_iovad,
+                       PAGE_SIZE, IOVA_START_PFN);
+       init_iova_domain(&domain->consistent_iovad,
+                       PAGE_SIZE, bounce_pfns);
+
+       return domain;
+err_file:
+       vfree(domain->bounce_maps);
+err_map:
+       vhost_iotlb_free(domain->iotlb);
+err_iotlb:
+       kfree(domain);
+       return NULL;
+}
+
+int vduse_domain_init(void)
+{
+       return iova_cache_get();
+}
+
+void vduse_domain_exit(void)
+{
+       iova_cache_put();
+}
diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
new file mode 100644 (file)
index 0000000..2722d9b
--- /dev/null
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * MMU-based software IOTLB.
+ *
+ * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xie Yongji <xieyongji@bytedance.com>
+ *
+ */
+
+#ifndef _VDUSE_IOVA_DOMAIN_H
+#define _VDUSE_IOVA_DOMAIN_H
+
+#include <linux/iova.h>
+#include <linux/dma-mapping.h>
+#include <linux/vhost_iotlb.h>
+
+#define IOVA_START_PFN 1
+
+#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+
+struct vduse_bounce_map {
+       struct page *bounce_page;
+       u64 orig_phys;
+};
+
+struct vduse_iova_domain {
+       struct iova_domain stream_iovad;
+       struct iova_domain consistent_iovad;
+       struct vduse_bounce_map *bounce_maps;
+       size_t bounce_size;
+       unsigned long iova_limit;
+       int bounce_map;
+       struct vhost_iotlb *iotlb;
+       spinlock_t iotlb_lock;
+       struct file *file;
+};
+
+int vduse_domain_set_map(struct vduse_iova_domain *domain,
+                        struct vhost_iotlb *iotlb);
+
+void vduse_domain_clear_map(struct vduse_iova_domain *domain,
+                           struct vhost_iotlb *iotlb);
+
+dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain,
+                                struct page *page, unsigned long offset,
+                                size_t size, enum dma_data_direction dir,
+                                unsigned long attrs);
+
+void vduse_domain_unmap_page(struct vduse_iova_domain *domain,
+                            dma_addr_t dma_addr, size_t size,
+                            enum dma_data_direction dir, unsigned long attrs);
+
+void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
+                                 size_t size, dma_addr_t *dma_addr,
+                                 gfp_t flag, unsigned long attrs);
+
+void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
+                               void *vaddr, dma_addr_t dma_addr,
+                               unsigned long attrs);
+
+void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain);
+
+void vduse_domain_destroy(struct vduse_iova_domain *domain);
+
+struct vduse_iova_domain *vduse_domain_create(unsigned long iova_limit,
+                                             size_t bounce_size);
+
+int vduse_domain_init(void);
+
+void vduse_domain_exit(void);
+
+#endif /* _VDUSE_IOVA_DOMAIN_H */
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
new file mode 100644 (file)
index 0000000..29a38ec
--- /dev/null
@@ -0,0 +1,1641 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VDUSE: vDPA Device in Userspace
+ *
+ * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xie Yongji <xieyongji@bytedance.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/dma-map-ops.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/vdpa.h>
+#include <linux/nospec.h>
+#include <uapi/linux/vduse.h>
+#include <uapi/linux/vdpa.h>
+#include <uapi/linux/virtio_config.h>
+#include <uapi/linux/virtio_ids.h>
+#include <uapi/linux/virtio_blk.h>
+#include <linux/mod_devicetable.h>
+
+#include "iova_domain.h"
+
+#define DRV_AUTHOR   "Yongji Xie <xieyongji@bytedance.com>"
+#define DRV_DESC     "vDPA Device in Userspace"
+#define DRV_LICENSE  "GPL v2"
+
+#define VDUSE_DEV_MAX (1U << MINORBITS)
+#define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
+#define VDUSE_IOVA_SIZE (128 * 1024 * 1024)
+#define VDUSE_MSG_DEFAULT_TIMEOUT 30
+
+struct vduse_virtqueue {
+       u16 index;
+       u16 num_max;
+       u32 num;
+       u64 desc_addr;
+       u64 driver_addr;
+       u64 device_addr;
+       struct vdpa_vq_state state;
+       bool ready;
+       bool kicked;
+       spinlock_t kick_lock;
+       spinlock_t irq_lock;
+       struct eventfd_ctx *kickfd;
+       struct vdpa_callback cb;
+       struct work_struct inject;
+       struct work_struct kick;
+};
+
+struct vduse_dev;
+
+struct vduse_vdpa {
+       struct vdpa_device vdpa;
+       struct vduse_dev *dev;
+};
+
+struct vduse_dev {
+       struct vduse_vdpa *vdev;
+       struct device *dev;
+       struct vduse_virtqueue *vqs;
+       struct vduse_iova_domain *domain;
+       char *name;
+       struct mutex lock;
+       spinlock_t msg_lock;
+       u64 msg_unique;
+       u32 msg_timeout;
+       wait_queue_head_t waitq;
+       struct list_head send_list;
+       struct list_head recv_list;
+       struct vdpa_callback config_cb;
+       struct work_struct inject;
+       spinlock_t irq_lock;
+       int minor;
+       bool broken;
+       bool connected;
+       u64 api_version;
+       u64 device_features;
+       u64 driver_features;
+       u32 device_id;
+       u32 vendor_id;
+       u32 generation;
+       u32 config_size;
+       void *config;
+       u8 status;
+       u32 vq_num;
+       u32 vq_align;
+};
+
+struct vduse_dev_msg {
+       struct vduse_dev_request req;
+       struct vduse_dev_response resp;
+       struct list_head list;
+       wait_queue_head_t waitq;
+       bool completed;
+};
+
+struct vduse_control {
+       u64 api_version;
+};
+
+static DEFINE_MUTEX(vduse_lock);
+static DEFINE_IDR(vduse_idr);
+
+static dev_t vduse_major;
+static struct class *vduse_class;
+static struct cdev vduse_ctrl_cdev;
+static struct cdev vduse_cdev;
+static struct workqueue_struct *vduse_irq_wq;
+
+static u32 allowed_device_id[] = {
+       VIRTIO_ID_BLOCK,
+};
+
+static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
+{
+       struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa);
+
+       return vdev->dev;
+}
+
+static inline struct vduse_dev *dev_to_vduse(struct device *dev)
+{
+       struct vdpa_device *vdpa = dev_to_vdpa(dev);
+
+       return vdpa_to_vduse(vdpa);
+}
+
+static struct vduse_dev_msg *vduse_find_msg(struct list_head *head,
+                                           uint32_t request_id)
+{
+       struct vduse_dev_msg *msg;
+
+       list_for_each_entry(msg, head, list) {
+               if (msg->req.request_id == request_id) {
+                       list_del(&msg->list);
+                       return msg;
+               }
+       }
+
+       return NULL;
+}
+
+static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head)
+{
+       struct vduse_dev_msg *msg = NULL;
+
+       if (!list_empty(head)) {
+               msg = list_first_entry(head, struct vduse_dev_msg, list);
+               list_del(&msg->list);
+       }
+
+       return msg;
+}
+
+static void vduse_enqueue_msg(struct list_head *head,
+                             struct vduse_dev_msg *msg)
+{
+       list_add_tail(&msg->list, head);
+}
+
+static void vduse_dev_broken(struct vduse_dev *dev)
+{
+       struct vduse_dev_msg *msg, *tmp;
+
+       if (unlikely(dev->broken))
+               return;
+
+       list_splice_init(&dev->recv_list, &dev->send_list);
+       list_for_each_entry_safe(msg, tmp, &dev->send_list, list) {
+               list_del(&msg->list);
+               msg->completed = 1;
+               msg->resp.result = VDUSE_REQ_RESULT_FAILED;
+               wake_up(&msg->waitq);
+       }
+       dev->broken = true;
+       wake_up(&dev->waitq);
+}
+
+static int vduse_dev_msg_sync(struct vduse_dev *dev,
+                             struct vduse_dev_msg *msg)
+{
+       int ret;
+
+       if (unlikely(dev->broken))
+               return -EIO;
+
+       init_waitqueue_head(&msg->waitq);
+       spin_lock(&dev->msg_lock);
+       if (unlikely(dev->broken)) {
+               spin_unlock(&dev->msg_lock);
+               return -EIO;
+       }
+       msg->req.request_id = dev->msg_unique++;
+       vduse_enqueue_msg(&dev->send_list, msg);
+       wake_up(&dev->waitq);
+       spin_unlock(&dev->msg_lock);
+       if (dev->msg_timeout)
+               ret = wait_event_killable_timeout(msg->waitq, msg->completed,
+                                                 (long)dev->msg_timeout * HZ);
+       else
+               ret = wait_event_killable(msg->waitq, msg->completed);
+
+       spin_lock(&dev->msg_lock);
+       if (!msg->completed) {
+               list_del(&msg->list);
+               msg->resp.result = VDUSE_REQ_RESULT_FAILED;
+               /* Mark the device as malfunction when there is a timeout */
+               if (!ret)
+                       vduse_dev_broken(dev);
+       }
+       ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO;
+       spin_unlock(&dev->msg_lock);
+
+       return ret;
+}
+
+static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev,
+                                        struct vduse_virtqueue *vq,
+                                        struct vdpa_vq_state_packed *packed)
+{
+       struct vduse_dev_msg msg = { 0 };
+       int ret;
+
+       msg.req.type = VDUSE_GET_VQ_STATE;
+       msg.req.vq_state.index = vq->index;
+
+       ret = vduse_dev_msg_sync(dev, &msg);
+       if (ret)
+               return ret;
+
+       packed->last_avail_counter =
+                       msg.resp.vq_state.packed.last_avail_counter & 0x0001;
+       packed->last_avail_idx =
+                       msg.resp.vq_state.packed.last_avail_idx & 0x7FFF;
+       packed->last_used_counter =
+                       msg.resp.vq_state.packed.last_used_counter & 0x0001;
+       packed->last_used_idx =
+                       msg.resp.vq_state.packed.last_used_idx & 0x7FFF;
+
+       return 0;
+}
+
+static int vduse_dev_get_vq_state_split(struct vduse_dev *dev,
+                                       struct vduse_virtqueue *vq,
+                                       struct vdpa_vq_state_split *split)
+{
+       struct vduse_dev_msg msg = { 0 };
+       int ret;
+
+       msg.req.type = VDUSE_GET_VQ_STATE;
+       msg.req.vq_state.index = vq->index;
+
+       ret = vduse_dev_msg_sync(dev, &msg);
+       if (ret)
+               return ret;
+
+       split->avail_index = msg.resp.vq_state.split.avail_index;
+
+       return 0;
+}
+
+static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
+{
+       struct vduse_dev_msg msg = { 0 };
+
+       msg.req.type = VDUSE_SET_STATUS;
+       msg.req.s.status = status;
+
+       return vduse_dev_msg_sync(dev, &msg);
+}
+
+static int vduse_dev_update_iotlb(struct vduse_dev *dev,
+                                 u64 start, u64 last)
+{
+       struct vduse_dev_msg msg = { 0 };
+
+       if (last < start)
+               return -EINVAL;
+
+       msg.req.type = VDUSE_UPDATE_IOTLB;
+       msg.req.iova.start = start;
+       msg.req.iova.last = last;
+
+       return vduse_dev_msg_sync(dev, &msg);
+}
+
+static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+       struct file *file = iocb->ki_filp;
+       struct vduse_dev *dev = file->private_data;
+       struct vduse_dev_msg *msg;
+       int size = sizeof(struct vduse_dev_request);
+       ssize_t ret;
+
+       if (iov_iter_count(to) < size)
+               return -EINVAL;
+
+       spin_lock(&dev->msg_lock);
+       while (1) {
+               msg = vduse_dequeue_msg(&dev->send_list);
+               if (msg)
+                       break;
+
+               ret = -EAGAIN;
+               if (file->f_flags & O_NONBLOCK)
+                       goto unlock;
+
+               spin_unlock(&dev->msg_lock);
+               ret = wait_event_interruptible_exclusive(dev->waitq,
+                                       !list_empty(&dev->send_list));
+               if (ret)
+                       return ret;
+
+               spin_lock(&dev->msg_lock);
+       }
+       spin_unlock(&dev->msg_lock);
+       ret = copy_to_iter(&msg->req, size, to);
+       spin_lock(&dev->msg_lock);
+       if (ret != size) {
+               ret = -EFAULT;
+               vduse_enqueue_msg(&dev->send_list, msg);
+               goto unlock;
+       }
+       vduse_enqueue_msg(&dev->recv_list, msg);
+unlock:
+       spin_unlock(&dev->msg_lock);
+
+       return ret;
+}
+
+static bool is_mem_zero(const char *ptr, int size)
+{
+       int i;
+
+       for (i = 0; i < size; i++) {
+               if (ptr[i])
+                       return false;
+       }
+       return true;
+}
+
+static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct file *file = iocb->ki_filp;
+       struct vduse_dev *dev = file->private_data;
+       struct vduse_dev_response resp;
+       struct vduse_dev_msg *msg;
+       size_t ret;
+
+       ret = copy_from_iter(&resp, sizeof(resp), from);
+       if (ret != sizeof(resp))
+               return -EINVAL;
+
+       if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved)))
+               return -EINVAL;
+
+       spin_lock(&dev->msg_lock);
+       msg = vduse_find_msg(&dev->recv_list, resp.request_id);
+       if (!msg) {
+               ret = -ENOENT;
+               goto unlock;
+       }
+
+       memcpy(&msg->resp, &resp, sizeof(resp));
+       msg->completed = 1;
+       wake_up(&msg->waitq);
+unlock:
+       spin_unlock(&dev->msg_lock);
+
+       return ret;
+}
+
+static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
+{
+       struct vduse_dev *dev = file->private_data;
+       __poll_t mask = 0;
+
+       poll_wait(file, &dev->waitq, wait);
+
+       spin_lock(&dev->msg_lock);
+
+       if (unlikely(dev->broken))
+               mask |= EPOLLERR;
+       if (!list_empty(&dev->send_list))
+               mask |= EPOLLIN | EPOLLRDNORM;
+       if (!list_empty(&dev->recv_list))
+               mask |= EPOLLOUT | EPOLLWRNORM;
+
+       spin_unlock(&dev->msg_lock);
+
+       return mask;
+}
+
+static void vduse_dev_reset(struct vduse_dev *dev)
+{
+       int i;
+       struct vduse_iova_domain *domain = dev->domain;
+
+       /* The coherent mappings are handled in vduse_dev_free_coherent() */
+       if (domain->bounce_map)
+               vduse_domain_reset_bounce_map(domain);
+
+       dev->status = 0;
+       dev->driver_features = 0;
+       dev->generation++;
+       spin_lock(&dev->irq_lock);
+       dev->config_cb.callback = NULL;
+       dev->config_cb.private = NULL;
+       spin_unlock(&dev->irq_lock);
+       flush_work(&dev->inject);
+
+       for (i = 0; i < dev->vq_num; i++) {
+               struct vduse_virtqueue *vq = &dev->vqs[i];
+
+               vq->ready = false;
+               vq->desc_addr = 0;
+               vq->driver_addr = 0;
+               vq->device_addr = 0;
+               vq->num = 0;
+               memset(&vq->state, 0, sizeof(vq->state));
+
+               spin_lock(&vq->kick_lock);
+               vq->kicked = false;
+               if (vq->kickfd)
+                       eventfd_ctx_put(vq->kickfd);
+               vq->kickfd = NULL;
+               spin_unlock(&vq->kick_lock);
+
+               spin_lock(&vq->irq_lock);
+               vq->cb.callback = NULL;
+               vq->cb.private = NULL;
+               spin_unlock(&vq->irq_lock);
+               flush_work(&vq->inject);
+               flush_work(&vq->kick);
+       }
+}
+
+static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
+                               u64 desc_area, u64 driver_area,
+                               u64 device_area)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       vq->desc_addr = desc_area;
+       vq->driver_addr = driver_area;
+       vq->device_addr = device_area;
+
+       return 0;
+}
+
+static void vduse_vq_kick(struct vduse_virtqueue *vq)
+{
+       spin_lock(&vq->kick_lock);
+       if (!vq->ready)
+               goto unlock;
+
+       if (vq->kickfd)
+               eventfd_signal(vq->kickfd, 1);
+       else
+               vq->kicked = true;
+unlock:
+       spin_unlock(&vq->kick_lock);
+}
+
+static void vduse_vq_kick_work(struct work_struct *work)
+{
+       struct vduse_virtqueue *vq = container_of(work,
+                                       struct vduse_virtqueue, kick);
+
+       vduse_vq_kick(vq);
+}
+
+static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       if (!eventfd_signal_allowed()) {
+               schedule_work(&vq->kick);
+               return;
+       }
+       vduse_vq_kick(vq);
+}
+
+static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
+                             struct vdpa_callback *cb)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       spin_lock(&vq->irq_lock);
+       vq->cb.callback = cb->callback;
+       vq->cb.private = cb->private;
+       spin_unlock(&vq->irq_lock);
+}
+
+static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       vq->num = num;
+}
+
+static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
+                                       u16 idx, bool ready)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       vq->ready = ready;
+}
+
+static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       return vq->ready;
+}
+
+static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
+                               const struct vdpa_vq_state *state)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
+               vq->state.packed.last_avail_counter =
+                               state->packed.last_avail_counter;
+               vq->state.packed.last_avail_idx = state->packed.last_avail_idx;
+               vq->state.packed.last_used_counter =
+                               state->packed.last_used_counter;
+               vq->state.packed.last_used_idx = state->packed.last_used_idx;
+       } else
+               vq->state.split.avail_index = state->split.avail_index;
+
+       return 0;
+}
+
+static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
+                               struct vdpa_vq_state *state)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       struct vduse_virtqueue *vq = &dev->vqs[idx];
+
+       if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
+               return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
+
+       return vduse_dev_get_vq_state_split(dev, vq, &state->split);
+}
+
+static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       return dev->vq_align;
+}
+
+static u64 vduse_vdpa_get_features(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       return dev->device_features;
+}
+
+static int vduse_vdpa_set_features(struct vdpa_device *vdpa, u64 features)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       dev->driver_features = features;
+       return 0;
+}
+
+static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa,
+                                 struct vdpa_callback *cb)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       spin_lock(&dev->irq_lock);
+       dev->config_cb.callback = cb->callback;
+       dev->config_cb.private = cb->private;
+       spin_unlock(&dev->irq_lock);
+}
+
+static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       u16 num_max = 0;
+       int i;
+
+       for (i = 0; i < dev->vq_num; i++)
+               if (num_max < dev->vqs[i].num_max)
+                       num_max = dev->vqs[i].num_max;
+
+       return num_max;
+}
+
+static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       return dev->device_id;
+}
+
+static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       return dev->vendor_id;
+}
+
+static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       return dev->status;
+}
+
+static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       if (vduse_dev_set_status(dev, status))
+               return;
+
+       dev->status = status;
+}
+
+static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       return dev->config_size;
+}
+
+static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
+                                 void *buf, unsigned int len)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       if (len > dev->config_size - offset)
+               return;
+
+       memcpy(buf, dev->config + offset, len);
+}
+
+static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset,
+                       const void *buf, unsigned int len)
+{
+       /* Now we only support read-only configuration space */
+}
+
+static int vduse_vdpa_reset(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       if (vduse_dev_set_status(dev, 0))
+               return -EIO;
+
+       vduse_dev_reset(dev);
+
+       return 0;
+}
+
+static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       return dev->generation;
+}
+
+static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
+                               struct vhost_iotlb *iotlb)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+       int ret;
+
+       ret = vduse_domain_set_map(dev->domain, iotlb);
+       if (ret)
+               return ret;
+
+       ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
+       if (ret) {
+               vduse_domain_clear_map(dev->domain, iotlb);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void vduse_vdpa_free(struct vdpa_device *vdpa)
+{
+       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+       dev->vdev = NULL;
+}
+
+static const struct vdpa_config_ops vduse_vdpa_config_ops = {
+       .set_vq_address         = vduse_vdpa_set_vq_address,
+       .kick_vq                = vduse_vdpa_kick_vq,
+       .set_vq_cb              = vduse_vdpa_set_vq_cb,
+       .set_vq_num             = vduse_vdpa_set_vq_num,
+       .set_vq_ready           = vduse_vdpa_set_vq_ready,
+       .get_vq_ready           = vduse_vdpa_get_vq_ready,
+       .set_vq_state           = vduse_vdpa_set_vq_state,
+       .get_vq_state           = vduse_vdpa_get_vq_state,
+       .get_vq_align           = vduse_vdpa_get_vq_align,
+       .get_features           = vduse_vdpa_get_features,
+       .set_features           = vduse_vdpa_set_features,
+       .set_config_cb          = vduse_vdpa_set_config_cb,
+       .get_vq_num_max         = vduse_vdpa_get_vq_num_max,
+       .get_device_id          = vduse_vdpa_get_device_id,
+       .get_vendor_id          = vduse_vdpa_get_vendor_id,
+       .get_status             = vduse_vdpa_get_status,
+       .set_status             = vduse_vdpa_set_status,
+       .get_config_size        = vduse_vdpa_get_config_size,
+       .get_config             = vduse_vdpa_get_config,
+       .set_config             = vduse_vdpa_set_config,
+       .get_generation         = vduse_vdpa_get_generation,
+       .reset                  = vduse_vdpa_reset,
+       .set_map                = vduse_vdpa_set_map,
+       .free                   = vduse_vdpa_free,
+};
+
+static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page,
+                                    unsigned long offset, size_t size,
+                                    enum dma_data_direction dir,
+                                    unsigned long attrs)
+{
+       struct vduse_dev *vdev = dev_to_vduse(dev);
+       struct vduse_iova_domain *domain = vdev->domain;
+
+       return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
+}
+
+static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr,
+                               size_t size, enum dma_data_direction dir,
+                               unsigned long attrs)
+{
+       struct vduse_dev *vdev = dev_to_vduse(dev);
+       struct vduse_iova_domain *domain = vdev->domain;
+
+       return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
+}
+
+static void *vduse_dev_alloc_coherent(struct device *dev, size_t size,
+                                       dma_addr_t *dma_addr, gfp_t flag,
+                                       unsigned long attrs)
+{
+       struct vduse_dev *vdev = dev_to_vduse(dev);
+       struct vduse_iova_domain *domain = vdev->domain;
+       unsigned long iova;
+       void *addr;
+
+       *dma_addr = DMA_MAPPING_ERROR;
+       addr = vduse_domain_alloc_coherent(domain, size,
+                               (dma_addr_t *)&iova, flag, attrs);
+       if (!addr)
+               return NULL;
+
+       *dma_addr = (dma_addr_t)iova;
+
+       return addr;
+}
+
+static void vduse_dev_free_coherent(struct device *dev, size_t size,
+                                       void *vaddr, dma_addr_t dma_addr,
+                                       unsigned long attrs)
+{
+       struct vduse_dev *vdev = dev_to_vduse(dev);
+       struct vduse_iova_domain *domain = vdev->domain;
+
+       vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
+}
+
+static size_t vduse_dev_max_mapping_size(struct device *dev)
+{
+       struct vduse_dev *vdev = dev_to_vduse(dev);
+       struct vduse_iova_domain *domain = vdev->domain;
+
+       return domain->bounce_size;
+}
+
+static const struct dma_map_ops vduse_dev_dma_ops = {
+       .map_page = vduse_dev_map_page,
+       .unmap_page = vduse_dev_unmap_page,
+       .alloc = vduse_dev_alloc_coherent,
+       .free = vduse_dev_free_coherent,
+       .max_mapping_size = vduse_dev_max_mapping_size,
+};
+
+static unsigned int perm_to_file_flags(u8 perm)
+{
+       unsigned int flags = 0;
+
+       switch (perm) {
+       case VDUSE_ACCESS_WO:
+               flags |= O_WRONLY;
+               break;
+       case VDUSE_ACCESS_RO:
+               flags |= O_RDONLY;
+               break;
+       case VDUSE_ACCESS_RW:
+               flags |= O_RDWR;
+               break;
+       default:
+               WARN(1, "invalidate vhost IOTLB permission\n");
+               break;
+       }
+
+       return flags;
+}
+
+static int vduse_kickfd_setup(struct vduse_dev *dev,
+                       struct vduse_vq_eventfd *eventfd)
+{
+       struct eventfd_ctx *ctx = NULL;
+       struct vduse_virtqueue *vq;
+       u32 index;
+
+       if (eventfd->index >= dev->vq_num)
+               return -EINVAL;
+
+       index = array_index_nospec(eventfd->index, dev->vq_num);
+       vq = &dev->vqs[index];
+       if (eventfd->fd >= 0) {
+               ctx = eventfd_ctx_fdget(eventfd->fd);
+               if (IS_ERR(ctx))
+                       return PTR_ERR(ctx);
+       } else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN)
+               return 0;
+
+       spin_lock(&vq->kick_lock);
+       if (vq->kickfd)
+               eventfd_ctx_put(vq->kickfd);
+       vq->kickfd = ctx;
+       if (vq->ready && vq->kicked && vq->kickfd) {
+               eventfd_signal(vq->kickfd, 1);
+               vq->kicked = false;
+       }
+       spin_unlock(&vq->kick_lock);
+
+       return 0;
+}
+
+static bool vduse_dev_is_ready(struct vduse_dev *dev)
+{
+       int i;
+
+       for (i = 0; i < dev->vq_num; i++)
+               if (!dev->vqs[i].num_max)
+                       return false;
+
+       return true;
+}
+
+static void vduse_dev_irq_inject(struct work_struct *work)
+{
+       struct vduse_dev *dev = container_of(work, struct vduse_dev, inject);
+
+       spin_lock_irq(&dev->irq_lock);
+       if (dev->config_cb.callback)
+               dev->config_cb.callback(dev->config_cb.private);
+       spin_unlock_irq(&dev->irq_lock);
+}
+
+static void vduse_vq_irq_inject(struct work_struct *work)
+{
+       struct vduse_virtqueue *vq = container_of(work,
+                                       struct vduse_virtqueue, inject);
+
+       spin_lock_irq(&vq->irq_lock);
+       if (vq->ready && vq->cb.callback)
+               vq->cb.callback(vq->cb.private);
+       spin_unlock_irq(&vq->irq_lock);
+}
+
+static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
+                           unsigned long arg)
+{
+       struct vduse_dev *dev = file->private_data;
+       void __user *argp = (void __user *)arg;
+       int ret;
+
+       if (unlikely(dev->broken))
+               return -EPERM;
+
+       switch (cmd) {
+       case VDUSE_IOTLB_GET_FD: {
+               struct vduse_iotlb_entry entry;
+               struct vhost_iotlb_map *map;
+               struct vdpa_map_file *map_file;
+               struct vduse_iova_domain *domain = dev->domain;
+               struct file *f = NULL;
+
+               ret = -EFAULT;
+               if (copy_from_user(&entry, argp, sizeof(entry)))
+                       break;
+
+               ret = -EINVAL;
+               if (entry.start > entry.last)
+                       break;
+
+               spin_lock(&domain->iotlb_lock);
+               map = vhost_iotlb_itree_first(domain->iotlb,
+                                             entry.start, entry.last);
+               if (map) {
+                       map_file = (struct vdpa_map_file *)map->opaque;
+                       f = get_file(map_file->file);
+                       entry.offset = map_file->offset;
+                       entry.start = map->start;
+                       entry.last = map->last;
+                       entry.perm = map->perm;
+               }
+               spin_unlock(&domain->iotlb_lock);
+               ret = -EINVAL;
+               if (!f)
+                       break;
+
+               ret = -EFAULT;
+               if (copy_to_user(argp, &entry, sizeof(entry))) {
+                       fput(f);
+                       break;
+               }
+               ret = receive_fd(f, perm_to_file_flags(entry.perm));
+               fput(f);
+               break;
+       }
+       case VDUSE_DEV_GET_FEATURES:
+               /*
+                * Just mirror what driver wrote here.
+                * The driver is expected to check FEATURE_OK later.
+                */
+               ret = put_user(dev->driver_features, (u64 __user *)argp);
+               break;
+       case VDUSE_DEV_SET_CONFIG: {
+               struct vduse_config_data config;
+               unsigned long size = offsetof(struct vduse_config_data,
+                                             buffer);
+
+               ret = -EFAULT;
+               if (copy_from_user(&config, argp, size))
+                       break;
+
+               ret = -EINVAL;
+               if (config.length == 0 ||
+                   config.length > dev->config_size - config.offset)
+                       break;
+
+               ret = -EFAULT;
+               if (copy_from_user(dev->config + config.offset, argp + size,
+                                  config.length))
+                       break;
+
+               ret = 0;
+               break;
+       }
+       case VDUSE_DEV_INJECT_CONFIG_IRQ:
+               ret = 0;
+               queue_work(vduse_irq_wq, &dev->inject);
+               break;
+       case VDUSE_VQ_SETUP: {
+               struct vduse_vq_config config;
+               u32 index;
+
+               ret = -EFAULT;
+               if (copy_from_user(&config, argp, sizeof(config)))
+                       break;
+
+               ret = -EINVAL;
+               if (config.index >= dev->vq_num)
+                       break;
+
+               if (!is_mem_zero((const char *)config.reserved,
+                                sizeof(config.reserved)))
+                       break;
+
+               index = array_index_nospec(config.index, dev->vq_num);
+               dev->vqs[index].num_max = config.max_size;
+               ret = 0;
+               break;
+       }
+       case VDUSE_VQ_GET_INFO: {
+               struct vduse_vq_info vq_info;
+               struct vduse_virtqueue *vq;
+               u32 index;
+
+               ret = -EFAULT;
+               if (copy_from_user(&vq_info, argp, sizeof(vq_info)))
+                       break;
+
+               ret = -EINVAL;
+               if (vq_info.index >= dev->vq_num)
+                       break;
+
+               index = array_index_nospec(vq_info.index, dev->vq_num);
+               vq = &dev->vqs[index];
+               vq_info.desc_addr = vq->desc_addr;
+               vq_info.driver_addr = vq->driver_addr;
+               vq_info.device_addr = vq->device_addr;
+               vq_info.num = vq->num;
+
+               if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
+                       vq_info.packed.last_avail_counter =
+                               vq->state.packed.last_avail_counter;
+                       vq_info.packed.last_avail_idx =
+                               vq->state.packed.last_avail_idx;
+                       vq_info.packed.last_used_counter =
+                               vq->state.packed.last_used_counter;
+                       vq_info.packed.last_used_idx =
+                               vq->state.packed.last_used_idx;
+               } else
+                       vq_info.split.avail_index =
+                               vq->state.split.avail_index;
+
+               vq_info.ready = vq->ready;
+
+               ret = -EFAULT;
+               if (copy_to_user(argp, &vq_info, sizeof(vq_info)))
+                       break;
+
+               ret = 0;
+               break;
+       }
+       case VDUSE_VQ_SETUP_KICKFD: {
+               struct vduse_vq_eventfd eventfd;
+
+               ret = -EFAULT;
+               if (copy_from_user(&eventfd, argp, sizeof(eventfd)))
+                       break;
+
+               ret = vduse_kickfd_setup(dev, &eventfd);
+               break;
+       }
+       case VDUSE_VQ_INJECT_IRQ: {
+               u32 index;
+
+               ret = -EFAULT;
+               if (get_user(index, (u32 __user *)argp))
+                       break;
+
+               ret = -EINVAL;
+               if (index >= dev->vq_num)
+                       break;
+
+               ret = 0;
+               index = array_index_nospec(index, dev->vq_num);
+               queue_work(vduse_irq_wq, &dev->vqs[index].inject);
+               break;
+       }
+       default:
+               ret = -ENOIOCTLCMD;
+               break;
+       }
+
+       return ret;
+}
+
+static int vduse_dev_release(struct inode *inode, struct file *file)
+{
+       struct vduse_dev *dev = file->private_data;
+
+       spin_lock(&dev->msg_lock);
+       /* Make sure the inflight messages can processed after reconncection */
+       list_splice_init(&dev->recv_list, &dev->send_list);
+       spin_unlock(&dev->msg_lock);
+       dev->connected = false;
+
+       return 0;
+}
+
+static struct vduse_dev *vduse_dev_get_from_minor(int minor)
+{
+       struct vduse_dev *dev;
+
+       mutex_lock(&vduse_lock);
+       dev = idr_find(&vduse_idr, minor);
+       mutex_unlock(&vduse_lock);
+
+       return dev;
+}
+
+static int vduse_dev_open(struct inode *inode, struct file *file)
+{
+       int ret;
+       struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode));
+
+       if (!dev)
+               return -ENODEV;
+
+       ret = -EBUSY;
+       mutex_lock(&dev->lock);
+       if (dev->connected)
+               goto unlock;
+
+       ret = 0;
+       dev->connected = true;
+       file->private_data = dev;
+unlock:
+       mutex_unlock(&dev->lock);
+
+       return ret;
+}
+
+static const struct file_operations vduse_dev_fops = {
+       .owner          = THIS_MODULE,
+       .open           = vduse_dev_open,
+       .release        = vduse_dev_release,
+       .read_iter      = vduse_dev_read_iter,
+       .write_iter     = vduse_dev_write_iter,
+       .poll           = vduse_dev_poll,
+       .unlocked_ioctl = vduse_dev_ioctl,
+       .compat_ioctl   = compat_ptr_ioctl,
+       .llseek         = noop_llseek,
+};
+
+static struct vduse_dev *vduse_dev_create(void)
+{
+       struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+
+       if (!dev)
+               return NULL;
+
+       mutex_init(&dev->lock);
+       spin_lock_init(&dev->msg_lock);
+       INIT_LIST_HEAD(&dev->send_list);
+       INIT_LIST_HEAD(&dev->recv_list);
+       spin_lock_init(&dev->irq_lock);
+
+       INIT_WORK(&dev->inject, vduse_dev_irq_inject);
+       init_waitqueue_head(&dev->waitq);
+
+       return dev;
+}
+
+static void vduse_dev_destroy(struct vduse_dev *dev)
+{
+       kfree(dev);
+}
+
+static struct vduse_dev *vduse_find_dev(const char *name)
+{
+       struct vduse_dev *dev;
+       int id;
+
+       idr_for_each_entry(&vduse_idr, dev, id)
+               if (!strcmp(dev->name, name))
+                       return dev;
+
+       return NULL;
+}
+
+static int vduse_destroy_dev(char *name)
+{
+       struct vduse_dev *dev = vduse_find_dev(name);
+
+       if (!dev)
+               return -EINVAL;
+
+       mutex_lock(&dev->lock);
+       if (dev->vdev || dev->connected) {
+               mutex_unlock(&dev->lock);
+               return -EBUSY;
+       }
+       dev->connected = true;
+       mutex_unlock(&dev->lock);
+
+       vduse_dev_reset(dev);
+       device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
+       idr_remove(&vduse_idr, dev->minor);
+       kvfree(dev->config);
+       kfree(dev->vqs);
+       vduse_domain_destroy(dev->domain);
+       kfree(dev->name);
+       vduse_dev_destroy(dev);
+       module_put(THIS_MODULE);
+
+       return 0;
+}
+
+static bool device_is_allowed(u32 device_id)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++)
+               if (allowed_device_id[i] == device_id)
+                       return true;
+
+       return false;
+}
+
+static bool features_is_valid(u64 features)
+{
+       if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
+               return false;
+
+       /* Now we only support read-only configuration space */
+       if (features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE))
+               return false;
+
+       return true;
+}
+
+static bool vduse_validate_config(struct vduse_dev_config *config)
+{
+       if (!is_mem_zero((const char *)config->reserved,
+                        sizeof(config->reserved)))
+               return false;
+
+       if (config->vq_align > PAGE_SIZE)
+               return false;
+
+       if (config->config_size > PAGE_SIZE)
+               return false;
+
+       if (!device_is_allowed(config->device_id))
+               return false;
+
+       if (!features_is_valid(config->features))
+               return false;
+
+       return true;
+}
+
+static ssize_t msg_timeout_show(struct device *device,
+                               struct device_attribute *attr, char *buf)
+{
+       struct vduse_dev *dev = dev_get_drvdata(device);
+
+       return sysfs_emit(buf, "%u\n", dev->msg_timeout);
+}
+
+static ssize_t msg_timeout_store(struct device *device,
+                                struct device_attribute *attr,
+                                const char *buf, size_t count)
+{
+       struct vduse_dev *dev = dev_get_drvdata(device);
+       int ret;
+
+       ret = kstrtouint(buf, 10, &dev->msg_timeout);
+       if (ret < 0)
+               return ret;
+
+       return count;
+}
+
+static DEVICE_ATTR_RW(msg_timeout);
+
+static struct attribute *vduse_dev_attrs[] = {
+       &dev_attr_msg_timeout.attr,
+       NULL
+};
+
+ATTRIBUTE_GROUPS(vduse_dev);
+
+static int vduse_create_dev(struct vduse_dev_config *config,
+                           void *config_buf, u64 api_version)
+{
+       int i, ret;
+       struct vduse_dev *dev;
+
+       ret = -EEXIST;
+       if (vduse_find_dev(config->name))
+               goto err;
+
+       ret = -ENOMEM;
+       dev = vduse_dev_create();
+       if (!dev)
+               goto err;
+
+       dev->api_version = api_version;
+       dev->device_features = config->features;
+       dev->device_id = config->device_id;
+       dev->vendor_id = config->vendor_id;
+       dev->name = kstrdup(config->name, GFP_KERNEL);
+       if (!dev->name)
+               goto err_str;
+
+       dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
+                                         VDUSE_BOUNCE_SIZE);
+       if (!dev->domain)
+               goto err_domain;
+
+       dev->config = config_buf;
+       dev->config_size = config->config_size;
+       dev->vq_align = config->vq_align;
+       dev->vq_num = config->vq_num;
+       dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
+       if (!dev->vqs)
+               goto err_vqs;
+
+       for (i = 0; i < dev->vq_num; i++) {
+               dev->vqs[i].index = i;
+               INIT_WORK(&dev->vqs[i].inject, vduse_vq_irq_inject);
+               INIT_WORK(&dev->vqs[i].kick, vduse_vq_kick_work);
+               spin_lock_init(&dev->vqs[i].kick_lock);
+               spin_lock_init(&dev->vqs[i].irq_lock);
+       }
+
+       ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
+       if (ret < 0)
+               goto err_idr;
+
+       dev->minor = ret;
+       dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT;
+       dev->dev = device_create(vduse_class, NULL,
+                                MKDEV(MAJOR(vduse_major), dev->minor),
+                                dev, "%s", config->name);
+       if (IS_ERR(dev->dev)) {
+               ret = PTR_ERR(dev->dev);
+               goto err_dev;
+       }
+       __module_get(THIS_MODULE);
+
+       return 0;
+err_dev:
+       idr_remove(&vduse_idr, dev->minor);
+err_idr:
+       kfree(dev->vqs);
+err_vqs:
+       vduse_domain_destroy(dev->domain);
+err_domain:
+       kfree(dev->name);
+err_str:
+       vduse_dev_destroy(dev);
+err:
+       kvfree(config_buf);
+       return ret;
+}
+
+static long vduse_ioctl(struct file *file, unsigned int cmd,
+                       unsigned long arg)
+{
+       int ret;
+       void __user *argp = (void __user *)arg;
+       struct vduse_control *control = file->private_data;
+
+       mutex_lock(&vduse_lock);
+       switch (cmd) {
+       case VDUSE_GET_API_VERSION:
+               ret = put_user(control->api_version, (u64 __user *)argp);
+               break;
+       case VDUSE_SET_API_VERSION: {
+               u64 api_version;
+
+               ret = -EFAULT;
+               if (get_user(api_version, (u64 __user *)argp))
+                       break;
+
+               ret = -EINVAL;
+               if (api_version > VDUSE_API_VERSION)
+                       break;
+
+               ret = 0;
+               control->api_version = api_version;
+               break;
+       }
+       case VDUSE_CREATE_DEV: {
+               struct vduse_dev_config config;
+               unsigned long size = offsetof(struct vduse_dev_config, config);
+               void *buf;
+
+               ret = -EFAULT;
+               if (copy_from_user(&config, argp, size))
+                       break;
+
+               ret = -EINVAL;
+               if (vduse_validate_config(&config) == false)
+                       break;
+
+               buf = vmemdup_user(argp + size, config.config_size);
+               if (IS_ERR(buf)) {
+                       ret = PTR_ERR(buf);
+                       break;
+               }
+               config.name[VDUSE_NAME_MAX - 1] = '\0';
+               ret = vduse_create_dev(&config, buf, control->api_version);
+               break;
+       }
+       case VDUSE_DESTROY_DEV: {
+               char name[VDUSE_NAME_MAX];
+
+               ret = -EFAULT;
+               if (copy_from_user(name, argp, VDUSE_NAME_MAX))
+                       break;
+
+               name[VDUSE_NAME_MAX - 1] = '\0';
+               ret = vduse_destroy_dev(name);
+               break;
+       }
+       default:
+               ret = -EINVAL;
+               break;
+       }
+       mutex_unlock(&vduse_lock);
+
+       return ret;
+}
+
+static int vduse_release(struct inode *inode, struct file *file)
+{
+       struct vduse_control *control = file->private_data;
+
+       kfree(control);
+       return 0;
+}
+
+static int vduse_open(struct inode *inode, struct file *file)
+{
+       struct vduse_control *control;
+
+       control = kmalloc(sizeof(struct vduse_control), GFP_KERNEL);
+       if (!control)
+               return -ENOMEM;
+
+       control->api_version = VDUSE_API_VERSION;
+       file->private_data = control;
+
+       return 0;
+}
+
+static const struct file_operations vduse_ctrl_fops = {
+       .owner          = THIS_MODULE,
+       .open           = vduse_open,
+       .release        = vduse_release,
+       .unlocked_ioctl = vduse_ioctl,
+       .compat_ioctl   = compat_ptr_ioctl,
+       .llseek         = noop_llseek,
+};
+
+static char *vduse_devnode(struct device *dev, umode_t *mode)
+{
+       return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev));
+}
+
+static void vduse_mgmtdev_release(struct device *dev)
+{
+}
+
+static struct device vduse_mgmtdev = {
+       .init_name = "vduse",
+       .release = vduse_mgmtdev_release,
+};
+
+static struct vdpa_mgmt_dev mgmt_dev;
+
+static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
+{
+       struct vduse_vdpa *vdev;
+       int ret;
+
+       if (dev->vdev)
+               return -EEXIST;
+
+       vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
+                                &vduse_vdpa_config_ops, name, true);
+       if (IS_ERR(vdev))
+               return PTR_ERR(vdev);
+
+       dev->vdev = vdev;
+       vdev->dev = dev;
+       vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask;
+       ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64));
+       if (ret) {
+               put_device(&vdev->vdpa.dev);
+               return ret;
+       }
+       set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops);
+       vdev->vdpa.dma_dev = &vdev->vdpa.dev;
+       vdev->vdpa.mdev = &mgmt_dev;
+
+       return 0;
+}
+
+static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
+{
+       struct vduse_dev *dev;
+       int ret;
+
+       mutex_lock(&vduse_lock);
+       dev = vduse_find_dev(name);
+       if (!dev || !vduse_dev_is_ready(dev)) {
+               mutex_unlock(&vduse_lock);
+               return -EINVAL;
+       }
+       ret = vduse_dev_init_vdpa(dev, name);
+       mutex_unlock(&vduse_lock);
+       if (ret)
+               return ret;
+
+       ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
+       if (ret) {
+               put_device(&dev->vdev->vdpa.dev);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)
+{
+       _vdpa_unregister_device(dev);
+}
+
+static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = {
+       .dev_add = vdpa_dev_add,
+       .dev_del = vdpa_dev_del,
+};
+
+static struct virtio_device_id id_table[] = {
+       { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
+       { 0 },
+};
+
+static struct vdpa_mgmt_dev mgmt_dev = {
+       .device = &vduse_mgmtdev,
+       .id_table = id_table,
+       .ops = &vdpa_dev_mgmtdev_ops,
+};
+
+static int vduse_mgmtdev_init(void)
+{
+       int ret;
+
+       ret = device_register(&vduse_mgmtdev);
+       if (ret)
+               return ret;
+
+       ret = vdpa_mgmtdev_register(&mgmt_dev);
+       if (ret)
+               goto err;
+
+       return 0;
+err:
+       device_unregister(&vduse_mgmtdev);
+       return ret;
+}
+
+static void vduse_mgmtdev_exit(void)
+{
+       vdpa_mgmtdev_unregister(&mgmt_dev);
+       device_unregister(&vduse_mgmtdev);
+}
+
+static int vduse_init(void)
+{
+       int ret;
+       struct device *dev;
+
+       vduse_class = class_create(THIS_MODULE, "vduse");
+       if (IS_ERR(vduse_class))
+               return PTR_ERR(vduse_class);
+
+       vduse_class->devnode = vduse_devnode;
+       vduse_class->dev_groups = vduse_dev_groups;
+
+       ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse");
+       if (ret)
+               goto err_chardev_region;
+
+       /* /dev/vduse/control */
+       cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops);
+       vduse_ctrl_cdev.owner = THIS_MODULE;
+       ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1);
+       if (ret)
+               goto err_ctrl_cdev;
+
+       dev = device_create(vduse_class, NULL, vduse_major, NULL, "control");
+       if (IS_ERR(dev)) {
+               ret = PTR_ERR(dev);
+               goto err_device;
+       }
+
+       /* /dev/vduse/$DEVICE */
+       cdev_init(&vduse_cdev, &vduse_dev_fops);
+       vduse_cdev.owner = THIS_MODULE;
+       ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1),
+                      VDUSE_DEV_MAX - 1);
+       if (ret)
+               goto err_cdev;
+
+       vduse_irq_wq = alloc_workqueue("vduse-irq",
+                               WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0);
+       if (!vduse_irq_wq)
+               goto err_wq;
+
+       ret = vduse_domain_init();
+       if (ret)
+               goto err_domain;
+
+       ret = vduse_mgmtdev_init();
+       if (ret)
+               goto err_mgmtdev;
+
+       return 0;
+err_mgmtdev:
+       vduse_domain_exit();
+err_domain:
+       destroy_workqueue(vduse_irq_wq);
+err_wq:
+       cdev_del(&vduse_cdev);
+err_cdev:
+       device_destroy(vduse_class, vduse_major);
+err_device:
+       cdev_del(&vduse_ctrl_cdev);
+err_ctrl_cdev:
+       unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
+err_chardev_region:
+       class_destroy(vduse_class);
+       return ret;
+}
+module_init(vduse_init);
+
+static void vduse_exit(void)
+{
+       vduse_mgmtdev_exit();
+       vduse_domain_exit();
+       destroy_workqueue(vduse_irq_wq);
+       cdev_del(&vduse_cdev);
+       device_destroy(vduse_class, vduse_major);
+       cdev_del(&vduse_ctrl_cdev);
+       unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
+       class_destroy(vduse_class);
+}
+module_exit(vduse_exit);
+
+MODULE_LICENSE(DRV_LICENSE);
+MODULE_AUTHOR(DRV_AUTHOR);
+MODULE_DESCRIPTION(DRV_DESC);
index fe05273..5bcd002 100644 (file)
@@ -189,10 +189,20 @@ static void vp_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
        }
 
        vp_modern_set_status(mdev, status);
+}
 
-       if (!(status & VIRTIO_CONFIG_S_DRIVER_OK) &&
-           (s & VIRTIO_CONFIG_S_DRIVER_OK))
+static int vp_vdpa_reset(struct vdpa_device *vdpa)
+{
+       struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+       struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev;
+       u8 s = vp_vdpa_get_status(vdpa);
+
+       vp_modern_set_status(mdev, 0);
+
+       if (s & VIRTIO_CONFIG_S_DRIVER_OK)
                vp_vdpa_free_irq(vp_vdpa);
+
+       return 0;
 }
 
 static u16 vp_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
@@ -398,6 +408,7 @@ static const struct vdpa_config_ops vp_vdpa_ops = {
        .set_features   = vp_vdpa_set_features,
        .get_status     = vp_vdpa_get_status,
        .set_status     = vp_vdpa_set_status,
+       .reset          = vp_vdpa_reset,
        .get_vq_num_max = vp_vdpa_get_vq_num_max,
        .get_vq_state   = vp_vdpa_get_vq_state,
        .get_vq_notification = vp_vdpa_get_vq_notification,
@@ -435,7 +446,7 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
                return ret;
 
        vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa,
-                                   dev, &vp_vdpa_ops, NULL);
+                                   dev, &vp_vdpa_ops, NULL, false);
        if (IS_ERR(vp_vdpa)) {
                dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n");
                return PTR_ERR(vp_vdpa);
index 0582079..670d56c 100644 (file)
@@ -36,19 +36,21 @@ void vhost_iotlb_map_free(struct vhost_iotlb *iotlb,
 EXPORT_SYMBOL_GPL(vhost_iotlb_map_free);
 
 /**
- * vhost_iotlb_add_range - add a new range to vhost IOTLB
+ * vhost_iotlb_add_range_ctx - add a new range to vhost IOTLB
  * @iotlb: the IOTLB
  * @start: start of the IOVA range
  * @last: last of IOVA range
  * @addr: the address that is mapped to @start
  * @perm: access permission of this range
+ * @opaque: the opaque pointer for the new mapping
  *
  * Returns an error last is smaller than start or memory allocation
  * fails
  */
-int vhost_iotlb_add_range(struct vhost_iotlb *iotlb,
-                         u64 start, u64 last,
-                         u64 addr, unsigned int perm)
+int vhost_iotlb_add_range_ctx(struct vhost_iotlb *iotlb,
+                             u64 start, u64 last,
+                             u64 addr, unsigned int perm,
+                             void *opaque)
 {
        struct vhost_iotlb_map *map;
 
@@ -71,6 +73,7 @@ int vhost_iotlb_add_range(struct vhost_iotlb *iotlb,
        map->last = last;
        map->addr = addr;
        map->perm = perm;
+       map->opaque = opaque;
 
        iotlb->nmaps++;
        vhost_iotlb_itree_insert(map, &iotlb->root);
@@ -80,6 +83,15 @@ int vhost_iotlb_add_range(struct vhost_iotlb *iotlb,
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(vhost_iotlb_add_range_ctx);
+
+int vhost_iotlb_add_range(struct vhost_iotlb *iotlb,
+                         u64 start, u64 last,
+                         u64 addr, unsigned int perm)
+{
+       return vhost_iotlb_add_range_ctx(iotlb, start, last,
+                                        addr, perm, NULL);
+}
 EXPORT_SYMBOL_GPL(vhost_iotlb_add_range);
 
 /**
index 46f897e..532e204 100644 (file)
@@ -1,24 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0+
 /*******************************************************************************
  * Vhost kernel TCM fabric driver for virtio SCSI initiators
  *
  * (C) Copyright 2010-2013 Datera, Inc.
  * (C) Copyright 2010-2012 IBM Corp.
  *
- * Licensed to the Linux Foundation under the General Public License (GPL) version 2.
- *
  * Authors: Nicholas A. Bellinger <nab@daterainc.com>
  *          Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  ****************************************************************************/
 
 #include <linux/module.h>
index 9479f7f..f41d081 100644 (file)
@@ -116,12 +116,13 @@ static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, u16 qid)
        irq_bypass_unregister_producer(&vq->call_ctx.producer);
 }
 
-static void vhost_vdpa_reset(struct vhost_vdpa *v)
+static int vhost_vdpa_reset(struct vhost_vdpa *v)
 {
        struct vdpa_device *vdpa = v->vdpa;
 
-       vdpa_reset(vdpa);
        v->in_batch = 0;
+
+       return vdpa_reset(vdpa);
 }
 
 static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp)
@@ -157,7 +158,7 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
        struct vdpa_device *vdpa = v->vdpa;
        const struct vdpa_config_ops *ops = vdpa->config;
        u8 status, status_old;
-       int nvqs = v->nvqs;
+       int ret, nvqs = v->nvqs;
        u16 i;
 
        if (copy_from_user(&status, statusp, sizeof(status)))
@@ -172,7 +173,12 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
        if (status != 0 && (ops->get_status(vdpa) & ~status) != 0)
                return -EINVAL;
 
-       ops->set_status(vdpa, status);
+       if (status == 0) {
+               ret = ops->reset(vdpa);
+               if (ret)
+                       return ret;
+       } else
+               ops->set_status(vdpa, status);
 
        if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && !(status_old & VIRTIO_CONFIG_S_DRIVER_OK))
                for (i = 0; i < nvqs; i++)
@@ -498,7 +504,7 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
        return r;
 }
 
-static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last)
+static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, u64 start, u64 last)
 {
        struct vhost_dev *dev = &v->vdev;
        struct vhost_iotlb *iotlb = dev->iotlb;
@@ -507,19 +513,44 @@ static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last)
        unsigned long pfn, pinned;
 
        while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) {
-               pinned = map->size >> PAGE_SHIFT;
-               for (pfn = map->addr >> PAGE_SHIFT;
+               pinned = PFN_DOWN(map->size);
+               for (pfn = PFN_DOWN(map->addr);
                     pinned > 0; pfn++, pinned--) {
                        page = pfn_to_page(pfn);
                        if (map->perm & VHOST_ACCESS_WO)
                                set_page_dirty_lock(page);
                        unpin_user_page(page);
                }
-               atomic64_sub(map->size >> PAGE_SHIFT, &dev->mm->pinned_vm);
+               atomic64_sub(PFN_DOWN(map->size), &dev->mm->pinned_vm);
                vhost_iotlb_map_free(iotlb, map);
        }
 }
 
+static void vhost_vdpa_va_unmap(struct vhost_vdpa *v, u64 start, u64 last)
+{
+       struct vhost_dev *dev = &v->vdev;
+       struct vhost_iotlb *iotlb = dev->iotlb;
+       struct vhost_iotlb_map *map;
+       struct vdpa_map_file *map_file;
+
+       while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) {
+               map_file = (struct vdpa_map_file *)map->opaque;
+               fput(map_file->file);
+               kfree(map_file);
+               vhost_iotlb_map_free(iotlb, map);
+       }
+}
+
+static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last)
+{
+       struct vdpa_device *vdpa = v->vdpa;
+
+       if (vdpa->use_va)
+               return vhost_vdpa_va_unmap(v, start, last);
+
+       return vhost_vdpa_pa_unmap(v, start, last);
+}
+
 static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v)
 {
        struct vhost_dev *dev = &v->vdev;
@@ -551,21 +582,21 @@ static int perm_to_iommu_flags(u32 perm)
        return flags | IOMMU_CACHE;
 }
 
-static int vhost_vdpa_map(struct vhost_vdpa *v,
-                         u64 iova, u64 size, u64 pa, u32 perm)
+static int vhost_vdpa_map(struct vhost_vdpa *v, u64 iova,
+                         u64 size, u64 pa, u32 perm, void *opaque)
 {
        struct vhost_dev *dev = &v->vdev;
        struct vdpa_device *vdpa = v->vdpa;
        const struct vdpa_config_ops *ops = vdpa->config;
        int r = 0;
 
-       r = vhost_iotlb_add_range(dev->iotlb, iova, iova + size - 1,
-                                 pa, perm);
+       r = vhost_iotlb_add_range_ctx(dev->iotlb, iova, iova + size - 1,
+                                     pa, perm, opaque);
        if (r)
                return r;
 
        if (ops->dma_map) {
-               r = ops->dma_map(vdpa, iova, size, pa, perm);
+               r = ops->dma_map(vdpa, iova, size, pa, perm, opaque);
        } else if (ops->set_map) {
                if (!v->in_batch)
                        r = ops->set_map(vdpa, dev->iotlb);
@@ -573,13 +604,15 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
                r = iommu_map(v->domain, iova, pa, size,
                              perm_to_iommu_flags(perm));
        }
-
-       if (r)
+       if (r) {
                vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
-       else
-               atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
+               return r;
+       }
 
-       return r;
+       if (!vdpa->use_va)
+               atomic64_add(PFN_DOWN(size), &dev->mm->pinned_vm);
+
+       return 0;
 }
 
 static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size)
@@ -600,38 +633,78 @@ static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size)
        }
 }
 
-static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
-                                          struct vhost_iotlb_msg *msg)
+static int vhost_vdpa_va_map(struct vhost_vdpa *v,
+                            u64 iova, u64 size, u64 uaddr, u32 perm)
+{
+       struct vhost_dev *dev = &v->vdev;
+       u64 offset, map_size, map_iova = iova;
+       struct vdpa_map_file *map_file;
+       struct vm_area_struct *vma;
+       int ret;
+
+       mmap_read_lock(dev->mm);
+
+       while (size) {
+               vma = find_vma(dev->mm, uaddr);
+               if (!vma) {
+                       ret = -EINVAL;
+                       break;
+               }
+               map_size = min(size, vma->vm_end - uaddr);
+               if (!(vma->vm_file && (vma->vm_flags & VM_SHARED) &&
+                       !(vma->vm_flags & (VM_IO | VM_PFNMAP))))
+                       goto next;
+
+               map_file = kzalloc(sizeof(*map_file), GFP_KERNEL);
+               if (!map_file) {
+                       ret = -ENOMEM;
+                       break;
+               }
+               offset = (vma->vm_pgoff << PAGE_SHIFT) + uaddr - vma->vm_start;
+               map_file->offset = offset;
+               map_file->file = get_file(vma->vm_file);
+               ret = vhost_vdpa_map(v, map_iova, map_size, uaddr,
+                                    perm, map_file);
+               if (ret) {
+                       fput(map_file->file);
+                       kfree(map_file);
+                       break;
+               }
+next:
+               size -= map_size;
+               uaddr += map_size;
+               map_iova += map_size;
+       }
+       if (ret)
+               vhost_vdpa_unmap(v, iova, map_iova - iova);
+
+       mmap_read_unlock(dev->mm);
+
+       return ret;
+}
+
+static int vhost_vdpa_pa_map(struct vhost_vdpa *v,
+                            u64 iova, u64 size, u64 uaddr, u32 perm)
 {
        struct vhost_dev *dev = &v->vdev;
-       struct vhost_iotlb *iotlb = dev->iotlb;
        struct page **page_list;
        unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
        unsigned int gup_flags = FOLL_LONGTERM;
        unsigned long npages, cur_base, map_pfn, last_pfn = 0;
        unsigned long lock_limit, sz2pin, nchunks, i;
-       u64 iova = msg->iova;
+       u64 start = iova;
        long pinned;
        int ret = 0;
 
-       if (msg->iova < v->range.first || !msg->size ||
-           msg->iova > U64_MAX - msg->size + 1 ||
-           msg->iova + msg->size - 1 > v->range.last)
-               return -EINVAL;
-
-       if (vhost_iotlb_itree_first(iotlb, msg->iova,
-                                   msg->iova + msg->size - 1))
-               return -EEXIST;
-
        /* Limit the use of memory for bookkeeping */
        page_list = (struct page **) __get_free_page(GFP_KERNEL);
        if (!page_list)
                return -ENOMEM;
 
-       if (msg->perm & VHOST_ACCESS_WO)
+       if (perm & VHOST_ACCESS_WO)
                gup_flags |= FOLL_WRITE;
 
-       npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT;
+       npages = PFN_UP(size + (iova & ~PAGE_MASK));
        if (!npages) {
                ret = -EINVAL;
                goto free;
@@ -639,13 +712,13 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
 
        mmap_read_lock(dev->mm);
 
-       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+       lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
        if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
                ret = -ENOMEM;
                goto unlock;
        }
 
-       cur_base = msg->uaddr & PAGE_MASK;
+       cur_base = uaddr & PAGE_MASK;
        iova &= PAGE_MASK;
        nchunks = 0;
 
@@ -673,10 +746,10 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
 
                        if (last_pfn && (this_pfn != last_pfn + 1)) {
                                /* Pin a contiguous chunk of memory */
-                               csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
+                               csize = PFN_PHYS(last_pfn - map_pfn + 1);
                                ret = vhost_vdpa_map(v, iova, csize,
-                                                    map_pfn << PAGE_SHIFT,
-                                                    msg->perm);
+                                                    PFN_PHYS(map_pfn),
+                                                    perm, NULL);
                                if (ret) {
                                        /*
                                         * Unpin the pages that are left unmapped
@@ -699,13 +772,13 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
                        last_pfn = this_pfn;
                }
 
-               cur_base += pinned << PAGE_SHIFT;
+               cur_base += PFN_PHYS(pinned);
                npages -= pinned;
        }
 
        /* Pin the rest chunk */
-       ret = vhost_vdpa_map(v, iova, (last_pfn - map_pfn + 1) << PAGE_SHIFT,
-                            map_pfn << PAGE_SHIFT, msg->perm);
+       ret = vhost_vdpa_map(v, iova, PFN_PHYS(last_pfn - map_pfn + 1),
+                            PFN_PHYS(map_pfn), perm, NULL);
 out:
        if (ret) {
                if (nchunks) {
@@ -724,13 +797,38 @@ out:
                        for (pfn = map_pfn; pfn <= last_pfn; pfn++)
                                unpin_user_page(pfn_to_page(pfn));
                }
-               vhost_vdpa_unmap(v, msg->iova, msg->size);
+               vhost_vdpa_unmap(v, start, size);
        }
 unlock:
        mmap_read_unlock(dev->mm);
 free:
        free_page((unsigned long)page_list);
        return ret;
+
+}
+
+static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
+                                          struct vhost_iotlb_msg *msg)
+{
+       struct vhost_dev *dev = &v->vdev;
+       struct vdpa_device *vdpa = v->vdpa;
+       struct vhost_iotlb *iotlb = dev->iotlb;
+
+       if (msg->iova < v->range.first || !msg->size ||
+           msg->iova > U64_MAX - msg->size + 1 ||
+           msg->iova + msg->size - 1 > v->range.last)
+               return -EINVAL;
+
+       if (vhost_iotlb_itree_first(iotlb, msg->iova,
+                                   msg->iova + msg->size - 1))
+               return -EEXIST;
+
+       if (vdpa->use_va)
+               return vhost_vdpa_va_map(v, msg->iova, msg->size,
+                                        msg->uaddr, msg->perm);
+
+       return vhost_vdpa_pa_map(v, msg->iova, msg->size, msg->uaddr,
+                                msg->perm);
 }
 
 static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev,
@@ -860,7 +958,9 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep)
                return -EBUSY;
 
        nvqs = v->nvqs;
-       vhost_vdpa_reset(v);
+       r = vhost_vdpa_reset(v);
+       if (r)
+               goto err;
 
        vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL);
        if (!vqs) {
@@ -945,7 +1045,7 @@ static vm_fault_t vhost_vdpa_fault(struct vm_fault *vmf)
 
        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
        if (remap_pfn_range(vma, vmf->address & PAGE_MASK,
-                           notify.addr >> PAGE_SHIFT, PAGE_SIZE,
+                           PFN_DOWN(notify.addr), PAGE_SIZE,
                            vma->vm_page_prot))
                return VM_FAULT_SIGBUS;
 
index f249622..938aefb 100644 (file)
@@ -114,7 +114,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
                size_t nbytes;
                size_t iov_len, payload_len;
                int head;
-               bool restore_flag = false;
+               u32 flags_to_restore = 0;
 
                spin_lock_bh(&vsock->send_pkt_list_lock);
                if (list_empty(&vsock->send_pkt_list)) {
@@ -178,16 +178,21 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
                         * small rx buffers, headers of packets in rx queue are
                         * created dynamically and are initialized with header
                         * of current packet(except length). But in case of
-                        * SOCK_SEQPACKET, we also must clear record delimeter
-                        * bit(VIRTIO_VSOCK_SEQ_EOR). Otherwise, instead of one
-                        * packet with delimeter(which marks end of record),
-                        * there will be sequence of packets with delimeter
-                        * bit set. After initialized header will be copied to
-                        * rx buffer, this bit will be restored.
+                        * SOCK_SEQPACKET, we also must clear message delimeter
+                        * bit (VIRTIO_VSOCK_SEQ_EOM) and MSG_EOR bit
+                        * (VIRTIO_VSOCK_SEQ_EOR) if set. Otherwise,
+                        * there will be sequence of packets with these
+                        * bits set. After initialized header will be copied to
+                        * rx buffer, these required bits will be restored.
                         */
-                       if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) {
-                               pkt->hdr.flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
-                               restore_flag = true;
+                       if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) {
+                               pkt->hdr.flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
+                               flags_to_restore |= VIRTIO_VSOCK_SEQ_EOM;
+
+                               if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) {
+                                       pkt->hdr.flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+                                       flags_to_restore |= VIRTIO_VSOCK_SEQ_EOR;
+                               }
                        }
                }
 
@@ -224,8 +229,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
                 * to send it with the next available buffer.
                 */
                if (pkt->off < pkt->len) {
-                       if (restore_flag)
-                               pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+                       pkt->hdr.flags |= cpu_to_le32(flags_to_restore);
 
                        /* We are queueing the same virtio_vsock_pkt to handle
                         * the remaining bytes, and we want to deliver it
index 71fb710..7420d2c 100644 (file)
@@ -962,6 +962,7 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
        struct fb_var_screeninfo old_var;
        struct fb_videomode mode;
        struct fb_event event;
+       u32 unused;
 
        if (var->activate & FB_ACTIVATE_INV_MODE) {
                struct fb_videomode mode1, mode2;
@@ -1008,6 +1009,11 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
        if (var->xres < 8 || var->yres < 8)
                return -EINVAL;
 
+       /* Too huge resolution causes multiplication overflow. */
+       if (check_mul_overflow(var->xres, var->yres, &unused) ||
+           check_mul_overflow(var->xres_virtual, var->yres_virtual, &unused))
+               return -EINVAL;
+
        ret = info->fbops->fb_check_var(var, info);
 
        if (ret)
index 1ea0c1f..588e02f 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/virtio_config.h>
 #include <linux/module.h>
 #include <linux/idr.h>
+#include <linux/of.h>
 #include <uapi/linux/virtio_ids.h>
 
 /* Unique numbering for virtio devices. */
@@ -292,6 +293,8 @@ static void virtio_dev_remove(struct device *_d)
 
        /* Acknowledge the device's existence again. */
        virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+
+       of_node_put(dev->dev.of_node);
 }
 
 static struct bus_type virtio_bus = {
@@ -318,6 +321,43 @@ void unregister_virtio_driver(struct virtio_driver *driver)
 }
 EXPORT_SYMBOL_GPL(unregister_virtio_driver);
 
+static int virtio_device_of_init(struct virtio_device *dev)
+{
+       struct device_node *np, *pnode = dev_of_node(dev->dev.parent);
+       char compat[] = "virtio,deviceXXXXXXXX";
+       int ret, count;
+
+       if (!pnode)
+               return 0;
+
+       count = of_get_available_child_count(pnode);
+       if (!count)
+               return 0;
+
+       /* There can be only 1 child node */
+       if (WARN_ON(count > 1))
+               return -EINVAL;
+
+       np = of_get_next_available_child(pnode, NULL);
+       if (WARN_ON(!np))
+               return -ENODEV;
+
+       ret = snprintf(compat, sizeof(compat), "virtio,device%x", dev->id.device);
+       BUG_ON(ret >= sizeof(compat));
+
+       if (!of_device_is_compatible(np, compat)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       dev->dev.of_node = np;
+       return 0;
+
+out:
+       of_node_put(np);
+       return ret;
+}
+
 /**
  * register_virtio_device - register virtio device
  * @dev        : virtio device to be registered
@@ -342,6 +382,10 @@ int register_virtio_device(struct virtio_device *dev)
        dev->index = err;
        dev_set_name(&dev->dev, "virtio%u", dev->index);
 
+       err = virtio_device_of_init(dev);
+       if (err)
+               goto out_ida_remove;
+
        spin_lock_init(&dev->config_lock);
        dev->config_enabled = false;
        dev->config_change_pending = false;
@@ -362,10 +406,16 @@ int register_virtio_device(struct virtio_device *dev)
         */
        err = device_add(&dev->dev);
        if (err)
-               ida_simple_remove(&virtio_index_ida, dev->index);
+               goto out_of_node_put;
+
+       return 0;
+
+out_of_node_put:
+       of_node_put(dev->dev.of_node);
+out_ida_remove:
+       ida_simple_remove(&virtio_index_ida, dev->index);
 out:
-       if (err)
-               virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
+       virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
        return err;
 }
 EXPORT_SYMBOL_GPL(register_virtio_device);
index 47dce91..c22ff01 100644 (file)
@@ -531,8 +531,8 @@ static int init_vqs(struct virtio_balloon *vb)
                callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
        }
 
-       err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
-                                        vqs, callbacks, names, NULL, NULL);
+       err = virtio_find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, vqs,
+                             callbacks, names, NULL);
        if (err)
                return err;
 
index 47af46a..a6313a9 100644 (file)
@@ -367,7 +367,7 @@ source "fs/ceph/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ksmbd/Kconfig"
 
-config CIFS_COMMON
+config SMBFS_COMMON
        tristate
        default y if CIFS=y
        default m if CIFS=m
index 2f21300..84c5e4c 100644 (file)
@@ -17,7 +17,7 @@ obj-y :=      open.o read_write.o file_table.o super.o \
                kernel_read_file.o remap_range.o
 
 ifeq ($(CONFIG_BLOCK),y)
-obj-y +=       buffer.o block_dev.o direct-io.o mpage.o
+obj-y +=       buffer.o direct-io.o mpage.o
 else
 obj-y +=       no-block.o
 endif
@@ -96,7 +96,7 @@ obj-$(CONFIG_LOCKD)           += lockd/
 obj-$(CONFIG_NLS)              += nls/
 obj-$(CONFIG_UNICODE)          += unicode/
 obj-$(CONFIG_SYSV_FS)          += sysv/
-obj-$(CONFIG_CIFS_COMMON)      += cifs_common/
+obj-$(CONFIG_SMBFS_COMMON)     += smbfs_common/
 obj-$(CONFIG_CIFS)             += cifs/
 obj-$(CONFIG_SMB_SERVER)       += ksmbd/
 obj-$(CONFIG_HPFS_FS)          += hpfs/
diff --git a/fs/block_dev.c b/fs/block_dev.c
deleted file mode 100644 (file)
index 45df6cb..0000000
+++ /dev/null
@@ -1,1695 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
- *  Copyright (C) 2016 - 2020 Christoph Hellwig
- */
-
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/fcntl.h>
-#include <linux/slab.h>
-#include <linux/kmod.h>
-#include <linux/major.h>
-#include <linux/device_cgroup.h>
-#include <linux/highmem.h>
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/module.h>
-#include <linux/blkpg.h>
-#include <linux/magic.h>
-#include <linux/buffer_head.h>
-#include <linux/swap.h>
-#include <linux/pagevec.h>
-#include <linux/writeback.h>
-#include <linux/mpage.h>
-#include <linux/mount.h>
-#include <linux/pseudo_fs.h>
-#include <linux/uio.h>
-#include <linux/namei.h>
-#include <linux/log2.h>
-#include <linux/cleancache.h>
-#include <linux/task_io_accounting_ops.h>
-#include <linux/falloc.h>
-#include <linux/part_stat.h>
-#include <linux/uaccess.h>
-#include <linux/suspend.h>
-#include "internal.h"
-#include "../block/blk.h"
-
-struct bdev_inode {
-       struct block_device bdev;
-       struct inode vfs_inode;
-};
-
-static const struct address_space_operations def_blk_aops;
-
-static inline struct bdev_inode *BDEV_I(struct inode *inode)
-{
-       return container_of(inode, struct bdev_inode, vfs_inode);
-}
-
-struct block_device *I_BDEV(struct inode *inode)
-{
-       return &BDEV_I(inode)->bdev;
-}
-EXPORT_SYMBOL(I_BDEV);
-
-static void bdev_write_inode(struct block_device *bdev)
-{
-       struct inode *inode = bdev->bd_inode;
-       int ret;
-
-       spin_lock(&inode->i_lock);
-       while (inode->i_state & I_DIRTY) {
-               spin_unlock(&inode->i_lock);
-               ret = write_inode_now(inode, true);
-               if (ret) {
-                       char name[BDEVNAME_SIZE];
-                       pr_warn_ratelimited("VFS: Dirty inode writeback failed "
-                                           "for block device %s (err=%d).\n",
-                                           bdevname(bdev, name), ret);
-               }
-               spin_lock(&inode->i_lock);
-       }
-       spin_unlock(&inode->i_lock);
-}
-
-/* Kill _all_ buffers and pagecache , dirty or not.. */
-static void kill_bdev(struct block_device *bdev)
-{
-       struct address_space *mapping = bdev->bd_inode->i_mapping;
-
-       if (mapping_empty(mapping))
-               return;
-
-       invalidate_bh_lrus();
-       truncate_inode_pages(mapping, 0);
-}
-
-/* Invalidate clean unused buffers and pagecache. */
-void invalidate_bdev(struct block_device *bdev)
-{
-       struct address_space *mapping = bdev->bd_inode->i_mapping;
-
-       if (mapping->nrpages) {
-               invalidate_bh_lrus();
-               lru_add_drain_all();    /* make sure all lru add caches are flushed */
-               invalidate_mapping_pages(mapping, 0, -1);
-       }
-       /* 99% of the time, we don't need to flush the cleancache on the bdev.
-        * But, for the strange corners, lets be cautious
-        */
-       cleancache_invalidate_inode(mapping);
-}
-EXPORT_SYMBOL(invalidate_bdev);
-
-/*
- * Drop all buffers & page cache for given bdev range. This function bails
- * with error if bdev has other exclusive owner (such as filesystem).
- */
-int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
-                       loff_t lstart, loff_t lend)
-{
-       /*
-        * If we don't hold exclusive handle for the device, upgrade to it
-        * while we discard the buffer cache to avoid discarding buffers
-        * under live filesystem.
-        */
-       if (!(mode & FMODE_EXCL)) {
-               int err = bd_prepare_to_claim(bdev, truncate_bdev_range);
-               if (err)
-                       goto invalidate;
-       }
-
-       truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
-       if (!(mode & FMODE_EXCL))
-               bd_abort_claiming(bdev, truncate_bdev_range);
-       return 0;
-
-invalidate:
-       /*
-        * Someone else has handle exclusively open. Try invalidating instead.
-        * The 'end' argument is inclusive so the rounding is safe.
-        */
-       return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
-                                            lstart >> PAGE_SHIFT,
-                                            lend >> PAGE_SHIFT);
-}
-
-static void set_init_blocksize(struct block_device *bdev)
-{
-       unsigned int bsize = bdev_logical_block_size(bdev);
-       loff_t size = i_size_read(bdev->bd_inode);
-
-       while (bsize < PAGE_SIZE) {
-               if (size & bsize)
-                       break;
-               bsize <<= 1;
-       }
-       bdev->bd_inode->i_blkbits = blksize_bits(bsize);
-}
-
-int set_blocksize(struct block_device *bdev, int size)
-{
-       /* Size must be a power of two, and between 512 and PAGE_SIZE */
-       if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
-               return -EINVAL;
-
-       /* Size cannot be smaller than the size supported by the device */
-       if (size < bdev_logical_block_size(bdev))
-               return -EINVAL;
-
-       /* Don't change the size if it is same as current */
-       if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
-               sync_blockdev(bdev);
-               bdev->bd_inode->i_blkbits = blksize_bits(size);
-               kill_bdev(bdev);
-       }
-       return 0;
-}
-
-EXPORT_SYMBOL(set_blocksize);
-
-int sb_set_blocksize(struct super_block *sb, int size)
-{
-       if (set_blocksize(sb->s_bdev, size))
-               return 0;
-       /* If we get here, we know size is power of two
-        * and it's value is between 512 and PAGE_SIZE */
-       sb->s_blocksize = size;
-       sb->s_blocksize_bits = blksize_bits(size);
-       return sb->s_blocksize;
-}
-
-EXPORT_SYMBOL(sb_set_blocksize);
-
-int sb_min_blocksize(struct super_block *sb, int size)
-{
-       int minsize = bdev_logical_block_size(sb->s_bdev);
-       if (size < minsize)
-               size = minsize;
-       return sb_set_blocksize(sb, size);
-}
-
-EXPORT_SYMBOL(sb_min_blocksize);
-
-static int
-blkdev_get_block(struct inode *inode, sector_t iblock,
-               struct buffer_head *bh, int create)
-{
-       bh->b_bdev = I_BDEV(inode);
-       bh->b_blocknr = iblock;
-       set_buffer_mapped(bh);
-       return 0;
-}
-
-static struct inode *bdev_file_inode(struct file *file)
-{
-       return file->f_mapping->host;
-}
-
-static unsigned int dio_bio_write_op(struct kiocb *iocb)
-{
-       unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
-
-       /* avoid the need for a I/O completion work item */
-       if (iocb->ki_flags & IOCB_DSYNC)
-               op |= REQ_FUA;
-       return op;
-}
-
-#define DIO_INLINE_BIO_VECS 4
-
-static void blkdev_bio_end_io_simple(struct bio *bio)
-{
-       struct task_struct *waiter = bio->bi_private;
-
-       WRITE_ONCE(bio->bi_private, NULL);
-       blk_wake_io_task(waiter);
-}
-
-static ssize_t
-__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
-               unsigned int nr_pages)
-{
-       struct file *file = iocb->ki_filp;
-       struct block_device *bdev = I_BDEV(bdev_file_inode(file));
-       struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
-       loff_t pos = iocb->ki_pos;
-       bool should_dirty = false;
-       struct bio bio;
-       ssize_t ret;
-       blk_qc_t qc;
-
-       if ((pos | iov_iter_alignment(iter)) &
-           (bdev_logical_block_size(bdev) - 1))
-               return -EINVAL;
-
-       if (nr_pages <= DIO_INLINE_BIO_VECS)
-               vecs = inline_vecs;
-       else {
-               vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec),
-                                    GFP_KERNEL);
-               if (!vecs)
-                       return -ENOMEM;
-       }
-
-       bio_init(&bio, vecs, nr_pages);
-       bio_set_dev(&bio, bdev);
-       bio.bi_iter.bi_sector = pos >> 9;
-       bio.bi_write_hint = iocb->ki_hint;
-       bio.bi_private = current;
-       bio.bi_end_io = blkdev_bio_end_io_simple;
-       bio.bi_ioprio = iocb->ki_ioprio;
-
-       ret = bio_iov_iter_get_pages(&bio, iter);
-       if (unlikely(ret))
-               goto out;
-       ret = bio.bi_iter.bi_size;
-
-       if (iov_iter_rw(iter) == READ) {
-               bio.bi_opf = REQ_OP_READ;
-               if (iter_is_iovec(iter))
-                       should_dirty = true;
-       } else {
-               bio.bi_opf = dio_bio_write_op(iocb);
-               task_io_account_write(ret);
-       }
-       if (iocb->ki_flags & IOCB_NOWAIT)
-               bio.bi_opf |= REQ_NOWAIT;
-       if (iocb->ki_flags & IOCB_HIPRI)
-               bio_set_polled(&bio, iocb);
-
-       qc = submit_bio(&bio);
-       for (;;) {
-               set_current_state(TASK_UNINTERRUPTIBLE);
-               if (!READ_ONCE(bio.bi_private))
-                       break;
-               if (!(iocb->ki_flags & IOCB_HIPRI) ||
-                   !blk_poll(bdev_get_queue(bdev), qc, true))
-                       blk_io_schedule();
-       }
-       __set_current_state(TASK_RUNNING);
-
-       bio_release_pages(&bio, should_dirty);
-       if (unlikely(bio.bi_status))
-               ret = blk_status_to_errno(bio.bi_status);
-
-out:
-       if (vecs != inline_vecs)
-               kfree(vecs);
-
-       bio_uninit(&bio);
-
-       return ret;
-}
-
-struct blkdev_dio {
-       union {
-               struct kiocb            *iocb;
-               struct task_struct      *waiter;
-       };
-       size_t                  size;
-       atomic_t                ref;
-       bool                    multi_bio : 1;
-       bool                    should_dirty : 1;
-       bool                    is_sync : 1;
-       struct bio              bio;
-};
-
-static struct bio_set blkdev_dio_pool;
-
-static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
-{
-       struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
-       struct request_queue *q = bdev_get_queue(bdev);
-
-       return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
-}
-
-static void blkdev_bio_end_io(struct bio *bio)
-{
-       struct blkdev_dio *dio = bio->bi_private;
-       bool should_dirty = dio->should_dirty;
-
-       if (bio->bi_status && !dio->bio.bi_status)
-               dio->bio.bi_status = bio->bi_status;
-
-       if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
-               if (!dio->is_sync) {
-                       struct kiocb *iocb = dio->iocb;
-                       ssize_t ret;
-
-                       if (likely(!dio->bio.bi_status)) {
-                               ret = dio->size;
-                               iocb->ki_pos += ret;
-                       } else {
-                               ret = blk_status_to_errno(dio->bio.bi_status);
-                       }
-
-                       dio->iocb->ki_complete(iocb, ret, 0);
-                       if (dio->multi_bio)
-                               bio_put(&dio->bio);
-               } else {
-                       struct task_struct *waiter = dio->waiter;
-
-                       WRITE_ONCE(dio->waiter, NULL);
-                       blk_wake_io_task(waiter);
-               }
-       }
-
-       if (should_dirty) {
-               bio_check_pages_dirty(bio);
-       } else {
-               bio_release_pages(bio, false);
-               bio_put(bio);
-       }
-}
-
-static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-               unsigned int nr_pages)
-{
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = bdev_file_inode(file);
-       struct block_device *bdev = I_BDEV(inode);
-       struct blk_plug plug;
-       struct blkdev_dio *dio;
-       struct bio *bio;
-       bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
-       bool is_read = (iov_iter_rw(iter) == READ), is_sync;
-       loff_t pos = iocb->ki_pos;
-       blk_qc_t qc = BLK_QC_T_NONE;
-       int ret = 0;
-
-       if ((pos | iov_iter_alignment(iter)) &
-           (bdev_logical_block_size(bdev) - 1))
-               return -EINVAL;
-
-       bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
-
-       dio = container_of(bio, struct blkdev_dio, bio);
-       dio->is_sync = is_sync = is_sync_kiocb(iocb);
-       if (dio->is_sync) {
-               dio->waiter = current;
-               bio_get(bio);
-       } else {
-               dio->iocb = iocb;
-       }
-
-       dio->size = 0;
-       dio->multi_bio = false;
-       dio->should_dirty = is_read && iter_is_iovec(iter);
-
-       /*
-        * Don't plug for HIPRI/polled IO, as those should go straight
-        * to issue
-        */
-       if (!is_poll)
-               blk_start_plug(&plug);
-
-       for (;;) {
-               bio_set_dev(bio, bdev);
-               bio->bi_iter.bi_sector = pos >> 9;
-               bio->bi_write_hint = iocb->ki_hint;
-               bio->bi_private = dio;
-               bio->bi_end_io = blkdev_bio_end_io;
-               bio->bi_ioprio = iocb->ki_ioprio;
-
-               ret = bio_iov_iter_get_pages(bio, iter);
-               if (unlikely(ret)) {
-                       bio->bi_status = BLK_STS_IOERR;
-                       bio_endio(bio);
-                       break;
-               }
-
-               if (is_read) {
-                       bio->bi_opf = REQ_OP_READ;
-                       if (dio->should_dirty)
-                               bio_set_pages_dirty(bio);
-               } else {
-                       bio->bi_opf = dio_bio_write_op(iocb);
-                       task_io_account_write(bio->bi_iter.bi_size);
-               }
-               if (iocb->ki_flags & IOCB_NOWAIT)
-                       bio->bi_opf |= REQ_NOWAIT;
-
-               dio->size += bio->bi_iter.bi_size;
-               pos += bio->bi_iter.bi_size;
-
-               nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
-               if (!nr_pages) {
-                       bool polled = false;
-
-                       if (iocb->ki_flags & IOCB_HIPRI) {
-                               bio_set_polled(bio, iocb);
-                               polled = true;
-                       }
-
-                       qc = submit_bio(bio);
-
-                       if (polled)
-                               WRITE_ONCE(iocb->ki_cookie, qc);
-                       break;
-               }
-
-               if (!dio->multi_bio) {
-                       /*
-                        * AIO needs an extra reference to ensure the dio
-                        * structure which is embedded into the first bio
-                        * stays around.
-                        */
-                       if (!is_sync)
-                               bio_get(bio);
-                       dio->multi_bio = true;
-                       atomic_set(&dio->ref, 2);
-               } else {
-                       atomic_inc(&dio->ref);
-               }
-
-               submit_bio(bio);
-               bio = bio_alloc(GFP_KERNEL, nr_pages);
-       }
-
-       if (!is_poll)
-               blk_finish_plug(&plug);
-
-       if (!is_sync)
-               return -EIOCBQUEUED;
-
-       for (;;) {
-               set_current_state(TASK_UNINTERRUPTIBLE);
-               if (!READ_ONCE(dio->waiter))
-                       break;
-
-               if (!(iocb->ki_flags & IOCB_HIPRI) ||
-                   !blk_poll(bdev_get_queue(bdev), qc, true))
-                       blk_io_schedule();
-       }
-       __set_current_state(TASK_RUNNING);
-
-       if (!ret)
-               ret = blk_status_to_errno(dio->bio.bi_status);
-       if (likely(!ret))
-               ret = dio->size;
-
-       bio_put(&dio->bio);
-       return ret;
-}
-
-static ssize_t
-blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
-       unsigned int nr_pages;
-
-       if (!iov_iter_count(iter))
-               return 0;
-
-       nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
-       if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS)
-               return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
-
-       return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
-}
-
-static __init int blkdev_init(void)
-{
-       return bioset_init(&blkdev_dio_pool, 4,
-                               offsetof(struct blkdev_dio, bio),
-                               BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE);
-}
-module_init(blkdev_init);
-
-int __sync_blockdev(struct block_device *bdev, int wait)
-{
-       if (!bdev)
-               return 0;
-       if (!wait)
-               return filemap_flush(bdev->bd_inode->i_mapping);
-       return filemap_write_and_wait(bdev->bd_inode->i_mapping);
-}
-
-/*
- * Write out and wait upon all the dirty data associated with a block
- * device via its mapping.  Does not take the superblock lock.
- */
-int sync_blockdev(struct block_device *bdev)
-{
-       return __sync_blockdev(bdev, 1);
-}
-EXPORT_SYMBOL(sync_blockdev);
-
-/*
- * Write out and wait upon all dirty data associated with this
- * device.   Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_bdev(struct block_device *bdev)
-{
-       struct super_block *sb = get_super(bdev);
-       if (sb) {
-               int res = sync_filesystem(sb);
-               drop_super(sb);
-               return res;
-       }
-       return sync_blockdev(bdev);
-}
-EXPORT_SYMBOL(fsync_bdev);
-
-/**
- * freeze_bdev  --  lock a filesystem and force it into a consistent state
- * @bdev:      blockdevice to lock
- *
- * If a superblock is found on this device, we take the s_umount semaphore
- * on it to make sure nobody unmounts until the snapshot creation is done.
- * The reference counter (bd_fsfreeze_count) guarantees that only the last
- * unfreeze process can unfreeze the frozen filesystem actually when multiple
- * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
- * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
- * actually.
- */
-int freeze_bdev(struct block_device *bdev)
-{
-       struct super_block *sb;
-       int error = 0;
-
-       mutex_lock(&bdev->bd_fsfreeze_mutex);
-       if (++bdev->bd_fsfreeze_count > 1)
-               goto done;
-
-       sb = get_active_super(bdev);
-       if (!sb)
-               goto sync;
-       if (sb->s_op->freeze_super)
-               error = sb->s_op->freeze_super(sb);
-       else
-               error = freeze_super(sb);
-       deactivate_super(sb);
-
-       if (error) {
-               bdev->bd_fsfreeze_count--;
-               goto done;
-       }
-       bdev->bd_fsfreeze_sb = sb;
-
-sync:
-       sync_blockdev(bdev);
-done:
-       mutex_unlock(&bdev->bd_fsfreeze_mutex);
-       return error;
-}
-EXPORT_SYMBOL(freeze_bdev);
-
-/**
- * thaw_bdev  -- unlock filesystem
- * @bdev:      blockdevice to unlock
- *
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
- */
-int thaw_bdev(struct block_device *bdev)
-{
-       struct super_block *sb;
-       int error = -EINVAL;
-
-       mutex_lock(&bdev->bd_fsfreeze_mutex);
-       if (!bdev->bd_fsfreeze_count)
-               goto out;
-
-       error = 0;
-       if (--bdev->bd_fsfreeze_count > 0)
-               goto out;
-
-       sb = bdev->bd_fsfreeze_sb;
-       if (!sb)
-               goto out;
-
-       if (sb->s_op->thaw_super)
-               error = sb->s_op->thaw_super(sb);
-       else
-               error = thaw_super(sb);
-       if (error)
-               bdev->bd_fsfreeze_count++;
-       else
-               bdev->bd_fsfreeze_sb = NULL;
-out:
-       mutex_unlock(&bdev->bd_fsfreeze_mutex);
-       return error;
-}
-EXPORT_SYMBOL(thaw_bdev);
-
-static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
-{
-       return block_write_full_page(page, blkdev_get_block, wbc);
-}
-
-static int blkdev_readpage(struct file * file, struct page * page)
-{
-       return block_read_full_page(page, blkdev_get_block);
-}
-
-static void blkdev_readahead(struct readahead_control *rac)
-{
-       mpage_readahead(rac, blkdev_get_block);
-}
-
-static int blkdev_write_begin(struct file *file, struct address_space *mapping,
-                       loff_t pos, unsigned len, unsigned flags,
-                       struct page **pagep, void **fsdata)
-{
-       return block_write_begin(mapping, pos, len, flags, pagep,
-                                blkdev_get_block);
-}
-
-static int blkdev_write_end(struct file *file, struct address_space *mapping,
-                       loff_t pos, unsigned len, unsigned copied,
-                       struct page *page, void *fsdata)
-{
-       int ret;
-       ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-       unlock_page(page);
-       put_page(page);
-
-       return ret;
-}
-
-/*
- * private llseek:
- * for a block special file file_inode(file)->i_size is zero
- * so we compute the size by hand (just as in block_read/write above)
- */
-static loff_t block_llseek(struct file *file, loff_t offset, int whence)
-{
-       struct inode *bd_inode = bdev_file_inode(file);
-       loff_t retval;
-
-       inode_lock(bd_inode);
-       retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
-       inode_unlock(bd_inode);
-       return retval;
-}
-       
-static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
-               int datasync)
-{
-       struct inode *bd_inode = bdev_file_inode(filp);
-       struct block_device *bdev = I_BDEV(bd_inode);
-       int error;
-       
-       error = file_write_and_wait_range(filp, start, end);
-       if (error)
-               return error;
-
-       /*
-        * There is no need to serialise calls to blkdev_issue_flush with
-        * i_mutex and doing so causes performance issues with concurrent
-        * O_SYNC writers to a block device.
-        */
-       error = blkdev_issue_flush(bdev);
-       if (error == -EOPNOTSUPP)
-               error = 0;
-
-       return error;
-}
-
-/**
- * bdev_read_page() - Start reading a page from a block device
- * @bdev: The device to read the page from
- * @sector: The offset on the device to read the page to (need not be aligned)
- * @page: The page to read
- *
- * On entry, the page should be locked.  It will be unlocked when the page
- * has been read.  If the block driver implements rw_page synchronously,
- * that will be true on exit from this function, but it need not be.
- *
- * Errors returned by this function are usually "soft", eg out of memory, or
- * queue full; callers should try a different route to read this page rather
- * than propagate an error back up the stack.
- *
- * Return: negative errno if an error occurs, 0 if submission was successful.
- */
-int bdev_read_page(struct block_device *bdev, sector_t sector,
-                       struct page *page)
-{
-       const struct block_device_operations *ops = bdev->bd_disk->fops;
-       int result = -EOPNOTSUPP;
-
-       if (!ops->rw_page || bdev_get_integrity(bdev))
-               return result;
-
-       result = blk_queue_enter(bdev->bd_disk->queue, 0);
-       if (result)
-               return result;
-       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
-                             REQ_OP_READ);
-       blk_queue_exit(bdev->bd_disk->queue);
-       return result;
-}
-
-/**
- * bdev_write_page() - Start writing a page to a block device
- * @bdev: The device to write the page to
- * @sector: The offset on the device to write the page to (need not be aligned)
- * @page: The page to write
- * @wbc: The writeback_control for the write
- *
- * On entry, the page should be locked and not currently under writeback.
- * On exit, if the write started successfully, the page will be unlocked and
- * under writeback.  If the write failed already (eg the driver failed to
- * queue the page to the device), the page will still be locked.  If the
- * caller is a ->writepage implementation, it will need to unlock the page.
- *
- * Errors returned by this function are usually "soft", eg out of memory, or
- * queue full; callers should try a different route to write this page rather
- * than propagate an error back up the stack.
- *
- * Return: negative errno if an error occurs, 0 if submission was successful.
- */
-int bdev_write_page(struct block_device *bdev, sector_t sector,
-                       struct page *page, struct writeback_control *wbc)
-{
-       int result;
-       const struct block_device_operations *ops = bdev->bd_disk->fops;
-
-       if (!ops->rw_page || bdev_get_integrity(bdev))
-               return -EOPNOTSUPP;
-       result = blk_queue_enter(bdev->bd_disk->queue, 0);
-       if (result)
-               return result;
-
-       set_page_writeback(page);
-       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
-                             REQ_OP_WRITE);
-       if (result) {
-               end_page_writeback(page);
-       } else {
-               clean_page_buffers(page);
-               unlock_page(page);
-       }
-       blk_queue_exit(bdev->bd_disk->queue);
-       return result;
-}
-
-/*
- * pseudo-fs
- */
-
-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
-static struct kmem_cache * bdev_cachep __read_mostly;
-
-static struct inode *bdev_alloc_inode(struct super_block *sb)
-{
-       struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
-
-       if (!ei)
-               return NULL;
-       memset(&ei->bdev, 0, sizeof(ei->bdev));
-       return &ei->vfs_inode;
-}
-
-static void bdev_free_inode(struct inode *inode)
-{
-       struct block_device *bdev = I_BDEV(inode);
-
-       free_percpu(bdev->bd_stats);
-       kfree(bdev->bd_meta_info);
-
-       if (!bdev_is_partition(bdev)) {
-               if (bdev->bd_disk && bdev->bd_disk->bdi)
-                       bdi_put(bdev->bd_disk->bdi);
-               kfree(bdev->bd_disk);
-       }
-
-       if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
-               blk_free_ext_minor(MINOR(bdev->bd_dev));
-
-       kmem_cache_free(bdev_cachep, BDEV_I(inode));
-}
-
-static void init_once(void *data)
-{
-       struct bdev_inode *ei = data;
-
-       inode_init_once(&ei->vfs_inode);
-}
-
-static void bdev_evict_inode(struct inode *inode)
-{
-       truncate_inode_pages_final(&inode->i_data);
-       invalidate_inode_buffers(inode); /* is it needed here? */
-       clear_inode(inode);
-}
-
-static const struct super_operations bdev_sops = {
-       .statfs = simple_statfs,
-       .alloc_inode = bdev_alloc_inode,
-       .free_inode = bdev_free_inode,
-       .drop_inode = generic_delete_inode,
-       .evict_inode = bdev_evict_inode,
-};
-
-static int bd_init_fs_context(struct fs_context *fc)
-{
-       struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
-       if (!ctx)
-               return -ENOMEM;
-       fc->s_iflags |= SB_I_CGROUPWB;
-       ctx->ops = &bdev_sops;
-       return 0;
-}
-
-static struct file_system_type bd_type = {
-       .name           = "bdev",
-       .init_fs_context = bd_init_fs_context,
-       .kill_sb        = kill_anon_super,
-};
-
-struct super_block *blockdev_superblock __read_mostly;
-EXPORT_SYMBOL_GPL(blockdev_superblock);
-
-void __init bdev_cache_init(void)
-{
-       int err;
-       static struct vfsmount *bd_mnt;
-
-       bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
-                       0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-                               SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
-                       init_once);
-       err = register_filesystem(&bd_type);
-       if (err)
-               panic("Cannot register bdev pseudo-fs");
-       bd_mnt = kern_mount(&bd_type);
-       if (IS_ERR(bd_mnt))
-               panic("Cannot create bdev pseudo-fs");
-       blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
-}
-
-struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
-{
-       struct block_device *bdev;
-       struct inode *inode;
-
-       inode = new_inode(blockdev_superblock);
-       if (!inode)
-               return NULL;
-       inode->i_mode = S_IFBLK;
-       inode->i_rdev = 0;
-       inode->i_data.a_ops = &def_blk_aops;
-       mapping_set_gfp_mask(&inode->i_data, GFP_USER);
-
-       bdev = I_BDEV(inode);
-       mutex_init(&bdev->bd_fsfreeze_mutex);
-       spin_lock_init(&bdev->bd_size_lock);
-       bdev->bd_disk = disk;
-       bdev->bd_partno = partno;
-       bdev->bd_inode = inode;
-       bdev->bd_stats = alloc_percpu(struct disk_stats);
-       if (!bdev->bd_stats) {
-               iput(inode);
-               return NULL;
-       }
-       return bdev;
-}
-
-void bdev_add(struct block_device *bdev, dev_t dev)
-{
-       bdev->bd_dev = dev;
-       bdev->bd_inode->i_rdev = dev;
-       bdev->bd_inode->i_ino = dev;
-       insert_inode_hash(bdev->bd_inode);
-}
-
-long nr_blockdev_pages(void)
-{
-       struct inode *inode;
-       long ret = 0;
-
-       spin_lock(&blockdev_superblock->s_inode_list_lock);
-       list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
-               ret += inode->i_mapping->nrpages;
-       spin_unlock(&blockdev_superblock->s_inode_list_lock);
-
-       return ret;
-}
-
-/**
- * bd_may_claim - test whether a block device can be claimed
- * @bdev: block device of interest
- * @whole: whole block device containing @bdev, may equal @bdev
- * @holder: holder trying to claim @bdev
- *
- * Test whether @bdev can be claimed by @holder.
- *
- * CONTEXT:
- * spin_lock(&bdev_lock).
- *
- * RETURNS:
- * %true if @bdev can be claimed, %false otherwise.
- */
-static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
-                        void *holder)
-{
-       if (bdev->bd_holder == holder)
-               return true;     /* already a holder */
-       else if (bdev->bd_holder != NULL)
-               return false;    /* held by someone else */
-       else if (whole == bdev)
-               return true;     /* is a whole device which isn't held */
-
-       else if (whole->bd_holder == bd_may_claim)
-               return true;     /* is a partition of a device that is being partitioned */
-       else if (whole->bd_holder != NULL)
-               return false;    /* is a partition of a held device */
-       else
-               return true;     /* is a partition of an un-held device */
-}
-
-/**
- * bd_prepare_to_claim - claim a block device
- * @bdev: block device of interest
- * @holder: holder trying to claim @bdev
- *
- * Claim @bdev.  This function fails if @bdev is already claimed by another
- * holder and waits if another claiming is in progress. return, the caller
- * has ownership of bd_claiming and bd_holder[s].
- *
- * RETURNS:
- * 0 if @bdev can be claimed, -EBUSY otherwise.
- */
-int bd_prepare_to_claim(struct block_device *bdev, void *holder)
-{
-       struct block_device *whole = bdev_whole(bdev);
-
-       if (WARN_ON_ONCE(!holder))
-               return -EINVAL;
-retry:
-       spin_lock(&bdev_lock);
-       /* if someone else claimed, fail */
-       if (!bd_may_claim(bdev, whole, holder)) {
-               spin_unlock(&bdev_lock);
-               return -EBUSY;
-       }
-
-       /* if claiming is already in progress, wait for it to finish */
-       if (whole->bd_claiming) {
-               wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
-               DEFINE_WAIT(wait);
-
-               prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
-               spin_unlock(&bdev_lock);
-               schedule();
-               finish_wait(wq, &wait);
-               goto retry;
-       }
-
-       /* yay, all mine */
-       whole->bd_claiming = holder;
-       spin_unlock(&bdev_lock);
-       return 0;
-}
-EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
-
-static void bd_clear_claiming(struct block_device *whole, void *holder)
-{
-       lockdep_assert_held(&bdev_lock);
-       /* tell others that we're done */
-       BUG_ON(whole->bd_claiming != holder);
-       whole->bd_claiming = NULL;
-       wake_up_bit(&whole->bd_claiming, 0);
-}
-
-/**
- * bd_finish_claiming - finish claiming of a block device
- * @bdev: block device of interest
- * @holder: holder that has claimed @bdev
- *
- * Finish exclusive open of a block device. Mark the device as exlusively
- * open by the holder and wake up all waiters for exclusive open to finish.
- */
-static void bd_finish_claiming(struct block_device *bdev, void *holder)
-{
-       struct block_device *whole = bdev_whole(bdev);
-
-       spin_lock(&bdev_lock);
-       BUG_ON(!bd_may_claim(bdev, whole, holder));
-       /*
-        * Note that for a whole device bd_holders will be incremented twice,
-        * and bd_holder will be set to bd_may_claim before being set to holder
-        */
-       whole->bd_holders++;
-       whole->bd_holder = bd_may_claim;
-       bdev->bd_holders++;
-       bdev->bd_holder = holder;
-       bd_clear_claiming(whole, holder);
-       spin_unlock(&bdev_lock);
-}
-
-/**
- * bd_abort_claiming - abort claiming of a block device
- * @bdev: block device of interest
- * @holder: holder that has claimed @bdev
- *
- * Abort claiming of a block device when the exclusive open failed. This can be
- * also used when exclusive open is not actually desired and we just needed
- * to block other exclusive openers for a while.
- */
-void bd_abort_claiming(struct block_device *bdev, void *holder)
-{
-       spin_lock(&bdev_lock);
-       bd_clear_claiming(bdev_whole(bdev), holder);
-       spin_unlock(&bdev_lock);
-}
-EXPORT_SYMBOL(bd_abort_claiming);
-
-static void blkdev_flush_mapping(struct block_device *bdev)
-{
-       WARN_ON_ONCE(bdev->bd_holders);
-       sync_blockdev(bdev);
-       kill_bdev(bdev);
-       bdev_write_inode(bdev);
-}
-
-static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
-{
-       struct gendisk *disk = bdev->bd_disk;
-       int ret = 0;
-
-       if (disk->fops->open) {
-               ret = disk->fops->open(bdev, mode);
-               if (ret) {
-                       /* avoid ghost partitions on a removed medium */
-                       if (ret == -ENOMEDIUM &&
-                            test_bit(GD_NEED_PART_SCAN, &disk->state))
-                               bdev_disk_changed(disk, true);
-                       return ret;
-               }
-       }
-
-       if (!bdev->bd_openers)
-               set_init_blocksize(bdev);
-       if (test_bit(GD_NEED_PART_SCAN, &disk->state))
-               bdev_disk_changed(disk, false);
-       bdev->bd_openers++;
-       return 0;;
-}
-
-static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
-{
-       if (!--bdev->bd_openers)
-               blkdev_flush_mapping(bdev);
-       if (bdev->bd_disk->fops->release)
-               bdev->bd_disk->fops->release(bdev->bd_disk, mode);
-}
-
-static int blkdev_get_part(struct block_device *part, fmode_t mode)
-{
-       struct gendisk *disk = part->bd_disk;
-       int ret;
-
-       if (part->bd_openers)
-               goto done;
-
-       ret = blkdev_get_whole(bdev_whole(part), mode);
-       if (ret)
-               return ret;
-
-       ret = -ENXIO;
-       if (!bdev_nr_sectors(part))
-               goto out_blkdev_put;
-
-       disk->open_partitions++;
-       set_init_blocksize(part);
-done:
-       part->bd_openers++;
-       return 0;
-
-out_blkdev_put:
-       blkdev_put_whole(bdev_whole(part), mode);
-       return ret;
-}
-
-static void blkdev_put_part(struct block_device *part, fmode_t mode)
-{
-       struct block_device *whole = bdev_whole(part);
-
-       if (--part->bd_openers)
-               return;
-       blkdev_flush_mapping(part);
-       whole->bd_disk->open_partitions--;
-       blkdev_put_whole(whole, mode);
-}
-
-struct block_device *blkdev_get_no_open(dev_t dev)
-{
-       struct block_device *bdev;
-       struct inode *inode;
-
-       inode = ilookup(blockdev_superblock, dev);
-       if (!inode) {
-               blk_request_module(dev);
-               inode = ilookup(blockdev_superblock, dev);
-               if (!inode)
-                       return NULL;
-       }
-
-       /* switch from the inode reference to a device mode one: */
-       bdev = &BDEV_I(inode)->bdev;
-       if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
-               bdev = NULL;
-       iput(inode);
-
-       if (!bdev)
-               return NULL;
-       if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||
-           !try_module_get(bdev->bd_disk->fops->owner)) {
-               put_device(&bdev->bd_device);
-               return NULL;
-       }
-
-       return bdev;
-}
-
-void blkdev_put_no_open(struct block_device *bdev)
-{
-       module_put(bdev->bd_disk->fops->owner);
-       put_device(&bdev->bd_device);
-}
-
-/**
- * blkdev_get_by_dev - open a block device by device number
- * @dev: device number of block device to open
- * @mode: FMODE_* mask
- * @holder: exclusive holder identifier
- *
- * Open the block device described by device number @dev. If @mode includes
- * %FMODE_EXCL, the block device is opened with exclusive access.  Specifying
- * %FMODE_EXCL with a %NULL @holder is invalid.  Exclusive opens may nest for
- * the same @holder.
- *
- * Use this interface ONLY if you really do not have anything better - i.e. when
- * you are behind a truly sucky interface and all you are given is a device
- * number.  Everything else should use blkdev_get_by_path().
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * Reference to the block_device on success, ERR_PTR(-errno) on failure.
- */
-struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
-{
-       bool unblock_events = true;
-       struct block_device *bdev;
-       struct gendisk *disk;
-       int ret;
-
-       ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
-                       MAJOR(dev), MINOR(dev),
-                       ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) |
-                       ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0));
-       if (ret)
-               return ERR_PTR(ret);
-
-       bdev = blkdev_get_no_open(dev);
-       if (!bdev)
-               return ERR_PTR(-ENXIO);
-       disk = bdev->bd_disk;
-
-       if (mode & FMODE_EXCL) {
-               ret = bd_prepare_to_claim(bdev, holder);
-               if (ret)
-                       goto put_blkdev;
-       }
-
-       disk_block_events(disk);
-
-       mutex_lock(&disk->open_mutex);
-       ret = -ENXIO;
-       if (!disk_live(disk))
-               goto abort_claiming;
-       if (bdev_is_partition(bdev))
-               ret = blkdev_get_part(bdev, mode);
-       else
-               ret = blkdev_get_whole(bdev, mode);
-       if (ret)
-               goto abort_claiming;
-       if (mode & FMODE_EXCL) {
-               bd_finish_claiming(bdev, holder);
-
-               /*
-                * Block event polling for write claims if requested.  Any write
-                * holder makes the write_holder state stick until all are
-                * released.  This is good enough and tracking individual
-                * writeable reference is too fragile given the way @mode is
-                * used in blkdev_get/put().
-                */
-               if ((mode & FMODE_WRITE) && !bdev->bd_write_holder &&
-                   (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
-                       bdev->bd_write_holder = true;
-                       unblock_events = false;
-               }
-       }
-       mutex_unlock(&disk->open_mutex);
-
-       if (unblock_events)
-               disk_unblock_events(disk);
-       return bdev;
-
-abort_claiming:
-       if (mode & FMODE_EXCL)
-               bd_abort_claiming(bdev, holder);
-       mutex_unlock(&disk->open_mutex);
-       disk_unblock_events(disk);
-put_blkdev:
-       blkdev_put_no_open(bdev);
-       return ERR_PTR(ret);
-}
-EXPORT_SYMBOL(blkdev_get_by_dev);
-
-/**
- * blkdev_get_by_path - open a block device by name
- * @path: path to the block device to open
- * @mode: FMODE_* mask
- * @holder: exclusive holder identifier
- *
- * Open the block device described by the device file at @path.  If @mode
- * includes %FMODE_EXCL, the block device is opened with exclusive access.
- * Specifying %FMODE_EXCL with a %NULL @holder is invalid.  Exclusive opens may
- * nest for the same @holder.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * Reference to the block_device on success, ERR_PTR(-errno) on failure.
- */
-struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
-                                       void *holder)
-{
-       struct block_device *bdev;
-       dev_t dev;
-       int error;
-
-       error = lookup_bdev(path, &dev);
-       if (error)
-               return ERR_PTR(error);
-
-       bdev = blkdev_get_by_dev(dev, mode, holder);
-       if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
-               blkdev_put(bdev, mode);
-               return ERR_PTR(-EACCES);
-       }
-
-       return bdev;
-}
-EXPORT_SYMBOL(blkdev_get_by_path);
-
-static int blkdev_open(struct inode * inode, struct file * filp)
-{
-       struct block_device *bdev;
-
-       /*
-        * Preserve backwards compatibility and allow large file access
-        * even if userspace doesn't ask for it explicitly. Some mkfs
-        * binary needs it. We might want to drop this workaround
-        * during an unstable branch.
-        */
-       filp->f_flags |= O_LARGEFILE;
-
-       filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
-
-       if (filp->f_flags & O_NDELAY)
-               filp->f_mode |= FMODE_NDELAY;
-       if (filp->f_flags & O_EXCL)
-               filp->f_mode |= FMODE_EXCL;
-       if ((filp->f_flags & O_ACCMODE) == 3)
-               filp->f_mode |= FMODE_WRITE_IOCTL;
-
-       bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
-       if (IS_ERR(bdev))
-               return PTR_ERR(bdev);
-       filp->f_mapping = bdev->bd_inode->i_mapping;
-       filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
-       return 0;
-}
-
-void blkdev_put(struct block_device *bdev, fmode_t mode)
-{
-       struct gendisk *disk = bdev->bd_disk;
-
-       /*
-        * Sync early if it looks like we're the last one.  If someone else
-        * opens the block device between now and the decrement of bd_openers
-        * then we did a sync that we didn't need to, but that's not the end
-        * of the world and we want to avoid long (could be several minute)
-        * syncs while holding the mutex.
-        */
-       if (bdev->bd_openers == 1)
-               sync_blockdev(bdev);
-
-       mutex_lock(&disk->open_mutex);
-       if (mode & FMODE_EXCL) {
-               struct block_device *whole = bdev_whole(bdev);
-               bool bdev_free;
-
-               /*
-                * Release a claim on the device.  The holder fields
-                * are protected with bdev_lock.  open_mutex is to
-                * synchronize disk_holder unlinking.
-                */
-               spin_lock(&bdev_lock);
-
-               WARN_ON_ONCE(--bdev->bd_holders < 0);
-               WARN_ON_ONCE(--whole->bd_holders < 0);
-
-               if ((bdev_free = !bdev->bd_holders))
-                       bdev->bd_holder = NULL;
-               if (!whole->bd_holders)
-                       whole->bd_holder = NULL;
-
-               spin_unlock(&bdev_lock);
-
-               /*
-                * If this was the last claim, remove holder link and
-                * unblock evpoll if it was a write holder.
-                */
-               if (bdev_free && bdev->bd_write_holder) {
-                       disk_unblock_events(disk);
-                       bdev->bd_write_holder = false;
-               }
-       }
-
-       /*
-        * Trigger event checking and tell drivers to flush MEDIA_CHANGE
-        * event.  This is to ensure detection of media removal commanded
-        * from userland - e.g. eject(1).
-        */
-       disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);
-
-       if (bdev_is_partition(bdev))
-               blkdev_put_part(bdev, mode);
-       else
-               blkdev_put_whole(bdev, mode);
-       mutex_unlock(&disk->open_mutex);
-
-       blkdev_put_no_open(bdev);
-}
-EXPORT_SYMBOL(blkdev_put);
-
-static int blkdev_close(struct inode * inode, struct file * filp)
-{
-       struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
-       blkdev_put(bdev, filp->f_mode);
-       return 0;
-}
-
-static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
-       struct block_device *bdev = I_BDEV(bdev_file_inode(file));
-       fmode_t mode = file->f_mode;
-
-       /*
-        * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
-        * to updated it before every ioctl.
-        */
-       if (file->f_flags & O_NDELAY)
-               mode |= FMODE_NDELAY;
-       else
-               mode &= ~FMODE_NDELAY;
-
-       return blkdev_ioctl(bdev, mode, cmd, arg);
-}
-
-/*
- * Write data to the block device.  Only intended for the block device itself
- * and the raw driver which basically is a fake block device.
- *
- * Does not take i_mutex for the write and thus is not for general purpose
- * use.
- */
-static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-       struct file *file = iocb->ki_filp;
-       struct inode *bd_inode = bdev_file_inode(file);
-       loff_t size = i_size_read(bd_inode);
-       struct blk_plug plug;
-       size_t shorted = 0;
-       ssize_t ret;
-
-       if (bdev_read_only(I_BDEV(bd_inode)))
-               return -EPERM;
-
-       if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
-               return -ETXTBSY;
-
-       if (!iov_iter_count(from))
-               return 0;
-
-       if (iocb->ki_pos >= size)
-               return -ENOSPC;
-
-       if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
-               return -EOPNOTSUPP;
-
-       size -= iocb->ki_pos;
-       if (iov_iter_count(from) > size) {
-               shorted = iov_iter_count(from) - size;
-               iov_iter_truncate(from, size);
-       }
-
-       blk_start_plug(&plug);
-       ret = __generic_file_write_iter(iocb, from);
-       if (ret > 0)
-               ret = generic_write_sync(iocb, ret);
-       iov_iter_reexpand(from, iov_iter_count(from) + shorted);
-       blk_finish_plug(&plug);
-       return ret;
-}
-
-static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
-{
-       struct file *file = iocb->ki_filp;
-       struct inode *bd_inode = bdev_file_inode(file);
-       loff_t size = i_size_read(bd_inode);
-       loff_t pos = iocb->ki_pos;
-       size_t shorted = 0;
-       ssize_t ret;
-
-       if (pos >= size)
-               return 0;
-
-       size -= pos;
-       if (iov_iter_count(to) > size) {
-               shorted = iov_iter_count(to) - size;
-               iov_iter_truncate(to, size);
-       }
-
-       ret = generic_file_read_iter(iocb, to);
-       iov_iter_reexpand(to, iov_iter_count(to) + shorted);
-       return ret;
-}
-
-static int blkdev_writepages(struct address_space *mapping,
-                            struct writeback_control *wbc)
-{
-       return generic_writepages(mapping, wbc);
-}
-
-static const struct address_space_operations def_blk_aops = {
-       .set_page_dirty = __set_page_dirty_buffers,
-       .readpage       = blkdev_readpage,
-       .readahead      = blkdev_readahead,
-       .writepage      = blkdev_writepage,
-       .write_begin    = blkdev_write_begin,
-       .write_end      = blkdev_write_end,
-       .writepages     = blkdev_writepages,
-       .direct_IO      = blkdev_direct_IO,
-       .migratepage    = buffer_migrate_page_norefs,
-       .is_dirty_writeback = buffer_check_dirty_writeback,
-};
-
-#define        BLKDEV_FALLOC_FL_SUPPORTED                                      \
-               (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
-                FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
-
-static long blkdev_fallocate(struct file *file, int mode, loff_t start,
-                            loff_t len)
-{
-       struct block_device *bdev = I_BDEV(bdev_file_inode(file));
-       loff_t end = start + len - 1;
-       loff_t isize;
-       int error;
-
-       /* Fail if we don't recognize the flags. */
-       if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
-               return -EOPNOTSUPP;
-
-       /* Don't go off the end of the device. */
-       isize = i_size_read(bdev->bd_inode);
-       if (start >= isize)
-               return -EINVAL;
-       if (end >= isize) {
-               if (mode & FALLOC_FL_KEEP_SIZE) {
-                       len = isize - start;
-                       end = start + len - 1;
-               } else
-                       return -EINVAL;
-       }
-
-       /*
-        * Don't allow IO that isn't aligned to logical block size.
-        */
-       if ((start | len) & (bdev_logical_block_size(bdev) - 1))
-               return -EINVAL;
-
-       /* Invalidate the page cache, including dirty pages. */
-       error = truncate_bdev_range(bdev, file->f_mode, start, end);
-       if (error)
-               return error;
-
-       switch (mode) {
-       case FALLOC_FL_ZERO_RANGE:
-       case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
-               error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
-                                           GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
-               break;
-       case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
-               error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
-                                            GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
-               break;
-       case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
-               error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
-                                            GFP_KERNEL, 0);
-               break;
-       default:
-               return -EOPNOTSUPP;
-       }
-       if (error)
-               return error;
-
-       /*
-        * Invalidate the page cache again; if someone wandered in and dirtied
-        * a page, we just discard it - userspace has no way of knowing whether
-        * the write happened before or after discard completing...
-        */
-       return truncate_bdev_range(bdev, file->f_mode, start, end);
-}
-
-const struct file_operations def_blk_fops = {
-       .open           = blkdev_open,
-       .release        = blkdev_close,
-       .llseek         = block_llseek,
-       .read_iter      = blkdev_read_iter,
-       .write_iter     = blkdev_write_iter,
-       .iopoll         = blkdev_iopoll,
-       .mmap           = generic_file_mmap,
-       .fsync          = blkdev_fsync,
-       .unlocked_ioctl = block_ioctl,
-#ifdef CONFIG_COMPAT
-       .compat_ioctl   = compat_blkdev_ioctl,
-#endif
-       .splice_read    = generic_file_splice_read,
-       .splice_write   = iter_file_splice_write,
-       .fallocate      = blkdev_fallocate,
-};
-
-/**
- * lookup_bdev  - lookup a struct block_device by name
- * @pathname:  special file representing the block device
- * @dev:       return value of the block device's dev_t
- *
- * Get a reference to the blockdevice at @pathname in the current
- * namespace if possible and return it.  Return ERR_PTR(error)
- * otherwise.
- */
-int lookup_bdev(const char *pathname, dev_t *dev)
-{
-       struct inode *inode;
-       struct path path;
-       int error;
-
-       if (!pathname || !*pathname)
-               return -EINVAL;
-
-       error = kern_path(pathname, LOOKUP_FOLLOW, &path);
-       if (error)
-               return error;
-
-       inode = d_backing_inode(path.dentry);
-       error = -ENOTBLK;
-       if (!S_ISBLK(inode->i_mode))
-               goto out_path_put;
-       error = -EACCES;
-       if (!may_open_dev(&path))
-               goto out_path_put;
-
-       *dev = inode->i_rdev;
-       error = 0;
-out_path_put:
-       path_put(&path);
-       return error;
-}
-EXPORT_SYMBOL(lookup_bdev);
-
-int __invalidate_device(struct block_device *bdev, bool kill_dirty)
-{
-       struct super_block *sb = get_super(bdev);
-       int res = 0;
-
-       if (sb) {
-               /*
-                * no need to lock the super, get_super holds the
-                * read mutex so the filesystem cannot go away
-                * under us (->put_super runs with the write lock
-                * hold).
-                */
-               shrink_dcache_sb(sb);
-               res = invalidate_inodes(sb, kill_dirty);
-               drop_super(sb);
-       }
-       invalidate_bdev(bdev);
-       return res;
-}
-EXPORT_SYMBOL(__invalidate_device);
-
-void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
-{
-       struct inode *inode, *old_inode = NULL;
-
-       spin_lock(&blockdev_superblock->s_inode_list_lock);
-       list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
-               struct address_space *mapping = inode->i_mapping;
-               struct block_device *bdev;
-
-               spin_lock(&inode->i_lock);
-               if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
-                   mapping->nrpages == 0) {
-                       spin_unlock(&inode->i_lock);
-                       continue;
-               }
-               __iget(inode);
-               spin_unlock(&inode->i_lock);
-               spin_unlock(&blockdev_superblock->s_inode_list_lock);
-               /*
-                * We hold a reference to 'inode' so it couldn't have been
-                * removed from s_inodes list while we dropped the
-                * s_inode_list_lock  We cannot iput the inode now as we can
-                * be holding the last reference and we cannot iput it under
-                * s_inode_list_lock. So we keep the reference and iput it
-                * later.
-                */
-               iput(old_inode);
-               old_inode = inode;
-               bdev = I_BDEV(inode);
-
-               mutex_lock(&bdev->bd_disk->open_mutex);
-               if (bdev->bd_openers)
-                       func(bdev, arg);
-               mutex_unlock(&bdev->bd_disk->open_mutex);
-
-               spin_lock(&blockdev_superblock->s_inode_list_lock);
-       }
-       spin_unlock(&blockdev_superblock->s_inode_list_lock);
-       iput(old_inode);
-}
index 6679e07..2e6f403 100644 (file)
@@ -22,7 +22,7 @@
 #include <linux/random.h>
 #include <linux/highmem.h>
 #include <linux/fips.h>
-#include "../cifs_common/arc4.h"
+#include "../smbfs_common/arc4.h"
 #include <crypto/aead.h>
 
 int __cifs_calc_signature(struct smb_rqst *rqst,
index dc920e2..98e8e5a 100644 (file)
@@ -12,7 +12,7 @@
 
 #include <net/sock.h>
 #include <asm/unaligned.h>
-#include "smbfsctl.h"
+#include "../smbfs_common/smbfsctl.h"
 
 #define CIFS_PROT   0
 #define POSIX_PROT  (CIFS_PROT+1)
index ddc0e8f..bda606d 100644 (file)
@@ -689,13 +689,19 @@ smb2_close_cached_fid(struct kref *ref)
                cifs_dbg(FYI, "clear cached root file handle\n");
                SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid,
                           cfid->fid->volatile_fid);
-               cfid->is_valid = false;
-               cfid->file_all_info_is_valid = false;
-               cfid->has_lease = false;
-               if (cfid->dentry) {
-                       dput(cfid->dentry);
-                       cfid->dentry = NULL;
-               }
+       }
+
+       /*
+        * We only check validity above to send SMB2_close,
+        * but we still need to invalidate these entries
+        * when this function is called
+        */
+       cfid->is_valid = false;
+       cfid->file_all_info_is_valid = false;
+       cfid->has_lease = false;
+       if (cfid->dentry) {
+               dput(cfid->dentry);
+               cfid->dentry = NULL;
        }
 }
 
index 10047cc..4a04877 100644 (file)
@@ -24,7 +24,7 @@
 #include "cifsglob.h"
 #include "cifs_debug.h"
 #include "cifsproto.h"
-#include "../cifs_common/md4.h"
+#include "../smbfs_common/md4.h"
 
 #ifndef false
 #define false 0
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
deleted file mode 100644 (file)
index d0fc420..0000000
+++ /dev/null
@@ -1,152 +0,0 @@
-/* SPDX-License-Identifier: LGPL-2.1 */
-/*
- *   fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
- *
- *   Copyright (c) International Business Machines  Corp., 2002,2013
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- */
-
-/* IOCTL information */
-/*
- * List of ioctl/fsctl function codes that are or could be useful in the
- * future to remote clients like cifs or SMB2/SMB3 client.  This is probably
- * a slightly larger set of fsctls that NTFS local filesystem could handle,
- * including the seven below that we do not have struct definitions for.
- * Even with protocol definitions for most of these now available, we still
- * need to do some experimentation to identify which are practical to do
- * remotely.  Some of the following, such as the encryption/compression ones
- * could be invoked from tools via a specialized hook into the VFS rather
- * than via the standard vfs entry points
- *
- * See MS-SMB2 Section 2.2.31 (last checked June 2013, all of that list are
- * below). Additional detail on less common ones can be found in MS-FSCC
- * section 2.3.
- */
-
-/*
- * FSCTL values are 32 bits and are constructed as
- * <device 16bits> <access 2bits> <function 12bits> <method 2bits>
- */
-/* Device */
-#define FSCTL_DEVICE_DFS                 (0x0006 << 16)
-#define FSCTL_DEVICE_FILE_SYSTEM         (0x0009 << 16)
-#define FSCTL_DEVICE_NAMED_PIPE          (0x0011 << 16)
-#define FSCTL_DEVICE_NETWORK_FILE_SYSTEM (0x0014 << 16)
-#define FSCTL_DEVICE_MASK                0xffff0000
-/* Access */
-#define FSCTL_DEVICE_ACCESS_FILE_ANY_ACCESS        (0x00 << 14)
-#define FSCTL_DEVICE_ACCESS_FILE_READ_ACCESS       (0x01 << 14)
-#define FSCTL_DEVICE_ACCESS_FILE_WRITE_ACCESS      (0x02 << 14)
-#define FSCTL_DEVICE_ACCESS_FILE_READ_WRITE_ACCESS (0x03 << 14)
-#define FSCTL_DEVICE_ACCESS_MASK                   0x0000c000
-/* Function */
-#define FSCTL_DEVICE_FUNCTION_MASK       0x00003ffc
-/* Method */
-#define FSCTL_DEVICE_METHOD_BUFFERED   0x00
-#define FSCTL_DEVICE_METHOD_IN_DIRECT  0x01
-#define FSCTL_DEVICE_METHOD_OUT_DIRECT 0x02
-#define FSCTL_DEVICE_METHOD_NEITHER    0x03
-#define FSCTL_DEVICE_METHOD_MASK       0x00000003
-
-
-#define FSCTL_DFS_GET_REFERRALS      0x00060194
-#define FSCTL_DFS_GET_REFERRALS_EX   0x000601B0
-#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
-#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
-#define FSCTL_REQUEST_BATCH_OPLOCK   0x00090008
-#define FSCTL_LOCK_VOLUME            0x00090018
-#define FSCTL_UNLOCK_VOLUME          0x0009001C
-#define FSCTL_IS_PATHNAME_VALID      0x0009002C /* BB add struct */
-#define FSCTL_GET_COMPRESSION        0x0009003C /* BB add struct */
-#define FSCTL_SET_COMPRESSION        0x0009C040 /* BB add struct */
-#define FSCTL_QUERY_FAT_BPB          0x00090058 /* BB add struct */
-/* Verify the next FSCTL number, we had it as 0x00090090 before */
-#define FSCTL_FILESYSTEM_GET_STATS   0x00090060 /* BB add struct */
-#define FSCTL_GET_NTFS_VOLUME_DATA   0x00090064 /* BB add struct */
-#define FSCTL_GET_RETRIEVAL_POINTERS 0x00090073 /* BB add struct */
-#define FSCTL_IS_VOLUME_DIRTY        0x00090078 /* BB add struct */
-#define FSCTL_ALLOW_EXTENDED_DASD_IO 0x00090083 /* BB add struct */
-#define FSCTL_REQUEST_FILTER_OPLOCK  0x0009008C
-#define FSCTL_FIND_FILES_BY_SID      0x0009008F /* BB add struct */
-#define FSCTL_SET_OBJECT_ID          0x00090098 /* BB add struct */
-#define FSCTL_GET_OBJECT_ID          0x0009009C /* BB add struct */
-#define FSCTL_DELETE_OBJECT_ID       0x000900A0 /* BB add struct */
-#define FSCTL_SET_REPARSE_POINT      0x000900A4 /* BB add struct */
-#define FSCTL_GET_REPARSE_POINT      0x000900A8 /* BB add struct */
-#define FSCTL_DELETE_REPARSE_POINT   0x000900AC /* BB add struct */
-#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
-#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
-#define FSCTL_SET_SPARSE             0x000900C4 /* BB add struct */
-#define FSCTL_SET_ZERO_DATA          0x000980C8
-#define FSCTL_SET_ENCRYPTION         0x000900D7 /* BB add struct */
-#define FSCTL_ENCRYPTION_FSCTL_IO    0x000900DB /* BB add struct */
-#define FSCTL_WRITE_RAW_ENCRYPTED    0x000900DF /* BB add struct */
-#define FSCTL_READ_RAW_ENCRYPTED     0x000900E3 /* BB add struct */
-#define FSCTL_READ_FILE_USN_DATA     0x000900EB /* BB add struct */
-#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */
-#define FSCTL_SIS_COPYFILE           0x00090100 /* BB add struct */
-#define FSCTL_RECALL_FILE            0x00090117 /* BB add struct */
-#define FSCTL_QUERY_SPARING_INFO     0x00090138 /* BB add struct */
-#define FSCTL_SET_ZERO_ON_DEALLOC    0x00090194 /* BB add struct */
-#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
-#define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C
-#define FSCTL_GET_RETRIEVAL_POINTERS_AND_REFCOUNT 0x000903d3
-#define FSCTL_GET_RETRIEVAL_POINTER_COUNT 0x0009042b
-#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF
-#define FSCTL_SET_DEFECT_MANAGEMENT  0x00098134 /* BB add struct */
-#define FSCTL_FILE_LEVEL_TRIM        0x00098208 /* BB add struct */
-#define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344
-#define FSCTL_SIS_LINK_FILES         0x0009C104
-#define FSCTL_SET_INTEGRITY_INFORMATION 0x0009C280
-#define FSCTL_PIPE_PEEK              0x0011400C /* BB add struct */
-#define FSCTL_PIPE_TRANSCEIVE        0x0011C017 /* BB add struct */
-/* strange that the number for this op is not sequential with previous op */
-#define FSCTL_PIPE_WAIT              0x00110018 /* BB add struct */
-/* Enumerate previous versions of a file */
-#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
-/* Retrieve an opaque file reference for server-side data movement ie copy */
-#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
-#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4
-#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
-#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
-#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204
-/* Perform server-side data movement */
-#define FSCTL_SRV_COPYCHUNK 0x001440F2
-#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2
-#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */
-#define FSCTL_SRV_READ_HASH          0x001441BB /* BB add struct */
-
-/* See FSCC 2.1.2.5 */
-#define IO_REPARSE_TAG_MOUNT_POINT   0xA0000003
-#define IO_REPARSE_TAG_HSM           0xC0000004
-#define IO_REPARSE_TAG_SIS           0x80000007
-#define IO_REPARSE_TAG_HSM2          0x80000006
-#define IO_REPARSE_TAG_DRIVER_EXTENDER 0x80000005
-/* Used by the DFS filter. See MS-DFSC */
-#define IO_REPARSE_TAG_DFS           0x8000000A
-/* Used by the DFS filter See MS-DFSC */
-#define IO_REPARSE_TAG_DFSR          0x80000012
-#define IO_REPARSE_TAG_FILTER_MANAGER 0x8000000B
-/* See section MS-FSCC 2.1.2.4 */
-#define IO_REPARSE_TAG_SYMLINK       0xA000000C
-#define IO_REPARSE_TAG_DEDUP         0x80000013
-#define IO_REPARSE_APPXSTREAM       0xC0000014
-/* NFS symlinks, Win 8/SMB3 and later */
-#define IO_REPARSE_TAG_NFS           0x80000014
-/*
- * AzureFileSync - see
- * https://docs.microsoft.com/en-us/azure/storage/files/storage-sync-cloud-tiering
- */
-#define IO_REPARSE_TAG_AZ_FILE_SYNC  0x8000001e
-/* WSL reparse tags */
-#define IO_REPARSE_TAG_LX_SYMLINK    0xA000001D
-#define IO_REPARSE_TAG_AF_UNIX      0x80000023
-#define IO_REPARSE_TAG_LX_FIFO      0x80000024
-#define IO_REPARSE_TAG_LX_CHR       0x80000025
-#define IO_REPARSE_TAG_LX_BLK       0x80000026
-
-/* fsctl flags */
-/* If Flags is set to this value, the request is an FSCTL not ioctl request */
-#define SMB2_0_IOCTL_IS_FSCTL          0x00000001
-
diff --git a/fs/cifs_common/Makefile b/fs/cifs_common/Makefile
deleted file mode 100644 (file)
index 6fedd2f..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Makefile for Linux filesystem routines that are shared by client and server.
-#
-
-obj-$(CONFIG_CIFS_COMMON) += cifs_arc4.o
-obj-$(CONFIG_CIFS_COMMON) += cifs_md4.o
diff --git a/fs/cifs_common/arc4.h b/fs/cifs_common/arc4.h
deleted file mode 100644 (file)
index 12e71ec..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Common values for ARC4 Cipher Algorithm
- */
-
-#ifndef _CRYPTO_ARC4_H
-#define _CRYPTO_ARC4_H
-
-#include <linux/types.h>
-
-#define ARC4_MIN_KEY_SIZE      1
-#define ARC4_MAX_KEY_SIZE      256
-#define ARC4_BLOCK_SIZE                1
-
-struct arc4_ctx {
-       u32 S[256];
-       u32 x, y;
-};
-
-int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len);
-void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len);
-
-#endif /* _CRYPTO_ARC4_H */
diff --git a/fs/cifs_common/cifs_arc4.c b/fs/cifs_common/cifs_arc4.c
deleted file mode 100644 (file)
index b964cc6..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API
- *
- * ARC4 Cipher Algorithm
- *
- * Jon Oberheide <jon@oberheide.org>
- */
-
-#include <linux/module.h>
-#include "arc4.h"
-
-MODULE_LICENSE("GPL");
-
-int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
-{
-       int i, j = 0, k = 0;
-
-       ctx->x = 1;
-       ctx->y = 0;
-
-       for (i = 0; i < 256; i++)
-               ctx->S[i] = i;
-
-       for (i = 0; i < 256; i++) {
-               u32 a = ctx->S[i];
-
-               j = (j + in_key[k] + a) & 0xff;
-               ctx->S[i] = ctx->S[j];
-               ctx->S[j] = a;
-               if (++k >= key_len)
-                       k = 0;
-       }
-
-       return 0;
-}
-EXPORT_SYMBOL_GPL(cifs_arc4_setkey);
-
-void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len)
-{
-       u32 *const S = ctx->S;
-       u32 x, y, a, b;
-       u32 ty, ta, tb;
-
-       if (len == 0)
-               return;
-
-       x = ctx->x;
-       y = ctx->y;
-
-       a = S[x];
-       y = (y + a) & 0xff;
-       b = S[y];
-
-       do {
-               S[y] = a;
-               a = (a + b) & 0xff;
-               S[x] = b;
-               x = (x + 1) & 0xff;
-               ta = S[x];
-               ty = (y + ta) & 0xff;
-               tb = S[ty];
-               *out++ = *in++ ^ S[a];
-               if (--len == 0)
-                       break;
-               y = ty;
-               a = ta;
-               b = tb;
-       } while (true);
-
-       ctx->x = x;
-       ctx->y = y;
-}
-EXPORT_SYMBOL_GPL(cifs_arc4_crypt);
-
-static int __init
-init_cifs_common(void)
-{
-       return 0;
-}
-static void __init
-exit_cifs_common(void)
-{
-}
-
-module_init(init_cifs_common)
-module_exit(exit_cifs_common)
diff --git a/fs/cifs_common/cifs_md4.c b/fs/cifs_common/cifs_md4.c
deleted file mode 100644 (file)
index 50f78cf..0000000
+++ /dev/null
@@ -1,197 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Cryptographic API.
- *
- * MD4 Message Digest Algorithm (RFC1320).
- *
- * Implementation derived from Andrew Tridgell and Steve French's
- * CIFS MD4 implementation, and the cryptoapi implementation
- * originally based on the public domain implementation written
- * by Colin Plumb in 1993.
- *
- * Copyright (c) Andrew Tridgell 1997-1998.
- * Modified by Steve French (sfrench@us.ibm.com) 2002
- * Copyright (c) Cryptoapi developers.
- * Copyright (c) 2002 David S. Miller (davem@redhat.com)
- * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
- *
- */
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <asm/byteorder.h>
-#include "md4.h"
-
-MODULE_LICENSE("GPL");
-
-static inline u32 lshift(u32 x, unsigned int s)
-{
-       x &= 0xFFFFFFFF;
-       return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s));
-}
-
-static inline u32 F(u32 x, u32 y, u32 z)
-{
-       return (x & y) | ((~x) & z);
-}
-
-static inline u32 G(u32 x, u32 y, u32 z)
-{
-       return (x & y) | (x & z) | (y & z);
-}
-
-static inline u32 H(u32 x, u32 y, u32 z)
-{
-       return x ^ y ^ z;
-}
-
-#define ROUND1(a,b,c,d,k,s) (a = lshift(a + F(b,c,d) + k, s))
-#define ROUND2(a,b,c,d,k,s) (a = lshift(a + G(b,c,d) + k + (u32)0x5A827999,s))
-#define ROUND3(a,b,c,d,k,s) (a = lshift(a + H(b,c,d) + k + (u32)0x6ED9EBA1,s))
-
-static void md4_transform(u32 *hash, u32 const *in)
-{
-       u32 a, b, c, d;
-
-       a = hash[0];
-       b = hash[1];
-       c = hash[2];
-       d = hash[3];
-
-       ROUND1(a, b, c, d, in[0], 3);
-       ROUND1(d, a, b, c, in[1], 7);
-       ROUND1(c, d, a, b, in[2], 11);
-       ROUND1(b, c, d, a, in[3], 19);
-       ROUND1(a, b, c, d, in[4], 3);
-       ROUND1(d, a, b, c, in[5], 7);
-       ROUND1(c, d, a, b, in[6], 11);
-       ROUND1(b, c, d, a, in[7], 19);
-       ROUND1(a, b, c, d, in[8], 3);
-       ROUND1(d, a, b, c, in[9], 7);
-       ROUND1(c, d, a, b, in[10], 11);
-       ROUND1(b, c, d, a, in[11], 19);
-       ROUND1(a, b, c, d, in[12], 3);
-       ROUND1(d, a, b, c, in[13], 7);
-       ROUND1(c, d, a, b, in[14], 11);
-       ROUND1(b, c, d, a, in[15], 19);
-
-       ROUND2(a, b, c, d, in[0], 3);
-       ROUND2(d, a, b, c, in[4], 5);
-       ROUND2(c, d, a, b, in[8], 9);
-       ROUND2(b, c, d, a, in[12], 13);
-       ROUND2(a, b, c, d, in[1], 3);
-       ROUND2(d, a, b, c, in[5], 5);
-       ROUND2(c, d, a, b, in[9], 9);
-       ROUND2(b, c, d, a, in[13], 13);
-       ROUND2(a, b, c, d, in[2], 3);
-       ROUND2(d, a, b, c, in[6], 5);
-       ROUND2(c, d, a, b, in[10], 9);
-       ROUND2(b, c, d, a, in[14], 13);
-       ROUND2(a, b, c, d, in[3], 3);
-       ROUND2(d, a, b, c, in[7], 5);
-       ROUND2(c, d, a, b, in[11], 9);
-       ROUND2(b, c, d, a, in[15], 13);
-
-       ROUND3(a, b, c, d, in[0], 3);
-       ROUND3(d, a, b, c, in[8], 9);
-       ROUND3(c, d, a, b, in[4], 11);
-       ROUND3(b, c, d, a, in[12], 15);
-       ROUND3(a, b, c, d, in[2], 3);
-       ROUND3(d, a, b, c, in[10], 9);
-       ROUND3(c, d, a, b, in[6], 11);
-       ROUND3(b, c, d, a, in[14], 15);
-       ROUND3(a, b, c, d, in[1], 3);
-       ROUND3(d, a, b, c, in[9], 9);
-       ROUND3(c, d, a, b, in[5], 11);
-       ROUND3(b, c, d, a, in[13], 15);
-       ROUND3(a, b, c, d, in[3], 3);
-       ROUND3(d, a, b, c, in[11], 9);
-       ROUND3(c, d, a, b, in[7], 11);
-       ROUND3(b, c, d, a, in[15], 15);
-
-       hash[0] += a;
-       hash[1] += b;
-       hash[2] += c;
-       hash[3] += d;
-}
-
-static inline void md4_transform_helper(struct md4_ctx *ctx)
-{
-       le32_to_cpu_array(ctx->block, ARRAY_SIZE(ctx->block));
-       md4_transform(ctx->hash, ctx->block);
-}
-
-int cifs_md4_init(struct md4_ctx *mctx)
-{
-       memset(mctx, 0, sizeof(struct md4_ctx));
-       mctx->hash[0] = 0x67452301;
-       mctx->hash[1] = 0xefcdab89;
-       mctx->hash[2] = 0x98badcfe;
-       mctx->hash[3] = 0x10325476;
-       mctx->byte_count = 0;
-
-       return 0;
-}
-EXPORT_SYMBOL_GPL(cifs_md4_init);
-
-int cifs_md4_update(struct md4_ctx *mctx, const u8 *data, unsigned int len)
-{
-       const u32 avail = sizeof(mctx->block) - (mctx->byte_count & 0x3f);
-
-       mctx->byte_count += len;
-
-       if (avail > len) {
-               memcpy((char *)mctx->block + (sizeof(mctx->block) - avail),
-                      data, len);
-               return 0;
-       }
-
-       memcpy((char *)mctx->block + (sizeof(mctx->block) - avail),
-              data, avail);
-
-       md4_transform_helper(mctx);
-       data += avail;
-       len -= avail;
-
-       while (len >= sizeof(mctx->block)) {
-               memcpy(mctx->block, data, sizeof(mctx->block));
-               md4_transform_helper(mctx);
-               data += sizeof(mctx->block);
-               len -= sizeof(mctx->block);
-       }
-
-       memcpy(mctx->block, data, len);
-
-       return 0;
-}
-EXPORT_SYMBOL_GPL(cifs_md4_update);
-
-int cifs_md4_final(struct md4_ctx *mctx, u8 *out)
-{
-       const unsigned int offset = mctx->byte_count & 0x3f;
-       char *p = (char *)mctx->block + offset;
-       int padding = 56 - (offset + 1);
-
-       *p++ = 0x80;
-       if (padding < 0) {
-               memset(p, 0x00, padding + sizeof(u64));
-               md4_transform_helper(mctx);
-               p = (char *)mctx->block;
-               padding = 56;
-       }
-
-       memset(p, 0, padding);
-       mctx->block[14] = mctx->byte_count << 3;
-       mctx->block[15] = mctx->byte_count >> 29;
-       le32_to_cpu_array(mctx->block, (sizeof(mctx->block) -
-                         sizeof(u64)) / sizeof(u32));
-       md4_transform(mctx->hash, mctx->block);
-       cpu_to_le32_array(mctx->hash, ARRAY_SIZE(mctx->hash));
-       memcpy(out, mctx->hash, sizeof(mctx->hash));
-       memset(mctx, 0, sizeof(*mctx));
-
-       return 0;
-}
-EXPORT_SYMBOL_GPL(cifs_md4_final);
diff --git a/fs/cifs_common/md4.h b/fs/cifs_common/md4.h
deleted file mode 100644 (file)
index 5337bec..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Common values for ARC4 Cipher Algorithm
- */
-
-#ifndef _CIFS_MD4_H
-#define _CIFS_MD4_H
-
-#include <linux/types.h>
-
-#define MD4_DIGEST_SIZE                16
-#define MD4_HMAC_BLOCK_SIZE    64
-#define MD4_BLOCK_WORDS                16
-#define MD4_HASH_WORDS         4
-
-struct md4_ctx {
-       u32 hash[MD4_HASH_WORDS];
-       u32 block[MD4_BLOCK_WORDS];
-       u64 byte_count;
-};
-
-
-int cifs_md4_init(struct md4_ctx *mctx);
-int cifs_md4_update(struct md4_ctx *mctx, const u8 *data, unsigned int len);
-int cifs_md4_final(struct md4_ctx *mctx, u8 *out);
-
-#endif /* _CIFS_MD4_H */
index d8afa82..8627dac 100644 (file)
--- a/fs/file.c
+++ b/fs/file.c
@@ -1150,6 +1150,12 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
        return new_fd;
 }
 
+int receive_fd(struct file *file, unsigned int o_flags)
+{
+       return __receive_fd(file, NULL, o_flags);
+}
+EXPORT_SYMBOL_GPL(receive_fd);
+
 static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
 {
        int err = -EBADF;
index 980d44f..3df07c0 100644 (file)
@@ -165,7 +165,6 @@ int fs_lookup_param(struct fs_context *fc,
                return invalf(fc, "%s: not usable as path", param->key);
        }
 
-       f->refcnt++; /* filename_lookup() drops our ref. */
        ret = filename_lookup(param->dirfd, f, flags, _path, NULL);
        if (ret < 0) {
                errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name);
index 68a2ae0..3cd065c 100644 (file)
@@ -18,7 +18,7 @@ struct user_namespace;
 struct pipe_inode_info;
 
 /*
- * block_dev.c
+ * block/bdev.c
  */
 #ifdef CONFIG_BLOCK
 extern void __init bdev_cache_init(void);
index d80e4a7..6c55362 100644 (file)
@@ -709,6 +709,7 @@ static void create_worker_cont(struct callback_head *cb)
                }
                raw_spin_unlock(&wqe->lock);
                io_worker_ref_put(wqe->wq);
+               kfree(worker);
                return;
        }
 
@@ -725,6 +726,7 @@ static void io_workqueue_create(struct work_struct *work)
        if (!io_queue_worker_create(worker, acct, create_worker_cont)) {
                clear_bit_unlock(0, &worker->create_state);
                io_worker_release(worker);
+               kfree(worker);
        }
 }
 
@@ -759,6 +761,7 @@ fail:
        if (!IS_ERR(tsk)) {
                io_init_new_worker(wqe, worker, tsk);
        } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
+               kfree(worker);
                goto fail;
        } else {
                INIT_WORK(&worker->work, io_workqueue_create);
@@ -832,6 +835,11 @@ append:
        wq_list_add_after(&work->list, &tail->list, &acct->work_list);
 }
 
+static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
+{
+       return work == data;
+}
+
 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 {
        struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
@@ -844,7 +852,6 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
         */
        if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) ||
            (work->flags & IO_WQ_WORK_CANCEL)) {
-run_cancel:
                io_run_cancel(work, wqe);
                return;
        }
@@ -864,15 +871,22 @@ run_cancel:
                bool did_create;
 
                did_create = io_wqe_create_worker(wqe, acct);
-               if (unlikely(!did_create)) {
-                       raw_spin_lock(&wqe->lock);
-                       /* fatal condition, failed to create the first worker */
-                       if (!acct->nr_workers) {
-                               raw_spin_unlock(&wqe->lock);
-                               goto run_cancel;
-                       }
-                       raw_spin_unlock(&wqe->lock);
+               if (likely(did_create))
+                       return;
+
+               raw_spin_lock(&wqe->lock);
+               /* fatal condition, failed to create the first worker */
+               if (!acct->nr_workers) {
+                       struct io_cb_cancel_data match = {
+                               .fn             = io_wq_work_match_item,
+                               .data           = work,
+                               .cancel_all     = false,
+                       };
+
+                       if (io_acct_cancel_pending_work(wqe, acct, &match))
+                               raw_spin_lock(&wqe->lock);
                }
+               raw_spin_unlock(&wqe->lock);
        }
 }
 
@@ -1122,7 +1136,7 @@ static bool io_task_work_match(struct callback_head *cb, void *data)
 {
        struct io_worker *worker;
 
-       if (cb->func != create_worker_cb || cb->func != create_worker_cont)
+       if (cb->func != create_worker_cb && cb->func != create_worker_cont)
                return false;
        worker = container_of(cb, struct io_worker, create_work);
        return worker->wqe->wq == data;
@@ -1143,9 +1157,14 @@ static void io_wq_exit_workers(struct io_wq *wq)
 
        while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
                struct io_worker *worker;
+               struct io_wqe_acct *acct;
 
                worker = container_of(cb, struct io_worker, create_work);
-               atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
+               acct = io_wqe_get_acct(worker);
+               atomic_dec(&acct->nr_running);
+               raw_spin_lock(&worker->wqe->lock);
+               acct->nr_workers--;
+               raw_spin_unlock(&worker->wqe->lock);
                io_worker_ref_put(wq);
                clear_bit_unlock(0, &worker->create_state);
                io_worker_release(worker);
index 855ea54..16fb743 100644 (file)
@@ -1482,6 +1482,8 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
        struct io_timeout_data *io = req->async_data;
 
        if (hrtimer_try_to_cancel(&io->timer) != -1) {
+               if (status)
+                       req_set_fail(req);
                atomic_set(&req->ctx->cq_timeouts,
                        atomic_read(&req->ctx->cq_timeouts) + 1);
                list_del_init(&req->timeout.list);
@@ -1619,8 +1621,11 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 
 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
 {
+       /* see waitqueue_active() comment */
+       smp_mb();
+
        if (ctx->flags & IORING_SETUP_SQPOLL) {
-               if (wq_has_sleeper(&ctx->cq_wait))
+               if (waitqueue_active(&ctx->cq_wait))
                        wake_up_all(&ctx->cq_wait);
        }
        if (io_should_trigger_evfd(ctx))
@@ -10550,7 +10555,14 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
        if (ctx->flags & IORING_SETUP_SQPOLL) {
                sqd = ctx->sq_data;
                if (sqd) {
+                       /*
+                        * Observe the correct sqd->lock -> ctx->uring_lock
+                        * ordering. Fine to drop uring_lock here, we hold
+                        * a ref to the ctx.
+                        */
+                       mutex_unlock(&ctx->uring_lock);
                        mutex_lock(&sqd->lock);
+                       mutex_lock(&ctx->uring_lock);
                        tctx = sqd->thread->io_uring;
                }
        } else {
@@ -10853,7 +10865,7 @@ static int __init io_uring_init(void)
        BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
 
        BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
-       BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
+       BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
 
        req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                SLAB_ACCOUNT);
index 95a881e..1946d96 100644 (file)
@@ -255,7 +255,7 @@ getname_kernel(const char * filename)
 
 void putname(struct filename *name)
 {
-       if (IS_ERR_OR_NULL(name))
+       if (IS_ERR(name))
                return;
 
        BUG_ON(name->refcnt <= 0);
@@ -2467,7 +2467,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
        return err;
 }
 
-static int __filename_lookup(int dfd, struct filename *name, unsigned flags,
+int filename_lookup(int dfd, struct filename *name, unsigned flags,
                    struct path *path, struct path *root)
 {
        int retval;
@@ -2488,15 +2488,6 @@ static int __filename_lookup(int dfd, struct filename *name, unsigned flags,
        return retval;
 }
 
-int filename_lookup(int dfd, struct filename *name, unsigned flags,
-                   struct path *path, struct path *root)
-{
-       int retval = __filename_lookup(dfd, name, flags, path, root);
-
-       putname(name);
-       return retval;
-}
-
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
 static int path_parentat(struct nameidata *nd, unsigned flags,
                                struct path *parent)
@@ -2514,9 +2505,10 @@ static int path_parentat(struct nameidata *nd, unsigned flags,
        return err;
 }
 
-static int __filename_parentat(int dfd, struct filename *name,
-                               unsigned int flags, struct path *parent,
-                               struct qstr *last, int *type)
+/* Note: this does not consume "name" */
+static int filename_parentat(int dfd, struct filename *name,
+                            unsigned int flags, struct path *parent,
+                            struct qstr *last, int *type)
 {
        int retval;
        struct nameidata nd;
@@ -2538,25 +2530,14 @@ static int __filename_parentat(int dfd, struct filename *name,
        return retval;
 }
 
-static int filename_parentat(int dfd, struct filename *name,
-                               unsigned int flags, struct path *parent,
-                               struct qstr *last, int *type)
-{
-       int retval = __filename_parentat(dfd, name, flags, parent, last, type);
-
-       putname(name);
-       return retval;
-}
-
 /* does lookup, returns the object with parent locked */
-struct dentry *kern_path_locked(const char *name, struct path *path)
+static struct dentry *__kern_path_locked(struct filename *name, struct path *path)
 {
        struct dentry *d;
        struct qstr last;
        int type, error;
 
-       error = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
-                                   &last, &type);
+       error = filename_parentat(AT_FDCWD, name, 0, path, &last, &type);
        if (error)
                return ERR_PTR(error);
        if (unlikely(type != LAST_NORM)) {
@@ -2572,10 +2553,23 @@ struct dentry *kern_path_locked(const char *name, struct path *path)
        return d;
 }
 
+struct dentry *kern_path_locked(const char *name, struct path *path)
+{
+       struct filename *filename = getname_kernel(name);
+       struct dentry *res = __kern_path_locked(filename, path);
+
+       putname(filename);
+       return res;
+}
+
 int kern_path(const char *name, unsigned int flags, struct path *path)
 {
-       return filename_lookup(AT_FDCWD, getname_kernel(name),
-                              flags, path, NULL);
+       struct filename *filename = getname_kernel(name);
+       int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL);
+
+       putname(filename);
+       return ret;
+
 }
 EXPORT_SYMBOL(kern_path);
 
@@ -2591,10 +2585,15 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct path *path)
 {
+       struct filename *filename;
        struct path root = {.mnt = mnt, .dentry = dentry};
+       int ret;
+
+       filename = getname_kernel(name);
        /* the first argument of filename_lookup() is ignored with root */
-       return filename_lookup(AT_FDCWD, getname_kernel(name),
-                              flags , path, &root);
+       ret = filename_lookup(AT_FDCWD, filename, flags, path, &root);
+       putname(filename);
+       return ret;
 }
 EXPORT_SYMBOL(vfs_path_lookup);
 
@@ -2798,8 +2797,11 @@ int path_pts(struct path *path)
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
                 struct path *path, int *empty)
 {
-       return filename_lookup(dfd, getname_flags(name, flags, empty),
-                              flags, path, NULL);
+       struct filename *filename = getname_flags(name, flags, empty);
+       int ret = filename_lookup(dfd, filename, flags, path, NULL);
+
+       putname(filename);
+       return ret;
 }
 EXPORT_SYMBOL(user_path_at_empty);
 
@@ -3618,8 +3620,8 @@ struct file *do_file_open_root(const struct path *root,
        return file;
 }
 
-static struct dentry *__filename_create(int dfd, struct filename *name,
-                               struct path *path, unsigned int lookup_flags)
+static struct dentry *filename_create(int dfd, struct filename *name,
+                                     struct path *path, unsigned int lookup_flags)
 {
        struct dentry *dentry = ERR_PTR(-EEXIST);
        struct qstr last;
@@ -3634,7 +3636,7 @@ static struct dentry *__filename_create(int dfd, struct filename *name,
         */
        lookup_flags &= LOOKUP_REVAL;
 
-       error = __filename_parentat(dfd, name, lookup_flags, path, &last, &type);
+       error = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
        if (error)
                return ERR_PTR(error);
 
@@ -3687,21 +3689,15 @@ out:
        return dentry;
 }
 
-static inline struct dentry *filename_create(int dfd, struct filename *name,
+struct dentry *kern_path_create(int dfd, const char *pathname,
                                struct path *path, unsigned int lookup_flags)
 {
-       struct dentry *res = __filename_create(dfd, name, path, lookup_flags);
+       struct filename *filename = getname_kernel(pathname);
+       struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
 
-       putname(name);
+       putname(filename);
        return res;
 }
-
-struct dentry *kern_path_create(int dfd, const char *pathname,
-                               struct path *path, unsigned int lookup_flags)
-{
-       return filename_create(dfd, getname_kernel(pathname),
-                               path, lookup_flags);
-}
 EXPORT_SYMBOL(kern_path_create);
 
 void done_path_create(struct path *path, struct dentry *dentry)
@@ -3716,7 +3712,11 @@ EXPORT_SYMBOL(done_path_create);
 inline struct dentry *user_path_create(int dfd, const char __user *pathname,
                                struct path *path, unsigned int lookup_flags)
 {
-       return filename_create(dfd, getname(pathname), path, lookup_flags);
+       struct filename *filename = getname(pathname);
+       struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
+
+       putname(filename);
+       return res;
 }
 EXPORT_SYMBOL(user_path_create);
 
@@ -3797,7 +3797,7 @@ static int do_mknodat(int dfd, struct filename *name, umode_t mode,
        if (error)
                goto out1;
 retry:
-       dentry = __filename_create(dfd, name, &path, lookup_flags);
+       dentry = filename_create(dfd, name, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out1;
@@ -3897,7 +3897,7 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode)
        unsigned int lookup_flags = LOOKUP_DIRECTORY;
 
 retry:
-       dentry = __filename_create(dfd, name, &path, lookup_flags);
+       dentry = filename_create(dfd, name, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out_putname;
@@ -3996,7 +3996,7 @@ int do_rmdir(int dfd, struct filename *name)
        int type;
        unsigned int lookup_flags = 0;
 retry:
-       error = __filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
+       error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
        if (error)
                goto exit1;
 
@@ -4137,7 +4137,7 @@ int do_unlinkat(int dfd, struct filename *name)
        struct inode *delegated_inode = NULL;
        unsigned int lookup_flags = 0;
 retry:
-       error = __filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
+       error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
        if (error)
                goto exit1;
 
@@ -4266,7 +4266,7 @@ int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
                goto out_putnames;
        }
 retry:
-       dentry = __filename_create(newdfd, to, &path, lookup_flags);
+       dentry = filename_create(newdfd, to, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out_putnames;
@@ -4426,11 +4426,11 @@ int do_linkat(int olddfd, struct filename *old, int newdfd,
        if (flags & AT_SYMLINK_FOLLOW)
                how |= LOOKUP_FOLLOW;
 retry:
-       error = __filename_lookup(olddfd, old, how, &old_path, NULL);
+       error = filename_lookup(olddfd, old, how, &old_path, NULL);
        if (error)
                goto out_putnames;
 
-       new_dentry = __filename_create(newdfd, new, &new_path,
+       new_dentry = filename_create(newdfd, new, &new_path,
                                        (how & LOOKUP_REVAL));
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
@@ -4689,13 +4689,13 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
                target_flags = 0;
 
 retry:
-       error = __filename_parentat(olddfd, from, lookup_flags, &old_path,
-                                       &old_last, &old_type);
+       error = filename_parentat(olddfd, from, lookup_flags, &old_path,
+                                 &old_last, &old_type);
        if (error)
                goto put_names;
 
-       error = __filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
-                               &new_type);
+       error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
+                                 &new_type);
        if (error)
                goto exit1;
 
index 95006d1..fa1d991 100644 (file)
@@ -531,6 +531,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
                /* Someone else created list structure for us */
                if (inode)
                        fsnotify_put_inode_ref(inode);
+               fsnotify_put_sb_connectors(conn);
                kmem_cache_free(fsnotify_mark_connector_cachep, conn);
        }
 
diff --git a/fs/smbfs_common/Makefile b/fs/smbfs_common/Makefile
new file mode 100644 (file)
index 0000000..cafc61a
--- /dev/null
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for Linux filesystem routines that are shared by client and server.
+#
+
+obj-$(CONFIG_SMBFS_COMMON) += cifs_arc4.o
+obj-$(CONFIG_SMBFS_COMMON) += cifs_md4.o
diff --git a/fs/smbfs_common/arc4.h b/fs/smbfs_common/arc4.h
new file mode 100644 (file)
index 0000000..12e71ec
--- /dev/null
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Common values for ARC4 Cipher Algorithm
+ */
+
+#ifndef _CRYPTO_ARC4_H
+#define _CRYPTO_ARC4_H
+
+#include <linux/types.h>
+
+#define ARC4_MIN_KEY_SIZE      1
+#define ARC4_MAX_KEY_SIZE      256
+#define ARC4_BLOCK_SIZE                1
+
+struct arc4_ctx {
+       u32 S[256];
+       u32 x, y;
+};
+
+int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len);
+void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len);
+
+#endif /* _CRYPTO_ARC4_H */
diff --git a/fs/smbfs_common/cifs_arc4.c b/fs/smbfs_common/cifs_arc4.c
new file mode 100644 (file)
index 0000000..85ba15a
--- /dev/null
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Cryptographic API
+ *
+ * ARC4 Cipher Algorithm
+ *
+ * Jon Oberheide <jon@oberheide.org>
+ */
+
+#include <linux/module.h>
+#include "arc4.h"
+
+MODULE_LICENSE("GPL");
+
+int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
+{
+       int i, j = 0, k = 0;
+
+       ctx->x = 1;
+       ctx->y = 0;
+
+       for (i = 0; i < 256; i++)
+               ctx->S[i] = i;
+
+       for (i = 0; i < 256; i++) {
+               u32 a = ctx->S[i];
+
+               j = (j + in_key[k] + a) & 0xff;
+               ctx->S[i] = ctx->S[j];
+               ctx->S[j] = a;
+               if (++k >= key_len)
+                       k = 0;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(cifs_arc4_setkey);
+
+void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len)
+{
+       u32 *const S = ctx->S;
+       u32 x, y, a, b;
+       u32 ty, ta, tb;
+
+       if (len == 0)
+               return;
+
+       x = ctx->x;
+       y = ctx->y;
+
+       a = S[x];
+       y = (y + a) & 0xff;
+       b = S[y];
+
+       do {
+               S[y] = a;
+               a = (a + b) & 0xff;
+               S[x] = b;
+               x = (x + 1) & 0xff;
+               ta = S[x];
+               ty = (y + ta) & 0xff;
+               tb = S[ty];
+               *out++ = *in++ ^ S[a];
+               if (--len == 0)
+                       break;
+               y = ty;
+               a = ta;
+               b = tb;
+       } while (true);
+
+       ctx->x = x;
+       ctx->y = y;
+}
+EXPORT_SYMBOL_GPL(cifs_arc4_crypt);
+
+static int __init
+init_smbfs_common(void)
+{
+       return 0;
+}
+static void __init
+exit_smbfs_common(void)
+{
+}
+
+module_init(init_smbfs_common)
+module_exit(exit_smbfs_common)
diff --git a/fs/smbfs_common/cifs_md4.c b/fs/smbfs_common/cifs_md4.c
new file mode 100644 (file)
index 0000000..50f78cf
--- /dev/null
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cryptographic API.
+ *
+ * MD4 Message Digest Algorithm (RFC1320).
+ *
+ * Implementation derived from Andrew Tridgell and Steve French's
+ * CIFS MD4 implementation, and the cryptoapi implementation
+ * originally based on the public domain implementation written
+ * by Colin Plumb in 1993.
+ *
+ * Copyright (c) Andrew Tridgell 1997-1998.
+ * Modified by Steve French (sfrench@us.ibm.com) 2002
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 David S. Miller (davem@redhat.com)
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ *
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include "md4.h"
+
+MODULE_LICENSE("GPL");
+
+static inline u32 lshift(u32 x, unsigned int s)
+{
+       x &= 0xFFFFFFFF;
+       return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s));
+}
+
+static inline u32 F(u32 x, u32 y, u32 z)
+{
+       return (x & y) | ((~x) & z);
+}
+
+static inline u32 G(u32 x, u32 y, u32 z)
+{
+       return (x & y) | (x & z) | (y & z);
+}
+
+static inline u32 H(u32 x, u32 y, u32 z)
+{
+       return x ^ y ^ z;
+}
+
+#define ROUND1(a,b,c,d,k,s) (a = lshift(a + F(b,c,d) + k, s))
+#define ROUND2(a,b,c,d,k,s) (a = lshift(a + G(b,c,d) + k + (u32)0x5A827999,s))
+#define ROUND3(a,b,c,d,k,s) (a = lshift(a + H(b,c,d) + k + (u32)0x6ED9EBA1,s))
+
+static void md4_transform(u32 *hash, u32 const *in)
+{
+       u32 a, b, c, d;
+
+       a = hash[0];
+       b = hash[1];
+       c = hash[2];
+       d = hash[3];
+
+       ROUND1(a, b, c, d, in[0], 3);
+       ROUND1(d, a, b, c, in[1], 7);
+       ROUND1(c, d, a, b, in[2], 11);
+       ROUND1(b, c, d, a, in[3], 19);
+       ROUND1(a, b, c, d, in[4], 3);
+       ROUND1(d, a, b, c, in[5], 7);
+       ROUND1(c, d, a, b, in[6], 11);
+       ROUND1(b, c, d, a, in[7], 19);
+       ROUND1(a, b, c, d, in[8], 3);
+       ROUND1(d, a, b, c, in[9], 7);
+       ROUND1(c, d, a, b, in[10], 11);
+       ROUND1(b, c, d, a, in[11], 19);
+       ROUND1(a, b, c, d, in[12], 3);
+       ROUND1(d, a, b, c, in[13], 7);
+       ROUND1(c, d, a, b, in[14], 11);
+       ROUND1(b, c, d, a, in[15], 19);
+
+       ROUND2(a, b, c, d, in[0], 3);
+       ROUND2(d, a, b, c, in[4], 5);
+       ROUND2(c, d, a, b, in[8], 9);
+       ROUND2(b, c, d, a, in[12], 13);
+       ROUND2(a, b, c, d, in[1], 3);
+       ROUND2(d, a, b, c, in[5], 5);
+       ROUND2(c, d, a, b, in[9], 9);
+       ROUND2(b, c, d, a, in[13], 13);
+       ROUND2(a, b, c, d, in[2], 3);
+       ROUND2(d, a, b, c, in[6], 5);
+       ROUND2(c, d, a, b, in[10], 9);
+       ROUND2(b, c, d, a, in[14], 13);
+       ROUND2(a, b, c, d, in[3], 3);
+       ROUND2(d, a, b, c, in[7], 5);
+       ROUND2(c, d, a, b, in[11], 9);
+       ROUND2(b, c, d, a, in[15], 13);
+
+       ROUND3(a, b, c, d, in[0], 3);
+       ROUND3(d, a, b, c, in[8], 9);
+       ROUND3(c, d, a, b, in[4], 11);
+       ROUND3(b, c, d, a, in[12], 15);
+       ROUND3(a, b, c, d, in[2], 3);
+       ROUND3(d, a, b, c, in[10], 9);
+       ROUND3(c, d, a, b, in[6], 11);
+       ROUND3(b, c, d, a, in[14], 15);
+       ROUND3(a, b, c, d, in[1], 3);
+       ROUND3(d, a, b, c, in[9], 9);
+       ROUND3(c, d, a, b, in[5], 11);
+       ROUND3(b, c, d, a, in[13], 15);
+       ROUND3(a, b, c, d, in[3], 3);
+       ROUND3(d, a, b, c, in[11], 9);
+       ROUND3(c, d, a, b, in[7], 11);
+       ROUND3(b, c, d, a, in[15], 15);
+
+       hash[0] += a;
+       hash[1] += b;
+       hash[2] += c;
+       hash[3] += d;
+}
+
+static inline void md4_transform_helper(struct md4_ctx *ctx)
+{
+       le32_to_cpu_array(ctx->block, ARRAY_SIZE(ctx->block));
+       md4_transform(ctx->hash, ctx->block);
+}
+
+int cifs_md4_init(struct md4_ctx *mctx)
+{
+       memset(mctx, 0, sizeof(struct md4_ctx));
+       mctx->hash[0] = 0x67452301;
+       mctx->hash[1] = 0xefcdab89;
+       mctx->hash[2] = 0x98badcfe;
+       mctx->hash[3] = 0x10325476;
+       mctx->byte_count = 0;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(cifs_md4_init);
+
+int cifs_md4_update(struct md4_ctx *mctx, const u8 *data, unsigned int len)
+{
+       const u32 avail = sizeof(mctx->block) - (mctx->byte_count & 0x3f);
+
+       mctx->byte_count += len;
+
+       if (avail > len) {
+               memcpy((char *)mctx->block + (sizeof(mctx->block) - avail),
+                      data, len);
+               return 0;
+       }
+
+       memcpy((char *)mctx->block + (sizeof(mctx->block) - avail),
+              data, avail);
+
+       md4_transform_helper(mctx);
+       data += avail;
+       len -= avail;
+
+       while (len >= sizeof(mctx->block)) {
+               memcpy(mctx->block, data, sizeof(mctx->block));
+               md4_transform_helper(mctx);
+               data += sizeof(mctx->block);
+               len -= sizeof(mctx->block);
+       }
+
+       memcpy(mctx->block, data, len);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(cifs_md4_update);
+
+int cifs_md4_final(struct md4_ctx *mctx, u8 *out)
+{
+       const unsigned int offset = mctx->byte_count & 0x3f;
+       char *p = (char *)mctx->block + offset;
+       int padding = 56 - (offset + 1);
+
+       *p++ = 0x80;
+       if (padding < 0) {
+               memset(p, 0x00, padding + sizeof(u64));
+               md4_transform_helper(mctx);
+               p = (char *)mctx->block;
+               padding = 56;
+       }
+
+       memset(p, 0, padding);
+       mctx->block[14] = mctx->byte_count << 3;
+       mctx->block[15] = mctx->byte_count >> 29;
+       le32_to_cpu_array(mctx->block, (sizeof(mctx->block) -
+                         sizeof(u64)) / sizeof(u32));
+       md4_transform(mctx->hash, mctx->block);
+       cpu_to_le32_array(mctx->hash, ARRAY_SIZE(mctx->hash));
+       memcpy(out, mctx->hash, sizeof(mctx->hash));
+       memset(mctx, 0, sizeof(*mctx));
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(cifs_md4_final);
diff --git a/fs/smbfs_common/md4.h b/fs/smbfs_common/md4.h
new file mode 100644 (file)
index 0000000..5337bec
--- /dev/null
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Common values for ARC4 Cipher Algorithm
+ */
+
+#ifndef _CIFS_MD4_H
+#define _CIFS_MD4_H
+
+#include <linux/types.h>
+
+#define MD4_DIGEST_SIZE                16
+#define MD4_HMAC_BLOCK_SIZE    64
+#define MD4_BLOCK_WORDS                16
+#define MD4_HASH_WORDS         4
+
+struct md4_ctx {
+       u32 hash[MD4_HASH_WORDS];
+       u32 block[MD4_BLOCK_WORDS];
+       u64 byte_count;
+};
+
+
+int cifs_md4_init(struct md4_ctx *mctx);
+int cifs_md4_update(struct md4_ctx *mctx, const u8 *data, unsigned int len);
+int cifs_md4_final(struct md4_ctx *mctx, u8 *out);
+
+#endif /* _CIFS_MD4_H */
diff --git a/fs/smbfs_common/smbfsctl.h b/fs/smbfs_common/smbfsctl.h
new file mode 100644 (file)
index 0000000..d01e8c9
--- /dev/null
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/*
+ *   fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2013
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ */
+
+/* IOCTL information */
+/*
+ * List of ioctl/fsctl function codes that are or could be useful in the
+ * future to remote clients like cifs or SMB2/SMB3 client.  This is probably
+ * a slightly larger set of fsctls that NTFS local filesystem could handle,
+ * including the seven below that we do not have struct definitions for.
+ * Even with protocol definitions for most of these now available, we still
+ * need to do some experimentation to identify which are practical to do
+ * remotely.  Some of the following, such as the encryption/compression ones
+ * could be invoked from tools via a specialized hook into the VFS rather
+ * than via the standard vfs entry points
+ *
+ * See MS-SMB2 Section 2.2.31 (last checked September 2021, all of that list are
+ * below). Additional detail on less common ones can be found in MS-FSCC
+ * section 2.3.
+ */
+
+#ifndef __SMBFSCTL_H
+#define __SMBFSCTL_H
+
+/*
+ * FSCTL values are 32 bits and are constructed as
+ * <device 16bits> <access 2bits> <function 12bits> <method 2bits>
+ */
+/* Device */
+#define FSCTL_DEVICE_DFS                 (0x0006 << 16)
+#define FSCTL_DEVICE_FILE_SYSTEM         (0x0009 << 16)
+#define FSCTL_DEVICE_NAMED_PIPE          (0x0011 << 16)
+#define FSCTL_DEVICE_NETWORK_FILE_SYSTEM (0x0014 << 16)
+#define FSCTL_DEVICE_MASK                0xffff0000
+/* Access */
+#define FSCTL_DEVICE_ACCESS_FILE_ANY_ACCESS        (0x00 << 14)
+#define FSCTL_DEVICE_ACCESS_FILE_READ_ACCESS       (0x01 << 14)
+#define FSCTL_DEVICE_ACCESS_FILE_WRITE_ACCESS      (0x02 << 14)
+#define FSCTL_DEVICE_ACCESS_FILE_READ_WRITE_ACCESS (0x03 << 14)
+#define FSCTL_DEVICE_ACCESS_MASK                   0x0000c000
+/* Function */
+#define FSCTL_DEVICE_FUNCTION_MASK       0x00003ffc
+/* Method */
+#define FSCTL_DEVICE_METHOD_BUFFERED   0x00
+#define FSCTL_DEVICE_METHOD_IN_DIRECT  0x01
+#define FSCTL_DEVICE_METHOD_OUT_DIRECT 0x02
+#define FSCTL_DEVICE_METHOD_NEITHER    0x03
+#define FSCTL_DEVICE_METHOD_MASK       0x00000003
+
+
+#define FSCTL_DFS_GET_REFERRALS      0x00060194
+#define FSCTL_DFS_GET_REFERRALS_EX   0x000601B0
+#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
+#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
+#define FSCTL_REQUEST_BATCH_OPLOCK   0x00090008
+#define FSCTL_LOCK_VOLUME            0x00090018
+#define FSCTL_UNLOCK_VOLUME          0x0009001C
+#define FSCTL_IS_PATHNAME_VALID      0x0009002C /* BB add struct */
+#define FSCTL_GET_COMPRESSION        0x0009003C /* BB add struct */
+#define FSCTL_SET_COMPRESSION        0x0009C040 /* BB add struct */
+#define FSCTL_QUERY_FAT_BPB          0x00090058 /* BB add struct */
+/* Verify the next FSCTL number, we had it as 0x00090090 before */
+#define FSCTL_FILESYSTEM_GET_STATS   0x00090060 /* BB add struct */
+#define FSCTL_GET_NTFS_VOLUME_DATA   0x00090064 /* BB add struct */
+#define FSCTL_GET_RETRIEVAL_POINTERS 0x00090073 /* BB add struct */
+#define FSCTL_IS_VOLUME_DIRTY        0x00090078 /* BB add struct */
+#define FSCTL_ALLOW_EXTENDED_DASD_IO 0x00090083 /* BB add struct */
+#define FSCTL_REQUEST_FILTER_OPLOCK  0x0009008C
+#define FSCTL_FIND_FILES_BY_SID      0x0009008F /* BB add struct */
+#define FSCTL_SET_OBJECT_ID          0x00090098 /* BB add struct */
+#define FSCTL_GET_OBJECT_ID          0x0009009C /* BB add struct */
+#define FSCTL_DELETE_OBJECT_ID       0x000900A0 /* BB add struct */
+#define FSCTL_SET_REPARSE_POINT      0x000900A4 /* BB add struct */
+#define FSCTL_GET_REPARSE_POINT      0x000900A8 /* BB add struct */
+#define FSCTL_DELETE_REPARSE_POINT   0x000900AC /* BB add struct */
+#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
+#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
+#define FSCTL_SET_SPARSE             0x000900C4 /* BB add struct */
+#define FSCTL_SET_ZERO_DATA          0x000980C8
+#define FSCTL_SET_ENCRYPTION         0x000900D7 /* BB add struct */
+#define FSCTL_ENCRYPTION_FSCTL_IO    0x000900DB /* BB add struct */
+#define FSCTL_WRITE_RAW_ENCRYPTED    0x000900DF /* BB add struct */
+#define FSCTL_READ_RAW_ENCRYPTED     0x000900E3 /* BB add struct */
+#define FSCTL_READ_FILE_USN_DATA     0x000900EB /* BB add struct */
+#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */
+#define FSCTL_SIS_COPYFILE           0x00090100 /* BB add struct */
+#define FSCTL_RECALL_FILE            0x00090117 /* BB add struct */
+#define FSCTL_QUERY_SPARING_INFO     0x00090138 /* BB add struct */
+#define FSCTL_SET_ZERO_ON_DEALLOC    0x00090194 /* BB add struct */
+#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
+#define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C
+#define FSCTL_GET_REFS_VOLUME_DATA   0x000902D8 /* See MS-FSCC 2.3.24 */
+#define FSCTL_GET_RETRIEVAL_POINTERS_AND_REFCOUNT 0x000903d3
+#define FSCTL_GET_RETRIEVAL_POINTER_COUNT 0x0009042b
+#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF
+#define FSCTL_SET_DEFECT_MANAGEMENT  0x00098134 /* BB add struct */
+#define FSCTL_FILE_LEVEL_TRIM        0x00098208 /* BB add struct */
+#define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344
+#define FSCTL_SIS_LINK_FILES         0x0009C104
+#define FSCTL_SET_INTEGRITY_INFORMATION 0x0009C280
+#define FSCTL_PIPE_PEEK              0x0011400C /* BB add struct */
+#define FSCTL_PIPE_TRANSCEIVE        0x0011C017 /* BB add struct */
+/* strange that the number for this op is not sequential with previous op */
+#define FSCTL_PIPE_WAIT              0x00110018 /* BB add struct */
+/* Enumerate previous versions of a file */
+#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
+/* Retrieve an opaque file reference for server-side data movement ie copy */
+#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
+#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4
+#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
+#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
+#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204
+/* Perform server-side data movement */
+#define FSCTL_SRV_COPYCHUNK 0x001440F2
+#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2
+#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */
+#define FSCTL_SRV_READ_HASH          0x001441BB /* BB add struct */
+
+/* See FSCC 2.1.2.5 */
+#define IO_REPARSE_TAG_MOUNT_POINT   0xA0000003
+#define IO_REPARSE_TAG_HSM           0xC0000004
+#define IO_REPARSE_TAG_SIS           0x80000007
+#define IO_REPARSE_TAG_HSM2          0x80000006
+#define IO_REPARSE_TAG_DRIVER_EXTENDER 0x80000005
+/* Used by the DFS filter. See MS-DFSC */
+#define IO_REPARSE_TAG_DFS           0x8000000A
+/* Used by the DFS filter See MS-DFSC */
+#define IO_REPARSE_TAG_DFSR          0x80000012
+#define IO_REPARSE_TAG_FILTER_MANAGER 0x8000000B
+/* See section MS-FSCC 2.1.2.4 */
+#define IO_REPARSE_TAG_SYMLINK       0xA000000C
+#define IO_REPARSE_TAG_DEDUP         0x80000013
+#define IO_REPARSE_APPXSTREAM       0xC0000014
+/* NFS symlinks, Win 8/SMB3 and later */
+#define IO_REPARSE_TAG_NFS           0x80000014
+/*
+ * AzureFileSync - see
+ * https://docs.microsoft.com/en-us/azure/storage/files/storage-sync-cloud-tiering
+ */
+#define IO_REPARSE_TAG_AZ_FILE_SYNC  0x8000001e
+/* WSL reparse tags */
+#define IO_REPARSE_TAG_LX_SYMLINK    0xA000001D
+#define IO_REPARSE_TAG_AF_UNIX      0x80000023
+#define IO_REPARSE_TAG_LX_FIFO      0x80000024
+#define IO_REPARSE_TAG_LX_CHR       0x80000025
+#define IO_REPARSE_TAG_LX_BLK       0x80000026
+
+#define IO_REPARSE_TAG_LX_SYMLINK_LE   cpu_to_le32(0xA000001D)
+#define IO_REPARSE_TAG_AF_UNIX_LE      cpu_to_le32(0x80000023)
+#define IO_REPARSE_TAG_LX_FIFO_LE      cpu_to_le32(0x80000024)
+#define IO_REPARSE_TAG_LX_CHR_LE       cpu_to_le32(0x80000025)
+#define IO_REPARSE_TAG_LX_BLK_LE       cpu_to_le32(0x80000026)
+
+/* fsctl flags */
+/* If Flags is set to this value, the request is an FSCTL not ioctl request */
+#define SMB2_0_IOCTL_IS_FSCTL          0x00000001
+#endif /* __SMBFSCTL_H */
index 9f4985b..bc159a9 100644 (file)
@@ -135,6 +135,7 @@ struct cppc_cpudata {
 
 #ifdef CONFIG_ACPI_CPPC_LIB
 extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf);
+extern int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf);
 extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs);
 extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls);
 extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps);
@@ -149,6 +150,10 @@ static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
 {
        return -ENOTSUPP;
 }
+static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
+{
+       return -ENOTSUPP;
+}
 static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs)
 {
        return -ENOTSUPP;
index 818680c..b20e89d 100644 (file)
 #ifndef _TTM_TT_H_
 #define _TTM_TT_H_
 
+#include <linux/pagemap.h>
 #include <linux/types.h>
 #include <drm/ttm/ttm_caching.h>
 #include <drm/ttm/ttm_kmap_iter.h>
 
-struct ttm_bo_device;
+struct ttm_device;
 struct ttm_tt;
 struct ttm_resource;
 struct ttm_buffer_object;
index abe089c..537e1b9 100644 (file)
@@ -110,7 +110,7 @@ static inline __init bool xbc_node_is_leaf(struct xbc_node *node)
 }
 
 /* Tree-based key-value access APIs */
-struct xbc_node * __init xbc_node_find_child(struct xbc_node *parent,
+struct xbc_node * __init xbc_node_find_subkey(struct xbc_node *parent,
                                             const char *key);
 
 const char * __init xbc_node_find_value(struct xbc_node *parent,
@@ -148,7 +148,7 @@ xbc_find_value(const char *key, struct xbc_node **vnode)
  */
 static inline struct xbc_node * __init xbc_find_node(const char *key)
 {
-       return xbc_node_find_child(NULL, key);
+       return xbc_node_find_subkey(NULL, key);
 }
 
 /**
index 1834752..39dcadd 100644 (file)
@@ -11,7 +11,7 @@
 #include <linux/types.h>
 
 /**
- * em_perf_state - Performance state of a performance domain
+ * struct em_perf_state - Performance state of a performance domain
  * @frequency: The frequency in KHz, for consistency with CPUFreq
  * @power:     The power consumed at this level (by 1 CPU or by a registered
  *             device). It can be a total power: static and dynamic.
@@ -25,7 +25,7 @@ struct em_perf_state {
 };
 
 /**
- * em_perf_domain - Performance domain
+ * struct em_perf_domain - Performance domain
  * @table:             List of performance states, in ascending order
  * @nr_perf_states:    Number of performance states
  * @milliwatts:                Flag indicating the power values are in milli-Watts
@@ -103,12 +103,12 @@ void em_dev_unregister_perf_domain(struct device *dev);
 
 /**
  * em_cpu_energy() - Estimates the energy consumed by the CPUs of a
              performance domain
*             performance domain
  * @pd         : performance domain for which energy has to be estimated
  * @max_util   : highest utilization among CPUs of the domain
  * @sum_util   : sum of the utilization of all CPUs in the domain
  * @allowed_cpu_cap    : maximum allowed CPU capacity for the @pd, which
                        might reflect reduced frequency (due to thermal)
*                       might reflect reduced frequency (due to thermal)
  *
  * This function must be used only for CPU devices. There is no validation,
  * i.e. if the EM is a CPU type and has cpumask allocated. It is called from
index 2de2e46..51e830b 100644 (file)
@@ -94,6 +94,9 @@ extern void fd_install(unsigned int fd, struct file *file);
 
 extern int __receive_fd(struct file *file, int __user *ufd,
                        unsigned int o_flags);
+
+extern int receive_fd(struct file *file, unsigned int o_flags);
+
 static inline int receive_fd_user(struct file *file, int __user *ufd,
                                  unsigned int o_flags)
 {
@@ -101,10 +104,6 @@ static inline int receive_fd_user(struct file *file, int __user *ufd,
                return -EFAULT;
        return __receive_fd(file, ufd, o_flags);
 }
-static inline int receive_fd(struct file *file, unsigned int o_flags)
-{
-       return __receive_fd(file, NULL, o_flags);
-}
 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);
 
 extern void flush_delayed_fput(void);
index a0b7e43..725c9b7 100644 (file)
@@ -404,7 +404,7 @@ int pwm_set_chip_data(struct pwm_device *pwm, void *data);
 void *pwm_get_chip_data(struct pwm_device *pwm);
 
 int pwmchip_add(struct pwm_chip *chip);
-int pwmchip_remove(struct pwm_chip *chip);
+void pwmchip_remove(struct pwm_chip *chip);
 
 int devm_pwmchip_add(struct device *dev, struct pwm_chip *chip);
 
index 0165824..c0475d1 100644 (file)
@@ -109,6 +109,12 @@ extern int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt,
                             u32 *resp);
 
 extern int qcom_scm_qsmmu500_wait_safe_toggle(bool en);
+
+extern int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val,
+                             u64 limit_node, u32 node_id, u64 version);
+extern int qcom_scm_lmh_profile_change(u32 profile_id);
+extern bool qcom_scm_lmh_dcvsh_available(void);
+
 #else
 
 #include <linux/errno.h>
@@ -170,5 +176,13 @@ static inline int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt,
 
 static inline int qcom_scm_qsmmu500_wait_safe_toggle(bool en)
                { return -ENODEV; }
+
+static inline int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val,
+                                    u64 limit_node, u32 node_id, u64 version)
+               { return -ENODEV; }
+
+static inline int qcom_scm_lmh_profile_change(u32 profile_id) { return -ENODEV; }
+
+static inline bool qcom_scm_lmh_dcvsh_available(void) { return -ENODEV; }
 #endif
 #endif
index 426e98e..352c612 100644 (file)
@@ -142,22 +142,14 @@ struct rw_semaphore {
 #define DECLARE_RWSEM(lockname) \
        struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-extern void  __rwsem_init(struct rw_semaphore *rwsem, const char *name,
+extern void  __init_rwsem(struct rw_semaphore *rwsem, const char *name,
                          struct lock_class_key *key);
-#else
-static inline void  __rwsem_init(struct rw_semaphore *rwsem, const char *name,
-                                struct lock_class_key *key)
-{
-}
-#endif
 
 #define init_rwsem(sem)                                                \
 do {                                                           \
        static struct lock_class_key __key;                     \
                                                                \
-       init_rwbase_rt(&(sem)->rwbase);                 \
-       __rwsem_init((sem), #sem, &__key);                      \
+       __init_rwsem((sem), #sem, &__key);                      \
 } while (0)
 
 static __always_inline int rwsem_is_locked(struct rw_semaphore *sem)
index d296f3b..c314893 100644 (file)
@@ -285,7 +285,7 @@ struct thermal_zone_params {
 };
 
 /**
- * struct thermal_zone_of_device_ops - scallbacks for handling DT based zones
+ * struct thermal_zone_of_device_ops - callbacks for handling DT based zones
  *
  * Mandatory:
  * @get_temp: a pointer to a function that reads the sensor temperature.
@@ -404,12 +404,13 @@ static inline void thermal_zone_device_unregister(
        struct thermal_zone_device *tz)
 { }
 static inline struct thermal_cooling_device *
-thermal_cooling_device_register(char *type, void *devdata,
+thermal_cooling_device_register(const char *type, void *devdata,
        const struct thermal_cooling_device_ops *ops)
 { return ERR_PTR(-ENODEV); }
 static inline struct thermal_cooling_device *
 thermal_of_cooling_device_register(struct device_node *np,
-       char *type, void *devdata, const struct thermal_cooling_device_ops *ops)
+       const char *type, void *devdata,
+       const struct thermal_cooling_device_ops *ops)
 { return ERR_PTR(-ENODEV); }
 static inline struct thermal_cooling_device *
 devm_thermal_of_cooling_device_register(struct device *dev,
index 5117cb5..81b9686 100644 (file)
@@ -25,7 +25,9 @@ struct itimerspec64 {
 #define TIME64_MIN                     (-TIME64_MAX - 1)
 
 #define KTIME_MAX                      ((s64)~((u64)1 << 63))
+#define KTIME_MIN                      (-KTIME_MAX - 1)
 #define KTIME_SEC_MAX                  (KTIME_MAX / NSEC_PER_SEC)
+#define KTIME_SEC_MIN                  (KTIME_MIN / NSEC_PER_SEC)
 
 /*
  * Limits for settimeofday():
@@ -124,10 +126,13 @@ static inline bool timespec64_valid_settod(const struct timespec64 *ts)
  */
 static inline s64 timespec64_to_ns(const struct timespec64 *ts)
 {
-       /* Prevent multiplication overflow */
-       if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX)
+       /* Prevent multiplication overflow / underflow */
+       if (ts->tv_sec >= KTIME_SEC_MAX)
                return KTIME_MAX;
 
+       if (ts->tv_sec <= KTIME_SEC_MIN)
+               return KTIME_MIN;
+
        return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
 }
 
index 8cfe49d..3972ab7 100644 (file)
@@ -43,17 +43,17 @@ struct vdpa_vq_state_split {
  * @last_used_idx: used index
  */
 struct vdpa_vq_state_packed {
-        u16    last_avail_counter:1;
-        u16    last_avail_idx:15;
-        u16    last_used_counter:1;
-        u16    last_used_idx:15;
+       u16     last_avail_counter:1;
+       u16     last_avail_idx:15;
+       u16     last_used_counter:1;
+       u16     last_used_idx:15;
 };
 
 struct vdpa_vq_state {
-     union {
-          struct vdpa_vq_state_split split;
-          struct vdpa_vq_state_packed packed;
-     };
+       union {
+               struct vdpa_vq_state_split split;
+               struct vdpa_vq_state_packed packed;
+       };
 };
 
 struct vdpa_mgmt_dev;
@@ -65,6 +65,7 @@ struct vdpa_mgmt_dev;
  * @config: the configuration ops for this device.
  * @index: device index
  * @features_valid: were features initialized? for legacy guests
+ * @use_va: indicate whether virtual address must be used by this device
  * @nvqs: maximum number of supported virtqueues
  * @mdev: management device pointer; caller must setup when registering device as part
  *       of dev_add() mgmtdev ops callback before invoking _vdpa_register_device().
@@ -75,6 +76,7 @@ struct vdpa_device {
        const struct vdpa_config_ops *config;
        unsigned int index;
        bool features_valid;
+       bool use_va;
        int nvqs;
        struct vdpa_mgmt_dev *mdev;
 };
@@ -89,6 +91,16 @@ struct vdpa_iova_range {
        u64 last;
 };
 
+/**
+ * Corresponding file area for device memory mapping
+ * @file: vma->vm_file for the mapping
+ * @offset: mapping offset in the vm_file
+ */
+struct vdpa_map_file {
+       struct file *file;
+       u64 offset;
+};
+
 /**
  * struct vdpa_config_ops - operations for configuring a vDPA device.
  * Note: vDPA device drivers are required to implement all of the
@@ -131,7 +143,7 @@ struct vdpa_iova_range {
  *                             @vdev: vdpa device
  *                             @idx: virtqueue index
  *                             @state: pointer to returned state (last_avail_idx)
- * @get_vq_notification:       Get the notification area for a virtqueue
+ * @get_vq_notification:       Get the notification area for a virtqueue
  *                             @vdev: vdpa device
  *                             @idx: virtqueue index
  *                             Returns the notifcation area
@@ -171,6 +183,9 @@ struct vdpa_iova_range {
  * @set_status:                        Set the device status
  *                             @vdev: vdpa device
  *                             @status: virtio device status
+ * @reset:                     Reset device
+ *                             @vdev: vdpa device
+ *                             Returns integer: success (0) or error (< 0)
  * @get_config_size:           Get the size of the configuration space
  *                             @vdev: vdpa device
  *                             Returns size_t: configuration size
@@ -255,6 +270,7 @@ struct vdpa_config_ops {
        u32 (*get_vendor_id)(struct vdpa_device *vdev);
        u8 (*get_status)(struct vdpa_device *vdev);
        void (*set_status)(struct vdpa_device *vdev, u8 status);
+       int (*reset)(struct vdpa_device *vdev);
        size_t (*get_config_size)(struct vdpa_device *vdev);
        void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
                           void *buf, unsigned int len);
@@ -266,7 +282,7 @@ struct vdpa_config_ops {
        /* DMA ops */
        int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb);
        int (*dma_map)(struct vdpa_device *vdev, u64 iova, u64 size,
-                      u64 pa, u32 perm);
+                      u64 pa, u32 perm, void *opaque);
        int (*dma_unmap)(struct vdpa_device *vdev, u64 iova, u64 size);
 
        /* Free device resources */
@@ -275,7 +291,8 @@ struct vdpa_config_ops {
 
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
                                        const struct vdpa_config_ops *config,
-                                       size_t size, const char *name);
+                                       size_t size, const char *name,
+                                       bool use_va);
 
 /**
  * vdpa_alloc_device - allocate and initilaize a vDPA device
@@ -285,15 +302,16 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
  * @parent: the parent device
  * @config: the bus operations that is supported by this device
  * @name: name of the vdpa device
+ * @use_va: indicate whether virtual address must be used by this device
  *
  * Return allocated data structure or ERR_PTR upon error
  */
-#define vdpa_alloc_device(dev_struct, member, parent, config, name)   \
+#define vdpa_alloc_device(dev_struct, member, parent, config, name, use_va)   \
                          container_of(__vdpa_alloc_device( \
                                       parent, config, \
                                       sizeof(dev_struct) + \
                                       BUILD_BUG_ON_ZERO(offsetof( \
-                                      dev_struct, member)), name), \
+                                      dev_struct, member)), name, use_va), \
                                       dev_struct, member)
 
 int vdpa_register_device(struct vdpa_device *vdev, int nvqs);
@@ -348,27 +366,27 @@ static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev)
        return vdev->dma_dev;
 }
 
-static inline void vdpa_reset(struct vdpa_device *vdev)
+static inline int vdpa_reset(struct vdpa_device *vdev)
 {
-        const struct vdpa_config_ops *ops = vdev->config;
+       const struct vdpa_config_ops *ops = vdev->config;
 
        vdev->features_valid = false;
-        ops->set_status(vdev, 0);
+       return ops->reset(vdev);
 }
 
 static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features)
 {
-        const struct vdpa_config_ops *ops = vdev->config;
+       const struct vdpa_config_ops *ops = vdev->config;
 
        vdev->features_valid = true;
-        return ops->set_features(vdev, features);
+       return ops->set_features(vdev, features);
 }
 
-
-static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset,
-                                  void *buf, unsigned int len)
+static inline void vdpa_get_config(struct vdpa_device *vdev,
+                                  unsigned int offset, void *buf,
+                                  unsigned int len)
 {
-        const struct vdpa_config_ops *ops = vdev->config;
+       const struct vdpa_config_ops *ops = vdev->config;
 
        /*
         * Config accesses aren't supposed to trigger before features are set.
index 6b09b78..2d0e2f5 100644 (file)
@@ -17,6 +17,7 @@ struct vhost_iotlb_map {
        u32 perm;
        u32 flags_padding;
        u64 __subtree_last;
+       void *opaque;
 };
 
 #define VHOST_IOTLB_FLAG_RETIRE 0x1
@@ -29,6 +30,8 @@ struct vhost_iotlb {
        unsigned int flags;
 };
 
+int vhost_iotlb_add_range_ctx(struct vhost_iotlb *iotlb, u64 start, u64 last,
+                             u64 addr, unsigned int perm, void *opaque);
 int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, u64 start, u64 last,
                          u64 addr, unsigned int perm);
 void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last);
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
new file mode 100644 (file)
index 0000000..7cfe1c1
--- /dev/null
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_VDUSE_H_
+#define _UAPI_VDUSE_H_
+
+#include <linux/types.h>
+
+#define VDUSE_BASE     0x81
+
+/* The ioctls for control device (/dev/vduse/control) */
+
+#define VDUSE_API_VERSION      0
+
+/*
+ * Get the version of VDUSE API that kernel supported (VDUSE_API_VERSION).
+ * This is used for future extension.
+ */
+#define VDUSE_GET_API_VERSION  _IOR(VDUSE_BASE, 0x00, __u64)
+
+/* Set the version of VDUSE API that userspace supported. */
+#define VDUSE_SET_API_VERSION  _IOW(VDUSE_BASE, 0x01, __u64)
+
+/**
+ * struct vduse_dev_config - basic configuration of a VDUSE device
+ * @name: VDUSE device name, needs to be NUL terminated
+ * @vendor_id: virtio vendor id
+ * @device_id: virtio device id
+ * @features: virtio features
+ * @vq_num: the number of virtqueues
+ * @vq_align: the allocation alignment of virtqueue's metadata
+ * @reserved: for future use, needs to be initialized to zero
+ * @config_size: the size of the configuration space
+ * @config: the buffer of the configuration space
+ *
+ * Structure used by VDUSE_CREATE_DEV ioctl to create VDUSE device.
+ */
+struct vduse_dev_config {
+#define VDUSE_NAME_MAX 256
+       char name[VDUSE_NAME_MAX];
+       __u32 vendor_id;
+       __u32 device_id;
+       __u64 features;
+       __u32 vq_num;
+       __u32 vq_align;
+       __u32 reserved[13];
+       __u32 config_size;
+       __u8 config[];
+};
+
+/* Create a VDUSE device which is represented by a char device (/dev/vduse/$NAME) */
+#define VDUSE_CREATE_DEV       _IOW(VDUSE_BASE, 0x02, struct vduse_dev_config)
+
+/*
+ * Destroy a VDUSE device. Make sure there are no more references
+ * to the char device (/dev/vduse/$NAME).
+ */
+#define VDUSE_DESTROY_DEV      _IOW(VDUSE_BASE, 0x03, char[VDUSE_NAME_MAX])
+
+/* The ioctls for VDUSE device (/dev/vduse/$NAME) */
+
+/**
+ * struct vduse_iotlb_entry - entry of IOTLB to describe one IOVA region [start, last]
+ * @offset: the mmap offset on returned file descriptor
+ * @start: start of the IOVA region
+ * @last: last of the IOVA region
+ * @perm: access permission of the IOVA region
+ *
+ * Structure used by VDUSE_IOTLB_GET_FD ioctl to find an overlapped IOVA region.
+ */
+struct vduse_iotlb_entry {
+       __u64 offset;
+       __u64 start;
+       __u64 last;
+#define VDUSE_ACCESS_RO 0x1
+#define VDUSE_ACCESS_WO 0x2
+#define VDUSE_ACCESS_RW 0x3
+       __u8 perm;
+};
+
+/*
+ * Find the first IOVA region that overlaps with the range [start, last]
+ * and return the corresponding file descriptor. Return -EINVAL means the
+ * IOVA region doesn't exist. Caller should set start and last fields.
+ */
+#define VDUSE_IOTLB_GET_FD     _IOWR(VDUSE_BASE, 0x10, struct vduse_iotlb_entry)
+
+/*
+ * Get the negotiated virtio features. It's a subset of the features in
+ * struct vduse_dev_config which can be accepted by virtio driver. It's
+ * only valid after FEATURES_OK status bit is set.
+ */
+#define VDUSE_DEV_GET_FEATURES _IOR(VDUSE_BASE, 0x11, __u64)
+
+/**
+ * struct vduse_config_data - data used to update configuration space
+ * @offset: the offset from the beginning of configuration space
+ * @length: the length to write to configuration space
+ * @buffer: the buffer used to write from
+ *
+ * Structure used by VDUSE_DEV_SET_CONFIG ioctl to update device
+ * configuration space.
+ */
+struct vduse_config_data {
+       __u32 offset;
+       __u32 length;
+       __u8 buffer[];
+};
+
+/* Set device configuration space */
+#define VDUSE_DEV_SET_CONFIG   _IOW(VDUSE_BASE, 0x12, struct vduse_config_data)
+
+/*
+ * Inject a config interrupt. It's usually used to notify virtio driver
+ * that device configuration space has changed.
+ */
+#define VDUSE_DEV_INJECT_CONFIG_IRQ    _IO(VDUSE_BASE, 0x13)
+
+/**
+ * struct vduse_vq_config - basic configuration of a virtqueue
+ * @index: virtqueue index
+ * @max_size: the max size of virtqueue
+ * @reserved: for future use, needs to be initialized to zero
+ *
+ * Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue.
+ */
+struct vduse_vq_config {
+       __u32 index;
+       __u16 max_size;
+       __u16 reserved[13];
+};
+
+/*
+ * Setup the specified virtqueue. Make sure all virtqueues have been
+ * configured before the device is attached to vDPA bus.
+ */
+#define VDUSE_VQ_SETUP         _IOW(VDUSE_BASE, 0x14, struct vduse_vq_config)
+
+/**
+ * struct vduse_vq_state_split - split virtqueue state
+ * @avail_index: available index
+ */
+struct vduse_vq_state_split {
+       __u16 avail_index;
+};
+
+/**
+ * struct vduse_vq_state_packed - packed virtqueue state
+ * @last_avail_counter: last driver ring wrap counter observed by device
+ * @last_avail_idx: device available index
+ * @last_used_counter: device ring wrap counter
+ * @last_used_idx: used index
+ */
+struct vduse_vq_state_packed {
+       __u16 last_avail_counter;
+       __u16 last_avail_idx;
+       __u16 last_used_counter;
+       __u16 last_used_idx;
+};
+
+/**
+ * struct vduse_vq_info - information of a virtqueue
+ * @index: virtqueue index
+ * @num: the size of virtqueue
+ * @desc_addr: address of desc area
+ * @driver_addr: address of driver area
+ * @device_addr: address of device area
+ * @split: split virtqueue state
+ * @packed: packed virtqueue state
+ * @ready: ready status of virtqueue
+ *
+ * Structure used by VDUSE_VQ_GET_INFO ioctl to get virtqueue's information.
+ */
+struct vduse_vq_info {
+       __u32 index;
+       __u32 num;
+       __u64 desc_addr;
+       __u64 driver_addr;
+       __u64 device_addr;
+       union {
+               struct vduse_vq_state_split split;
+               struct vduse_vq_state_packed packed;
+       };
+       __u8 ready;
+};
+
+/* Get the specified virtqueue's information. Caller should set index field. */
+#define VDUSE_VQ_GET_INFO      _IOWR(VDUSE_BASE, 0x15, struct vduse_vq_info)
+
+/**
+ * struct vduse_vq_eventfd - eventfd configuration for a virtqueue
+ * @index: virtqueue index
+ * @fd: eventfd, -1 means de-assigning the eventfd
+ *
+ * Structure used by VDUSE_VQ_SETUP_KICKFD ioctl to setup kick eventfd.
+ */
+struct vduse_vq_eventfd {
+       __u32 index;
+#define VDUSE_EVENTFD_DEASSIGN -1
+       int fd;
+};
+
+/*
+ * Setup kick eventfd for specified virtqueue. The kick eventfd is used
+ * by VDUSE kernel module to notify userspace to consume the avail vring.
+ */
+#define VDUSE_VQ_SETUP_KICKFD  _IOW(VDUSE_BASE, 0x16, struct vduse_vq_eventfd)
+
+/*
+ * Inject an interrupt for specific virtqueue. It's used to notify virtio driver
+ * to consume the used vring.
+ */
+#define VDUSE_VQ_INJECT_IRQ    _IOW(VDUSE_BASE, 0x17, __u32)
+
+/* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */
+
+/**
+ * enum vduse_req_type - request type
+ * @VDUSE_GET_VQ_STATE: get the state for specified virtqueue from userspace
+ * @VDUSE_SET_STATUS: set the device status
+ * @VDUSE_UPDATE_IOTLB: Notify userspace to update the memory mapping for
+ *                      specified IOVA range via VDUSE_IOTLB_GET_FD ioctl
+ */
+enum vduse_req_type {
+       VDUSE_GET_VQ_STATE,
+       VDUSE_SET_STATUS,
+       VDUSE_UPDATE_IOTLB,
+};
+
+/**
+ * struct vduse_vq_state - virtqueue state
+ * @index: virtqueue index
+ * @split: split virtqueue state
+ * @packed: packed virtqueue state
+ */
+struct vduse_vq_state {
+       __u32 index;
+       union {
+               struct vduse_vq_state_split split;
+               struct vduse_vq_state_packed packed;
+       };
+};
+
+/**
+ * struct vduse_dev_status - device status
+ * @status: device status
+ */
+struct vduse_dev_status {
+       __u8 status;
+};
+
+/**
+ * struct vduse_iova_range - IOVA range [start, last]
+ * @start: start of the IOVA range
+ * @last: last of the IOVA range
+ */
+struct vduse_iova_range {
+       __u64 start;
+       __u64 last;
+};
+
+/**
+ * struct vduse_dev_request - control request
+ * @type: request type
+ * @request_id: request id
+ * @reserved: for future use
+ * @vq_state: virtqueue state, only index field is available
+ * @s: device status
+ * @iova: IOVA range for updating
+ * @padding: padding
+ *
+ * Structure used by read(2) on /dev/vduse/$NAME.
+ */
+struct vduse_dev_request {
+       __u32 type;
+       __u32 request_id;
+       __u32 reserved[4];
+       union {
+               struct vduse_vq_state vq_state;
+               struct vduse_dev_status s;
+               struct vduse_iova_range iova;
+               __u32 padding[32];
+       };
+};
+
+/**
+ * struct vduse_dev_response - response to control request
+ * @request_id: corresponding request id
+ * @result: the result of request
+ * @reserved: for future use, needs to be initialized to zero
+ * @vq_state: virtqueue state
+ * @padding: padding
+ *
+ * Structure used by write(2) on /dev/vduse/$NAME.
+ */
+struct vduse_dev_response {
+       __u32 request_id;
+#define VDUSE_REQ_RESULT_OK    0x00
+#define VDUSE_REQ_RESULT_FAILED        0x01
+       __u32 result;
+       __u32 reserved[4];
+       union {
+               struct vduse_vq_state vq_state;
+               __u32 padding[32];
+       };
+};
+
+#endif /* _UAPI_VDUSE_H_ */
index 50d352f..80d76b7 100644 (file)
 #define VIRTIO_ID_SOUND                        25 /* virtio sound */
 #define VIRTIO_ID_FS                   26 /* virtio filesystem */
 #define VIRTIO_ID_PMEM                 27 /* virtio pmem */
+#define VIRTIO_ID_RPMB                 28 /* virtio rpmb */
 #define VIRTIO_ID_MAC80211_HWSIM       29 /* virtio mac80211-hwsim */
+#define VIRTIO_ID_VIDEO_ENCODER                30 /* virtio video encoder */
+#define VIRTIO_ID_VIDEO_DECODER                31 /* virtio video decoder */
 #define VIRTIO_ID_SCMI                 32 /* virtio SCMI */
+#define VIRTIO_ID_NITRO_SEC_MOD                33 /* virtio nitro secure module*/
 #define VIRTIO_ID_I2C_ADAPTER          34 /* virtio i2c adapter */
+#define VIRTIO_ID_WATCHDOG             35 /* virtio watchdog */
+#define VIRTIO_ID_CAN                  36 /* virtio can */
+#define VIRTIO_ID_DMABUF               37 /* virtio dmabuf */
+#define VIRTIO_ID_PARAM_SERV           38 /* virtio parameter server */
+#define VIRTIO_ID_AUDIO_POLICY         39 /* virtio audio policy */
 #define VIRTIO_ID_BT                   40 /* virtio bluetooth */
 #define VIRTIO_ID_GPIO                 41 /* virtio gpio */
 
index 3dd3555..6473883 100644 (file)
@@ -97,7 +97,8 @@ enum virtio_vsock_shutdown {
 
 /* VIRTIO_VSOCK_OP_RW flags values */
 enum virtio_vsock_rw {
-       VIRTIO_VSOCK_SEQ_EOR = 1,
+       VIRTIO_VSOCK_SEQ_EOM = 1,
+       VIRTIO_VSOCK_SEQ_EOR = 2,
 };
 
 #endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */
index a47a731..7cc2a0f 100644 (file)
@@ -276,7 +276,17 @@ enum hl_device_status {
        HL_DEVICE_STATUS_OPERATIONAL,
        HL_DEVICE_STATUS_IN_RESET,
        HL_DEVICE_STATUS_MALFUNCTION,
-       HL_DEVICE_STATUS_NEEDS_RESET
+       HL_DEVICE_STATUS_NEEDS_RESET,
+       HL_DEVICE_STATUS_IN_DEVICE_CREATION,
+       HL_DEVICE_STATUS_LAST = HL_DEVICE_STATUS_IN_DEVICE_CREATION
+};
+
+enum hl_server_type {
+       HL_SERVER_TYPE_UNKNOWN = 0,
+       HL_SERVER_GAUDI_HLS1 = 1,
+       HL_SERVER_GAUDI_HLS1H = 2,
+       HL_SERVER_GAUDI_TYPE1 = 3,
+       HL_SERVER_GAUDI_TYPE2 = 4
 };
 
 /* Opcode for management ioctl
@@ -337,17 +347,49 @@ enum hl_device_status {
 #define HL_INFO_VERSION_MAX_LEN        128
 #define HL_INFO_CARD_NAME_MAX_LEN      16
 
+/**
+ * struct hl_info_hw_ip_info - hardware information on various IPs in the ASIC
+ * @sram_base_address: The first SRAM physical base address that is free to be
+ *                     used by the user.
+ * @dram_base_address: The first DRAM virtual or physical base address that is
+ *                     free to be used by the user.
+ * @dram_size: The DRAM size that is available to the user.
+ * @sram_size: The SRAM size that is available to the user.
+ * @num_of_events: The number of events that can be received from the f/w. This
+ *                 is needed so the user can what is the size of the h/w events
+ *                 array he needs to pass to the kernel when he wants to fetch
+ *                 the event counters.
+ * @device_id: PCI device ID of the ASIC.
+ * @module_id: Module ID of the ASIC for mezzanine cards in servers
+ *             (From OCP spec).
+ * @first_available_interrupt_id: The first available interrupt ID for the user
+ *                                to be used when it works with user interrupts.
+ * @server_type: Server type that the Gaudi ASIC is currently installed in.
+ *               The value is according to enum hl_server_type
+ * @cpld_version: CPLD version on the board.
+ * @psoc_pci_pll_nr: PCI PLL NR value. Needed by the profiler in some ASICs.
+ * @psoc_pci_pll_nf: PCI PLL NF value. Needed by the profiler in some ASICs.
+ * @psoc_pci_pll_od: PCI PLL OD value. Needed by the profiler in some ASICs.
+ * @psoc_pci_pll_div_factor: PCI PLL DIV factor value. Needed by the profiler
+ *                           in some ASICs.
+ * @tpc_enabled_mask: Bit-mask that represents which TPCs are enabled. Relevant
+ *                    for Goya/Gaudi only.
+ * @dram_enabled: Whether the DRAM is enabled.
+ * @cpucp_version: The CPUCP f/w version.
+ * @card_name: The card name as passed by the f/w.
+ * @dram_page_size: The DRAM physical page size.
+ */
 struct hl_info_hw_ip_info {
        __u64 sram_base_address;
        __u64 dram_base_address;
        __u64 dram_size;
        __u32 sram_size;
        __u32 num_of_events;
-       __u32 device_id; /* PCI Device ID */
-       __u32 module_id; /* For mezzanine cards in servers (From OCP spec.) */
+       __u32 device_id;
+       __u32 module_id;
        __u32 reserved;
        __u16 first_available_interrupt_id;
-       __u16 reserved2;
+       __u16 server_type;
        __u32 cpld_version;
        __u32 psoc_pci_pll_nr;
        __u32 psoc_pci_pll_nf;
@@ -358,7 +400,7 @@ struct hl_info_hw_ip_info {
        __u8 pad[2];
        __u8 cpucp_version[HL_INFO_VERSION_MAX_LEN];
        __u8 card_name[HL_INFO_CARD_NAME_MAX_LEN];
-       __u64 reserved3;
+       __u64 reserved2;
        __u64 dram_page_size;
 };
 
@@ -628,12 +670,21 @@ struct hl_cs_chunk {
                __u64 cb_handle;
 
                /* Relevant only when HL_CS_FLAGS_WAIT or
-                * HL_CS_FLAGS_COLLECTIVE_WAIT is set.
+                * HL_CS_FLAGS_COLLECTIVE_WAIT is set
                 * This holds address of array of u64 values that contain
-                * signal CS sequence numbers. The wait described by this job
-                * will listen on all those signals (wait event per signal)
+                * signal CS sequence numbers. The wait described by
+                * this job will listen on all those signals
+                * (wait event per signal)
                 */
                __u64 signal_seq_arr;
+
+               /*
+                * Relevant only when HL_CS_FLAGS_WAIT or
+                * HL_CS_FLAGS_COLLECTIVE_WAIT is set
+                * along with HL_CS_FLAGS_ENCAP_SIGNALS.
+                * This is the CS sequence which has the encapsulated signals.
+                */
+               __u64 encaps_signal_seq;
        };
 
        /* Index of queue to put the CB on */
@@ -651,6 +702,17 @@ struct hl_cs_chunk {
                 * Number of entries in signal_seq_arr
                 */
                __u32 num_signal_seq_arr;
+
+               /* Relevant only when HL_CS_FLAGS_WAIT or
+                * HL_CS_FLAGS_COLLECTIVE_WAIT is set along
+                * with HL_CS_FLAGS_ENCAP_SIGNALS
+                * This set the signals range that the user want to wait for
+                * out of the whole reserved signals range.
+                * e.g if the signals range is 20, and user don't want
+                * to wait for signal 8, so he set this offset to 7, then
+                * he call the API again with 9 and so on till 20.
+                */
+               __u32 encaps_signal_offset;
        };
 
        /* HL_CS_CHUNK_FLAGS_* */
@@ -678,6 +740,28 @@ struct hl_cs_chunk {
 #define HL_CS_FLAGS_CUSTOM_TIMEOUT             0x200
 #define HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT      0x400
 
+/*
+ * The encapsulated signals CS is merged into the existing CS ioctls.
+ * In order to use this feature need to follow the below procedure:
+ * 1. Reserve signals, set the CS type to HL_CS_FLAGS_RESERVE_SIGNALS_ONLY
+ *    the output of this API will be the SOB offset from CFG_BASE.
+ *    this address will be used to patch CB cmds to do the signaling for this
+ *    SOB by incrementing it's value.
+ *    for reverting the reservation use HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY
+ *    CS type, note that this might fail if out-of-sync happened to the SOB
+ *    value, in case other signaling request to the same SOB occurred between
+ *    reserve-unreserve calls.
+ * 2. Use the staged CS to do the encapsulated signaling jobs.
+ *    use HL_CS_FLAGS_STAGED_SUBMISSION and HL_CS_FLAGS_STAGED_SUBMISSION_FIRST
+ *    along with HL_CS_FLAGS_ENCAP_SIGNALS flag, and set encaps_signal_offset
+ *    field. This offset allows app to wait on part of the reserved signals.
+ * 3. Use WAIT/COLLECTIVE WAIT CS along with HL_CS_FLAGS_ENCAP_SIGNALS flag
+ *    to wait for the encapsulated signals.
+ */
+#define HL_CS_FLAGS_ENCAP_SIGNALS              0x800
+#define HL_CS_FLAGS_RESERVE_SIGNALS_ONLY       0x1000
+#define HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY     0x2000
+
 #define HL_CS_STATUS_SUCCESS           0
 
 #define HL_MAX_JOBS_PER_CS             512
@@ -690,10 +774,35 @@ struct hl_cs_in {
        /* holds address of array of hl_cs_chunk for execution phase */
        __u64 chunks_execute;
 
-       /* Sequence number of a staged submission CS
-        * valid only if HL_CS_FLAGS_STAGED_SUBMISSION is set
-        */
-       __u64 seq;
+       union {
+               /*
+                * Sequence number of a staged submission CS
+                * valid only if HL_CS_FLAGS_STAGED_SUBMISSION is set and
+                * HL_CS_FLAGS_STAGED_SUBMISSION_FIRST is unset.
+                */
+               __u64 seq;
+
+               /*
+                * Encapsulated signals handle id
+                * Valid for two flows:
+                * 1. CS with encapsulated signals:
+                *    when HL_CS_FLAGS_STAGED_SUBMISSION and
+                *    HL_CS_FLAGS_STAGED_SUBMISSION_FIRST
+                *    and HL_CS_FLAGS_ENCAP_SIGNALS are set.
+                * 2. unreserve signals:
+                *    valid when HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY is set.
+                */
+               __u32 encaps_sig_handle_id;
+
+               /* Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set */
+               struct {
+                       /* Encapsulated signals number */
+                       __u32 encaps_signals_count;
+
+                       /* Encapsulated signals queue index (stream) */
+                       __u32 encaps_signals_q_idx;
+               };
+       };
 
        /* Number of chunks in restore phase array. Maximum number is
         * HL_MAX_JOBS_PER_CS
@@ -718,14 +827,31 @@ struct hl_cs_in {
 };
 
 struct hl_cs_out {
+       union {
+               /*
+                * seq holds the sequence number of the CS to pass to wait
+                * ioctl. All values are valid except for 0 and ULLONG_MAX
+                */
+               __u64 seq;
+
+               /* Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set */
+               struct {
+                       /* This is the resereved signal handle id */
+                       __u32 handle_id;
+
+                       /* This is the signals count */
+                       __u32 count;
+               };
+       };
+
+       /* HL_CS_STATUS */
+       __u32 status;
+
        /*
-        * seq holds the sequence number of the CS to pass to wait ioctl. All
-        * values are valid except for 0 and ULLONG_MAX
+        * SOB base address offset
+        * Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set
         */
-       __u64 seq;
-       /* HL_CS_STATUS_* */
-       __u32 status;
-       __u32 pad;
+       __u32 sob_base_addr_offset;
 };
 
 union hl_cs_args {
@@ -735,11 +861,18 @@ union hl_cs_args {
 
 #define HL_WAIT_CS_FLAGS_INTERRUPT     0x2
 #define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000
+#define HL_WAIT_CS_FLAGS_MULTI_CS      0x4
+
+#define HL_WAIT_MULTI_CS_LIST_MAX_LEN  32
 
 struct hl_wait_cs_in {
        union {
                struct {
-                       /* Command submission sequence number */
+                       /*
+                        * In case of wait_cs holds the CS sequence number.
+                        * In case of wait for multi CS hold a user pointer to
+                        * an array of CS sequence numbers
+                        */
                        __u64 seq;
                        /* Absolute timeout to wait for command submission
                         * in microseconds
@@ -767,12 +900,17 @@ struct hl_wait_cs_in {
 
        /* Context ID - Currently not in use */
        __u32 ctx_id;
+
        /* HL_WAIT_CS_FLAGS_*
         * If HL_WAIT_CS_FLAGS_INTERRUPT is set, this field should include
         * interrupt id according to HL_WAIT_CS_FLAGS_INTERRUPT_MASK, in order
         * not to specify an interrupt id ,set mask to all 1s.
         */
        __u32 flags;
+
+       /* Multi CS API info- valid entries in multi-CS array */
+       __u8 seq_arr_len;
+       __u8 pad[7];
 };
 
 #define HL_WAIT_CS_STATUS_COMPLETED    0
@@ -789,8 +927,15 @@ struct hl_wait_cs_out {
        __u32 status;
        /* HL_WAIT_CS_STATUS_FLAG* */
        __u32 flags;
-       /* valid only if HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD is set */
+       /*
+        * valid only if HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD is set
+        * for wait_cs: timestamp of CS completion
+        * for wait_multi_cs: timestamp of FIRST CS completion
+        */
        __s64 timestamp_nsec;
+       /* multi CS completion bitmap */
+       __u32 cs_completion_map;
+       __u32 pad;
 };
 
 union hl_wait_cs_args {
@@ -813,6 +958,7 @@ union hl_wait_cs_args {
 #define HL_MEM_CONTIGUOUS      0x1
 #define HL_MEM_SHARED          0x2
 #define HL_MEM_USERPTR         0x4
+#define HL_MEM_FORCE_HINT      0x8
 
 struct hl_mem_in {
        union {
index e7b4c61..c15ad27 100644 (file)
@@ -1263,6 +1263,36 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
        return -ESRCH;
 }
 
+static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
+                                struct futex_pi_state **ps)
+{
+       /*
+        * No existing pi state. First waiter. [2]
+        *
+        * This creates pi_state, we have hb->lock held, this means nothing can
+        * observe this state, wait_lock is irrelevant.
+        */
+       struct futex_pi_state *pi_state = alloc_pi_state();
+
+       /*
+        * Initialize the pi_mutex in locked state and make @p
+        * the owner of it:
+        */
+       rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+       /* Store the key for possible exit cleanups: */
+       pi_state->key = *key;
+
+       WARN_ON(!list_empty(&pi_state->list));
+       list_add(&pi_state->list, &p->pi_state_list);
+       /*
+        * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+        * because there is no concurrency as the object is not published yet.
+        */
+       pi_state->owner = p;
+
+       *ps = pi_state;
+}
 /*
  * Lookup the task for the TID provided from user space and attach to
  * it after doing proper sanity checks.
@@ -1272,7 +1302,6 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
                              struct task_struct **exiting)
 {
        pid_t pid = uval & FUTEX_TID_MASK;
-       struct futex_pi_state *pi_state;
        struct task_struct *p;
 
        /*
@@ -1324,36 +1353,11 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
                return ret;
        }
 
-       /*
-        * No existing pi state. First waiter. [2]
-        *
-        * This creates pi_state, we have hb->lock held, this means nothing can
-        * observe this state, wait_lock is irrelevant.
-        */
-       pi_state = alloc_pi_state();
-
-       /*
-        * Initialize the pi_mutex in locked state and make @p
-        * the owner of it:
-        */
-       rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
-
-       /* Store the key for possible exit cleanups: */
-       pi_state->key = *key;
-
-       WARN_ON(!list_empty(&pi_state->list));
-       list_add(&pi_state->list, &p->pi_state_list);
-       /*
-        * Assignment without holding pi_state->pi_mutex.wait_lock is safe
-        * because there is no concurrency as the object is not published yet.
-        */
-       pi_state->owner = p;
+       __attach_to_pi_owner(p, key, ps);
        raw_spin_unlock_irq(&p->pi_lock);
 
        put_task_struct(p);
 
-       *ps = pi_state;
-
        return 0;
 }
 
@@ -1454,8 +1458,26 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                        newval |= FUTEX_WAITERS;
 
                ret = lock_pi_update_atomic(uaddr, uval, newval);
-               /* If the take over worked, return 1 */
-               return ret < 0 ? ret : 1;
+               if (ret)
+                       return ret;
+
+               /*
+                * If the waiter bit was requested the caller also needs PI
+                * state attached to the new owner of the user space futex.
+                *
+                * @task is guaranteed to be alive and it cannot be exiting
+                * because it is either sleeping or waiting in
+                * futex_requeue_pi_wakeup_sync().
+                *
+                * No need to do the full attach_to_pi_owner() exercise
+                * because @task is known and valid.
+                */
+               if (set_waiters) {
+                       raw_spin_lock_irq(&task->pi_lock);
+                       __attach_to_pi_owner(task, key, ps);
+                       raw_spin_unlock_irq(&task->pi_lock);
+               }
+               return 1;
        }
 
        /*
@@ -1939,12 +1961,26 @@ static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
  * @hb:                the hash_bucket of the requeue target futex
  *
  * During futex_requeue, with requeue_pi=1, it is possible to acquire the
- * target futex if it is uncontended or via a lock steal.  Set the futex_q key
- * to the requeue target futex so the waiter can detect the wakeup on the right
- * futex, but remove it from the hb and NULL the rt_waiter so it can detect
- * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
- * to protect access to the pi_state to fixup the owner later.  Must be called
- * with both q->lock_ptr and hb->lock held.
+ * target futex if it is uncontended or via a lock steal.
+ *
+ * 1) Set @q::key to the requeue target futex key so the waiter can detect
+ *    the wakeup on the right futex.
+ *
+ * 2) Dequeue @q from the hash bucket.
+ *
+ * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock
+ *    acquisition.
+ *
+ * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that
+ *    the waiter has to fixup the pi state.
+ *
+ * 5) Complete the requeue state so the waiter can make progress. After
+ *    this point the waiter task can return from the syscall immediately in
+ *    case that the pi state does not have to be fixed up.
+ *
+ * 6) Wake the waiter task.
+ *
+ * Must be called with both q->lock_ptr and hb->lock held.
  */
 static inline
 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
@@ -1998,7 +2034,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
 {
        struct futex_q *top_waiter = NULL;
        u32 curval;
-       int ret, vpid;
+       int ret;
 
        if (get_futex_value_locked(&curval, pifutex))
                return -EFAULT;
@@ -2025,7 +2061,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
         * and waiting on the 'waitqueue' futex which is always !PI.
         */
        if (!top_waiter->rt_waiter || top_waiter->pi_state)
-               ret = -EINVAL;
+               return -EINVAL;
 
        /* Ensure we requeue to the expected futex. */
        if (!match_futex(top_waiter->requeue_pi_key, key2))
@@ -2036,17 +2072,23 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
                return -EAGAIN;
 
        /*
-        * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
-        * the contended case or if set_waiters is 1.  The pi_state is returned
-        * in ps in contended cases.
+        * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
+        * in the contended case or if @set_waiters is true.
+        *
+        * In the contended case PI state is attached to the lock owner. If
+        * the user space lock can be acquired then PI state is attached to
+        * the new owner (@top_waiter->task) when @set_waiters is true.
         */
-       vpid = task_pid_vnr(top_waiter->task);
        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
                                   exiting, set_waiters);
        if (ret == 1) {
-               /* Dequeue, wake up and update top_waiter::requeue_state */
+               /*
+                * Lock was acquired in user space and PI state was
+                * attached to @top_waiter->task. That means state is fully
+                * consistent and the waiter can return to user space
+                * immediately after the wakeup.
+                */
                requeue_pi_wake_futex(top_waiter, key2, hb2);
-               return vpid;
        } else if (ret < 0) {
                /* Rewind top_waiter::requeue_state */
                futex_requeue_pi_complete(top_waiter, ret);
@@ -2208,19 +2250,26 @@ retry_private:
                                                 &exiting, nr_requeue);
 
                /*
-                * At this point the top_waiter has either taken uaddr2 or is
-                * waiting on it.  If the former, then the pi_state will not
-                * exist yet, look it up one more time to ensure we have a
-                * reference to it. If the lock was taken, @ret contains the
-                * VPID of the top waiter task.
-                * If the lock was not taken, we have pi_state and an initial
-                * refcount on it. In case of an error we have nothing.
+                * At this point the top_waiter has either taken uaddr2 or
+                * is waiting on it. In both cases pi_state has been
+                * established and an initial refcount on it. In case of an
+                * error there's nothing.
                 *
                 * The top waiter's requeue_state is up to date:
                 *
-                *  - If the lock was acquired atomically (ret > 0), then
+                *  - If the lock was acquired atomically (ret == 1), then
                 *    the state is Q_REQUEUE_PI_LOCKED.
                 *
+                *    The top waiter has been dequeued and woken up and can
+                *    return to user space immediately. The kernel/user
+                *    space state is consistent. In case that there must be
+                *    more waiters requeued the WAITERS bit in the user
+                *    space futex is set so the top waiter task has to go
+                *    into the syscall slowpath to unlock the futex. This
+                *    will block until this requeue operation has been
+                *    completed and the hash bucket locks have been
+                *    dropped.
+                *
                 *  - If the trylock failed with an error (ret < 0) then
                 *    the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
                 *    happened", or Q_REQUEUE_PI_IGNORE when there was an
@@ -2234,36 +2283,20 @@ retry_private:
                 *    the same sanity checks for requeue_pi as the loop
                 *    below does.
                 */
-               if (ret > 0) {
-                       WARN_ON(pi_state);
-                       task_count++;
-                       /*
-                        * If futex_proxy_trylock_atomic() acquired the
-                        * user space futex, then the user space value
-                        * @uaddr2 has been set to the @hb1's top waiter
-                        * task VPID. This task is guaranteed to be alive
-                        * and cannot be exiting because it is either
-                        * sleeping or blocked on @hb2 lock.
-                        *
-                        * The @uaddr2 futex cannot have waiters either as
-                        * otherwise futex_proxy_trylock_atomic() would not
-                        * have succeeded.
-                        *
-                        * In order to requeue waiters to @hb2, pi state is
-                        * required. Hand in the VPID value (@ret) and
-                        * allocate PI state with an initial refcount on
-                        * it.
-                        */
-                       ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state,
-                                                &exiting);
-                       WARN_ON(ret);
-               }
-
                switch (ret) {
                case 0:
                        /* We hold a reference on the pi state. */
                        break;
 
+               case 1:
+                       /*
+                        * futex_proxy_trylock_atomic() acquired the user space
+                        * futex. Adjust task_count.
+                        */
+                       task_count++;
+                       ret = 0;
+                       break;
+
                /*
                 * If the above failed, then pi_state is NULL and
                 * waiter::requeue_state is correct.
@@ -2395,9 +2428,8 @@ retry_private:
        }
 
        /*
-        * We took an extra initial reference to the pi_state either in
-        * futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need
-        * to drop it here again.
+        * We took an extra initial reference to the pi_state in
+        * futex_proxy_trylock_atomic(). We need to drop it here again.
         */
        put_pi_state(pi_state);
 
index 8eabdc7..6bb116c 100644 (file)
@@ -753,7 +753,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
                 * other configuration and we fail to report; also, see
                 * lockdep.
                 */
-               if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter->ww_ctx)
+               if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx)
                        ret = 0;
 
                raw_spin_unlock(&lock->wait_lock);
index 9215b4d..000e8d5 100644 (file)
@@ -1376,15 +1376,17 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 
 #include "rwbase_rt.c"
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void __rwsem_init(struct rw_semaphore *sem, const char *name,
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
                  struct lock_class_key *key)
 {
+       init_rwbase_rt(&(sem)->rwbase);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
        debug_check_no_locks_freed((void *)sem, sizeof(*sem));
        lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
-}
-EXPORT_SYMBOL(__rwsem_init);
 #endif
+}
+EXPORT_SYMBOL(__init_rwsem);
 
 static inline void __down_read(struct rw_semaphore *sem)
 {
index c4462c4..1bba412 100644 (file)
@@ -8836,7 +8836,6 @@ static void balance_push(struct rq *rq)
        struct task_struct *push_task = rq->curr;
 
        lockdep_assert_rq_held(rq);
-       SCHED_WARN_ON(rq->cpu != smp_processor_id());
 
        /*
         * Ensure the thing is persistent until balance_push_set(.on = false);
@@ -8844,9 +8843,10 @@ static void balance_push(struct rq *rq)
        rq->balance_callback = &balance_push_callback;
 
        /*
-        * Only active while going offline.
+        * Only active while going offline and when invoked on the outgoing
+        * CPU.
         */
-       if (!cpu_dying(rq->cpu))
+       if (!cpu_dying(rq->cpu) || rq != this_rq())
                return;
 
        /*
index 912b47a..d17b0a5 100644 (file)
@@ -379,10 +379,10 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
        cpuidle_use_deepest_state(latency_ns);
 
        it.done = 0;
-       hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        it.timer.function = idle_inject_timer_fn;
        hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
-                     HRTIMER_MODE_REL_PINNED);
+                     HRTIMER_MODE_REL_PINNED_HARD);
 
        while (!READ_ONCE(it.done))
                do_idle();
index 388e65d..8d252f6 100644 (file)
@@ -219,13 +219,12 @@ static int __init
 trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp,
                          char *end, const char *key)
 {
-       struct xbc_node *knode, *anode;
+       struct xbc_node *anode;
        const char *p;
        char sep;
 
-       knode = xbc_node_find_child(hnode, key);
-       if (knode) {
-               anode = xbc_node_get_child(knode);
+       p = xbc_node_find_value(hnode, key, &anode);
+       if (p) {
                if (!anode) {
                        pr_err("hist.%s requires value(s).\n", key);
                        return -EINVAL;
@@ -263,9 +262,9 @@ trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp,
        append_printf(bufp, end, ":%s(%s)", handler, p);
 
        /* Compose 'action' parameter */
-       knode = xbc_node_find_child(hnode, "trace");
+       knode = xbc_node_find_subkey(hnode, "trace");
        if (!knode)
-               knode = xbc_node_find_child(hnode, "save");
+               knode = xbc_node_find_subkey(hnode, "save");
 
        if (knode) {
                anode = xbc_node_get_child(knode);
@@ -284,7 +283,7 @@ trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp,
                                sep = ',';
                }
                append_printf(bufp, end, ")");
-       } else if (xbc_node_find_child(hnode, "snapshot")) {
+       } else if (xbc_node_find_subkey(hnode, "snapshot")) {
                append_printf(bufp, end, ".snapshot()");
        } else {
                pr_err("hist.%s requires an action.\n",
@@ -315,7 +314,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp,
                        break;
        }
 
-       if (xbc_node_find_child(hnode, param))
+       if (xbc_node_find_subkey(hnode, param))
                ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param);
 
        return ret;
@@ -375,7 +374,7 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size)
        if (p)
                append_printf(&buf, end, ":name=%s", p);
 
-       node = xbc_node_find_child(hnode, "var");
+       node = xbc_node_find_subkey(hnode, "var");
        if (node) {
                xbc_node_for_each_key_value(node, knode, p) {
                        /* Expression must not include spaces. */
@@ -386,21 +385,21 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size)
        }
 
        /* Histogram control attributes (mutual exclusive) */
-       if (xbc_node_find_child(hnode, "pause"))
+       if (xbc_node_find_value(hnode, "pause", NULL))
                append_printf(&buf, end, ":pause");
-       else if (xbc_node_find_child(hnode, "continue"))
+       else if (xbc_node_find_value(hnode, "continue", NULL))
                append_printf(&buf, end, ":continue");
-       else if (xbc_node_find_child(hnode, "clear"))
+       else if (xbc_node_find_value(hnode, "clear", NULL))
                append_printf(&buf, end, ":clear");
 
        /* Histogram handler and actions */
-       node = xbc_node_find_child(hnode, "onmax");
+       node = xbc_node_find_subkey(hnode, "onmax");
        if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0)
                return -EINVAL;
-       node = xbc_node_find_child(hnode, "onchange");
+       node = xbc_node_find_subkey(hnode, "onchange");
        if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0)
                return -EINVAL;
-       node = xbc_node_find_child(hnode, "onmatch");
+       node = xbc_node_find_subkey(hnode, "onmatch");
        if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0)
                return -EINVAL;
 
@@ -437,7 +436,7 @@ trace_boot_init_histograms(struct trace_event_file *file,
                }
        }
 
-       if (xbc_node_find_child(hnode, "keys")) {
+       if (xbc_node_find_subkey(hnode, "keys")) {
                if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) {
                        tmp = kstrdup(buf, GFP_KERNEL);
                        if (trigger_process_regex(file, buf) < 0)
@@ -496,7 +495,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
                        else if (trigger_process_regex(file, buf) < 0)
                                pr_err("Failed to apply an action: %s\n", p);
                }
-               anode = xbc_node_find_child(enode, "hist");
+               anode = xbc_node_find_subkey(enode, "hist");
                if (anode)
                        trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf));
        } else if (xbc_node_find_value(enode, "actions", NULL))
@@ -518,7 +517,7 @@ trace_boot_init_events(struct trace_array *tr, struct xbc_node *node)
        bool enable, enable_all = false;
        const char *data;
 
-       node = xbc_node_find_child(node, "event");
+       node = xbc_node_find_subkey(node, "event");
        if (!node)
                return;
        /* per-event key starts with "event.GROUP.EVENT" */
@@ -621,7 +620,7 @@ trace_boot_init_instances(struct xbc_node *node)
        struct trace_array *tr;
        const char *p;
 
-       node = xbc_node_find_child(node, "instance");
+       node = xbc_node_find_subkey(node, "instance");
        if (!node)
                return;
 
index 9270174..f8419cf 100644 (file)
@@ -142,16 +142,16 @@ xbc_node_match_prefix(struct xbc_node *node, const char **prefix)
 }
 
 /**
- * xbc_node_find_child() - Find a child node which matches given key
+ * xbc_node_find_subkey() - Find a subkey node which matches given key
  * @parent: An XBC node.
  * @key: A key string.
  *
- * Search a node under @parent which matches @key. The @key can contain
+ * Search a key node under @parent which matches @key. The @key can contain
  * several words jointed with '.'. If @parent is NULL, this searches the
  * node from whole tree. Return NULL if no node is matched.
  */
 struct xbc_node * __init
-xbc_node_find_child(struct xbc_node *parent, const char *key)
+xbc_node_find_subkey(struct xbc_node *parent, const char *key)
 {
        struct xbc_node *node;
 
@@ -191,7 +191,7 @@ const char * __init
 xbc_node_find_value(struct xbc_node *parent, const char *key,
                    struct xbc_node **vnode)
 {
-       struct xbc_node *node = xbc_node_find_child(parent, key);
+       struct xbc_node *node = xbc_node_find_subkey(parent, key);
 
        if (!node || !xbc_node_is_key(node))
                return NULL;
index 3e02cc3..e2c0cfb 100644 (file)
@@ -2014,7 +2014,7 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
 {
        const struct vsock_transport *transport;
        struct vsock_sock *vsk;
-       ssize_t record_len;
+       ssize_t msg_len;
        long timeout;
        int err = 0;
        DEFINE_WAIT(wait);
@@ -2028,9 +2028,9 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
        if (err <= 0)
                goto out;
 
-       record_len = transport->seqpacket_dequeue(vsk, msg, flags);
+       msg_len = transport->seqpacket_dequeue(vsk, msg, flags);
 
-       if (record_len < 0) {
+       if (msg_len < 0) {
                err = -ENOMEM;
                goto out;
        }
@@ -2044,14 +2044,14 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
                 * packet.
                 */
                if (flags & MSG_TRUNC)
-                       err = record_len;
+                       err = msg_len;
                else
                        err = len - msg_data_left(msg);
 
                /* Always set MSG_TRUNC if real length of packet is
                 * bigger than user's buffer.
                 */
-               if (record_len > len)
+               if (msg_len > len)
                        msg->msg_flags |= MSG_TRUNC;
        }
 
index 081e7ae..59ee1be 100644 (file)
@@ -76,8 +76,12 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
                        goto out;
 
                if (msg_data_left(info->msg) == 0 &&
-                   info->type == VIRTIO_VSOCK_TYPE_SEQPACKET)
-                       pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+                   info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
+                       pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
+
+                       if (info->msg->msg_flags & MSG_EOR)
+                               pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+               }
        }
 
        trace_virtio_transport_alloc_pkt(src_cid, src_port,
@@ -457,9 +461,12 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
                                dequeued_len += pkt_len;
                }
 
-               if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) {
+               if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) {
                        msg_ready = true;
                        vvs->msg_count--;
+
+                       if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)
+                               msg->msg_flags |= MSG_EOR;
                }
 
                virtio_transport_dec_rx_pkt(vvs, pkt);
@@ -1029,7 +1036,7 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk,
                goto out;
        }
 
-       if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)
+       if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM)
                vvs->msg_count++;
 
        /* Try to copy small packets into the buffer of last packet queued,
@@ -1044,12 +1051,12 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk,
 
                /* If there is space in the last packet queued, we copy the
                 * new packet in its buffer. We avoid this if the last packet
-                * queued has VIRTIO_VSOCK_SEQ_EOR set, because this is
-                * delimiter of SEQPACKET record, so 'pkt' is the first packet
-                * of a new record.
+                * queued has VIRTIO_VSOCK_SEQ_EOM set, because this is
+                * delimiter of SEQPACKET message, so 'pkt' is the first packet
+                * of a new message.
                 */
                if ((pkt->len <= last_pkt->buf_len - last_pkt->len) &&
-                   !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)) {
+                   !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM)) {
                        memcpy(last_pkt->buf + last_pkt->len, pkt->buf,
                               pkt->len);
                        last_pkt->len += pkt->len;
index c30dab7..5ddcb76 100644 (file)
@@ -79,7 +79,7 @@ position p : script:python() { relevant(p) };
   } else {
     ... when != krealloc(E, ...)
         when any
-*   \(kfree\|kzfree\)(E)
+*   \(kfree\|kfree_sensitive\)(E)
     ...
   }
 
index 9be48b5..676edd5 100644 (file)
@@ -123,6 +123,8 @@ hlist_for_each_entry_safe(c,...) S
 |
 list_remove_head(x,c,...)
 |
+list_entry_is_head(c,...)
+|
 sizeof(<+...c...+>)
 |
  &c->member
index 0ef3abf..f355869 100644 (file)
@@ -349,6 +349,7 @@ static int do_file(char const *const fname, void *addr)
        case EM_ARM:
        case EM_MICROBLAZE:
        case EM_MIPS:
+       case EM_RISCV:
        case EM_XTENSA:
                break;
        default:
index 67766bf..2a3638c 100644 (file)
@@ -282,6 +282,7 @@ static void test_stream_msg_peek_server(const struct test_opts *opts)
 }
 
 #define MESSAGES_CNT 7
+#define MSG_EOR_IDX (MESSAGES_CNT / 2)
 static void test_seqpacket_msg_bounds_client(const struct test_opts *opts)
 {
        int fd;
@@ -294,7 +295,7 @@ static void test_seqpacket_msg_bounds_client(const struct test_opts *opts)
 
        /* Send several messages, one with MSG_EOR flag */
        for (int i = 0; i < MESSAGES_CNT; i++)
-               send_byte(fd, 1, 0);
+               send_byte(fd, 1, (i == MSG_EOR_IDX) ? MSG_EOR : 0);
 
        control_writeln("SENDDONE");
        close(fd);
@@ -324,6 +325,11 @@ static void test_seqpacket_msg_bounds_server(const struct test_opts *opts)
                        perror("message bound violated");
                        exit(EXIT_FAILURE);
                }
+
+               if ((i == MSG_EOR_IDX) ^ !!(msg.msg_flags & MSG_EOR)) {
+                       perror("MSG_EOR");
+                       exit(EXIT_FAILURE);
+               }
        }
 
        close(fd);
index 9db867d..f9c52b7 100644 (file)
@@ -10,10 +10,9 @@ override CFLAGS+= $(call cc-option,-O3,-O1) ${WARNFLAGS}
 # Add "-fstack-protector" only if toolchain supports it.
 override CFLAGS+= $(call cc-option,-fstack-protector-strong)
 CC?= $(CROSS_COMPILE)gcc
-PKG_CONFIG?= pkg-config
+PKG_CONFIG?= $(CROSS_COMPILE)pkg-config
 
 override CFLAGS+=-D VERSION=\"$(VERSION)\"
-LDFLAGS+=
 TARGET=tmon
 
 INSTALL_PROGRAM=install -m 755 -p
@@ -33,7 +32,6 @@ override CFLAGS += $(shell $(PKG_CONFIG) --cflags $(STATIC) panelw ncursesw 2> /
                     $(PKG_CONFIG) --cflags $(STATIC) panel ncurses 2> /dev/null)
 
 OBJS = tmon.o tui.o sysfs.o pid.o
-OBJS +=
 
 tmon: $(OBJS) Makefile tmon.h
        $(CC) $(CFLAGS) $(LDFLAGS) $(OBJS)  -o $(TARGET) $(TMON_LIBS)
@@ -42,15 +40,13 @@ valgrind: tmon
         sudo valgrind -v --track-origins=yes --tool=memcheck --leak-check=yes --show-reachable=yes --num-callers=20 --track-fds=yes ./$(TARGET)  1> /dev/null
 
 install:
-       - mkdir -p $(INSTALL_ROOT)/$(BINDIR)
-       - $(INSTALL_PROGRAM) "$(TARGET)" "$(INSTALL_ROOT)/$(BINDIR)/$(TARGET)"
+       - $(INSTALL_PROGRAM) -D "$(TARGET)" "$(INSTALL_ROOT)/$(BINDIR)/$(TARGET)"
 
 uninstall:
        $(DEL_FILE) "$(INSTALL_ROOT)/$(BINDIR)/$(TARGET)"
 
 clean:
-       find . -name "*.o" | xargs $(DEL_FILE)
-       rm -f $(TARGET)
+       rm -f $(TARGET) $(OBJS)
 
 dist:
        git tag v$(VERSION)