Merge tag 'perf-tools-2020-12-24' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 25 Dec 2020 19:07:34 +0000 (11:07 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 25 Dec 2020 19:07:34 +0000 (11:07 -0800)
Pull more perf tools updates from Arnaldo Carvalho de Melo:

 - Refactor 'perf stat' per CPU/socket/die/thread aggregation fixing use
   cases in ARM machines.

 - Fix memory leak when synthesizing SDT probes in 'perf probe'.

 - Update kernel header copies related to KVM, epol_pwait. msr-index and
   powerpc and s390 syscall tables.

* tag 'perf-tools-2020-12-24' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux: (24 commits)
  perf probe: Fix memory leak when synthesizing SDT probes
  perf stat aggregation: Add separate thread member
  perf stat aggregation: Add separate core member
  perf stat aggregation: Add separate die member
  perf stat aggregation: Add separate socket member
  perf stat aggregation: Add separate node member
  perf stat aggregation: Start using cpu_aggr_id in map
  perf cpumap: Drop in cpu_aggr_map struct
  perf cpumap: Add new map type for aggregation
  perf stat: Replace aggregation ID with a struct
  perf cpumap: Add new struct for cpu aggregation
  perf cpumap: Use existing allocator to avoid using malloc
  perf tests: Improve topology test to check all aggregation types
  perf tools: Update s390's syscall.tbl copy from the kernel sources
  perf tools: Update powerpc's syscall.tbl copy from the kernel sources
  perf s390: Move syscall.tbl check into check-headers.sh
  perf powerpc: Move syscall.tbl check to check-headers.sh
  tools headers UAPI: Synch KVM's svm.h header with the kernel
  tools kvm headers: Update KVM headers from the kernel sources
  tools headers UAPI: Sync KVM's vmx.h header with the kernel sources
  ...

262 files changed:
Documentation/admin-guide/sysctl/vm.rst
Documentation/devicetree/bindings/arm/idle-states.yaml
Documentation/devicetree/bindings/bus/allwinner,sun50i-a64-de2.yaml
Documentation/devicetree/bindings/bus/baikal,bt1-axi.yaml
Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml
Documentation/devicetree/bindings/connector/usb-connector.yaml
Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml
Documentation/devicetree/bindings/display/bridge/anx6345.yaml
Documentation/devicetree/bindings/display/bridge/intel,keembay-dsi.yaml
Documentation/devicetree/bindings/display/bridge/ite,it6505.yaml
Documentation/devicetree/bindings/display/bridge/lvds-codec.yaml
Documentation/devicetree/bindings/display/bridge/ps8640.yaml
Documentation/devicetree/bindings/display/bridge/simple-bridge.yaml
Documentation/devicetree/bindings/display/bridge/thine,thc63lvd1024.yaml
Documentation/devicetree/bindings/display/bridge/toshiba,tc358775.yaml
Documentation/devicetree/bindings/display/intel,keembay-msscam.yaml
Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml
Documentation/devicetree/bindings/display/panel/novatek,nt36672a.yaml
Documentation/devicetree/bindings/display/xlnx/xlnx,zynqmp-dpsub.yaml
Documentation/devicetree/bindings/dma/dma-common.yaml
Documentation/devicetree/bindings/dma/dma-router.yaml
Documentation/devicetree/bindings/dma/ingenic,dma.yaml
Documentation/devicetree/bindings/dma/renesas,rcar-dmac.yaml
Documentation/devicetree/bindings/dma/snps,dma-spear1340.yaml
Documentation/devicetree/bindings/eeprom/at24.yaml
Documentation/devicetree/bindings/eeprom/at25.yaml
Documentation/devicetree/bindings/hwmon/moortec,mr75203.yaml
Documentation/devicetree/bindings/hwmon/sensirion,shtc1.yaml
Documentation/devicetree/bindings/hwmon/ti,tmp513.yaml
Documentation/devicetree/bindings/iio/adc/lltc,ltc2496.yaml
Documentation/devicetree/bindings/iio/humidity/ti,hdc2010.yaml
Documentation/devicetree/bindings/iio/light/upisemi,us5182.yaml
Documentation/devicetree/bindings/iio/proximity/semtech,sx9310.yaml
Documentation/devicetree/bindings/input/fsl,mpr121-touchkey.yaml
Documentation/devicetree/bindings/input/gpio-keys.yaml
Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml
Documentation/devicetree/bindings/interrupt-controller/mti,gic.yaml
Documentation/devicetree/bindings/interrupt-controller/ti,pruss-intc.yaml
Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml
Documentation/devicetree/bindings/leds/backlight/common.yaml
Documentation/devicetree/bindings/leds/common.yaml
Documentation/devicetree/bindings/leds/leds-lp55xx.yaml
Documentation/devicetree/bindings/mailbox/arm,mhu.yaml
Documentation/devicetree/bindings/media/coda.yaml
Documentation/devicetree/bindings/media/i2c/maxim,max9286.yaml
Documentation/devicetree/bindings/media/i2c/mipi-ccs.yaml
Documentation/devicetree/bindings/media/i2c/sony,imx214.yaml
Documentation/devicetree/bindings/media/i2c/sony,imx274.yaml
Documentation/devicetree/bindings/mfd/st,stmfx.yaml
Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml
Documentation/devicetree/bindings/net/amlogic,meson-dwmac.yaml
Documentation/devicetree/bindings/net/dsa/dsa.yaml
Documentation/devicetree/bindings/net/ethernet-controller.yaml
Documentation/devicetree/bindings/net/ethernet-phy.yaml
Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml
Documentation/devicetree/bindings/net/mdio.yaml
Documentation/devicetree/bindings/net/mediatek,star-emac.yaml
Documentation/devicetree/bindings/net/qcom,ipa.yaml
Documentation/devicetree/bindings/net/snps,dwmac.yaml
Documentation/devicetree/bindings/net/socionext,uniphier-ave4.yaml
Documentation/devicetree/bindings/net/ti,cpsw-switch.yaml
Documentation/devicetree/bindings/net/ti,dp83867.yaml
Documentation/devicetree/bindings/net/ti,dp83869.yaml
Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml
Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml
Documentation/devicetree/bindings/phy/ti,omap-usb2.yaml
Documentation/devicetree/bindings/power/mediatek,power-controller.yaml
Documentation/devicetree/bindings/power/supply/cw2015_battery.yaml
Documentation/devicetree/bindings/powerpc/sleep.yaml
Documentation/devicetree/bindings/regulator/anatop-regulator.yaml
Documentation/devicetree/bindings/serial/8250.yaml
Documentation/devicetree/bindings/serial/litex,liteuart.yaml
Documentation/devicetree/bindings/soc/litex/litex,soc-controller.yaml
Documentation/devicetree/bindings/soc/mediatek/devapc.yaml
Documentation/devicetree/bindings/soc/ti/k3-ringacc.yaml
Documentation/devicetree/bindings/soc/xilinx/xlnx,vcu-settings.yaml
Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-codec.yaml
Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.yaml
Documentation/devicetree/bindings/sound/st,stm32-sai.yaml
Documentation/devicetree/bindings/usb/renesas,usb-xhci.yaml
Documentation/devicetree/bindings/usb/renesas,usbhs.yaml
Documentation/filesystems/ext4/journal.rst
Documentation/process/submit-checklist.rst
Documentation/process/submitting-patches.rst
MAINTAINERS
arch/alpha/kernel/sys_jensen.c
arch/arm/include/asm/efi.h
arch/arm/kernel/smp.c
arch/arm64/Kconfig
arch/arm64/include/asm/efi.h
arch/arm64/kernel/smp.c
arch/parisc/kernel/irq.c
arch/powerpc/boot/Makefile
arch/powerpc/include/asm/ppc_asm.h
arch/powerpc/include/asm/vdso/timebase.h
arch/powerpc/kernel/head_32.h
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/vdso32/Makefile
arch/powerpc/kernel/vdso64/Makefile
arch/riscv/include/asm/efi.h
arch/riscv/mm/init.c
arch/s390/kernel/irq.c
arch/x86/boot/compressed/Makefile
arch/x86/include/asm/efi.h
arch/x86/kernel/Makefile
arch/x86/kernel/ima_arch.c [deleted file]
arch/x86/kernel/topology.c
arch/x86/xen/efi.c
block/blk-iocost.c
block/blk-mq.c
block/genhd.c
block/partitions/core.c
drivers/acpi/nfit/core.c
drivers/block/nbd.c
drivers/block/rnbd/rnbd-clt-sysfs.c
drivers/block/rnbd/rnbd-clt.c
drivers/block/rnbd/rnbd-clt.h
drivers/block/rnbd/rnbd-proto.h
drivers/block/rnbd/rnbd-srv.c
drivers/dax/bus.c
drivers/dax/pmem/core.c
drivers/dax/super.c
drivers/dma-buf/heaps/cma_heap.c
drivers/firmware/efi/Kconfig
drivers/firmware/efi/Makefile
drivers/firmware/efi/capsule.c
drivers/firmware/efi/libstub/efi-stub.c
drivers/firmware/efi/libstub/efistub.h
drivers/firmware/efi/libstub/fdt.c
drivers/firmware/efi/libstub/secureboot.c
drivers/firmware/efi/libstub/x86-stub.c
drivers/firmware/efi/test/efi_test.c
drivers/firmware/efi/test/efi_test.h
drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c
drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c
drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr_vbios_smu.c
drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c
drivers/gpu/drm/amd/display/dc/core/dc.c
drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubp.c
drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubp.h
drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c
drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.h
drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c
drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c
drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c
drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c
drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hubp.c
drivers/gpu/drm/amd/display/dc/dcn30/dcn30_mpc.c
drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h
drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h
drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c
drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c
drivers/gpu/drm/amd/display/modules/power/power_helpers.c
drivers/gpu/drm/amd/display/modules/power/power_helpers.h
drivers/gpu/drm/amd/pm/inc/smu_v11_0.h
drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c
drivers/gpu/drm/arm/display/komeda/komeda_dev.c
drivers/gpu/drm/arm/display/komeda/komeda_kms.c
drivers/gpu/drm/arm/display/komeda/komeda_pipeline.c
drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c
drivers/gpu/drm/i915/display/intel_lpe_audio.c
drivers/gpu/drm/i915/i915_irq.c
drivers/gpu/drm/i915/i915_pmu.c
drivers/gpu/drm/i915/i915_pmu.h
drivers/gpu/drm/ttm/ttm_pool.c
drivers/infiniband/ulp/rtrs/rtrs-clt.c
drivers/infiniband/ulp/rtrs/rtrs.h
drivers/md/bcache/super.c
drivers/md/bcache/sysfs.c
drivers/mfd/ab8500-debugfs.c
drivers/net/ethernet/mellanox/mlx4/en_cq.c
drivers/net/ethernet/mellanox/mlx4/en_rx.c
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
drivers/net/virtio_net.c
drivers/ntb/msi.c
drivers/nvdimm/btt.h
drivers/nvdimm/claim.c
drivers/nvdimm/core.c
drivers/nvdimm/label.c
drivers/pci/controller/mobiveil/pcie-mobiveil-host.c
drivers/pci/controller/pcie-xilinx-nwl.c
drivers/pinctrl/nomadik/pinctrl-nomadik.c
drivers/s390/block/dasd_alias.c
drivers/vdpa/Kconfig
drivers/vdpa/ifcvf/ifcvf_main.c
drivers/vdpa/mlx5/net/mlx5_vnet.c
drivers/vdpa/vdpa.c
drivers/vdpa/vdpa_sim/Makefile
drivers/vdpa/vdpa_sim/vdpa_sim.c
drivers/vdpa/vdpa_sim/vdpa_sim.h [new file with mode: 0644]
drivers/vdpa/vdpa_sim/vdpa_sim_net.c [new file with mode: 0644]
drivers/vhost/scsi.c
drivers/vhost/vdpa.c
drivers/virtio/virtio_mem.c
drivers/virtio/virtio_ring.c
drivers/xen/events/events_base.c
drivers/xen/evtchn.c
fs/block_dev.c
fs/dcache.c
fs/ext4/balloc.c
fs/ext4/block_validity.c
fs/ext4/ext4.h
fs/ext4/ext4_jbd2.c
fs/ext4/ext4_jbd2.h
fs/ext4/extents.c
fs/ext4/fast_commit.c
fs/ext4/fast_commit.h
fs/ext4/fsync.c
fs/ext4/indirect.c
fs/ext4/inode.c
fs/ext4/mballoc.c
fs/ext4/namei.c
fs/ext4/page-io.c
fs/ext4/super.c
fs/ext4/xattr.c
fs/hostfs/hostfs_kern.c
fs/inode.c
fs/io-wq.c
fs/io-wq.h
fs/io_uring.c
fs/jbd2/journal.c
fs/namei.c
fs/namespace.c
fs/pnode.h
include/linux/efi.h
include/linux/fs.h
include/linux/interrupt.h
include/linux/irq.h
include/linux/irqdesc.h
include/linux/jbd2.h
include/linux/kernel_stat.h
include/linux/vdpa.h
include/trace/events/iocost.h
include/uapi/linux/virtio_ids.h
kernel/irq/irqdesc.c
kernel/irq/manage.c
kernel/irq/proc.c
mm/memory_hotplug.c
scripts/coccicheck
scripts/coccinelle/api/ptr_ret.cocci [deleted file]
scripts/coccinelle/misc/boolinit.cocci [deleted file]
scripts/nsdeps
security/integrity/ima/Makefile
security/integrity/ima/ima_efi.c [new file with mode: 0644]
security/smack/smack_access.c
tools/virtio/asm/barrier.h
tools/virtio/linux/bug.h
tools/virtio/linux/kernel.h

index e972caa..e35a3f2 100644 (file)
@@ -428,7 +428,7 @@ While most applications need less than a thousand maps, certain
 programs, particularly malloc debuggers, may consume lots of them,
 e.g., up to one or two maps per allocation.
 
-The default value is 65536.
+The default value is 65530.
 
 
 memory_failure_early_kill:
index ea805c1..52bce5d 100644 (file)
@@ -313,7 +313,7 @@ patternProperties:
           wakeup-latency-us by this duration.
 
       idle-state-name:
-        $ref: /schemas/types.yaml#definitions/string
+        $ref: /schemas/types.yaml#/definitions/string
         description:
           A string used as a descriptive name for the idle state.
 
index 0503651..863a287 100644 (file)
@@ -34,7 +34,7 @@ properties:
     description:
       The SRAM that needs to be claimed to access the display engine
       bus.
-    $ref: /schemas/types.yaml#definitions/phandle-array
+    $ref: /schemas/types.yaml#/definitions/phandle-array
     maxItems: 1
 
   ranges: true
index 0bee469..4ac78b4 100644 (file)
@@ -46,7 +46,7 @@ properties:
     const: 1
 
   syscon:
-    $ref: /schemas/types.yaml#definitions/phandle
+    $ref: /schemas/types.yaml#/definitions/phandle
     description: Phandle to the Baikal-T1 System Controller DT node
 
   interrupts:
index e709e53..940486e 100644 (file)
@@ -29,18 +29,18 @@ properties:
       - const: fsl,imx8qxp-lpcg
       - items:
           - enum:
-            - fsl,imx8qm-lpcg
+              - fsl,imx8qm-lpcg
           - const: fsl,imx8qxp-lpcg
       - enum:
-        - fsl,imx8qxp-lpcg-adma
-        - fsl,imx8qxp-lpcg-conn
-        - fsl,imx8qxp-lpcg-dc
-        - fsl,imx8qxp-lpcg-dsp
-        - fsl,imx8qxp-lpcg-gpu
-        - fsl,imx8qxp-lpcg-hsio
-        - fsl,imx8qxp-lpcg-img
-        - fsl,imx8qxp-lpcg-lsio
-        - fsl,imx8qxp-lpcg-vpu
+          - fsl,imx8qxp-lpcg-adma
+          - fsl,imx8qxp-lpcg-conn
+          - fsl,imx8qxp-lpcg-dc
+          - fsl,imx8qxp-lpcg-dsp
+          - fsl,imx8qxp-lpcg-gpu
+          - fsl,imx8qxp-lpcg-hsio
+          - fsl,imx8qxp-lpcg-img
+          - fsl,imx8qxp-lpcg-lsio
+          - fsl,imx8qxp-lpcg-vpu
         deprecated: true
   reg:
     maxItems: 1
index a84464b..4286ed7 100644 (file)
@@ -37,7 +37,7 @@ properties:
     description: Size of the connector, should be specified in case of
       non-fullsize 'usb-a-connector' or 'usb-b-connector' compatible
       connectors.
-    $ref: /schemas/types.yaml#definitions/string
+    $ref: /schemas/types.yaml#/definitions/string
 
     enum:
       - mini
@@ -67,7 +67,7 @@ properties:
   power-role:
     description: Determines the power role that the Type C connector will
       support. "dual" refers to Dual Role Port (DRP).
-    $ref: /schemas/types.yaml#definitions/string
+    $ref: /schemas/types.yaml#/definitions/string
 
     enum:
       - source
@@ -76,7 +76,7 @@ properties:
 
   try-power-role:
     description: Preferred power role.
-    $ref: /schemas/types.yaml#definitions/string
+    $ref: /schemas/types.yaml#/definitions/string
 
     enum:
       - source
@@ -86,7 +86,7 @@ properties:
   data-role:
     description: Data role if Type C connector supports USB data. "dual" refers
       Dual Role Device (DRD).
-    $ref: /schemas/types.yaml#definitions/string
+    $ref: /schemas/types.yaml#/definitions/string
 
     enum:
       - host
@@ -105,7 +105,7 @@ properties:
         Type-C Cable and Connector specification, when Power Delivery is not
         supported.
     allOf:
-      - $ref: /schemas/types.yaml#definitions/string
+      - $ref: /schemas/types.yaml#/definitions/string
     enum:
       - default
       - 1.5A
index 60585a4..9392b55 100644 (file)
@@ -49,8 +49,8 @@ properties:
           Video port for panel or connector.
 
     required:
-        - port@0
-        - port@1
+      - port@0
+      - port@1
 
 required:
   - compatible
index 8c0e4f2..fccd635 100644 (file)
@@ -26,11 +26,9 @@ properties:
     description: GPIO connected to active low reset
 
   dvdd12-supply:
-    maxItems: 1
     description: Regulator for 1.2V digital core power.
 
   dvdd25-supply:
-    maxItems: 1
     description: Regulator for 2.5V digital core power.
 
   ports:
index ab5be26..35c9dfd 100644 (file)
@@ -39,10 +39,10 @@ properties:
 
     properties:
       '#address-cells':
-       const: 1
+        const: 1
 
       '#size-cells':
-       const: 0
+        const: 0
 
       port@0:
         type: object
index efbb3d0..02cfc0a 100644 (file)
@@ -35,11 +35,9 @@ properties:
     maxItems: 1
 
   ovdd-supply:
-    maxItems: 1
     description: I/O voltage
 
   pwr18-supply:
-    maxItems: 1
     description: core voltage
 
   interrupts:
index e5e3c72..66a14d6 100644 (file)
@@ -79,8 +79,7 @@ properties:
       The GPIO used to control the power down line of this device.
     maxItems: 1
 
-  power-supply:
-    maxItems: 1
+  power-supply: true
 
 required:
   - compatible
index 7e27cfc..763c790 100644 (file)
@@ -35,11 +35,9 @@ properties:
     description: GPIO connected to active low reset.
 
   vdd12-supply:
-    maxItems: 1
     description: Regulator for 1.2V digital core power.
 
   vdd33-supply:
-    maxItems: 1
     description: Regulator for 3.3V digital core power.
 
   ports:
index 3ddb35f..64e8a1c 100644 (file)
@@ -60,7 +60,6 @@ properties:
     description: GPIO controlling bridge enable
 
   vdd-supply:
-    maxItems: 1
     description: Power supply for the bridge
 
 required:
index 469ac4a..3d5ce08 100644 (file)
@@ -74,7 +74,6 @@ properties:
     description: Power down GPIO signal, pin name "/PDWN", active low.
 
   vcc-supply:
-    maxItems: 1
     description:
       Power supply for the TTL output, TTL CLOCKOUT signal, LVDS input, PLL and
       digital circuitry.
index fd3113a..b5959cc 100644 (file)
@@ -28,11 +28,9 @@ properties:
     description: i2c address of the bridge, 0x0f
 
   vdd-supply:
-    maxItems: 1
     description: 1.2V LVDS Power Supply
 
   vddio-supply:
-    maxItems: 1
     description: 1.8V IO Power Supply
 
   stby-gpios:
index 40caa61..a222b52 100644 (file)
@@ -18,8 +18,8 @@ description: |
 properties:
   compatible:
     items:
-     - const: intel,keembay-msscam
-     - const: syscon
+      - const: intel,keembay-msscam
+      - const: syscon
 
   reg:
     maxItems: 1
index 91cb4c3..a108029 100644 (file)
@@ -32,7 +32,7 @@ required:
   - power-supply
   - reset-gpios
 
-additionalProperties: false
+unevaluatedProperties: false
 
 examples:
   - |
index d2170de..2f5df1d 100644 (file)
@@ -22,7 +22,7 @@ properties:
   compatible:
     items:
       - enum:
-         - tianma,fhd-video
+          - tianma,fhd-video
       - const: novatek,nt36672a
     description: This indicates the panel manufacturer of the panel that is
       in turn using the NT36672A panel driver. This compatible string
index 7b9d468..403d579 100644 (file)
@@ -98,7 +98,6 @@ properties:
     maxItems: 1
 
   dmas:
-    maxItems: 4
     items:
       - description: Video layer, plane 0 (RGB or luma)
       - description: Video layer, plane 1 (U/V or U)
index 307b499..ad06d36 100644 (file)
@@ -38,12 +38,12 @@ properties:
       maxItems: 255
 
   dma-channels:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description:
       Number of DMA channels supported by the controller.
 
   dma-requests:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description:
       Number of DMA request signals supported by the controller.
 
index 4cee566..e727484 100644 (file)
@@ -23,7 +23,7 @@ properties:
     pattern: "^dma-router(@.*)?$"
 
   dma-masters:
-    $ref: /schemas/types.yaml#definitions/phandle-array
+    $ref: /schemas/types.yaml#/definitions/phandle-array
     description:
       Array of phandles to the DMA controllers the router can direct
       the signal to.
index 00f19b3..6a20437 100644 (file)
@@ -48,7 +48,7 @@ properties:
         ingenic,reserved-channels property.
 
   ingenic,reserved-channels:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: >
       Bitmask of channels to reserve for devices that need a specific
       channel. These channels will only be assigned when explicitely
index b548e47..c07eb6f 100644 (file)
@@ -73,7 +73,6 @@ properties:
     maxItems: 1
 
   clock-names:
-    maxItems: 1
     items:
       - const: fck
 
index ef1d687..6b35089 100644 (file)
@@ -54,7 +54,7 @@ properties:
     maximum: 16
 
   dma-masters:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
       Number of DMA masters supported by the controller. In case if
       not specified the driver will try to auto-detect this and
@@ -63,7 +63,7 @@ properties:
     maximum: 4
 
   chan_allocation_order:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
       DMA channels allocation order specifier. Zero means ascending order
       (first free allocated), while one - descending (last free allocated).
@@ -71,7 +71,7 @@ properties:
     enum: [0, 1]
 
   chan_priority:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
       DMA channels priority order. Zero means ascending channels priority
       so the very first channel has the highest priority. While 1 means
@@ -80,7 +80,7 @@ properties:
     enum: [0, 1]
 
   block_size:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: Maximum block size supported by the DMA controller.
     enum: [3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095]
 
@@ -139,7 +139,7 @@ properties:
         default: 256
 
   snps,dma-protection-control:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
       Bits one-to-one passed to the AHB HPROT[3:1] bus. Each bit setting
       indicates the following features: bit 0 - privileged mode,
index 6edfa70..d5117c6 100644 (file)
@@ -131,7 +131,7 @@ properties:
     default: 1
 
   read-only:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Disables writes to the eeprom.
 
@@ -141,7 +141,7 @@ properties:
       Total eeprom size in bytes.
 
   no-read-rollover:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Indicates that the multi-address eeprom does not automatically roll
       over reads to the next slave address. Please consult the manual of
index 7449736..121a601 100644 (file)
@@ -45,13 +45,13 @@ properties:
   spi-max-frequency: true
 
   pagesize:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     enum: [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
     description:
       Size of the eeprom page.
 
   size:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description:
       Total eeprom size in bytes.
 
index 6f3e3c0..b79f069 100644 (file)
@@ -32,7 +32,7 @@ properties:
       PVT controller has 5 VM (voltage monitor) sensors.
       vm-map defines CPU core to VM instance mapping. A
       value of 0xff means that VM sensor is unused.
-    $ref: /schemas/types.yaml#definitions/uint8-array
+    $ref: /schemas/types.yaml#/definitions/uint8-array
     maxItems: 5
 
   clocks:
index c523a1b..7d49478 100644 (file)
@@ -29,12 +29,12 @@ properties:
     const: 0x70
 
   sensirion,blocking-io:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       If set, the driver hold the i2c bus until measurement is finished.
 
   sensirion,low-precision:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       If set, the sensor aquire data with low precision (not recommended).
       The driver aquire data with high precision by default.
index c17e5d3..8020d73 100644 (file)
@@ -61,7 +61,7 @@ properties:
       Array of three(TMP513) or two(TMP512) n-Factor value for each remote
       temperature channel.
       See datasheet Table 11 for n-Factor range list and value interpretation.
-    $ref: /schemas/types.yaml#definitions/uint32-array
+    $ref: /schemas/types.yaml#/definitions/uint32-array
     minItems: 2
     maxItems: 3
     items:
index 6a991e9..2716d4e 100644 (file)
@@ -17,8 +17,7 @@ properties:
       - lltc,ltc2496
 
   vref-supply:
-    description: phandle to an external regulator providing the reference voltage
-    $ref: /schemas/types.yaml#/definitions/phandle
+    description: Power supply for the reference voltage
 
   reg:
     description: spi chipselect number according to the usual spi bindings
index 7037f82..88384b6 100644 (file)
@@ -22,8 +22,7 @@ properties:
       - ti,hdc2010
       - ti,hdc2080
 
-  vdd-supply:
-    maxItems: 1
+  vdd-supply: true
 
   reg:
     maxItems: 1
index 4a9b282..de5882c 100644 (file)
@@ -45,7 +45,7 @@ properties:
     default: 0x16
 
   upisemi,continuous:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description: |
       This chip has two power modes: one-shot (chip takes one measurement and
       then shuts itself down) and continuous (chip takes continuous
index ccfb163..5de0bb2 100644 (file)
@@ -72,7 +72,7 @@ properties:
       - finest
 
   semtech,startup-sensor:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     enum: [0, 1, 2, 3]
     default: 0
     description:
@@ -81,7 +81,7 @@ properties:
       compensation.
 
   semtech,proxraw-strength:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     enum: [0, 2, 4, 8]
     default: 2
     description:
@@ -89,7 +89,7 @@ properties:
       represent 1-1/N.
 
   semtech,avg-pos-strength:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     enum: [0, 16, 64, 128, 256, 512, 1024, 4294967295]
     default: 16
     description:
index 378a85c..878464f 100644 (file)
@@ -31,8 +31,7 @@ properties:
   interrupts:
     maxItems: 1
 
-  vdd-supply:
-    maxItems: 1
+  vdd-supply: true
 
   linux,keycodes:
     minItems: 1
index 6966ab0..060a309 100644 (file)
@@ -34,13 +34,13 @@ patternProperties:
 
         linux,code:
           description: Key / Axis code to emit.
-          $ref: /schemas/types.yaml#definitions/uint32
+          $ref: /schemas/types.yaml#/definitions/uint32
 
         linux,input-type:
           description:
             Specify event type this button/key generates. If not specified defaults to
             <1> == EV_KEY.
-          $ref: /schemas/types.yaml#definitions/uint32
+          $ref: /schemas/types.yaml#/definitions/uint32
 
           default: 1
 
@@ -56,12 +56,12 @@ patternProperties:
 
             linux,input-value = <0xffffffff>; /* -1 */
 
-          $ref: /schemas/types.yaml#definitions/uint32
+          $ref: /schemas/types.yaml#/definitions/uint32
 
         debounce-interval:
           description:
             Debouncing interval time in milliseconds. If not specified defaults to 5.
-          $ref: /schemas/types.yaml#definitions/uint32
+          $ref: /schemas/types.yaml#/definitions/uint32
 
           default: 5
 
@@ -79,7 +79,7 @@ patternProperties:
               EV_ACT_ANY        - both asserted and deasserted
               EV_ACT_ASSERTED   - asserted
               EV_ACT_DEASSERTED - deasserted
-          $ref: /schemas/types.yaml#definitions/uint32
+          $ref: /schemas/types.yaml#/definitions/uint32
           enum: [0, 1, 2]
 
         linux,can-disable:
@@ -118,7 +118,7 @@ then:
     poll-interval:
       description:
         Poll interval time in milliseconds
-      $ref: /schemas/types.yaml#definitions/uint32
+      $ref: /schemas/types.yaml#/definitions/uint32
 
   required:
     - poll-interval
index 4ce1094..bfc3a8b 100644 (file)
@@ -55,8 +55,7 @@ properties:
 
   wakeup-source: true
 
-  vcc-supply:
-    maxItems: 1
+  vcc-supply: true
 
   gain:
     description: Allows setting the sensitivity in the range from 0 to 31.
index 039e08a..91bb3c2 100644 (file)
@@ -42,7 +42,7 @@ properties:
       Specifies the list of CPU interrupt vectors to which the GIC may not
       route interrupts. This property is ignored if the CPU is started in EIC
       mode.
-    $ref: /schemas/types.yaml#definitions/uint32-array
+    $ref: /schemas/types.yaml#/definitions/uint32-array
     minItems: 1
     maxItems: 6
     uniqueItems: true
@@ -56,7 +56,7 @@ properties:
       It accepts two values: the 1st is the starting interrupt and the 2nd is
       the size of the reserved range. If not specified, the driver will
       allocate the last (2 * number of VPEs in the system).
-    $ref: /schemas/types.yaml#definitions/uint32-array
+    $ref: /schemas/types.yaml#/definitions/uint32-array
     items:
       - minimum: 0
         maximum: 254
index 1c4c009..c2ce215 100644 (file)
@@ -80,7 +80,7 @@ properties:
       mapping is provided.
 
   ti,irqs-reserved:
-    $ref: /schemas/types.yaml#definitions/uint8
+    $ref: /schemas/types.yaml#/definitions/uint8
     description: |
       Bitmask of host interrupts between 0 and 7 (corresponding to PRUSS INTC
       output interrupts 2 through 9) that are not connected to the Arm interrupt
index b5af120..3d89668 100644 (file)
@@ -76,7 +76,7 @@ properties:
             "limit" specifies the limit for translation
 
   ti,unmapped-event-sources:
-    $ref: /schemas/types.yaml#definitions/phandle-array
+    $ref: /schemas/types.yaml#/definitions/phandle-array
     description:
       Array of phandles to DMA controllers where the unmapped events originate.
 
index bc817f7..702ba35 100644 (file)
@@ -22,7 +22,7 @@ properties:
       The default brightness that should be applied to the LED by the operating
       system on start-up. The brightness should not exceed the brightness the
       LED can provide.
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
 
   max-brightness:
     description:
@@ -31,6 +31,6 @@ properties:
       on the brightness apart from what the driver says, as it could happen
       that a LED can be made so bright that it gets damaged or causes damage
       due to restrictions in a specific system, such as mounting conditions.
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
 
 additionalProperties: true
index f1211e7..b1f3637 100644 (file)
@@ -27,21 +27,21 @@ properties:
       List of device current outputs the LED is connected to. The outputs are
       identified by the numbers that must be defined in the LED device binding
       documentation.
-    $ref: /schemas/types.yaml#definitions/uint32-array
+    $ref: /schemas/types.yaml#/definitions/uint32-array
 
   function:
     description:
       LED function. Use one of the LED_FUNCTION_* prefixed definitions
       from the header include/dt-bindings/leds/common.h. If there is no
       matching LED_FUNCTION available, add a new one.
-    $ref: /schemas/types.yaml#definitions/string
+    $ref: /schemas/types.yaml#/definitions/string
 
   color:
     description:
       Color of the LED. Use one of the LED_COLOR_ID_* prefixed definitions from
       the header include/dt-bindings/leds/common.h. If there is no matching
       LED_COLOR_ID available, add a new one.
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     minimum: 0
     maximum: 9
 
@@ -49,7 +49,7 @@ properties:
     description:
       Integer to be used when more than one instance of the same function is
       needed, differing only with an ordinal number.
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
 
   label:
     description:
@@ -66,7 +66,7 @@ properties:
       produced where the LED momentarily turns off (or on). The "keep" setting
       will keep the LED at whatever its current state is, without producing a
       glitch.
-    $ref: /schemas/types.yaml#definitions/string
+    $ref: /schemas/types.yaml#/definitions/string
     enum:
       - on
       - off
@@ -77,7 +77,7 @@ properties:
     description:
       This parameter, if present, is a string defining the trigger assigned to
       the LED.
-    $ref: /schemas/types.yaml#definitions/string
+    $ref: /schemas/types.yaml#/definitions/string
 
     enum:
         # LED will act as a back-light, controlled by the framebuffer system
@@ -109,7 +109,7 @@ properties:
           brightness and duration (in ms).  The exact format is
           described in:
           Documentation/devicetree/bindings/leds/leds-trigger-pattern.txt
-    $ref: /schemas/types.yaml#definitions/uint32-matrix
+    $ref: /schemas/types.yaml#/definitions/uint32-matrix
     items:
       minItems: 2
       maxItems: 2
@@ -143,7 +143,7 @@ properties:
       the device tree and be referenced by a phandle and a set of phandle
       arguments. A length of arguments should be specified by the
       #trigger-source-cells property in the source node.
-    $ref: /schemas/types.yaml#definitions/phandle-array
+    $ref: /schemas/types.yaml#/definitions/phandle-array
 
   # Required properties for flash LED child nodes:
   flash-max-microamp:
index 58e9747..f552cd1 100644 (file)
@@ -35,7 +35,7 @@ properties:
     description: I2C slave address
 
   clock-mode:
-    $ref: /schemas/types.yaml#definitions/uint8
+    $ref: /schemas/types.yaml#/definitions/uint8
     description: |
       Input clock mode
     enum:
@@ -49,7 +49,7 @@ properties:
       GPIO attached to the chip's enable pin
 
   pwr-sel:
-    $ref: /schemas/types.yaml#definitions/uint8
+    $ref: /schemas/types.yaml#/definitions/uint8
     description: |
       LP8501 specific property. Power selection for output channels.
     enum:
@@ -70,14 +70,14 @@ patternProperties:
     $ref: common.yaml#
     properties:
       led-cur:
-        $ref: /schemas/types.yaml#definitions/uint8
+        $ref: /schemas/types.yaml#/definitions/uint8
         description: |
           Current setting at each LED channel (mA x10, 0 if LED is not connected)
         minimum: 0
         maximum: 255
 
       max-cur:
-        $ref: /schemas/types.yaml#definitions/uint8
+        $ref: /schemas/types.yaml#/definitions/uint8
         description: Maximun current at each LED channel.
 
       reg:
@@ -97,7 +97,7 @@ patternProperties:
           - 8 # LED output D9
 
       chan-name:
-        $ref: /schemas/types.yaml#definitions/string
+        $ref: /schemas/types.yaml#/definitions/string
         description: name of channel
 
 required:
index d43791a..d07eb00 100644 (file)
@@ -61,7 +61,6 @@ properties:
       - description: low-priority non-secure
       - description: high-priority non-secure
       - description: Secure
-    maxItems: 3
 
   clocks:
     maxItems: 1
index 7bac005..36781ee 100644 (file)
@@ -44,6 +44,21 @@ properties:
       - const: per
       - const: ahb
 
+  interrupts:
+    minItems: 1
+    items:
+      - description: BIT processor interrupt
+      - description: JPEG unit interrupt
+
+  interrupt-names:
+    minItems: 1
+    items:
+      - const: bit
+      - const: jpeg
+
+  power-domains:
+    maxItems: 1
+
   resets:
     maxItems: 1
 
@@ -59,6 +74,8 @@ required:
   - clocks
   - clock-names
 
+additionalProperties: false
+
 allOf:
   - if:
       properties:
@@ -68,34 +85,17 @@ allOf:
     then:
       properties:
         interrupts:
-          items:
-            - description: BIT processor interrupt
-            - description: JPEG unit interrupt
+          minItems: 2
 
         interrupt-names:
-          items:
-            - const: bit
-            - const: jpeg
+          minItems: 2
     else:
       properties:
         interrupts:
-          items:
-            - description: BIT processor interrupt
-
-  - if:
-      properties:
-        compatible:
-          contains:
-            enum:
-              - fsl,imx6dl-vpu
-              - fsl,imx6q-vpu
-    then:
-      properties:
-        power-domains:
-          $ref: /schemas/types.yaml#/definitions/phandle
-          description: phandle pointing to the PU power domain
           maxItems: 1
 
+        power-domains: false
+
 examples:
   - |
     vpu: video-codec@63ff4000 {
index 9ea8270..68ee8c7 100644 (file)
@@ -40,7 +40,6 @@ properties:
 
   poc-supply:
     description: Regulator providing Power over Coax to the cameras
-    maxItems: 1
 
   enable-gpios:
     description: GPIO connected to the \#PWDN pin with inverted polarity
index d94bd67..bb35283 100644 (file)
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 # Copyright (C) 2014--2020 Intel Corporation
-
+%YAML 1.2
+---
 $id: http://devicetree.org/schemas/media/i2c/mipi-ccs.yaml#
 $schema: http://devicetree.org/meta-schemas/core.yaml#
 
@@ -26,11 +27,11 @@ properties:
   compatible:
     oneOf:
       - items:
-        - const: mipi-ccs-1.1
-        - const: mipi-ccs
+          - const: mipi-ccs-1.1
+          - const: mipi-ccs
       - items:
-        - const: mipi-ccs-1.0
-        - const: mipi-ccs
+          - const: mipi-ccs-1.0
+          - const: mipi-ccs
       - const: nokia,smia
 
   reg:
@@ -38,15 +39,12 @@ properties:
 
   vana-supply:
     description: Analogue voltage supply (VANA), sensor dependent.
-    maxItems: 1
 
   vcore-supply:
     description: Core voltage supply (VCore), sensor dependent.
-    maxItems: 1
 
   vio-supply:
     description: I/O voltage supply (VIO), sensor dependent.
-    maxItems: 1
 
   clocks:
     description: External clock to the sensor.
index 1a3590d..eb12526 100644 (file)
@@ -37,15 +37,12 @@ properties:
 
   vdddo-supply:
     description: Chip digital IO regulator (1.8V).
-    maxItems: 1
 
   vdda-supply:
     description: Chip analog regulator (2.7V).
-    maxItems: 1
 
   vddd-supply:
     description: Chip digital core regulator (1.12V).
-    maxItems: 1
 
   flash-leds:
     description: See ../video-interfaces.txt
index f697e1a..a66acb2 100644 (file)
@@ -33,15 +33,12 @@ properties:
 
   vana-supply:
     description: Sensor 2.8 V analog supply.
-    maxItems: 1
 
   vdig-supply:
     description: Sensor 1.8 V digital core supply.
-    maxItems: 1
 
   vddl-supply:
     description: Sensor digital IO 1.2 V supply.
-    maxItems: 1
 
   port:
     type: object
index 888ab4b..19e9afb 100644 (file)
@@ -26,8 +26,7 @@ properties:
 
   drive-open-drain: true
 
-  vdd-supply:
-    maxItems: 1
+  vdd-supply: true
 
   pinctrl:
     type: object
index c7c9ad4..7f2578d 100644 (file)
@@ -38,7 +38,7 @@ properties:
     const: stmmaceth
 
   syscon:
-    $ref: /schemas/types.yaml#definitions/phandle
+    $ref: /schemas/types.yaml#/definitions/phandle
     description:
       Phandle to the device containing the EMAC or GMAC clock
       register
@@ -114,7 +114,7 @@ allOf:
     then:
       properties:
         allwinner,leds-active-low:
-          $ref: /schemas/types.yaml#definitions/flag
+          $ref: /schemas/types.yaml#/definitions/flag
           description:
             EPHY LEDs are active low.
 
@@ -126,7 +126,7 @@ allOf:
               const: allwinner,sun8i-h3-mdio-mux
 
             mdio-parent-bus:
-              $ref: /schemas/types.yaml#definitions/phandle
+              $ref: /schemas/types.yaml#/definitions/phandle
               description:
                 Phandle to EMAC MDIO.
 
index 6b057b1..1f133f4 100644 (file)
@@ -60,7 +60,7 @@ allOf:
             - const: timing-adjustment
 
         amlogic,tx-delay-ns:
-          $ref: /schemas/types.yaml#definitions/uint32
+          $ref: /schemas/types.yaml#/definitions/uint32
           description:
             The internal RGMII TX clock delay (provided by this driver) in
             nanoseconds. Allowed values are 0ns, 2ns, 4ns, 6ns.
index 8e04463..8a3494d 100644 (file)
@@ -54,7 +54,7 @@ patternProperties:
             description:
               Describes the label associated with this port, which will become
               the netdev name
-            $ref: /schemas/types.yaml#definitions/string
+            $ref: /schemas/types.yaml#/definitions/string
 
           link:
             description:
@@ -62,13 +62,13 @@ patternProperties:
               port is used as the outgoing port towards the phandle ports. The
               full routing information must be given, not just the one hop
               routes to neighbouring switches
-            $ref: /schemas/types.yaml#definitions/phandle-array
+            $ref: /schemas/types.yaml#/definitions/phandle-array
 
           ethernet:
             description:
               Should be a phandle to a valid Ethernet device node.  This host
               device is what the switch port is connected to
-            $ref: /schemas/types.yaml#definitions/phandle
+            $ref: /schemas/types.yaml#/definitions/phandle
 
           phy-handle: true
 
index cc93063..0965f65 100644 (file)
@@ -16,7 +16,7 @@ properties:
   local-mac-address:
     description:
       Specifies the MAC address that was assigned to the network device.
-    $ref: /schemas/types.yaml#definitions/uint8-array
+    $ref: /schemas/types.yaml#/definitions/uint8-array
     items:
       - minItems: 6
         maxItems: 6
@@ -27,20 +27,20 @@ properties:
       program; should be used in cases where the MAC address assigned
       to the device by the boot program is different from the
       local-mac-address property.
-    $ref: /schemas/types.yaml#definitions/uint8-array
+    $ref: /schemas/types.yaml#/definitions/uint8-array
     items:
       - minItems: 6
         maxItems: 6
 
   max-frame-size:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description:
       Maximum transfer unit (IEEE defined MTU), rather than the
       maximum frame size (there\'s contradiction in the Devicetree
       Specification).
 
   max-speed:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description:
       Specifies maximum speed in Mbit/s supported by the device.
 
@@ -101,7 +101,7 @@ properties:
     $ref: "#/properties/phy-connection-type"
 
   phy-handle:
-    $ref: /schemas/types.yaml#definitions/phandle
+    $ref: /schemas/types.yaml#/definitions/phandle
     description:
       Specifies a reference to a node representing a PHY device.
 
@@ -114,7 +114,7 @@ properties:
     deprecated: true
 
   rx-fifo-depth:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description:
       The size of the controller\'s receive fifo in bytes. This is used
       for components that can have configurable receive fifo sizes,
@@ -129,12 +129,12 @@ properties:
       If this property is present then the MAC applies the RX delay.
 
   sfp:
-    $ref: /schemas/types.yaml#definitions/phandle
+    $ref: /schemas/types.yaml#/definitions/phandle
     description:
       Specifies a reference to a node representing a SFP cage.
 
   tx-fifo-depth:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description:
       The size of the controller\'s transmit fifo in bytes. This
       is used for components that can have configurable fifo sizes.
@@ -150,7 +150,7 @@ properties:
     description:
       Specifies the PHY management type. If auto is set and fixed-link
       is not specified, it uses MDIO for management.
-    $ref: /schemas/types.yaml#definitions/string
+    $ref: /schemas/types.yaml#/definitions/string
     default: auto
     enum:
       - auto
@@ -198,17 +198,17 @@ properties:
             speed:
               description:
                 Link speed.
-              $ref: /schemas/types.yaml#definitions/uint32
+              $ref: /schemas/types.yaml#/definitions/uint32
               enum: [10, 100, 1000]
 
             full-duplex:
-              $ref: /schemas/types.yaml#definitions/flag
+              $ref: /schemas/types.yaml#/definitions/flag
               description:
                 Indicates that full-duplex is used. When absent, half
                 duplex is assumed.
 
             asym-pause:
-              $ref: /schemas/types.yaml#definitions/flag
+              $ref: /schemas/types.yaml#/definitions/flag
               description:
                 Indicates that asym_pause should be enabled.
 
index 6dd72fa..2766fe4 100644 (file)
@@ -78,57 +78,57 @@ properties:
       Maximum PHY supported speed in Mbits / seconds.
 
   broken-turn-around:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       If set, indicates the PHY device does not correctly release
       the turn around line low at end of the control phase of the
       MDIO transaction.
 
   enet-phy-lane-swap:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       If set, indicates the PHY will swap the TX/RX lanes to
       compensate for the board being designed with the lanes
       swapped.
 
   eee-broken-100tx:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Mark the corresponding energy efficient ethernet mode as
       broken and request the ethernet to stop advertising it.
 
   eee-broken-1000t:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Mark the corresponding energy efficient ethernet mode as
       broken and request the ethernet to stop advertising it.
 
   eee-broken-10gt:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Mark the corresponding energy efficient ethernet mode as
       broken and request the ethernet to stop advertising it.
 
   eee-broken-1000kx:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Mark the corresponding energy efficient ethernet mode as
       broken and request the ethernet to stop advertising it.
 
   eee-broken-10gkx4:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Mark the corresponding energy efficient ethernet mode as
       broken and request the ethernet to stop advertising it.
 
   eee-broken-10gkr:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Mark the corresponding energy efficient ethernet mode as
       broken and request the ethernet to stop advertising it.
 
   phy-is-integrated:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       If set, indicates that the PHY is integrated into the same
       physical package as the Ethernet MAC. If needed, muxers
@@ -158,7 +158,7 @@ properties:
       this property is missing the delay will be skipped.
 
   sfp:
-    $ref: /schemas/types.yaml#definitions/phandle
+    $ref: /schemas/types.yaml#/definitions/phandle
     description:
       Specifies a reference to a node representing a SFP cage.
 
index 2159b7d..7f620a7 100644 (file)
@@ -31,7 +31,7 @@ properties:
   phy-mode: true
 
   pcs-handle:
-    $ref: /schemas/types.yaml#definitions/phandle
+    $ref: /schemas/types.yaml#/definitions/phandle
     description:
       A reference to a node representing a PCS PHY device found on
       the internal MDIO bus.
index e811e0f..08e15fb 100644 (file)
@@ -70,7 +70,7 @@ patternProperties:
           The ID number for the device.
 
       broken-turn-around:
-        $ref: /schemas/types.yaml#definitions/flag
+        $ref: /schemas/types.yaml#/definitions/flag
         description:
           If set, indicates the MDIO device does not correctly release
           the turn around line low at end of the control phase of the
index 0bbd598..e6a5ff2 100644 (file)
@@ -42,7 +42,7 @@ properties:
       - const: trans
 
   mediatek,pericfg:
-    $ref: /schemas/types.yaml#definitions/phandle
+    $ref: /schemas/types.yaml#/definitions/phandle
     description:
       Phandle to the device containing the PERICFG register range. This is used
       to control the MII mode.
index 4d8464b..8a2d126 100644 (file)
@@ -114,14 +114,13 @@ properties:
       validating firwmare used by the GSI.
 
   modem-remoteproc:
-    $ref: /schemas/types.yaml#definitions/phandle
+    $ref: /schemas/types.yaml#/definitions/phandle
     description:
       This defines the phandle to the remoteproc node representing
       the modem subsystem.  This is requied so the IPA driver can
       receive and act on notifications of modem up/down events.
 
   memory-region:
-    $ref: /schemas/types.yaml#/definitions/phandle-array
     maxItems: 1
     description:
       If present, a phandle for a reserved memory area that holds
index 11a6fdb..b2f6083 100644 (file)
@@ -126,7 +126,7 @@ properties:
       in a different mode than the PHY in order to function.
 
   snps,axi-config:
-    $ref: /schemas/types.yaml#definitions/phandle
+    $ref: /schemas/types.yaml#/definitions/phandle
     description:
       AXI BUS Mode parameters. Phandle to a node that can contain the
       following properties
@@ -141,7 +141,7 @@ properties:
         * snps,rb, rebuild INCRx Burst
 
   snps,mtl-rx-config:
-    $ref: /schemas/types.yaml#definitions/phandle
+    $ref: /schemas/types.yaml#/definitions/phandle
     description:
       Multiple RX Queues parameters. Phandle to a node that can
       contain the following properties
@@ -164,7 +164,7 @@ properties:
           * snps,priority, RX queue priority (Range 0x0 to 0xF)
 
   snps,mtl-tx-config:
-    $ref: /schemas/types.yaml#definitions/phandle
+    $ref: /schemas/types.yaml#/definitions/phandle
     description:
       Multiple TX Queues parameters. Phandle to a node that can
       contain the following properties
@@ -198,7 +198,7 @@ properties:
 
   snps,reset-active-low:
     deprecated: true
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Indicates that the PHY Reset is active low
 
@@ -208,55 +208,55 @@ properties:
       Triplet of delays. The 1st cell is reset pre-delay in micro
       seconds. The 2nd cell is reset pulse in micro seconds. The 3rd
       cell is reset post-delay in micro seconds.
-    $ref: /schemas/types.yaml#definitions/uint32-array
+    $ref: /schemas/types.yaml#/definitions/uint32-array
     minItems: 3
     maxItems: 3
 
   snps,aal:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Use Address-Aligned Beats
 
   snps,fixed-burst:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Program the DMA to use the fixed burst mode
 
   snps,mixed-burst:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Program the DMA to use the mixed burst mode
 
   snps,force_thresh_dma_mode:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Force DMA to use the threshold mode for both tx and rx
 
   snps,force_sf_dma_mode:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Force DMA to use the Store and Forward mode for both tx and
       rx. This flag is ignored if force_thresh_dma_mode is set.
 
   snps,en-tx-lpi-clockgating:
-    $ref: /schemas/types.yaml#definitions/flag
+    $ref: /schemas/types.yaml#/definitions/flag
     description:
       Enable gating of the MAC TX clock during TX low-power mode
 
   snps,multicast-filter-bins:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description:
       Number of multicast filter hash bins supported by this device
       instance
 
   snps,perfect-filter-entries:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description:
       Number of perfect filter entries supported by this device
       instance
 
   snps,ps-speed:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description:
       Port selection speed that can be passed to the core when PCS
       is supported. For example, this is used in case of SGMII and
@@ -307,25 +307,25 @@ allOf:
         snps,pbl:
           description:
             Programmable Burst Length (tx and rx)
-          $ref: /schemas/types.yaml#definitions/uint32
+          $ref: /schemas/types.yaml#/definitions/uint32
           enum: [2, 4, 8]
 
         snps,txpbl:
           description:
             Tx Programmable Burst Length. If set, DMA tx will use this
             value rather than snps,pbl.
-          $ref: /schemas/types.yaml#definitions/uint32
+          $ref: /schemas/types.yaml#/definitions/uint32
           enum: [2, 4, 8]
 
         snps,rxpbl:
           description:
             Rx Programmable Burst Length. If set, DMA rx will use this
             value rather than snps,pbl.
-          $ref: /schemas/types.yaml#definitions/uint32
+          $ref: /schemas/types.yaml#/definitions/uint32
           enum: [2, 4, 8]
 
         snps,no-pbl-x8:
-          $ref: /schemas/types.yaml#definitions/flag
+          $ref: /schemas/types.yaml#/definitions/flag
           description:
             Don\'t multiply the pbl/txpbl/rxpbl values by 8. For core
             rev < 3.50, don\'t multiply the values by 4.
@@ -351,7 +351,7 @@ allOf:
     then:
       properties:
         snps,tso:
-          $ref: /schemas/types.yaml#definitions/flag
+          $ref: /schemas/types.yaml#/definitions/flag
           description:
             Enables the TSO feature otherwise it will be managed by
             MAC HW capability register.
index cbacc04..8a03a24 100644 (file)
@@ -64,7 +64,7 @@ properties:
       - const: ether    # for others
 
   socionext,syscon-phy-mode:
-    $ref: /schemas/types.yaml#definitions/phandle-array
+    $ref: /schemas/types.yaml#/definitions/phandle-array
     description:
       A phandle to syscon with one argument that configures phy mode.
       The argument is the ID of MAC instance.
index dadeb8f..07a00f5 100644 (file)
@@ -70,7 +70,7 @@ properties:
   pinctrl-names: true
 
   syscon:
-    $ref: /schemas/types.yaml#definitions/phandle
+    $ref: /schemas/types.yaml#/definitions/phandle
     description:
       Phandle to the system control device node which provides access to
       efuse IO range with MAC addresses
index 4050a36..047d757 100644 (file)
@@ -47,31 +47,31 @@ properties:
         takes precedence.
 
   tx-fifo-depth:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
        Transmitt FIFO depth see dt-bindings/net/ti-dp83867.h for values
 
   rx-fifo-depth:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
        Receive FIFO depth see dt-bindings/net/ti-dp83867.h for values
 
   ti,clk-output-sel:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
       Muxing option for CLK_OUT pin.  See dt-bindings/net/ti-dp83867.h
       for applicable values. The CLK_OUT pin can also be disabled by this
       property.  When omitted, the PHY's default will be left as is.
 
   ti,rx-internal-delay:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
       RGMII Receive Clock Delay - see dt-bindings/net/ti-dp83867.h
       for applicable values. Required only if interface type is
       PHY_INTERFACE_MODE_RGMII_ID or PHY_INTERFACE_MODE_RGMII_RXID.
 
   ti,tx-internal-delay:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
       RGMII Transmit Clock Delay - see dt-bindings/net/ti-dp83867.h
       for applicable values. Required only if interface type is
@@ -101,7 +101,7 @@ properties:
 
   ti,fifo-depth:
     deprecated: true
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
       Transmitt FIFO depth- see dt-bindings/net/ti-dp83867.h for applicable
       values.
index c3235f0..70a1209 100644 (file)
@@ -44,22 +44,22 @@ properties:
        to a maximum value (70 ohms).
 
   tx-fifo-depth:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
        Transmitt FIFO depth see dt-bindings/net/ti-dp83869.h for values
 
   rx-fifo-depth:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
        Receive FIFO depth see dt-bindings/net/ti-dp83869.h for values
 
   ti,clk-output-sel:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
        Muxing option for CLK_OUT pin see dt-bindings/net/ti-dp83869.h for values.
 
   ti,op-mode:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
        Operational mode for the PHY.  If this is not set then the operational
        mode is set by the straps. see dt-bindings/net/ti-dp83869.h for values
index 227270c..c47b58f 100644 (file)
@@ -119,12 +119,12 @@ properties:
             description: label associated with this port
 
           ti,mac-only:
-            $ref: /schemas/types.yaml#definitions/flag
+            $ref: /schemas/types.yaml#/definitions/flag
             description:
               Specifies the port works in mac-only mode.
 
           ti,syscon-efuse:
-            $ref: /schemas/types.yaml#definitions/phandle-array
+            $ref: /schemas/types.yaml#/definitions/phandle-array
             description:
               Phandle to the system control device node which provides access
               to efuse IO range with MAC addresses
index 6af9991..85c2f69 100644 (file)
@@ -136,7 +136,7 @@ properties:
       - const: tcl2host-status-ring
 
   qcom,rproc:
-    $ref: /schemas/types.yaml#definitions/phandle
+    $ref: /schemas/types.yaml#/definitions/phandle
     description:
       DT entry of q6v5-wcss remoteproc driver.
       Phandle to a node that can contain the following properties
index 83d5d0a..cbbf5e8 100644 (file)
@@ -44,13 +44,13 @@ properties:
       - const: refclk
 
   syscon-phy-power:
-    $ref: /schemas/types.yaml#definitions/phandle-array
+    $ref: /schemas/types.yaml#/definitions/phandle-array
     description:
       phandle/offset pair. Phandle to the system control module and
       register offset to power on/off the PHY.
 
   ctrl-module:
-    $ref: /schemas/types.yaml#definitions/phandle
+    $ref: /schemas/types.yaml#/definitions/phandle
     description:
       (deprecated) phandle of the control module used by PHY driver
       to power on the PHY. Use syscon-phy-power instead.
index fd12baf..d14cb9b 100644 (file)
@@ -83,11 +83,11 @@ patternProperties:
           SUSBSYS clocks.
 
       mediatek,infracfg:
-        $ref: /schemas/types.yaml#definitions/phandle
+        $ref: /schemas/types.yaml#/definitions/phandle
         description: phandle to the device containing the INFRACFG register range.
 
       mediatek,smi:
-        $ref: /schemas/types.yaml#definitions/phandle
+        $ref: /schemas/types.yaml#/definitions/phandle
         description: phandle to the device containing the SMI register range.
 
     patternProperties:
@@ -131,11 +131,11 @@ patternProperties:
               SUSBSYS clocks.
 
           mediatek,infracfg:
-            $ref: /schemas/types.yaml#definitions/phandle
+            $ref: /schemas/types.yaml#/definitions/phandle
             description: phandle to the device containing the INFRACFG register range.
 
           mediatek,smi:
-            $ref: /schemas/types.yaml#definitions/phandle
+            $ref: /schemas/types.yaml#/definitions/phandle
             description: phandle to the device containing the SMI register range.
 
         patternProperties:
@@ -179,11 +179,11 @@ patternProperties:
                   SUSBSYS clocks.
 
               mediatek,infracfg:
-                $ref: /schemas/types.yaml#definitions/phandle
+                $ref: /schemas/types.yaml#/definitions/phandle
                 description: phandle to the device containing the INFRACFG register range.
 
               mediatek,smi:
-                $ref: /schemas/types.yaml#definitions/phandle
+                $ref: /schemas/types.yaml#/definitions/phandle
                 description: phandle to the device containing the SMI register range.
 
             required:
index ee92e6a..5fcdf58 100644 (file)
@@ -27,7 +27,7 @@ properties:
       of this binary blob is kept secret by CellWise. The only way to obtain
       it is to mail two batteries to a test facility of CellWise and receive
       back a test report with the binary blob.
-    $ref: /schemas/types.yaml#definitions/uint8-array
+    $ref: /schemas/types.yaml#/definitions/uint8-array
     minItems: 64
     maxItems: 64
 
index 6494c7d..1b0936a 100644 (file)
@@ -42,6 +42,6 @@ select: true
 
 properties:
   sleep:
-    $ref: /schemas/types.yaml#definitions/phandle-array
+    $ref: /schemas/types.yaml#/definitions/phandle-array
 
 additionalProperties: true
index e7b3abe..0a66338 100644 (file)
@@ -59,7 +59,6 @@ properties:
     description: u32 value representing regulator enable bit offset.
 
   vin-supply:
-    $ref: '/schemas/types.yaml#/definitions/phandle'
     description: input supply phandle.
 
 required:
index c1d4c19..f54cae9 100644 (file)
@@ -126,7 +126,7 @@ properties:
     maxItems: 1
 
   current-speed:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: The current active speed of the UART.
 
   reg-offset:
@@ -154,7 +154,7 @@ properties:
       Set to indicate that the port does not implement loopback test mode.
 
   fifo-size:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: The fifo size of the UART.
 
   auto-flow-control:
@@ -165,7 +165,7 @@ properties:
       property.
 
   tx-threshold:
-    $ref: /schemas/types.yaml#definitions/uint32
+    $ref: /schemas/types.yaml#/definitions/uint32
     description: |
       Specify the TX FIFO low water indication for parts with programmable
       TX FIFO thresholds.
index bc79b3c..c4f1f48 100644 (file)
@@ -29,6 +29,8 @@ required:
   - compatible
   - reg
 
+additionalProperties: false
+
 examples:
   - |
     uart0: serial@e0001800 {
index e2b7887..c8b57c7 100644 (file)
@@ -28,6 +28,8 @@ required:
   - compatible
   - reg
 
+additionalProperties: false
+
 examples:
   - |
     soc_ctrl0: soc-controller@f0000000 {
index 6c763f8..31e4d3c 100644 (file)
@@ -44,6 +44,8 @@ required:
   - clocks
   - clock-names
 
+additionalProperties: false
+
 examples:
   - |
     #include <dt-bindings/interrupt-controller/arm-gic.h>
index c3c595e..ddea3d4 100644 (file)
@@ -55,7 +55,7 @@ properties:
     description: TI-SCI RM subtype for GP ring range
 
   ti,sci:
-    $ref: /schemas/types.yaml#definitions/phandle-array
+    $ref: /schemas/types.yaml#/definitions/phandle-array
     description: phandle on TI-SCI compatible System controller node
 
   ti,sci-dev-id:
index 378d0ce..cb245f4 100644 (file)
@@ -26,9 +26,18 @@ required:
   - compatible
   - reg
 
+additionalProperties: false
+
 examples:
   - |
-    xlnx_vcu: vcu@a0041000 {
-          compatible = "xlnx,vcu-settings", "syscon";
-          reg = <0x0 0xa0041000 0x0 0x1000>;
+    fpga {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        xlnx_vcu: vcu@a0041000 {
+            compatible = "xlnx,vcu-settings", "syscon";
+            reg = <0x0 0xa0041000 0x0 0x1000>;
+        };
     };
+
+...
index be390ac..dd47fef 100644 (file)
@@ -57,7 +57,7 @@ properties:
       A list of the connections between audio components.  Each entry
       is a pair of strings, the first being the connection's sink, the
       second being the connection's source.
-    $ref: /schemas/types.yaml#definitions/non-unique-string-array
+    $ref: /schemas/types.yaml#/definitions/non-unique-string-array
     minItems: 2
     maxItems: 18
     items:
index e543a61..b55775e 100644 (file)
@@ -44,7 +44,6 @@ properties:
     maxItems: 3
 
   clock-names:
-    maxItems: 3
     items:
       - const: hda
       - const: hda2hdmi
@@ -54,7 +53,6 @@ properties:
     maxItems: 3
 
   reset-names:
-    maxItems: 3
     items:
       - const: hda
       - const: hda2hdmi
index 6ad48c7..f2443b6 100644 (file)
@@ -106,7 +106,7 @@ patternProperties:
           Must contain the phandle and index of the SAI sub-block providing
           the synchronization.
         allOf:
-          - $ref: /schemas/types.yaml#definitions/phandle-array
+          - $ref: /schemas/types.yaml#/definitions/phandle-array
           - maxItems: 1
 
       st,iec60958:
@@ -117,7 +117,7 @@ patternProperties:
           configured according to protocol defined in related DAI link node,
           such as i2s, left justified, right justified, dsp and pdm protocols.
         allOf:
-          - $ref: /schemas/types.yaml#definitions/flag
+          - $ref: /schemas/types.yaml#/definitions/flag
 
       "#clock-cells":
         description: Configure the SAI device as master clock provider.
index 0f078bd..2260325 100644 (file)
@@ -51,7 +51,6 @@ properties:
     maxItems: 1
 
   phy-names:
-    maxItems: 1
     items:
       - const: usb
 
index 737c1f4..54c361d 100644 (file)
@@ -74,11 +74,8 @@ properties:
 
   phys:
     maxItems: 1
-    items:
-      - description: phandle + phy specifier pair.
 
   phy-names:
-    maxItems: 1
     items:
       - const: usb
 
index 849d5b1..cdbfec4 100644 (file)
@@ -681,3 +681,53 @@ Here is the list of supported tags and their meanings:
      - Stores the TID of the commit, CRC of the fast commit of which this tag
        represents the end of
 
+Fast Commit Replay Idempotence
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Fast commits tags are idempotent in nature provided the recovery code follows
+certain rules. The guiding principle that the commit path follows while
+committing is that it stores the result of a particular operation instead of
+storing the procedure.
+
+Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
+was associated with inode 10. During fast commit, instead of storing this
+operation as a procedure "rename a to b", we store the resulting file system
+state as a "series" of outcomes:
+
+- Link dirent b to inode 10
+- Unlink dirent a
+- Inode 10 with valid refcount
+
+Now when recovery code runs, it needs "enforce" this state on the file
+system. This is what guarantees idempotence of fast commit replay.
+
+Let's take an example of a procedure that is not idempotent and see how fast
+commits make it idempotent. Consider following sequence of operations:
+
+1) rm A
+2) mv B A
+3) read A
+
+If we store this sequence of operations as is then the replay is not idempotent.
+Let's say while in replay, we crash after (2). During the second replay,
+file A (which was actually created as a result of "mv B A" operation) would get
+deleted. Thus, file named A would be absent when we try to read A. So, this
+sequence of operations is not idempotent. However, as mentioned above, instead
+of storing the procedure fast commits store the outcome of each procedure. Thus
+the fast commit log for above procedure would be as follows:
+
+(Let's assume dirent A was linked to inode 10 and dirent B was linked to
+inode 11 before the replay)
+
+1) Unlink A
+2) Link A to inode 11
+3) Unlink B
+4) Inode 11
+
+If we crash after (3) we will have file A linked to inode 11. During the second
+replay, we will remove file A (inode 11). But we will create it back and make
+it point to inode 11. We won't find B, so we'll just skip that step. At this
+point, the refcount for inode 11 is not reliable, but that gets fixed by the
+replay of last inode 11 tag. Thus, by converting a non-idempotent procedure
+into a series of idempotent outcomes, fast commits ensured idempotence during
+the replay.
index 1879f88..230ee42 100644 (file)
@@ -75,44 +75,44 @@ and elsewhere regarding submitting Linux kernel patches.
 13) Has been build- and runtime tested with and without ``CONFIG_SMP`` and
     ``CONFIG_PREEMPT.``
 
-16) All codepaths have been exercised with all lockdep features enabled.
+14) All codepaths have been exercised with all lockdep features enabled.
 
-17) All new ``/proc`` entries are documented under ``Documentation/``
+15) All new ``/proc`` entries are documented under ``Documentation/``
 
-18) All new kernel boot parameters are documented in
+16) All new kernel boot parameters are documented in
     ``Documentation/admin-guide/kernel-parameters.rst``.
 
-19) All new module parameters are documented with ``MODULE_PARM_DESC()``
+17) All new module parameters are documented with ``MODULE_PARM_DESC()``
 
-20) All new userspace interfaces are documented in ``Documentation/ABI/``.
+18) All new userspace interfaces are documented in ``Documentation/ABI/``.
     See ``Documentation/ABI/README`` for more information.
     Patches that change userspace interfaces should be CCed to
     linux-api@vger.kernel.org.
 
-21) Check that it all passes ``make headers_check``.
+19) Check that it all passes ``make headers_check``.
 
-22) Has been checked with injection of at least slab and page-allocation
+20) Has been checked with injection of at least slab and page-allocation
     failures.  See ``Documentation/fault-injection/``.
 
     If the new code is substantial, addition of subsystem-specific fault
     injection might be appropriate.
 
-23) Newly-added code has been compiled with ``gcc -W`` (use
+21) Newly-added code has been compiled with ``gcc -W`` (use
     ``make EXTRA_CFLAGS=-W``).  This will generate lots of noise, but is good
     for finding bugs like "warning: comparison between signed and unsigned".
 
-24) Tested after it has been merged into the -mm patchset to make sure
+22) Tested after it has been merged into the -mm patchset to make sure
     that it still works with all of the other queued patches and various
     changes in the VM, VFS, and other subsystems.
 
-25) All memory barriers {e.g., ``barrier()``, ``rmb()``, ``wmb()``} need a
+23) All memory barriers {e.g., ``barrier()``, ``rmb()``, ``wmb()``} need a
     comment in the source code that explains the logic of what they are doing
     and why.
 
-26) If any ioctl's are added by the patch, then also update
+24) If any ioctl's are added by the patch, then also update
     ``Documentation/userspace-api/ioctl/ioctl-number.rst``.
 
-27) If your modified source code depends on or uses any of the kernel
+25) If your modified source code depends on or uses any of the kernel
     APIs or features that are related to the following ``Kconfig`` symbols,
     then test multiple builds with the related ``Kconfig`` symbols disabled
     and/or ``=m`` (if that option is available) [not all of these at the
index fb8261a..5ba5412 100644 (file)
@@ -411,6 +411,12 @@ Some people also put extra tags at the end.  They'll just be ignored for
 now, but you can do this to mark internal company procedures or just
 point out some special detail about the sign-off.
 
+Any further SoBs (Signed-off-by:'s) following the author's SoB are from
+people handling and transporting the patch, but were not involved in its
+development. SoB chains should reflect the **real** route a patch took
+as it was propagated to the maintainers and ultimately to Linus, with
+the first SoB entry signalling primary authorship of a single author.
+
 
 When to use Acked-by:, Cc:, and Co-developed-by:
 ------------------------------------------------
@@ -446,7 +452,7 @@ patch.  This tag documents that potentially interested parties
 have been included in the discussion.
 
 Co-developed-by: states that the patch was co-created by multiple developers;
-it is used to give attribution to co-authors (in addition to the author
+it is used to give attribution to co-authors (in addition to the author
 attributed by the From: tag) when several people work on a single patch.  Since
 Co-developed-by: denotes authorship, every Co-developed-by: must be immediately
 followed by a Signed-off-by: of the associated co-author.  Standard sign-off
index ad0e34b..154c870 100644 (file)
@@ -3199,6 +3199,7 @@ S:        Maintained
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
 F:     block/
 F:     drivers/block/
+F:     fs/block_dev.c
 F:     include/linux/blk*
 F:     kernel/trace/blktrace.c
 F:     lib/sbitmap.c
@@ -4354,8 +4355,8 @@ T:        git git://linuxtv.org/media_tree.git
 F:     drivers/media/pci/cobalt/
 
 COCCINELLE/Semantic Patches (SmPL)
-M:     Julia Lawall <Julia.Lawall@lip6.fr>
-M:     Gilles Muller <Gilles.Muller@lip6.fr>
+M:     Julia Lawall <Julia.Lawall@inria.fr>
+M:     Gilles Muller <Gilles.Muller@inria.fr>
 M:     Nicolas Palix <nicolas.palix@imag.fr>
 M:     Michal Marek <michal.lkml@markovi.net>
 L:     cocci@systeme.lip6.fr (moderated for non-subscribers)
index 0a2ab6c..e5d870f 100644 (file)
@@ -7,7 +7,7 @@
  *
  * Code supporting the Jensen.
  */
-
+#include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/mm.h>
index 3ee4f43..9de7ab2 100644 (file)
@@ -66,24 +66,17 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
 #define MAX_UNCOMP_KERNEL_SIZE SZ_32M
 
 /*
- * phys-to-virt patching requires that the physical to virtual offset fits
- * into the immediate field of an add/sub instruction, which comes down to the
- * 24 least significant bits being zero, and so the offset should be a multiple
- * of 16 MB. Since PAGE_OFFSET itself is a multiple of 16 MB, the physical
- * base should be aligned to 16 MB as well.
+ * phys-to-virt patching requires that the physical to virtual offset is a
+ * multiple of 2 MiB. However, using an alignment smaller than TEXT_OFFSET
+ * here throws off the memory allocation logic, so let's use the lowest power
+ * of two greater than 2 MiB and greater than TEXT_OFFSET.
  */
-#define EFI_PHYS_ALIGN         SZ_16M
-
-/* on ARM, the FDT should be located in a lowmem region */
-static inline unsigned long efi_get_max_fdt_addr(unsigned long image_addr)
-{
-       return round_down(image_addr, EFI_PHYS_ALIGN) + SZ_512M;
-}
+#define EFI_PHYS_ALIGN         max(UL(SZ_2M), roundup_pow_of_two(TEXT_OFFSET))
 
 /* on ARM, the initrd should be loaded in a lowmem region */
 static inline unsigned long efi_get_max_initrd_addr(unsigned long image_addr)
 {
-       return round_down(image_addr, EFI_PHYS_ALIGN) + SZ_512M;
+       return round_down(image_addr, SZ_4M) + SZ_512M;
 }
 
 struct efi_arm_entry_state {
@@ -93,4 +86,9 @@ struct efi_arm_entry_state {
        u32     sctlr_after_ebs;
 };
 
+static inline void efi_capsule_flush_cache_range(void *addr, int size)
+{
+       __cpuc_flush_dcache_area(addr, size);
+}
+
 #endif /* _ASM_ARM_EFI_H */
index 6ab2b0a..5c48eb4 100644 (file)
@@ -549,7 +549,7 @@ void show_ipi_list(struct seq_file *p, int prec)
                seq_printf(p, "%*s%u: ", prec - 1, "IPI", i);
 
                for_each_online_cpu(cpu)
-                       seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu));
+                       seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu));
 
                seq_printf(p, " %s\n", ipi_types[i]);
        }
index d0d94f7..05e1735 100644 (file)
@@ -1877,6 +1877,7 @@ config EFI
        select EFI_RUNTIME_WRAPPERS
        select EFI_STUB
        select EFI_GENERIC_STUB
+       imply IMA_SECURE_AND_OR_TRUSTED_BOOT
        default y
        help
          This option provides support for runtime services provided
index 973b144..3578aba 100644 (file)
@@ -64,12 +64,6 @@ efi_status_t __efi_rt_asm_wrapper(void *, const char *, ...);
 #define EFI_KIMG_ALIGN \
        (SEGMENT_ALIGN > THREAD_ALIGN ? SEGMENT_ALIGN : THREAD_ALIGN)
 
-/* on arm64, the FDT may be located anywhere in system RAM */
-static inline unsigned long efi_get_max_fdt_addr(unsigned long image_addr)
-{
-       return ULONG_MAX;
-}
-
 /*
  * On arm64, we have to ensure that the initrd ends up in the linear region,
  * which is a 1 GB aligned region of size '1UL << (VA_BITS_MIN - 1)' that is
@@ -141,4 +135,9 @@ static inline void efi_set_pgd(struct mm_struct *mm)
 void efi_virtmap_load(void);
 void efi_virtmap_unload(void);
 
+static inline void efi_capsule_flush_cache_range(void *addr, int size)
+{
+       __flush_dcache_area(addr, size);
+}
+
 #endif /* _ASM_EFI_H */
index 19b1705..6bc3a36 100644 (file)
@@ -811,7 +811,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
                seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i,
                           prec >= 4 ? " " : "");
                for_each_online_cpu(cpu)
-                       seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu));
+                       seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu));
                seq_printf(p, "      %s\n", ipi_types[i]);
        }
 
index e76c866..49cd6d2 100644 (file)
@@ -216,12 +216,9 @@ int show_interrupts(struct seq_file *p, void *v)
                if (!action)
                        goto skip;
                seq_printf(p, "%3d: ", i);
-#ifdef CONFIG_SMP
+
                for_each_online_cpu(j)
-                       seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-#else
-               seq_printf(p, "%10u ", kstat_irqs(i));
-#endif
+                       seq_printf(p, "%10u ", irq_desc_kstat_cpu(desc, j));
 
                seq_printf(p, " %14s", irq_desc_get_chip(desc)->name);
 #ifndef PARISC_IRQ_CR16_COUNTS
index ec0b218..2b8da92 100644 (file)
@@ -373,6 +373,8 @@ initrd-y := $(filter-out $(image-y), $(initrd-y))
 targets        += $(image-y) $(initrd-y)
 targets += $(foreach x, dtbImage uImage cuImage simpleImage treeImage, \
                $(patsubst $(x).%, dts/%.dtb, $(filter $(x).%, $(image-y))))
+targets += $(foreach x, dtbImage uImage cuImage simpleImage treeImage, \
+               $(patsubst $(x).%, dts/fsl/%.dtb, $(filter $(x).%, $(image-y))))
 
 $(addprefix $(obj)/, $(initrd-y)): $(obj)/ramdisk.image.gz
 
index cfa8148..cc1bca5 100644 (file)
@@ -180,7 +180,12 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 #define VCPU_GPR(n)    __VCPU_GPR(__REG_##n)
 
 #ifdef __KERNEL__
-#ifdef CONFIG_PPC64
+
+/*
+ * We use __powerpc64__ here because we want the compat VDSO to use the 32-bit
+ * version below in the else case of the ifdef.
+ */
+#ifdef __powerpc64__
 
 #define STACKFRAMESIZE 256
 #define __STK_REG(i)   (112 + ((i)-14)*8)
index b558b07..881f655 100644 (file)
@@ -49,7 +49,7 @@ static inline unsigned long get_tbl(void)
        return mftb();
 }
 
-static inline u64 get_tb(void)
+static __always_inline u64 get_tb(void)
 {
        unsigned int tbhi, tblo, tbhi2;
 
index 541664d..a2f72c9 100644 (file)
 #ifdef CONFIG_VMAP_STACK
        mfspr   r11, SPRN_SRR0
        mtctr   r11
-#endif
        andi.   r11, r9, MSR_PR
-       lwz     r11,TASK_STACK-THREAD(r12)
+       mr      r11, r1
+       lwz     r1,TASK_STACK-THREAD(r12)
        beq-    99f
-       addi    r11, r11, THREAD_SIZE - INT_FRAME_SIZE
-#ifdef CONFIG_VMAP_STACK
+       addi    r1, r1, THREAD_SIZE - INT_FRAME_SIZE
        li      r10, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */
        mtmsr   r10
        isync
+       tovirt(r12, r12)
+       stw     r11,GPR1(r1)
+       stw     r11,0(r1)
+       mr      r11, r1
+#else
+       andi.   r11, r9, MSR_PR
+       lwz     r11,TASK_STACK-THREAD(r12)
+       beq-    99f
+       addi    r11, r11, THREAD_SIZE - INT_FRAME_SIZE
+       tophys(r11, r11)
+       stw     r1,GPR1(r11)
+       stw     r1,0(r11)
+       tovirt(r1, r11)         /* set new kernel sp */
 #endif
-       tovirt_vmstack r12, r12
-       tophys_novmstack r11, r11
        mflr    r10
        stw     r10, _LINK(r11)
 #ifdef CONFIG_VMAP_STACK
 #else
        mfspr   r10,SPRN_SRR0
 #endif
-       stw     r1,GPR1(r11)
-       stw     r1,0(r11)
-       tovirt_novmstack r1, r11        /* set new kernel sp */
        stw     r10,_NIP(r11)
        mfcr    r10
        rlwinm  r10,r10,0,4,2   /* Clear SO bit in CR */
index 2b9b1bb..9e2246e 100644 (file)
@@ -990,7 +990,7 @@ static struct sched_domain_topology_level powerpc_topology[] = {
        { NULL, },
 };
 
-static int init_big_cores(void)
+static int __init init_big_cores(void)
 {
        int cpu;
 
index 59aa294..9cb6f52 100644 (file)
@@ -2,7 +2,7 @@
 
 # List of files in the vdso, has to be asm only for now
 
-ARCH_REL_TYPE_ABS := R_PPC_JUMP_SLOT|R_PPC_GLOB_DAT|R_PPC_ADDR32|R_PPC_ADDR24|R_PPC_ADDR16|R_PPC_ADDR16_LO|R_PPC_ADDR16_HI|R_PPC_ADDR16_HA|R_PPC_ADDR14|R_PPC_ADDR14_BRTAKEN|R_PPC_ADDR14_BRNTAKEN
+ARCH_REL_TYPE_ABS := R_PPC_JUMP_SLOT|R_PPC_GLOB_DAT|R_PPC_ADDR32|R_PPC_ADDR24|R_PPC_ADDR16|R_PPC_ADDR16_LO|R_PPC_ADDR16_HI|R_PPC_ADDR16_HA|R_PPC_ADDR14|R_PPC_ADDR14_BRTAKEN|R_PPC_ADDR14_BRNTAKEN|R_PPC_REL24
 include $(srctree)/lib/vdso/Makefile
 
 obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o
@@ -27,7 +27,7 @@ endif
 CC32FLAGS :=
 ifdef CONFIG_PPC64
 CC32FLAGS += -m32
-KBUILD_CFLAGS := $(filter-out -mcmodel=medium,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS := $(filter-out -mcmodel=medium -mabi=elfv1 -mabi=elfv2 -mcall-aixdesc,$(KBUILD_CFLAGS))
 endif
 
 targets := $(obj-vdso32) vdso32.so.dbg
index d365810..bf363ff 100644 (file)
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 # List of files in the vdso, has to be asm only for now
 
-ARCH_REL_TYPE_ABS := R_PPC_JUMP_SLOT|R_PPC_GLOB_DAT|R_PPC_ADDR32|R_PPC_ADDR24|R_PPC_ADDR16|R_PPC_ADDR16_LO|R_PPC_ADDR16_HI|R_PPC_ADDR16_HA|R_PPC_ADDR14|R_PPC_ADDR14_BRTAKEN|R_PPC_ADDR14_BRNTAKEN
+ARCH_REL_TYPE_ABS := R_PPC_JUMP_SLOT|R_PPC_GLOB_DAT|R_PPC_ADDR32|R_PPC_ADDR24|R_PPC_ADDR16|R_PPC_ADDR16_LO|R_PPC_ADDR16_HI|R_PPC_ADDR16_HA|R_PPC_ADDR14|R_PPC_ADDR14_BRTAKEN|R_PPC_ADDR14_BRNTAKEN|R_PPC_REL24
 include $(srctree)/lib/vdso/Makefile
 
 obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o
index 7542282..6d98cd9 100644 (file)
@@ -27,12 +27,6 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md);
 
 #define ARCH_EFI_IRQ_FLAGS_MASK (SR_IE | SR_SPIE)
 
-/* on RISC-V, the FDT may be located anywhere in system RAM */
-static inline unsigned long efi_get_max_fdt_addr(unsigned long image_addr)
-{
-       return ULONG_MAX;
-}
-
 /* Load initrd at enough distance from DRAM start */
 static inline unsigned long efi_get_max_initrd_addr(unsigned long image_addr)
 {
index 13ba533..bf53791 100644 (file)
@@ -176,7 +176,7 @@ void __init setup_bootmem(void)
         * Make sure that any memory beyond mem_start + (-PAGE_OFFSET) is removed
         * as it is unusable by kernel.
         */
-       memblock_enforce_memory_limit(mem_start - PAGE_OFFSET);
+       memblock_enforce_memory_limit(-PAGE_OFFSET);
 
        /* Reserve from the start of the kernel to the end of the kernel */
        memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
index 3514420..f8a8b94 100644 (file)
@@ -124,7 +124,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq)
        raw_spin_lock_irqsave(&desc->lock, flags);
        seq_printf(p, "%3d: ", irq);
        for_each_online_cpu(cpu)
-               seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu));
+               seq_printf(p, "%10u ", irq_desc_kstat_cpu(desc, cpu));
 
        if (desc->irq_data.chip)
                seq_printf(p, " %8s", desc->irq_data.chip->name);
index 40b8fd3..e0bc398 100644 (file)
@@ -35,7 +35,7 @@ cflags-$(CONFIG_X86_32) := -march=i386
 cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone
 KBUILD_CFLAGS += $(cflags-y)
 KBUILD_CFLAGS += -mno-mmx -mno-sse
-KBUILD_CFLAGS += -ffreestanding
+KBUILD_CFLAGS += -ffreestanding -fshort-wchar
 KBUILD_CFLAGS += -fno-stack-protector
 KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
 KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
index bc9758e..c98f783 100644 (file)
@@ -213,8 +213,6 @@ static inline bool efi_is_64bit(void)
 
 static inline bool efi_is_native(void)
 {
-       if (!IS_ENABLED(CONFIG_X86_64))
-               return true;
        return efi_is_64bit();
 }
 
@@ -382,4 +380,7 @@ static inline void efi_fake_memmap_early(void)
 }
 #endif
 
+#define arch_ima_efi_boot_mode \
+       ({ extern struct boot_params boot_params; boot_params.secure_boot; })
+
 #endif /* _ASM_X86_EFI_H */
index 68608bd..5eeb808 100644 (file)
@@ -161,5 +161,3 @@ ifeq ($(CONFIG_X86_64),y)
        obj-$(CONFIG_MMCONF_FAM10H)     += mmconf-fam10h_64.o
        obj-y                           += vsmp_64.o
 endif
-
-obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT)   += ima_arch.o
diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c
deleted file mode 100644 (file)
index 7dfb1e8..0000000
+++ /dev/null
@@ -1,94 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Copyright (C) 2018 IBM Corporation
- */
-#include <linux/efi.h>
-#include <linux/module.h>
-#include <linux/ima.h>
-
-extern struct boot_params boot_params;
-
-static enum efi_secureboot_mode get_sb_mode(void)
-{
-       efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
-       efi_status_t status;
-       unsigned long size;
-       u8 secboot, setupmode;
-
-       size = sizeof(secboot);
-
-       if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) {
-               pr_info("ima: secureboot mode unknown, no efi\n");
-               return efi_secureboot_mode_unknown;
-       }
-
-       /* Get variable contents into buffer */
-       status = efi.get_variable(L"SecureBoot", &efi_variable_guid,
-                                 NULL, &size, &secboot);
-       if (status == EFI_NOT_FOUND) {
-               pr_info("ima: secureboot mode disabled\n");
-               return efi_secureboot_mode_disabled;
-       }
-
-       if (status != EFI_SUCCESS) {
-               pr_info("ima: secureboot mode unknown\n");
-               return efi_secureboot_mode_unknown;
-       }
-
-       size = sizeof(setupmode);
-       status = efi.get_variable(L"SetupMode", &efi_variable_guid,
-                                 NULL, &size, &setupmode);
-
-       if (status != EFI_SUCCESS)      /* ignore unknown SetupMode */
-               setupmode = 0;
-
-       if (secboot == 0 || setupmode == 1) {
-               pr_info("ima: secureboot mode disabled\n");
-               return efi_secureboot_mode_disabled;
-       }
-
-       pr_info("ima: secureboot mode enabled\n");
-       return efi_secureboot_mode_enabled;
-}
-
-bool arch_ima_get_secureboot(void)
-{
-       static enum efi_secureboot_mode sb_mode;
-       static bool initialized;
-
-       if (!initialized && efi_enabled(EFI_BOOT)) {
-               sb_mode = boot_params.secure_boot;
-
-               if (sb_mode == efi_secureboot_mode_unset)
-                       sb_mode = get_sb_mode();
-               initialized = true;
-       }
-
-       if (sb_mode == efi_secureboot_mode_enabled)
-               return true;
-       else
-               return false;
-}
-
-/* secureboot arch rules */
-static const char * const sb_arch_rules[] = {
-#if !IS_ENABLED(CONFIG_KEXEC_SIG)
-       "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig",
-#endif /* CONFIG_KEXEC_SIG */
-       "measure func=KEXEC_KERNEL_CHECK",
-#if !IS_ENABLED(CONFIG_MODULE_SIG)
-       "appraise func=MODULE_CHECK appraise_type=imasig",
-#endif
-       "measure func=MODULE_CHECK",
-       NULL
-};
-
-const char * const *arch_get_ima_policy(void)
-{
-       if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) {
-               if (IS_ENABLED(CONFIG_MODULE_SIG))
-                       set_module_sig_enforced();
-               return sb_arch_rules;
-       }
-       return NULL;
-}
index 0a2ec80..f5477ea 100644 (file)
@@ -25,6 +25,7 @@
  *
  * Send feedback to <colpatch@us.ibm.com>
  */
+#include <linux/interrupt.h>
 #include <linux/nodemask.h>
 #include <linux/export.h>
 #include <linux/mmzone.h>
index 205a9bc..7d7ffb9 100644 (file)
@@ -93,37 +93,22 @@ static efi_system_table_t __init *xen_efi_probe(void)
 
 /*
  * Determine whether we're in secure boot mode.
- *
- * Please keep the logic in sync with
- * drivers/firmware/efi/libstub/secureboot.c:efi_get_secureboot().
  */
 static enum efi_secureboot_mode xen_efi_get_secureboot(void)
 {
-       static efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
        static efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID;
+       enum efi_secureboot_mode mode;
        efi_status_t status;
-       u8 moksbstate, secboot, setupmode;
+       u8 moksbstate;
        unsigned long size;
 
-       size = sizeof(secboot);
-       status = efi.get_variable(L"SecureBoot", &efi_variable_guid,
-                                 NULL, &size, &secboot);
-
-       if (status == EFI_NOT_FOUND)
-               return efi_secureboot_mode_disabled;
-
-       if (status != EFI_SUCCESS)
-               goto out_efi_err;
-
-       size = sizeof(setupmode);
-       status = efi.get_variable(L"SetupMode", &efi_variable_guid,
-                                 NULL, &size, &setupmode);
-
-       if (status != EFI_SUCCESS)
-               goto out_efi_err;
-
-       if (secboot == 0 || setupmode == 1)
-               return efi_secureboot_mode_disabled;
+       mode = efi_get_secureboot_mode(efi.get_variable);
+       if (mode == efi_secureboot_mode_unknown) {
+               pr_err("Could not determine UEFI Secure Boot status.\n");
+               return efi_secureboot_mode_unknown;
+       }
+       if (mode != efi_secureboot_mode_enabled)
+               return mode;
 
        /* See if a user has put the shim into insecure mode. */
        size = sizeof(moksbstate);
@@ -140,10 +125,6 @@ static enum efi_secureboot_mode xen_efi_get_secureboot(void)
  secure_boot_enabled:
        pr_info("UEFI Secure Boot is enabled.\n");
        return efi_secureboot_mode_enabled;
-
- out_efi_err:
-       pr_err("Could not determine UEFI Secure Boot status.\n");
-       return efi_secureboot_mode_unknown;
 }
 
 void __init xen_efi_init(struct boot_params *boot_params)
index ffa418c..ac6078a 100644 (file)
@@ -2185,6 +2185,9 @@ static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
                                                            WEIGHT_ONE);
                        }
 
+                       TRACE_IOCG_PATH(iocg_idle, iocg, now,
+                                       atomic64_read(&iocg->active_period),
+                                       atomic64_read(&ioc->cur_period), vtime);
                        __propagate_weights(iocg, 0, 0, false, now);
                        list_del_init(&iocg->active_list);
                }
index b09ce00..c338c9b 100644 (file)
@@ -650,6 +650,14 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq)
        if (!IS_ENABLED(CONFIG_SMP) ||
            !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
                return false;
+       /*
+        * With force threaded interrupts enabled, raising softirq from an SMP
+        * function call will always result in waking the ksoftirqd thread.
+        * This is probably worse than completing the request on a different
+        * cache domain.
+        */
+       if (force_irqthreads)
+               return false;
 
        /* same CPU or cache domain?  Complete locally */
        if (cpu == rq->mq_ctx->cpu ||
@@ -1495,31 +1503,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
        int srcu_idx;
 
-       /*
-        * We should be running this queue from one of the CPUs that
-        * are mapped to it.
-        *
-        * There are at least two related races now between setting
-        * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
-        * __blk_mq_run_hw_queue():
-        *
-        * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
-        *   but later it becomes online, then this warning is harmless
-        *   at all
-        *
-        * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
-        *   but later it becomes offline, then the warning can't be
-        *   triggered, and we depend on blk-mq timeout handler to
-        *   handle dispatched requests to this hctx
-        */
-       if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
-               cpu_online(hctx->next_cpu)) {
-               printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
-                       raw_smp_processor_id(),
-                       cpumask_empty(hctx->cpumask) ? "inactive": "active");
-               dump_stack();
-       }
-
        /*
         * We can't run the queue inline with ints disabled. Ensure that
         * we catch bad users of this early.
index b84b867..73faec4 100644 (file)
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *  gendisk handling
+ *
+ * Portions Copyright (C) 2020 Christoph Hellwig
  */
 
 #include <linux/module.h>
index deca253..e7d776d 100644 (file)
@@ -2,6 +2,7 @@
 /*
  * Copyright (C) 1991-1998  Linus Torvalds
  * Re-organised Feb 1998 Russell King
+ * Copyright (C) 2020 Christoph Hellwig
  */
 #include <linux/fs.h>
 #include <linux/slab.h>
index 4426082..b11b08a 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/list_sort.h>
 #include <linux/libnvdimm.h>
 #include <linux/module.h>
+#include <linux/nospec.h>
 #include <linux/mutex.h>
 #include <linux/ndctl.h>
 #include <linux/sysfs.h>
@@ -282,18 +283,19 @@ err:
 
 static union acpi_object *int_to_buf(union acpi_object *integer)
 {
-       union acpi_object *buf = ACPI_ALLOCATE(sizeof(*buf) + 4);
+       union acpi_object *buf = NULL;
        void *dst = NULL;
 
-       if (!buf)
-               goto err;
-
        if (integer->type != ACPI_TYPE_INTEGER) {
                WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n",
                                integer->type);
                goto err;
        }
 
+       buf = ACPI_ALLOCATE(sizeof(*buf) + 4);
+       if (!buf)
+               goto err;
+
        dst = buf + 1;
        buf->type = ACPI_TYPE_BUFFER;
        buf->buffer.length = 4;
@@ -478,8 +480,11 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
                cmd_mask = nd_desc->cmd_mask;
                if (cmd == ND_CMD_CALL && call_pkg->nd_family) {
                        family = call_pkg->nd_family;
-                       if (!test_bit(family, &nd_desc->bus_family_mask))
+                       if (family > NVDIMM_BUS_FAMILY_MAX ||
+                           !test_bit(family, &nd_desc->bus_family_mask))
                                return -EINVAL;
+                       family = array_index_nospec(family,
+                                                   NVDIMM_BUS_FAMILY_MAX + 1);
                        dsm_mask = acpi_desc->family_dsm_mask[family];
                        guid = to_nfit_bus_uuid(family);
                } else {
index 92f84ed..6727358 100644 (file)
@@ -318,7 +318,8 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
        blk_queue_logical_block_size(nbd->disk->queue, blksize);
        blk_queue_physical_block_size(nbd->disk->queue, blksize);
 
-       set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
+       if (max_part)
+               set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
        if (!set_capacity_and_notify(nbd->disk, bytesize >> 9))
                kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
        return 0;
@@ -1476,9 +1477,11 @@ static int nbd_open(struct block_device *bdev, fmode_t mode)
                refcount_set(&nbd->config_refs, 1);
                refcount_inc(&nbd->refs);
                mutex_unlock(&nbd->config_lock);
-               set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
+               if (max_part)
+                       set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
        } else if (nbd_disconnected(nbd->config)) {
-               set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
+               if (max_part)
+                       set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
        }
 out:
        mutex_unlock(&nbd_index_mutex);
index a7caeed..d4aa6bf 100644 (file)
@@ -432,7 +432,7 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev)
         * i.e. rnbd_clt_unmap_dev_store() leading to a sysfs warning because
         * of sysfs link already was removed already.
         */
-       if (strlen(dev->blk_symlink_name) && try_module_get(THIS_MODULE)) {
+       if (dev->blk_symlink_name && try_module_get(THIS_MODULE)) {
                sysfs_remove_link(rnbd_devs_kobj, dev->blk_symlink_name);
                kfree(dev->blk_symlink_name);
                module_put(THIS_MODULE);
@@ -521,7 +521,8 @@ static int rnbd_clt_add_dev_symlink(struct rnbd_clt_dev *dev)
        return 0;
 
 out_err:
-       dev->blk_symlink_name[0] = '\0';
+       kfree(dev->blk_symlink_name);
+       dev->blk_symlink_name = NULL ;
        return ret;
 }
 
index a199b19..96e3f9f 100644 (file)
@@ -88,6 +88,8 @@ static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev,
        dev->discard_alignment      = le32_to_cpu(rsp->discard_alignment);
        dev->secure_discard         = le16_to_cpu(rsp->secure_discard);
        dev->rotational             = rsp->rotational;
+       dev->wc                     = !!(rsp->cache_policy & RNBD_WRITEBACK);
+       dev->fua                    = !!(rsp->cache_policy & RNBD_FUA);
 
        dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE;
        dev->max_segments = BMAX_SEGMENTS;
@@ -347,19 +349,26 @@ static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess,
        struct rnbd_iu *iu;
        struct rtrs_permit *permit;
 
+       iu = kzalloc(sizeof(*iu), GFP_KERNEL);
+       if (!iu) {
+               return NULL;
+       }
+
        permit = rnbd_get_permit(sess, con_type,
                                  wait ? RTRS_PERMIT_WAIT :
                                  RTRS_PERMIT_NOWAIT);
-       if (unlikely(!permit))
+       if (unlikely(!permit)) {
+               kfree(iu);
                return NULL;
-       iu = rtrs_permit_to_pdu(permit);
+       }
+
        iu->permit = permit;
        /*
         * 1st reference is dropped after finishing sending a "user" message,
         * 2nd reference is dropped after confirmation with the response is
         * returned.
         * 1st and 2nd can happen in any order, so the rnbd_iu should be
-        * released (rtrs_permit returned to ibbtrs) only leased after both
+        * released (rtrs_permit returned to rtrs) only after both
         * are finished.
         */
        atomic_set(&iu->refcount, 2);
@@ -371,8 +380,10 @@ static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess,
 
 static void rnbd_put_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu)
 {
-       if (atomic_dec_and_test(&iu->refcount))
+       if (atomic_dec_and_test(&iu->refcount)) {
                rnbd_put_permit(sess, iu->permit);
+               kfree(iu);
+       }
 }
 
 static void rnbd_softirq_done_fn(struct request *rq)
@@ -382,6 +393,7 @@ static void rnbd_softirq_done_fn(struct request *rq)
        struct rnbd_iu *iu;
 
        iu = blk_mq_rq_to_pdu(rq);
+       sg_free_table_chained(&iu->sgt, RNBD_INLINE_SG_CNT);
        rnbd_put_permit(sess, iu->permit);
        blk_mq_end_request(rq, errno_to_blk_status(iu->errno));
 }
@@ -475,7 +487,7 @@ static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, bool wait)
        iu->buf = NULL;
        iu->dev = dev;
 
-       sg_mark_end(&iu->sglist[0]);
+       sg_alloc_table(&iu->sgt, 1, GFP_KERNEL);
 
        msg.hdr.type    = cpu_to_le16(RNBD_MSG_CLOSE);
        msg.device_id   = cpu_to_le32(device_id);
@@ -490,6 +502,7 @@ static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, bool wait)
                err = errno;
        }
 
+       sg_free_table(&iu->sgt);
        rnbd_put_iu(sess, iu);
        return err;
 }
@@ -562,7 +575,8 @@ static int send_msg_open(struct rnbd_clt_dev *dev, bool wait)
        iu->buf = rsp;
        iu->dev = dev;
 
-       sg_init_one(iu->sglist, rsp, sizeof(*rsp));
+       sg_alloc_table(&iu->sgt, 1, GFP_KERNEL);
+       sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp));
 
        msg.hdr.type    = cpu_to_le16(RNBD_MSG_OPEN);
        msg.access_mode = dev->access_mode;
@@ -570,7 +584,7 @@ static int send_msg_open(struct rnbd_clt_dev *dev, bool wait)
 
        WARN_ON(!rnbd_clt_get_dev(dev));
        err = send_usr_msg(sess->rtrs, READ, iu,
-                          &vec, sizeof(*rsp), iu->sglist, 1,
+                          &vec, sizeof(*rsp), iu->sgt.sgl, 1,
                           msg_open_conf, &errno, wait);
        if (err) {
                rnbd_clt_put_dev(dev);
@@ -580,6 +594,7 @@ static int send_msg_open(struct rnbd_clt_dev *dev, bool wait)
                err = errno;
        }
 
+       sg_free_table(&iu->sgt);
        rnbd_put_iu(sess, iu);
        return err;
 }
@@ -608,7 +623,8 @@ static int send_msg_sess_info(struct rnbd_clt_session *sess, bool wait)
        iu->buf = rsp;
        iu->sess = sess;
 
-       sg_init_one(iu->sglist, rsp, sizeof(*rsp));
+       sg_alloc_table(&iu->sgt, 1, GFP_KERNEL);
+       sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp));
 
        msg.hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO);
        msg.ver      = RNBD_PROTO_VER_MAJOR;
@@ -624,7 +640,7 @@ static int send_msg_sess_info(struct rnbd_clt_session *sess, bool wait)
                goto put_iu;
        }
        err = send_usr_msg(sess->rtrs, READ, iu,
-                          &vec, sizeof(*rsp), iu->sglist, 1,
+                          &vec, sizeof(*rsp), iu->sgt.sgl, 1,
                           msg_sess_info_conf, &errno, wait);
        if (err) {
                rnbd_clt_put_sess(sess);
@@ -634,7 +650,7 @@ put_iu:
        } else {
                err = errno;
        }
-
+       sg_free_table(&iu->sgt);
        rnbd_put_iu(sess, iu);
        return err;
 }
@@ -803,7 +819,7 @@ static struct rnbd_clt_session *alloc_sess(const char *sessname)
        rnbd_init_cpu_qlists(sess->cpu_queues);
 
        /*
-        * That is simple percpu variable which stores cpu indeces, which are
+        * That is simple percpu variable which stores cpu indices, which are
         * incremented on each access.  We need that for the sake of fairness
         * to wake up queues in a round-robin manner.
         */
@@ -1014,11 +1030,10 @@ static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev,
         * See queue limits.
         */
        if (req_op(rq) != REQ_OP_DISCARD)
-               sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sglist);
+               sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sgt.sgl);
 
        if (sg_cnt == 0)
-               /* Do not forget to mark the end */
-               sg_mark_end(&iu->sglist[0]);
+               sg_mark_end(&iu->sgt.sgl[0]);
 
        msg.hdr.type    = cpu_to_le16(RNBD_MSG_IO);
        msg.device_id   = cpu_to_le32(dev->device_id);
@@ -1027,13 +1042,13 @@ static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev,
                .iov_base = &msg,
                .iov_len  = sizeof(msg)
        };
-       size = rnbd_clt_get_sg_size(iu->sglist, sg_cnt);
+       size = rnbd_clt_get_sg_size(iu->sgt.sgl, sg_cnt);
        req_ops = (struct rtrs_clt_req_ops) {
                .priv = iu,
                .conf_fn = msg_io_conf,
        };
        err = rtrs_clt_request(rq_data_dir(rq), &req_ops, rtrs, permit,
-                              &vec, 1, size, iu->sglist, sg_cnt);
+                              &vec, 1, size, iu->sgt.sgl, sg_cnt);
        if (unlikely(err)) {
                rnbd_clt_err_rl(dev, "RTRS failed to transfer IO, err: %d\n",
                                 err);
@@ -1120,6 +1135,7 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
        struct rnbd_clt_dev *dev = rq->rq_disk->private_data;
        struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq);
        int err;
+       blk_status_t ret = BLK_STS_IOERR;
 
        if (unlikely(dev->dev_state != DEV_STATE_MAPPED))
                return BLK_STS_IOERR;
@@ -1131,32 +1147,35 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
                return BLK_STS_RESOURCE;
        }
 
+       iu->sgt.sgl = iu->first_sgl;
+       err = sg_alloc_table_chained(&iu->sgt,
+                                    /* Even-if the request has no segment,
+                                     * sglist must have one entry at least */
+                                    blk_rq_nr_phys_segments(rq) ? : 1,
+                                    iu->sgt.sgl,
+                                    RNBD_INLINE_SG_CNT);
+       if (err) {
+               rnbd_clt_err_rl(dev, "sg_alloc_table_chained ret=%d\n", err);
+               rnbd_clt_dev_kick_mq_queue(dev, hctx, 10/*ms*/);
+               rnbd_put_permit(dev->sess, iu->permit);
+               return BLK_STS_RESOURCE;
+       }
+
        blk_mq_start_request(rq);
        err = rnbd_client_xfer_request(dev, rq, iu);
        if (likely(err == 0))
                return BLK_STS_OK;
        if (unlikely(err == -EAGAIN || err == -ENOMEM)) {
                rnbd_clt_dev_kick_mq_queue(dev, hctx, 10/*ms*/);
-               rnbd_put_permit(dev->sess, iu->permit);
-               return BLK_STS_RESOURCE;
+               ret = BLK_STS_RESOURCE;
        }
-
+       sg_free_table_chained(&iu->sgt, RNBD_INLINE_SG_CNT);
        rnbd_put_permit(dev->sess, iu->permit);
-       return BLK_STS_IOERR;
-}
-
-static int rnbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
-                             unsigned int hctx_idx, unsigned int numa_node)
-{
-       struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq);
-
-       sg_init_table(iu->sglist, BMAX_SEGMENTS);
-       return 0;
+       return ret;
 }
 
 static struct blk_mq_ops rnbd_mq_ops = {
        .queue_rq       = rnbd_queue_rq,
-       .init_request   = rnbd_init_request,
        .complete       = rnbd_softirq_done_fn,
 };
 
@@ -1170,7 +1189,7 @@ static int setup_mq_tags(struct rnbd_clt_session *sess)
        tag_set->numa_node              = NUMA_NO_NODE;
        tag_set->flags          = BLK_MQ_F_SHOULD_MERGE |
                                  BLK_MQ_F_TAG_QUEUE_SHARED;
-       tag_set->cmd_size               = sizeof(struct rnbd_iu);
+       tag_set->cmd_size       = sizeof(struct rnbd_iu) + RNBD_RDMA_SGL_SIZE;
        tag_set->nr_hw_queues   = num_online_cpus();
 
        return blk_mq_alloc_tag_set(tag_set);
@@ -1208,7 +1227,7 @@ find_and_get_or_create_sess(const char *sessname,
         */
        sess->rtrs = rtrs_clt_open(&rtrs_ops, sessname,
                                   paths, path_cnt, port_nr,
-                                  sizeof(struct rnbd_iu),
+                                  0, /* Do not use pdu of rtrs */
                                   RECONNECT_DELAY, BMAX_SEGMENTS,
                                   BLK_MAX_SEGMENT_SIZE,
                                   MAX_RECONNECTS);
@@ -1305,7 +1324,7 @@ static void setup_request_queue(struct rnbd_clt_dev *dev)
        blk_queue_max_segments(dev->queue, dev->max_segments);
        blk_queue_io_opt(dev->queue, dev->sess->max_io_size);
        blk_queue_virt_boundary(dev->queue, SZ_4K - 1);
-       blk_queue_write_cache(dev->queue, true, true);
+       blk_queue_write_cache(dev->queue, dev->wc, dev->fua);
        dev->queue->queuedata = dev;
 }
 
@@ -1388,12 +1407,11 @@ static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess,
                goto out_queues;
        }
 
-       dev->pathname = kzalloc(strlen(pathname) + 1, GFP_KERNEL);
+       dev->pathname = kstrdup(pathname, GFP_KERNEL);
        if (!dev->pathname) {
                ret = -ENOMEM;
                goto out_queues;
        }
-       strlcpy(dev->pathname, pathname, strlen(pathname) + 1);
 
        dev->clt_device_id      = ret;
        dev->sess               = sess;
@@ -1529,13 +1547,13 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
        }
 
        rnbd_clt_info(dev,
-                      "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_write_same_sectors: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, rotational: %d)\n",
+                      "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_write_same_sectors: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, rotational: %d, wc: %d, fua: %d)\n",
                       dev->gd->disk_name, dev->nsectors,
                       dev->logical_block_size, dev->physical_block_size,
                       dev->max_write_same_sectors, dev->max_discard_sectors,
                       dev->discard_granularity, dev->discard_alignment,
                       dev->secure_discard, dev->max_segments,
-                      dev->max_hw_sectors, dev->rotational);
+                      dev->max_hw_sectors, dev->rotational, dev->wc, dev->fua);
 
        mutex_unlock(&dev->lock);
 
@@ -1667,7 +1685,7 @@ static void rnbd_destroy_sessions(void)
        /*
         * Here at this point there is no any concurrent access to sessions
         * list and devices list:
-        *   1. New session or device can'be be created - session sysfs files
+        *   1. New session or device can't be created - session sysfs files
         *      are removed.
         *   2. Device or session can't be removed - module reference is taken
         *      into account in unmap device sysfs callback.
index b193d59..537d499 100644 (file)
@@ -44,6 +44,13 @@ struct rnbd_iu_comp {
        int errno;
 };
 
+#ifdef CONFIG_ARCH_NO_SG_CHAIN
+#define RNBD_INLINE_SG_CNT 0
+#else
+#define RNBD_INLINE_SG_CNT 2
+#endif
+#define RNBD_RDMA_SGL_SIZE (sizeof(struct scatterlist) * RNBD_INLINE_SG_CNT)
+
 struct rnbd_iu {
        union {
                struct request *rq; /* for block io */
@@ -56,11 +63,12 @@ struct rnbd_iu {
                /* use to send msg associated with a sess */
                struct rnbd_clt_session *sess;
        };
-       struct scatterlist      sglist[BMAX_SEGMENTS];
+       struct sg_table         sgt;
        struct work_struct      work;
        int                     errno;
        struct rnbd_iu_comp     comp;
        atomic_t                refcount;
+       struct scatterlist      first_sgl[]; /* must be the last one */
 };
 
 struct rnbd_cpu_qlist {
@@ -112,6 +120,8 @@ struct rnbd_clt_dev {
        enum rnbd_access_mode   access_mode;
        bool                    read_only;
        bool                    rotational;
+       bool                    wc;
+       bool                    fua;
        u32                     max_hw_sectors;
        u32                     max_write_same_sectors;
        u32                     max_discard_sectors;
index ca16624..c1bc5c0 100644 (file)
@@ -108,6 +108,11 @@ struct rnbd_msg_close {
        __le32          device_id;
 };
 
+enum rnbd_cache_policy {
+       RNBD_FUA = 1 << 0,
+       RNBD_WRITEBACK = 1 << 1,
+};
+
 /**
  * struct rnbd_msg_open_rsp - response message to RNBD_MSG_OPEN
  * @hdr:               message header
@@ -124,6 +129,7 @@ struct rnbd_msg_close {
  * @max_segments:      max segments hardware support in one transfer
  * @secure_discard:    supports secure discard
  * @rotation:          is a rotational disc?
+ * @cache_policy:      support write-back caching or FUA?
  */
 struct rnbd_msg_open_rsp {
        struct rnbd_msg_hdr     hdr;
@@ -139,7 +145,8 @@ struct rnbd_msg_open_rsp {
        __le16                  max_segments;
        __le16                  secure_discard;
        u8                      rotational;
-       u8                      reserved[11];
+       u8                      cache_policy;
+       u8                      reserved[10];
 };
 
 /**
index d1ee72e..b8e4433 100644 (file)
@@ -338,9 +338,10 @@ static int rnbd_srv_link_ev(struct rtrs_srv *rtrs,
 
 void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev)
 {
+       mutex_lock(&sess_dev->sess->lock);
        rnbd_srv_destroy_dev_session_sysfs(sess_dev);
+       mutex_unlock(&sess_dev->sess->lock);
        sess_dev->keep_id = true;
-
 }
 
 static int process_msg_close(struct rtrs_srv *rtrs,
@@ -549,6 +550,7 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp,
                                        struct rnbd_srv_sess_dev *sess_dev)
 {
        struct rnbd_dev *rnbd_dev = sess_dev->rnbd_dev;
+       struct request_queue *q = bdev_get_queue(rnbd_dev->bdev);
 
        rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP);
        rsp->device_id =
@@ -573,8 +575,12 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp,
                cpu_to_le32(rnbd_dev_get_discard_alignment(rnbd_dev));
        rsp->secure_discard =
                cpu_to_le16(rnbd_dev_get_secure_discard(rnbd_dev));
-       rsp->rotational =
-               !blk_queue_nonrot(bdev_get_queue(rnbd_dev->bdev));
+       rsp->rotational = !blk_queue_nonrot(q);
+       rsp->cache_policy = 0;
+       if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+               rsp->cache_policy |= RNBD_WRITEBACK;
+       if (blk_queue_fua(q))
+               rsp->cache_policy |= RNBD_FUA;
 }
 
 static struct rnbd_srv_sess_dev *
index 27513d3..737b207 100644 (file)
@@ -367,19 +367,28 @@ void kill_dev_dax(struct dev_dax *dev_dax)
 }
 EXPORT_SYMBOL_GPL(kill_dev_dax);
 
-static void free_dev_dax_ranges(struct dev_dax *dev_dax)
+static void trim_dev_dax_range(struct dev_dax *dev_dax)
 {
+       int i = dev_dax->nr_range - 1;
+       struct range *range = &dev_dax->ranges[i].range;
        struct dax_region *dax_region = dev_dax->region;
-       int i;
 
        device_lock_assert(dax_region->dev);
-       for (i = 0; i < dev_dax->nr_range; i++) {
-               struct range *range = &dev_dax->ranges[i].range;
-
-               __release_region(&dax_region->res, range->start,
-                               range_len(range));
+       dev_dbg(&dev_dax->dev, "delete range[%d]: %#llx:%#llx\n", i,
+               (unsigned long long)range->start,
+               (unsigned long long)range->end);
+
+       __release_region(&dax_region->res, range->start, range_len(range));
+       if (--dev_dax->nr_range == 0) {
+               kfree(dev_dax->ranges);
+               dev_dax->ranges = NULL;
        }
-       dev_dax->nr_range = 0;
+}
+
+static void free_dev_dax_ranges(struct dev_dax *dev_dax)
+{
+       while (dev_dax->nr_range)
+               trim_dev_dax_range(dev_dax);
 }
 
 static void unregister_dev_dax(void *dev)
@@ -763,22 +772,14 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start,
                return 0;
        }
 
-       ranges = krealloc(dev_dax->ranges, sizeof(*ranges)
-                       * (dev_dax->nr_range + 1), GFP_KERNEL);
-       if (!ranges)
+       alloc = __request_region(res, start, size, dev_name(dev), 0);
+       if (!alloc)
                return -ENOMEM;
 
-       alloc = __request_region(res, start, size, dev_name(dev), 0);
-       if (!alloc) {
-               /*
-                * If this was an empty set of ranges nothing else
-                * will release @ranges, so do it now.
-                */
-               if (!dev_dax->nr_range) {
-                       kfree(ranges);
-                       ranges = NULL;
-               }
-               dev_dax->ranges = ranges;
+       ranges = krealloc(dev_dax->ranges, sizeof(*ranges)
+                       * (dev_dax->nr_range + 1), GFP_KERNEL);
+       if (!ranges) {
+               __release_region(res, alloc->start, resource_size(alloc));
                return -ENOMEM;
        }
 
@@ -804,15 +805,10 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start,
                return 0;
 
        rc = devm_register_dax_mapping(dev_dax, dev_dax->nr_range - 1);
-       if (rc) {
-               dev_dbg(dev, "delete range[%d]: %pa:%pa\n", dev_dax->nr_range - 1,
-                               &alloc->start, &alloc->end);
-               dev_dax->nr_range--;
-               __release_region(res, alloc->start, resource_size(alloc));
-               return rc;
-       }
+       if (rc)
+               trim_dev_dax_range(dev_dax);
 
-       return 0;
+       return rc;
 }
 
 static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, resource_size_t size)
@@ -885,12 +881,7 @@ static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size)
                if (shrink >= range_len(range)) {
                        devm_release_action(dax_region->dev,
                                        unregister_dax_mapping, &mapping->dev);
-                       __release_region(&dax_region->res, range->start,
-                                       range_len(range));
-                       dev_dax->nr_range--;
-                       dev_dbg(dev, "delete range[%d]: %#llx:%#llx\n", i,
-                                       (unsigned long long) range->start,
-                                       (unsigned long long) range->end);
+                       trim_dev_dax_range(dev_dax);
                        to_shrink -= shrink;
                        if (!to_shrink)
                                break;
@@ -1114,16 +1105,9 @@ static ssize_t align_show(struct device *dev,
 
 static ssize_t dev_dax_validate_align(struct dev_dax *dev_dax)
 {
-       resource_size_t dev_size = dev_dax_size(dev_dax);
        struct device *dev = &dev_dax->dev;
        int i;
 
-       if (dev_size > 0 && !alloc_is_aligned(dev_dax, dev_size)) {
-               dev_dbg(dev, "%s: align %u invalid for size %pa\n",
-                       __func__, dev_dax->align, &dev_size);
-               return -EINVAL;
-       }
-
        for (i = 0; i < dev_dax->nr_range; i++) {
                size_t len = range_len(&dev_dax->ranges[i].range);
 
@@ -1274,7 +1258,6 @@ static void dev_dax_release(struct device *dev)
        put_dax(dax_dev);
        free_dev_dax_id(dev_dax);
        dax_region_put(dax_region);
-       kfree(dev_dax->ranges);
        kfree(dev_dax->pgmap);
        kfree(dev_dax);
 }
index 62b26bf..062e8bc 100644 (file)
@@ -52,7 +52,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
 
        /* adjust the dax_region range to the start of data */
        range = pgmap.range;
-       range.start += offset,
+       range.start += offset;
        dax_region = alloc_dax_region(dev, region_id, &range,
                        nd_region->target_node, le32_to_cpu(pfn_sb->align),
                        IORESOURCE_DAX_STATIC);
index edc279b..cadbd0a 100644 (file)
@@ -752,6 +752,7 @@ err_chrdev:
 
 static void __exit dax_core_exit(void)
 {
+       dax_bus_exit();
        unregister_chrdev_region(dax_devt, MINORMASK+1);
        ida_destroy(&dax_minor_ida);
        dax_fs_exit();
index 5e7c343..3c4e343 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 
 struct cma_heap {
index d989549..2c3dac5 100644 (file)
@@ -122,7 +122,7 @@ config EFI_ARMSTUB_DTB_LOADER
 config EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER
        bool "Enable the command line initrd loader" if !X86
        depends on EFI_STUB && (EFI_GENERIC_STUB || X86)
-       default y
+       default y if X86
        depends on !RISCV
        help
          Select this config option to add support for the initrd= command
@@ -147,7 +147,7 @@ config EFI_BOOTLOADER_CONTROL
 
 config EFI_CAPSULE_LOADER
        tristate "EFI capsule loader"
-       depends on EFI
+       depends on EFI && !IA64
        help
          This option exposes a loader interface "/dev/efi_capsule_loader" for
          users to load EFI capsules. This driver requires working runtime
index d6ca2da..467e942 100644 (file)
@@ -12,7 +12,10 @@ KASAN_SANITIZE_runtime-wrappers.o    := n
 
 obj-$(CONFIG_ACPI_BGRT)                += efi-bgrt.o
 obj-$(CONFIG_EFI)                      += efi.o vars.o reboot.o memattr.o tpm.o
-obj-$(CONFIG_EFI)                      += capsule.o memmap.o
+obj-$(CONFIG_EFI)                      += memmap.o
+ifneq ($(CONFIG_EFI_CAPSULE_LOADER),)
+obj-$(CONFIG_EFI)                      += capsule.o
+endif
 obj-$(CONFIG_EFI_PARAMS_FROM_FDT)      += fdtparams.o
 obj-$(CONFIG_EFI_VARS)                 += efivars.o
 obj-$(CONFIG_EFI_ESRT)                 += esrt.o
index 598b780..7684302 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/highmem.h>
 #include <linux/efi.h>
 #include <linux/vmalloc.h>
+#include <asm/efi.h>
 #include <asm/io.h>
 
 typedef struct {
@@ -244,7 +245,7 @@ int efi_capsule_update(efi_capsule_header_t *capsule, phys_addr_t *pages)
        for (i = 0; i < sg_count; i++) {
                efi_capsule_block_desc_t *sglist;
 
-               sglist = kmap(sg_pages[i]);
+               sglist = kmap_atomic(sg_pages[i]);
 
                for (j = 0; j < SGLIST_PER_PAGE && count > 0; j++) {
                        u64 sz = min_t(u64, imagesize,
@@ -265,7 +266,18 @@ int efi_capsule_update(efi_capsule_header_t *capsule, phys_addr_t *pages)
                else
                        sglist[j].data = page_to_phys(sg_pages[i + 1]);
 
-               kunmap(sg_pages[i]);
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+               /*
+                * At runtime, the firmware has no way to find out where the
+                * sglist elements are mapped, if they are mapped in the first
+                * place. Therefore, on architectures that can only perform
+                * cache maintenance by virtual address, the firmware is unable
+                * to perform this maintenance, and so it is up to the OS to do
+                * it instead.
+                */
+               efi_capsule_flush_cache_range(sglist, PAGE_SIZE);
+#endif
+               kunmap_atomic(sglist);
        }
 
        mutex_lock(&capsule_mutex);
index 914a343..ec2f398 100644 (file)
@@ -273,7 +273,6 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
        install_memreserve_table();
 
        status = allocate_new_fdt_and_exit_boot(handle, &fdt_addr,
-                                               efi_get_max_fdt_addr(image_addr),
                                                initrd_addr, initrd_size,
                                                cmdline_ptr, fdt_addr, fdt_size);
        if (status != EFI_SUCCESS)
index 2d7abcd..b50a6c6 100644 (file)
@@ -750,7 +750,6 @@ efi_status_t efi_exit_boot_services(void *handle,
 
 efi_status_t allocate_new_fdt_and_exit_boot(void *handle,
                                            unsigned long *new_fdt_addr,
-                                           unsigned long max_addr,
                                            u64 initrd_addr, u64 initrd_size,
                                            char *cmdline_ptr,
                                            unsigned long fdt_addr,
@@ -848,4 +847,6 @@ asmlinkage void __noreturn efi_enter_kernel(unsigned long entrypoint,
 
 void efi_handle_post_ebs_state(void);
 
+enum efi_secureboot_mode efi_get_secureboot(void);
+
 #endif
index 368cd60..365c3a4 100644 (file)
@@ -238,7 +238,6 @@ static efi_status_t exit_boot_func(struct efi_boot_memmap *map,
 
 efi_status_t allocate_new_fdt_and_exit_boot(void *handle,
                                            unsigned long *new_fdt_addr,
-                                           unsigned long max_addr,
                                            u64 initrd_addr, u64 initrd_size,
                                            char *cmdline_ptr,
                                            unsigned long fdt_addr,
@@ -275,7 +274,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle,
        efi_info("Exiting boot services and installing virtual address map...\n");
 
        map.map = &memory_map;
-       status = efi_allocate_pages(MAX_FDT_SIZE, new_fdt_addr, max_addr);
+       status = efi_allocate_pages(MAX_FDT_SIZE, new_fdt_addr, ULONG_MAX);
        if (status != EFI_SUCCESS) {
                efi_err("Unable to allocate memory for new device tree.\n");
                goto fail;
index 5efc524..8a18930 100644 (file)
 
 #include "efistub.h"
 
-/* BIOS variables */
-static const efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
-static const efi_char16_t efi_SecureBoot_name[] = L"SecureBoot";
-static const efi_char16_t efi_SetupMode_name[] = L"SetupMode";
-
 /* SHIM variables */
 static const efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID;
 static const efi_char16_t shim_MokSBState_name[] = L"MokSBState";
 
+static efi_status_t get_var(efi_char16_t *name, efi_guid_t *vendor, u32 *attr,
+                           unsigned long *data_size, void *data)
+{
+       return get_efi_var(name, vendor, attr, data_size, data);
+}
+
 /*
  * Determine whether we're in secure boot mode.
- *
- * Please keep the logic in sync with
- * arch/x86/xen/efi.c:xen_efi_get_secureboot().
  */
 enum efi_secureboot_mode efi_get_secureboot(void)
 {
        u32 attr;
-       u8 secboot, setupmode, moksbstate;
        unsigned long size;
+       enum efi_secureboot_mode mode;
        efi_status_t status;
+       u8 moksbstate;
 
-       size = sizeof(secboot);
-       status = get_efi_var(efi_SecureBoot_name, &efi_variable_guid,
-                            NULL, &size, &secboot);
-       if (status == EFI_NOT_FOUND)
-               return efi_secureboot_mode_disabled;
-       if (status != EFI_SUCCESS)
-               goto out_efi_err;
-
-       size = sizeof(setupmode);
-       status = get_efi_var(efi_SetupMode_name, &efi_variable_guid,
-                            NULL, &size, &setupmode);
-       if (status != EFI_SUCCESS)
-               goto out_efi_err;
-
-       if (secboot == 0 || setupmode == 1)
-               return efi_secureboot_mode_disabled;
+       mode = efi_get_secureboot_mode(get_var);
+       if (mode == efi_secureboot_mode_unknown) {
+               efi_err("Could not determine UEFI Secure Boot status.\n");
+               return efi_secureboot_mode_unknown;
+       }
+       if (mode != efi_secureboot_mode_enabled)
+               return mode;
 
        /*
         * See if a user has put the shim into insecure mode. If so, and if the
@@ -69,8 +59,4 @@ enum efi_secureboot_mode efi_get_secureboot(void)
 secure_boot_enabled:
        efi_info("UEFI Secure Boot is enabled.\n");
        return efi_secureboot_mode_enabled;
-
-out_efi_err:
-       efi_err("Could not determine UEFI Secure Boot status.\n");
-       return efi_secureboot_mode_unknown;
 }
index 3672539..f14c4ff 100644 (file)
@@ -715,8 +715,11 @@ unsigned long efi_main(efi_handle_t handle,
            (IS_ENABLED(CONFIG_X86_32) && buffer_end > KERNEL_IMAGE_SIZE)    ||
            (IS_ENABLED(CONFIG_X86_64) && buffer_end > MAXMEM_X86_64_4LEVEL) ||
            (image_offset == 0)) {
+               extern char _bss[];
+
                status = efi_relocate_kernel(&bzimage_addr,
-                                            hdr->init_size, hdr->init_size,
+                                            (unsigned long)_bss - bzimage_addr,
+                                            hdr->init_size,
                                             hdr->pref_address,
                                             hdr->kernel_alignment,
                                             LOAD_PHYSICAL_ADDR);
index ddf9eae..47d67bb 100644 (file)
@@ -663,6 +663,19 @@ out:
        return rv;
 }
 
+static long efi_runtime_get_supported_mask(unsigned long arg)
+{
+       unsigned int __user *supported_mask;
+       int rv = 0;
+
+       supported_mask = (unsigned int *)arg;
+
+       if (put_user(efi.runtime_supported_mask, supported_mask))
+               rv = -EFAULT;
+
+       return rv;
+}
+
 static long efi_test_ioctl(struct file *file, unsigned int cmd,
                                                        unsigned long arg)
 {
@@ -699,6 +712,9 @@ static long efi_test_ioctl(struct file *file, unsigned int cmd,
 
        case EFI_RUNTIME_RESET_SYSTEM:
                return efi_runtime_reset_system(arg);
+
+       case EFI_RUNTIME_GET_SUPPORTED_MASK:
+               return efi_runtime_get_supported_mask(arg);
        }
 
        return -ENOTTY;
index f2446aa..117349e 100644 (file)
@@ -118,4 +118,7 @@ struct efi_resetsystem {
 #define EFI_RUNTIME_RESET_SYSTEM \
        _IOW('p', 0x0B, struct efi_resetsystem)
 
+#define EFI_RUNTIME_GET_SUPPORTED_MASK \
+       _IOR('p', 0x0C, unsigned int)
+
 #endif /* _DRIVERS_FIRMWARE_EFI_TEST_H_ */
index 65d1b23..b9c11c2 100644 (file)
@@ -1414,10 +1414,12 @@ out:
                pm_runtime_put_autosuspend(connector->dev->dev);
        }
 
-       drm_dp_set_subconnector_property(&amdgpu_connector->base,
-                                        ret,
-                                        amdgpu_dig_connector->dpcd,
-                                        amdgpu_dig_connector->downstream_ports);
+       if (connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort ||
+           connector->connector_type == DRM_MODE_CONNECTOR_eDP)
+               drm_dp_set_subconnector_property(&amdgpu_connector->base,
+                                                ret,
+                                                amdgpu_dig_connector->dpcd,
+                                                amdgpu_dig_connector->downstream_ports);
        return ret;
 }
 
index 7d2f7a2..1cb7d73 100644 (file)
@@ -5069,8 +5069,7 @@ out:
  * @pdev: pointer to PCI device
  *
  * Called when the error recovery driver tells us that its
- * OK to resume normal operation. Use completion to allow
- * halted scsi ops to resume.
+ * OK to resume normal operation.
  */
 void amdgpu_pci_resume(struct pci_dev *pdev)
 {
index c2ced5b..6e679db 100644 (file)
@@ -496,7 +496,8 @@ void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev)
                break;
        }
 
-       if (!amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_DCE)) {
+       if (amdgpu_sriov_vf(adev) ||
+           !amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_DCE)) {
                size = 0;
        } else {
                size = amdgpu_gmc_get_vbios_fb_size(adev);
index fc9bb94..5f4805e 100644 (file)
@@ -1647,7 +1647,7 @@ static int gfx_v9_0_init_microcode(struct amdgpu_device *adev)
        }
 
        /* No CPG in Arcturus */
-       if (adev->asic_type != CHIP_ARCTURUS) {
+       if (adev->gfx.num_gfx_rings) {
                r = gfx_v9_0_init_cp_gfx_microcode(adev, chip_name);
                if (r)
                        return r;
@@ -2633,7 +2633,14 @@ static void gfx_v9_0_wait_for_rlc_serdes(struct amdgpu_device *adev)
 static void gfx_v9_0_enable_gui_idle_interrupt(struct amdgpu_device *adev,
                                               bool enable)
 {
-       u32 tmp = RREG32_SOC15(GC, 0, mmCP_INT_CNTL_RING0);
+       u32 tmp;
+
+       /* don't toggle interrupts that are only applicable
+        * to me0 pipe0 on AISCs that have me0 removed */
+       if (!adev->gfx.num_gfx_rings)
+               return;
+
+       tmp= RREG32_SOC15(GC, 0, mmCP_INT_CNTL_RING0);
 
        tmp = REG_SET_FIELD(tmp, CP_INT_CNTL_RING0, CNTX_BUSY_INT_ENABLE, enable ? 1 : 0);
        tmp = REG_SET_FIELD(tmp, CP_INT_CNTL_RING0, CNTX_EMPTY_INT_ENABLE, enable ? 1 : 0);
@@ -3822,7 +3829,7 @@ static int gfx_v9_0_cp_resume(struct amdgpu_device *adev)
                gfx_v9_0_enable_gui_idle_interrupt(adev, false);
 
        if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) {
-               if (adev->asic_type != CHIP_ARCTURUS) {
+               if (adev->gfx.num_gfx_rings) {
                        /* legacy firmware loading */
                        r = gfx_v9_0_cp_gfx_load_microcode(adev);
                        if (r)
@@ -3838,7 +3845,7 @@ static int gfx_v9_0_cp_resume(struct amdgpu_device *adev)
        if (r)
                return r;
 
-       if (adev->asic_type != CHIP_ARCTURUS) {
+       if (adev->gfx.num_gfx_rings) {
                r = gfx_v9_0_cp_gfx_resume(adev);
                if (r)
                        return r;
@@ -3848,7 +3855,7 @@ static int gfx_v9_0_cp_resume(struct amdgpu_device *adev)
        if (r)
                return r;
 
-       if (adev->asic_type != CHIP_ARCTURUS) {
+       if (adev->gfx.num_gfx_rings) {
                ring = &adev->gfx.gfx_ring[0];
                r = amdgpu_ring_test_helper(ring);
                if (r)
@@ -3884,7 +3891,7 @@ static void gfx_v9_0_init_tcp_config(struct amdgpu_device *adev)
 
 static void gfx_v9_0_cp_enable(struct amdgpu_device *adev, bool enable)
 {
-       if (adev->asic_type != CHIP_ARCTURUS)
+       if (adev->gfx.num_gfx_rings)
                gfx_v9_0_cp_gfx_enable(adev, enable);
        gfx_v9_0_cp_compute_enable(adev, enable);
 }
@@ -4025,7 +4032,7 @@ static int gfx_v9_0_soft_reset(void *handle)
                /* stop the rlc */
                adev->gfx.rlc.funcs->stop(adev);
 
-               if (adev->asic_type != CHIP_ARCTURUS)
+               if (adev->gfx.num_gfx_rings)
                        /* Disable GFX parsing/prefetching */
                        gfx_v9_0_cp_gfx_enable(adev, false);
 
index e1531d9..e22268f 100644 (file)
@@ -1577,13 +1577,10 @@ static int gmc_v9_0_hw_init(void *handle)
        gmc_v9_0_init_golden_registers(adev);
 
        if (adev->mode_info.num_crtc) {
-               if (adev->asic_type != CHIP_ARCTURUS) {
-                       /* Lockout access through VGA aperture*/
-                       WREG32_FIELD15(DCE, 0, VGA_HDP_CONTROL, VGA_MEMORY_DISABLE, 1);
-
-                       /* disable VGA render */
-                       WREG32_FIELD15(DCE, 0, VGA_RENDER_CONTROL, VGA_VSTATUS_CNTL, 0);
-               }
+               /* Lockout access through VGA aperture*/
+               WREG32_FIELD15(DCE, 0, VGA_HDP_CONTROL, VGA_MEMORY_DISABLE, 1);
+               /* disable VGA render */
+               WREG32_FIELD15(DCE, 0, VGA_RENDER_CONTROL, VGA_VSTATUS_CNTL, 0);
        }
 
        amdgpu_device_program_register_sequence(adev,
index 50922ff..72c893f 100644 (file)
@@ -422,7 +422,7 @@ static const struct kfd_device_info navi10_device_info = {
        .mqd_size_aligned = MQD_SIZE_ALIGNED,
        .needs_iommu_device = false,
        .supports_cwsr = true,
-       .needs_pci_atomics = false,
+       .needs_pci_atomics = true,
        .num_sdma_engines = 2,
        .num_xgmi_sdma_engines = 0,
        .num_sdma_queues_per_engine = 8,
@@ -440,7 +440,7 @@ static const struct kfd_device_info navi12_device_info = {
        .mqd_size_aligned = MQD_SIZE_ALIGNED,
        .needs_iommu_device = false,
        .supports_cwsr = true,
-       .needs_pci_atomics = false,
+       .needs_pci_atomics = true,
        .num_sdma_engines = 2,
        .num_xgmi_sdma_engines = 0,
        .num_sdma_queues_per_engine = 8,
@@ -458,7 +458,7 @@ static const struct kfd_device_info navi14_device_info = {
        .mqd_size_aligned = MQD_SIZE_ALIGNED,
        .needs_iommu_device = false,
        .supports_cwsr = true,
-       .needs_pci_atomics = false,
+       .needs_pci_atomics = true,
        .num_sdma_engines = 2,
        .num_xgmi_sdma_engines = 0,
        .num_sdma_queues_per_engine = 8,
@@ -476,7 +476,7 @@ static const struct kfd_device_info sienna_cichlid_device_info = {
        .mqd_size_aligned = MQD_SIZE_ALIGNED,
        .needs_iommu_device = false,
        .supports_cwsr = true,
-       .needs_pci_atomics = false,
+       .needs_pci_atomics = true,
        .num_sdma_engines = 4,
        .num_xgmi_sdma_engines = 0,
        .num_sdma_queues_per_engine = 8,
@@ -494,7 +494,7 @@ static const struct kfd_device_info navy_flounder_device_info = {
        .mqd_size_aligned = MQD_SIZE_ALIGNED,
        .needs_iommu_device = false,
        .supports_cwsr = true,
-       .needs_pci_atomics = false,
+       .needs_pci_atomics = true,
        .num_sdma_engines = 2,
        .num_xgmi_sdma_engines = 0,
        .num_sdma_queues_per_engine = 8,
@@ -530,7 +530,7 @@ static const struct kfd_device_info dimgrey_cavefish_device_info = {
        .mqd_size_aligned = MQD_SIZE_ALIGNED,
        .needs_iommu_device = false,
        .supports_cwsr = true,
-       .needs_pci_atomics = false,
+       .needs_pci_atomics = true,
        .num_sdma_engines = 2,
        .num_xgmi_sdma_engines = 0,
        .num_sdma_queues_per_engine = 8,
index 2c4dbde..519080e 100644 (file)
@@ -2386,7 +2386,8 @@ void amdgpu_dm_update_connector_after_detect(
 
                        drm_connector_update_edid_property(connector,
                                                           aconnector->edid);
-                       drm_add_edid_modes(connector, aconnector->edid);
+                       aconnector->num_modes = drm_add_edid_modes(connector, aconnector->edid);
+                       drm_connector_list_update(connector);
 
                        if (aconnector->dc_link->aux_mode)
                                drm_dp_cec_set_edid(&aconnector->dm_dp_aux.aux,
@@ -9367,7 +9368,7 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev,
                if (ret)
                        goto fail;
 
-               if (dm_old_crtc_state->dsc_force_changed && new_crtc_state)
+               if (dm_old_crtc_state->dsc_force_changed)
                        new_crtc_state->mode_changed = true;
        }
 
index 3577785..26ed70e 100644 (file)
@@ -165,7 +165,10 @@ static struct list_head *remove_irq_handler(struct amdgpu_device *adev,
                handler = list_entry(entry, struct amdgpu_dm_irq_handler_data,
                                     list);
 
-               if (ih == handler) {
+               if (handler == NULL)
+                       continue;
+
+               if (ih == handler->handler) {
                        /* Found our handler. Remove it from the list. */
                        list_del(&handler->list);
                        handler_removed = true;
index d00b025..01b1853 100644 (file)
@@ -75,15 +75,8 @@ int rn_get_active_display_cnt_wa(
        for (i = 0; i < dc->link_count; i++) {
                const struct dc_link *link = dc->links[i];
 
-               /*
-                * Only notify active stream or virtual stream.
-                * Need to notify virtual stream to work around
-                * headless case. HPD does not fire when system is in
-                * S0i2.
-                */
                /* abusing the fact that the dig and phy are coupled to see if the phy is enabled */
-               if (link->connector_signal == SIGNAL_TYPE_VIRTUAL ||
-                               link->link_enc->funcs->is_dig_enabled(link->link_enc))
+               if (link->link_enc->funcs->is_dig_enabled(link->link_enc))
                        display_count++;
        }
 
@@ -234,12 +227,11 @@ void rn_update_clocks(struct clk_mgr *clk_mgr_base,
                                        rn_vbios_smu_set_dppclk(clk_mgr, clk_mgr_base->clks.dppclk_khz);
 
                // always update dtos unless clock is lowered and not safe to lower
-               if (new_clocks->dppclk_khz >= dc->current_state->bw_ctx.bw.dcn.clk.dppclk_khz)
-                       rn_update_clocks_update_dpp_dto(
-                                       clk_mgr,
-                                       context,
-                                       clk_mgr_base->clks.actual_dppclk_khz,
-                                       safe_to_lower);
+               rn_update_clocks_update_dpp_dto(
+                               clk_mgr,
+                               context,
+                               clk_mgr_base->clks.actual_dppclk_khz,
+                               safe_to_lower);
        }
 
        if (update_dispclk &&
@@ -738,32 +730,32 @@ static struct wm_table ddr4_wm_table_rn = {
                        .wm_inst = WM_A,
                        .wm_type = WM_TYPE_PSTATE_CHG,
                        .pstate_latency_us = 11.72,
-                       .sr_exit_time_us = 9.09,
-                       .sr_enter_plus_exit_time_us = 10.14,
+                       .sr_exit_time_us = 11.90,
+                       .sr_enter_plus_exit_time_us = 12.80,
                        .valid = true,
                },
                {
                        .wm_inst = WM_B,
                        .wm_type = WM_TYPE_PSTATE_CHG,
                        .pstate_latency_us = 11.72,
-                       .sr_exit_time_us = 11.12,
-                       .sr_enter_plus_exit_time_us = 12.48,
+                       .sr_exit_time_us = 13.18,
+                       .sr_enter_plus_exit_time_us = 14.30,
                        .valid = true,
                },
                {
                        .wm_inst = WM_C,
                        .wm_type = WM_TYPE_PSTATE_CHG,
                        .pstate_latency_us = 11.72,
-                       .sr_exit_time_us = 11.12,
-                       .sr_enter_plus_exit_time_us = 12.48,
+                       .sr_exit_time_us = 13.18,
+                       .sr_enter_plus_exit_time_us = 14.30,
                        .valid = true,
                },
                {
                        .wm_inst = WM_D,
                        .wm_type = WM_TYPE_PSTATE_CHG,
                        .pstate_latency_us = 11.72,
-                       .sr_exit_time_us = 11.12,
-                       .sr_enter_plus_exit_time_us = 12.48,
+                       .sr_exit_time_us = 13.18,
+                       .sr_enter_plus_exit_time_us = 14.30,
                        .valid = true,
                },
        }
index 11a7b58..7deeec9 100644 (file)
@@ -99,7 +99,7 @@ int rn_vbios_smu_send_msg_with_param(struct clk_mgr_internal *clk_mgr, unsigned
        /* Trigger the message transaction by writing the message ID */
        REG_WRITE(MP1_SMN_C2PMSG_67, msg_id);
 
-       result = rn_smu_wait_for_response(clk_mgr, 10, 1000);
+       result = rn_smu_wait_for_response(clk_mgr, 10, 200000);
 
        ASSERT(result == VBIOSSMC_Result_OK || result == VBIOSSMC_Result_UnknownCmd);
 
index 9a8e66b..991b9c5 100644 (file)
@@ -74,15 +74,8 @@ int vg_get_active_display_cnt_wa(
        for (i = 0; i < dc->link_count; i++) {
                const struct dc_link *link = dc->links[i];
 
-               /*
-                * Only notify active stream or virtual stream.
-                * Need to notify virtual stream to work around
-                * headless case. HPD does not fire when system is in
-                * S0i2.
-                */
                /* abusing the fact that the dig and phy are coupled to see if the phy is enabled */
-               if (link->connector_signal == SIGNAL_TYPE_VIRTUAL ||
-                               link->link_enc->funcs->is_dig_enabled(link->link_enc))
+               if (link->link_enc->funcs->is_dig_enabled(link->link_enc))
                        display_count++;
        }
 
index 7339d98..58eb0d6 100644 (file)
@@ -2625,26 +2625,6 @@ static void commit_planes_for_stream(struct dc *dc,
                }
        }
 
-       if (update_type != UPDATE_TYPE_FAST) {
-               // If changing VTG FP2: wait until back in vactive to program FP2
-               // Need to ensure that pipe unlock happens soon after to minimize race condition
-               for (i = 0; i < dc->res_pool->pipe_count; i++) {
-                       struct pipe_ctx *pipe_ctx = &context->res_ctx.pipe_ctx[i];
-
-                       if (pipe_ctx->top_pipe || pipe_ctx->stream != stream)
-                               continue;
-
-                       if (!pipe_ctx->update_flags.bits.global_sync)
-                               continue;
-
-                       pipe_ctx->stream_res.tg->funcs->wait_for_state(pipe_ctx->stream_res.tg, CRTC_STATE_VBLANK);
-                       pipe_ctx->stream_res.tg->funcs->wait_for_state(pipe_ctx->stream_res.tg, CRTC_STATE_VACTIVE);
-
-                       pipe_ctx->stream_res.tg->funcs->set_vtg_params(
-                                       pipe_ctx->stream_res.tg, &pipe_ctx->stream->timing, true);
-               }
-       }
-
        if ((update_type != UPDATE_TYPE_FAST) && dc->hwss.interdependent_update_lock)
                dc->hwss.interdependent_update_lock(dc, context, false);
        else
index 6b11d4a..2fc1223 100644 (file)
@@ -3173,13 +3173,7 @@ static void get_active_converter_info(
        }
 
        /* DPCD 0x5 bit 0 = 1, it indicate it's branch device */
-       if (ds_port.fields.PORT_TYPE == DOWNSTREAM_DP) {
-               link->dpcd_caps.is_branch_dev = false;
-       }
-
-       else {
-               link->dpcd_caps.is_branch_dev = ds_port.fields.PORT_PRESENT;
-       }
+       link->dpcd_caps.is_branch_dev = ds_port.fields.PORT_PRESENT;
 
        switch (ds_port.fields.PORT_TYPE) {
        case DOWNSTREAM_VGA:
index 41679ad..9e796df 100644 (file)
@@ -1241,6 +1241,22 @@ void hubp1_vtg_sel(struct hubp *hubp, uint32_t otg_inst)
        REG_UPDATE(DCHUBP_CNTL, HUBP_VTG_SEL, otg_inst);
 }
 
+bool hubp1_in_blank(struct hubp *hubp)
+{
+       uint32_t in_blank;
+       struct dcn10_hubp *hubp1 = TO_DCN10_HUBP(hubp);
+
+       REG_GET(DCHUBP_CNTL, HUBP_IN_BLANK, &in_blank);
+       return in_blank ? true : false;
+}
+
+void hubp1_soft_reset(struct hubp *hubp, bool reset)
+{
+       struct dcn10_hubp *hubp1 = TO_DCN10_HUBP(hubp);
+
+       REG_UPDATE(DCHUBP_CNTL, HUBP_DISABLE, reset ? 1 : 0);
+}
+
 void hubp1_init(struct hubp *hubp)
 {
        //do nothing
@@ -1272,6 +1288,8 @@ static const struct hubp_funcs dcn10_hubp_funcs = {
 
        .dmdata_set_attributes = NULL,
        .dmdata_load = NULL,
+       .hubp_soft_reset = hubp1_soft_reset,
+       .hubp_in_blank = hubp1_in_blank,
 };
 
 /*****************************************/
index 780af5b..a9a6ed7 100644 (file)
        HUBP_SF(HUBP0_DCHUBP_CNTL, HUBP_NO_OUTSTANDING_REQ, mask_sh),\
        HUBP_SF(HUBP0_DCHUBP_CNTL, HUBP_VTG_SEL, mask_sh),\
        HUBP_SF(HUBP0_DCHUBP_CNTL, HUBP_DISABLE, mask_sh),\
+       HUBP_SF(HUBP0_DCHUBP_CNTL, HUBP_IN_BLANK, mask_sh),\
        HUBP_SF(HUBP0_DCSURF_ADDR_CONFIG, NUM_PIPES, mask_sh),\
        HUBP_SF(HUBP0_DCSURF_ADDR_CONFIG, NUM_BANKS, mask_sh),\
        HUBP_SF(HUBP0_DCSURF_ADDR_CONFIG, PIPE_INTERLEAVE, mask_sh),\
        type HUBP_VTG_SEL;\
        type HUBP_UNDERFLOW_STATUS;\
        type HUBP_UNDERFLOW_CLEAR;\
+       type HUBP_IN_BLANK;\
        type NUM_PIPES;\
        type NUM_BANKS;\
        type PIPE_INTERLEAVE;\
@@ -772,5 +774,7 @@ void hubp1_vready_workaround(struct hubp *hubp,
 
 void hubp1_init(struct hubp *hubp);
 void hubp1_read_state_common(struct hubp *hubp);
+bool hubp1_in_blank(struct hubp *hubp);
+void hubp1_soft_reset(struct hubp *hubp, bool reset);
 
 #endif
index 3fcd408..a46cb20 100644 (file)
@@ -467,6 +467,17 @@ void mpc1_cursor_lock(struct mpc *mpc, int opp_id, bool lock)
        REG_SET(CUR[opp_id], 0, CUR_VUPDATE_LOCK_SET, lock ? 1 : 0);
 }
 
+unsigned int mpc1_get_mpc_out_mux(struct mpc *mpc, int opp_id)
+{
+       struct dcn10_mpc *mpc10 = TO_DCN10_MPC(mpc);
+       uint32_t val;
+
+       if (opp_id < MAX_OPP && REG(MUX[opp_id]))
+               REG_GET(MUX[opp_id], MPC_OUT_MUX, &val);
+
+       return val;
+}
+
 static const struct mpc_funcs dcn10_mpc_funcs = {
        .read_mpcc_state = mpc1_read_mpcc_state,
        .insert_plane = mpc1_insert_plane,
@@ -483,6 +494,7 @@ static const struct mpc_funcs dcn10_mpc_funcs = {
        .set_denorm_clamp = NULL,
        .set_output_csc = NULL,
        .set_output_gamma = NULL,
+       .get_mpc_out_mux = mpc1_get_mpc_out_mux,
 };
 
 void dcn10_mpc_construct(struct dcn10_mpc *mpc10,
index 66a4719..dbfffc6 100644 (file)
@@ -200,4 +200,5 @@ void mpc1_read_mpcc_state(
 
 void mpc1_cursor_lock(struct mpc *mpc, int opp_id, bool lock);
 
+unsigned int mpc1_get_mpc_out_mux(struct mpc *mpc, int opp_id);
 #endif
index b7e44e5..0df0da2 100644 (file)
@@ -1595,6 +1595,8 @@ static struct hubp_funcs dcn20_hubp_funcs = {
        .hubp_set_flip_control_surface_gsl = hubp2_set_flip_control_surface_gsl,
        .hubp_init = hubp1_init,
        .validate_dml_output = hubp2_validate_dml_output,
+       .hubp_in_blank = hubp1_in_blank,
+       .hubp_soft_reset = hubp1_soft_reset,
 };
 
 
index 31a4771..cb822df 100644 (file)
@@ -1586,7 +1586,10 @@ static void dcn20_program_pipe(
                        && !pipe_ctx->top_pipe && !pipe_ctx->prev_odm_pipe)
                hws->funcs.blank_pixel_data(dc, pipe_ctx, !pipe_ctx->plane_state->visible);
 
-       if (pipe_ctx->update_flags.bits.global_sync) {
+       /* Only update TG on top pipe */
+       if (pipe_ctx->update_flags.bits.global_sync && !pipe_ctx->top_pipe
+                       && !pipe_ctx->prev_odm_pipe) {
+
                pipe_ctx->stream_res.tg->funcs->program_global_sync(
                                pipe_ctx->stream_res.tg,
                                pipe_ctx->pipe_dlg_param.vready_offset,
@@ -1594,8 +1597,11 @@ static void dcn20_program_pipe(
                                pipe_ctx->pipe_dlg_param.vupdate_offset,
                                pipe_ctx->pipe_dlg_param.vupdate_width);
 
+               pipe_ctx->stream_res.tg->funcs->wait_for_state(pipe_ctx->stream_res.tg, CRTC_STATE_VBLANK);
+               pipe_ctx->stream_res.tg->funcs->wait_for_state(pipe_ctx->stream_res.tg, CRTC_STATE_VACTIVE);
+
                pipe_ctx->stream_res.tg->funcs->set_vtg_params(
-                               pipe_ctx->stream_res.tg, &pipe_ctx->stream->timing, false);
+                               pipe_ctx->stream_res.tg, &pipe_ctx->stream->timing, true);
 
                if (hws->funcs.setup_vupdate_interrupt)
                        hws->funcs.setup_vupdate_interrupt(dc, pipe_ctx);
@@ -2570,4 +2576,4 @@ void dcn20_set_disp_pattern_generator(const struct dc *dc,
 {
        pipe_ctx->stream_res.opp->funcs->opp_set_disp_pattern_generator(pipe_ctx->stream_res.opp, test_pattern,
                        color_space, color_depth, solid_color, width, height, offset);
-}
\ No newline at end of file
+}
index 99cc095..6a99fdd 100644 (file)
@@ -556,6 +556,7 @@ const struct mpc_funcs dcn20_mpc_funcs = {
        .set_ocsc_default = mpc2_set_ocsc_default,
        .set_output_gamma = mpc2_set_output_gamma,
        .power_on_mpc_mem_pwr = mpc20_power_on_ogam_lut,
+       .get_mpc_out_mux = mpc1_get_mpc_out_mux,
 };
 
 void dcn20_mpc_construct(struct dcn20_mpc *mpc20,
index ff36db5..e04ecf0 100644 (file)
@@ -1933,7 +1933,7 @@ bool dcn20_split_stream_for_odm(
                next_odm_pipe->stream_res.opp = pool->opps[next_odm_pipe->pipe_idx];
        else
                next_odm_pipe->stream_res.opp = next_odm_pipe->top_pipe->stream_res.opp;
-       if (next_odm_pipe->stream->timing.flags.DSC == 1) {
+       if (next_odm_pipe->stream->timing.flags.DSC == 1 && !next_odm_pipe->top_pipe) {
                dcn20_acquire_dsc(dc, res_ctx, &next_odm_pipe->stream_res.dsc, next_odm_pipe->pipe_idx);
                ASSERT(next_odm_pipe->stream_res.dsc);
                if (next_odm_pipe->stream_res.dsc == NULL)
index af462fe..88ffa9f 100644 (file)
@@ -509,6 +509,8 @@ static struct hubp_funcs dcn30_hubp_funcs = {
        .hubp_clear_underflow = hubp2_clear_underflow,
        .hubp_set_flip_control_surface_gsl = hubp2_set_flip_control_surface_gsl,
        .hubp_init = hubp3_init,
+       .hubp_in_blank = hubp1_in_blank,
+       .hubp_soft_reset = hubp1_soft_reset,
 };
 
 bool hubp3_construct(
index d7d053f..3e6f760 100644 (file)
@@ -1428,6 +1428,7 @@ const struct mpc_funcs dcn30_mpc_funcs = {
        .program_3dlut = mpc3_program_3dlut,
        .release_rmu = mpcc3_release_rmu,
        .power_on_mpc_mem_pwr = mpc20_power_on_ogam_lut,
+       .get_mpc_out_mux = mpc1_get_mpc_out_mux,
 
 };
 
index 315e306..22f3f64 100644 (file)
@@ -188,6 +188,8 @@ struct hubp_funcs {
        void (*set_unbounded_requesting)(
                struct hubp *hubp,
                bool enable);
+       bool (*hubp_in_blank)(struct hubp *hubp);
+       void (*hubp_soft_reset)(struct hubp *hubp, bool reset);
 
 };
 
index 879f502..75c77ad 100644 (file)
@@ -359,6 +359,10 @@ struct mpc_funcs {
 
        int (*release_rmu)(struct mpc *mpc, int mpcc_id);
 
+       unsigned int (*get_mpc_out_mux)(
+                       struct mpc *mpc,
+                       int opp_id);
+
 };
 
 #endif
index f512bda..249a076 100644 (file)
 
 /* Firmware versioning. */
 #ifdef DMUB_EXPOSE_VERSION
-#define DMUB_FW_VERSION_GIT_HASH 0xa18e25995
+#define DMUB_FW_VERSION_GIT_HASH 0xf51b86a
 #define DMUB_FW_VERSION_MAJOR 0
 #define DMUB_FW_VERSION_MINOR 0
-#define DMUB_FW_VERSION_REVISION 46
+#define DMUB_FW_VERSION_REVISION 47
 #define DMUB_FW_VERSION_TEST 0
 #define DMUB_FW_VERSION_VBIOS 0
 #define DMUB_FW_VERSION_HOTFIX 0
index f244b72..73ca49f 100644 (file)
@@ -128,8 +128,12 @@ static inline uint8_t get_device_count(struct mod_hdcp *hdcp)
 
 static inline enum mod_hdcp_status check_device_count(struct mod_hdcp *hdcp)
 {
-       /* device count must be greater than or equal to tracked hdcp displays */
-       return (get_device_count(hdcp) < get_active_display_count(hdcp)) ?
+       /* Some MST display may choose to report the internal panel as an HDCP RX.
+        * To update this condition with 1(because the immediate repeater's internal
+        * panel is possibly not included in DEVICE_COUNT) + get_device_count(hdcp).
+        * Device count must be greater than or equal to tracked hdcp displays.
+        */
+       return ((1 + get_device_count(hdcp)) < get_active_display_count(hdcp)) ?
                        MOD_HDCP_STATUS_HDCP1_DEVICE_COUNT_MISMATCH_FAILURE :
                        MOD_HDCP_STATUS_SUCCESS;
 }
index 549c113..a0895a7 100644 (file)
@@ -207,8 +207,11 @@ static inline uint8_t get_device_count(struct mod_hdcp *hdcp)
 
 static enum mod_hdcp_status check_device_count(struct mod_hdcp *hdcp)
 {
-       /* device count must be greater than or equal to tracked hdcp displays */
-       return (get_device_count(hdcp) < get_active_display_count(hdcp)) ?
+       /* Some MST display may choose to report the internal panel as an HDCP RX.   */
+       /* To update this condition with 1(because the immediate repeater's internal */
+       /* panel is possibly not included in DEVICE_COUNT) + get_device_count(hdcp). */
+       /* Device count must be greater than or equal to tracked hdcp displays.      */
+       return ((1 + get_device_count(hdcp)) < get_active_display_count(hdcp)) ?
                        MOD_HDCP_STATUS_HDCP2_DEVICE_COUNT_MISMATCH_FAILURE :
                        MOD_HDCP_STATUS_SUCCESS;
 }
index cc983f6..4fd8bce 100644 (file)
@@ -82,22 +82,24 @@ struct abm_parameters {
        unsigned char deviation_gain;
        unsigned char min_knee;
        unsigned char max_knee;
+       unsigned short blRampReduction;
+       unsigned short blRampStart;
 };
 
 static const struct abm_parameters abm_settings_config0[abm_defines_max_level] = {
-//  min_red  max_red  bright_pos  dark_pos  brightness_gain  contrast  deviation  min_knee  max_knee
-       {0xff,   0xbf,    0x20,       0x00,     0xff,            0x99,     0xb3,      0x40,     0xe0},
-       {0xde,   0x85,    0x20,       0x00,     0xff,            0x90,     0xa8,      0x40,     0xdf},
-       {0xb0,   0x50,    0x20,       0x00,     0xc0,            0x88,     0x78,      0x70,     0xa0},
-       {0x82,   0x40,    0x20,       0x00,     0x00,            0xff,     0xb3,      0x70,     0x70},
+//  min_red  max_red  bright_pos  dark_pos  bright_gain  contrast  dev   min_knee  max_knee  blStart  blRed
+       {0xff,   0xbf,    0x20,   0x00,     0xff,        0x99,     0xb3, 0x40,     0xe0,     0xCCCC,  0xCCCC},
+       {0xde,   0x85,    0x20,   0x00,     0xff,        0x90,     0xa8, 0x40,     0xdf,     0xCCCC,  0xCCCC},
+       {0xb0,   0x50,    0x20,   0x00,     0xc0,        0x88,     0x78, 0x70,     0xa0,     0xCCCC,  0xCCCC},
+       {0x82,   0x40,    0x20,   0x00,     0x00,        0xff,     0xb3, 0x70,     0x70,     0xCCCC,  0xCCCC},
 };
 
 static const struct abm_parameters abm_settings_config1[abm_defines_max_level] = {
-//  min_red  max_red  bright_pos  dark_pos  brightness_gain  contrast  deviation  min_knee  max_knee
-       {0xf0,   0xd9,    0x20,       0x00,     0x00,            0xff,     0xb3,      0x70,     0x70},
-       {0xcd,   0xa5,    0x20,       0x00,     0x00,            0xff,     0xb3,      0x70,     0x70},
-       {0x99,   0x65,    0x20,       0x00,     0x00,            0xff,     0xb3,      0x70,     0x70},
-       {0x82,   0x4d,    0x20,       0x00,     0x00,            0xff,     0xb3,      0x70,     0x70},
+//  min_red  max_red  bright_pos  dark_pos  bright_gain  contrast  dev   min_knee  max_knee  blStart  blRed
+       {0xf0,   0xd9,    0x20,   0x00,     0x00,        0xff,     0xb3, 0x70,     0x70,     0xCCCC,  0xCCCC},
+       {0xcd,   0xa5,    0x20,   0x00,     0x00,        0xff,     0xb3, 0x70,     0x70,     0xCCCC,  0xCCCC},
+       {0x99,   0x65,    0x20,   0x00,     0x00,        0xff,     0xb3, 0x70,     0x70,     0xCCCC,  0xCCCC},
+       {0x82,   0x4d,    0x20,   0x00,     0x00,        0xff,     0xb3, 0x70,     0x70,     0xCCCC,  0xCCCC},
 };
 
 static const struct abm_parameters * const abm_settings[] = {
@@ -662,6 +664,7 @@ bool dmub_init_abm_config(struct resource_pool *res_pool,
 {
        struct iram_table_v_2_2 ram_table;
        struct abm_config_table config;
+       unsigned int set = params.set;
        bool result = false;
        uint32_t i, j = 0;
 
@@ -710,6 +713,18 @@ bool dmub_init_abm_config(struct resource_pool *res_pool,
                config.max_knee[i] = ram_table.max_knee[i];
        }
 
+       if (params.backlight_ramping_override) {
+               for (i = 0; i < NUM_AGGR_LEVEL; i++) {
+                       config.blRampReduction[i] = params.backlight_ramping_reduction;
+                       config.blRampStart[i] = params.backlight_ramping_start;
+                       }
+               } else {
+                       for (i = 0; i < NUM_AGGR_LEVEL; i++) {
+                               config.blRampReduction[i] = abm_settings[set][i].blRampReduction;
+                               config.blRampStart[i] = abm_settings[set][i].blRampStart;
+                               }
+                       }
+
        config.min_abm_backlight = ram_table.min_abm_backlight;
 
 #if defined(CONFIG_DRM_AMD_DC_DCN)
index fa4728d..6f2eecc 100644 (file)
@@ -39,6 +39,7 @@ enum abm_defines {
 struct dmcu_iram_parameters {
        unsigned int *backlight_lut_array;
        unsigned int backlight_lut_array_size;
+       bool backlight_ramping_override;
        unsigned int backlight_ramping_reduction;
        unsigned int backlight_ramping_start;
        unsigned int min_abm_backlight;
index e5aa072..13de692 100644 (file)
@@ -30,7 +30,7 @@
 #define SMU11_DRIVER_IF_VERSION_NV10 0x36
 #define SMU11_DRIVER_IF_VERSION_NV12 0x36
 #define SMU11_DRIVER_IF_VERSION_NV14 0x36
-#define SMU11_DRIVER_IF_VERSION_Sienna_Cichlid 0x3B
+#define SMU11_DRIVER_IF_VERSION_Sienna_Cichlid 0x3D
 #define SMU11_DRIVER_IF_VERSION_Navy_Flounder 0xC
 #define SMU11_DRIVER_IF_VERSION_VANGOGH 0x02
 #define SMU11_DRIVER_IF_VERSION_Dimgrey_Cavefish 0xF
index 9bccf2a..8cb4fce 100644 (file)
@@ -724,8 +724,13 @@ static int vangogh_set_fine_grain_gfx_freq_parameters(struct smu_context *smu)
 
 static int vangogh_system_features_control(struct smu_context *smu, bool en)
 {
-       return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_RlcPowerNotify,
-                                       en ? RLC_STATUS_NORMAL : RLC_STATUS_OFF, NULL);
+       struct amdgpu_device *adev = smu->adev;
+
+       if (adev->pm.fw_version >= 0x43f1700)
+               return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_RlcPowerNotify,
+                                               en ? RLC_STATUS_NORMAL : RLC_STATUS_OFF, NULL);
+       else
+               return 0;
 }
 
 static const struct pptable_funcs vangogh_ppt_funcs = {
index 1f8195b..ca891ae 100644 (file)
@@ -152,7 +152,6 @@ static int komeda_parse_dt(struct device *dev, struct komeda_dev *mdev)
        ret = of_reserved_mem_device_init(dev);
        if (ret && ret != -ENODEV)
                return ret;
-       ret = 0;
 
        for_each_available_child_of_node(np, child) {
                if (of_node_name_eq(child, "pipeline")) {
index 6b99df6..034ee08 100644 (file)
@@ -81,10 +81,10 @@ static void komeda_kms_commit_tail(struct drm_atomic_state *old_state)
 
        drm_atomic_helper_commit_modeset_enables(dev, old_state);
 
-       drm_atomic_helper_wait_for_flip_done(dev, old_state);
-
        drm_atomic_helper_commit_hw_done(old_state);
 
+       drm_atomic_helper_wait_for_flip_done(dev, old_state);
+
        drm_atomic_helper_cleanup_planes(dev, old_state);
 }
 
index 452e505..719a797 100644 (file)
@@ -137,9 +137,10 @@ komeda_pipeline_get_first_component(struct komeda_pipeline *pipe,
                                    u32 comp_mask)
 {
        struct komeda_component *c = NULL;
+       unsigned long comp_mask_local = (unsigned long)comp_mask;
        int id;
 
-       id = find_first_bit((unsigned long *)&comp_mask, 32);
+       id = find_first_bit(&comp_mask_local, 32);
        if (id < 32)
                c = komeda_pipeline_get_component(pipe, id);
 
index 8f32ae7..5c08511 100644 (file)
@@ -704,10 +704,10 @@ komeda_compiz_set_input(struct komeda_compiz *compiz,
        cin->layer_alpha = dflow->layer_alpha;
 
        old_st = komeda_component_get_old_state(&compiz->base, drm_st);
-       WARN_ON(!old_st);
 
        /* compare with old to check if this input has been changed */
-       if (memcmp(&(to_compiz_st(old_st)->cins[idx]), cin, sizeof(*cin)))
+       if (WARN_ON(!old_st) ||
+           memcmp(&(to_compiz_st(old_st)->cins[idx]), cin, sizeof(*cin)))
                c_st->changed_active_inputs |= BIT(idx);
 
        komeda_component_add_input(c_st, &dflow->input, idx);
index ad5cc13..1c939f9 100644 (file)
@@ -297,13 +297,9 @@ int intel_lpe_audio_init(struct drm_i915_private *dev_priv)
  */
 void intel_lpe_audio_teardown(struct drm_i915_private *dev_priv)
 {
-       struct irq_desc *desc;
-
        if (!HAS_LPE_AUDIO(dev_priv))
                return;
 
-       desc = irq_to_desc(dev_priv->lpe_audio.irq);
-
        lpe_audio_platdev_destroy(dev_priv);
 
        irq_free_desc(dev_priv->lpe_audio.irq);
index c80eeac..6cdb052 100644 (file)
  * and related files, but that will be described in separate chapters.
  */
 
+/*
+ * Interrupt statistic for PMU. Increments the counter only if the
+ * interrupt originated from the the GPU so interrupts from a device which
+ * shares the interrupt line are not accounted.
+ */
+static inline void pmu_irq_stats(struct drm_i915_private *i915,
+                                irqreturn_t res)
+{
+       if (unlikely(res != IRQ_HANDLED))
+               return;
+
+       /*
+        * A clever compiler translates that into INC. A not so clever one
+        * should at least prevent store tearing.
+        */
+       WRITE_ONCE(i915->pmu.irq_count, i915->pmu.irq_count + 1);
+}
+
 typedef bool (*long_pulse_detect_func)(enum hpd_pin pin, u32 val);
 typedef u32 (*hotplug_enables_func)(struct drm_i915_private *i915,
                                    enum hpd_pin pin);
@@ -1668,6 +1686,8 @@ static irqreturn_t valleyview_irq_handler(int irq, void *arg)
                valleyview_pipestat_irq_handler(dev_priv, pipe_stats);
        } while (0);
 
+       pmu_irq_stats(dev_priv, ret);
+
        enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
 
        return ret;
@@ -1745,6 +1765,8 @@ static irqreturn_t cherryview_irq_handler(int irq, void *arg)
                valleyview_pipestat_irq_handler(dev_priv, pipe_stats);
        } while (0);
 
+       pmu_irq_stats(dev_priv, ret);
+
        enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
 
        return ret;
@@ -2155,6 +2177,8 @@ static irqreturn_t ilk_irq_handler(int irq, void *arg)
        if (sde_ier)
                raw_reg_write(regs, SDEIER, sde_ier);
 
+       pmu_irq_stats(i915, ret);
+
        /* IRQs are synced during runtime_suspend, we don't require a wakeref */
        enable_rpm_wakeref_asserts(&i915->runtime_pm);
 
@@ -2541,6 +2565,8 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
 
        gen8_master_intr_enable(regs);
 
+       pmu_irq_stats(dev_priv, IRQ_HANDLED);
+
        return IRQ_HANDLED;
 }
 
@@ -2636,6 +2662,8 @@ __gen11_irq_handler(struct drm_i915_private * const i915,
 
        gen11_gu_misc_irq_handler(gt, gu_misc_iir);
 
+       pmu_irq_stats(i915, IRQ_HANDLED);
+
        return IRQ_HANDLED;
 }
 
@@ -3934,6 +3962,8 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
                i8xx_pipestat_irq_handler(dev_priv, iir, pipe_stats);
        } while (0);
 
+       pmu_irq_stats(dev_priv, ret);
+
        enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
 
        return ret;
@@ -4043,6 +4073,8 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
                i915_pipestat_irq_handler(dev_priv, iir, pipe_stats);
        } while (0);
 
+       pmu_irq_stats(dev_priv, ret);
+
        enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
 
        return ret;
@@ -4189,6 +4221,8 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
                i965_pipestat_irq_handler(dev_priv, iir, pipe_stats);
        } while (0);
 
+       pmu_irq_stats(dev_priv, IRQ_HANDLED);
+
        enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
 
        return ret;
index cd786ad..d76685c 100644 (file)
@@ -4,7 +4,6 @@
  * Copyright Â© 2017-2018 Intel Corporation
  */
 
-#include <linux/irq.h>
 #include <linux/pm_runtime.h>
 
 #include "gt/intel_engine.h"
@@ -424,22 +423,6 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer)
        return HRTIMER_RESTART;
 }
 
-static u64 count_interrupts(struct drm_i915_private *i915)
-{
-       /* open-coded kstat_irqs() */
-       struct irq_desc *desc = irq_to_desc(i915->drm.pdev->irq);
-       u64 sum = 0;
-       int cpu;
-
-       if (!desc || !desc->kstat_irqs)
-               return 0;
-
-       for_each_possible_cpu(cpu)
-               sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
-
-       return sum;
-}
-
 static void i915_pmu_event_destroy(struct perf_event *event)
 {
        struct drm_i915_private *i915 =
@@ -590,7 +573,7 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
                                   USEC_PER_SEC /* to MHz */);
                        break;
                case I915_PMU_INTERRUPTS:
-                       val = count_interrupts(i915);
+                       val = READ_ONCE(pmu->irq_count);
                        break;
                case I915_PMU_RC6_RESIDENCY:
                        val = get_rc6(&i915->gt);
index a24885a..8405d6d 100644 (file)
@@ -111,6 +111,14 @@ struct i915_pmu {
         * @sleep_last: Last time GT parked for RC6 estimation.
         */
        ktime_t sleep_last;
+       /**
+        * @irq_count: Number of interrupts
+        *
+        * Intentionally unsigned long to avoid atomics or heuristics on 32bit.
+        * 4e9 interrupts are a lot and postprocessing can really deal with an
+        * occasional wraparound easily. It's 32bit after all.
+        */
+       unsigned long irq_count;
        /**
         * @events_attr_group: Device events attribute group.
         */
index 5455b20..7b2f606 100644 (file)
@@ -239,21 +239,6 @@ static struct page *ttm_pool_type_take(struct ttm_pool_type *pt)
        return p;
 }
 
-/* Count the number of pages available in a pool_type */
-static unsigned int ttm_pool_type_count(struct ttm_pool_type *pt)
-{
-       unsigned int count = 0;
-       struct page *p;
-
-       spin_lock(&pt->lock);
-       /* Only used for debugfs, the overhead doesn't matter */
-       list_for_each_entry(p, &pt->pages, lru)
-               ++count;
-       spin_unlock(&pt->lock);
-
-       return count;
-}
-
 /* Initialize and add a pool type to the global shrinker list */
 static void ttm_pool_type_init(struct ttm_pool_type *pt, struct ttm_pool *pool,
                               enum ttm_caching caching, unsigned int order)
@@ -543,6 +528,20 @@ void ttm_pool_fini(struct ttm_pool *pool)
 EXPORT_SYMBOL(ttm_pool_fini);
 
 #ifdef CONFIG_DEBUG_FS
+/* Count the number of pages available in a pool_type */
+static unsigned int ttm_pool_type_count(struct ttm_pool_type *pt)
+{
+       unsigned int count = 0;
+       struct page *p;
+
+       spin_lock(&pt->lock);
+       /* Only used for debugfs, the overhead doesn't matter */
+       list_for_each_entry(p, &pt->pages, lru)
+               ++count;
+       spin_unlock(&pt->lock);
+
+       return count;
+}
 
 /* Dump information about the different pool types */
 static void ttm_pool_debugfs_orders(struct ttm_pool_type *pt,
index 560865f..67f86c4 100644 (file)
@@ -157,12 +157,6 @@ void rtrs_clt_put_permit(struct rtrs_clt *clt, struct rtrs_permit *permit)
 }
 EXPORT_SYMBOL(rtrs_clt_put_permit);
 
-void *rtrs_permit_to_pdu(struct rtrs_permit *permit)
-{
-       return permit + 1;
-}
-EXPORT_SYMBOL(rtrs_permit_to_pdu);
-
 /**
  * rtrs_permit_to_clt_con() - returns RDMA connection pointer by the permit
  * @sess: client session pointer
index 9af750f..8738e90 100644 (file)
@@ -63,13 +63,6 @@ struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops,
 
 void rtrs_clt_close(struct rtrs_clt *sess);
 
-/**
- * rtrs_permit_to_pdu() - converts rtrs_permit to opaque pdu pointer
- * @permit: RTRS permit pointer, it associates the memory allocation for future
- *          RDMA operation.
- */
-void *rtrs_permit_to_pdu(struct rtrs_permit *permit);
-
 enum {
        RTRS_PERMIT_NOWAIT = 0,
        RTRS_PERMIT_WAIT   = 1,
index 0e06d72..a4752ac 100644 (file)
@@ -2535,8 +2535,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
                        else
                                err = "device busy";
                        mutex_unlock(&bch_register_lock);
-                       if (!IS_ERR(bdev))
-                               bdput(bdev);
                        if (attr == &ksysfs_register_quiet)
                                goto done;
                }
index 554e3af..00a520c 100644 (file)
@@ -404,7 +404,7 @@ STORE(__cached_dev)
                if (!env)
                        return -ENOMEM;
                add_uevent_var(env, "DRIVER=bcache");
-               add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid),
+               add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid);
                add_uevent_var(env, "CACHED_LABEL=%s", buf);
                kobject_uevent_env(&disk_to_dev(dc->disk.disk)->kobj,
                                   KOBJ_CHANGE,
index 6d1bf7c..a320393 100644 (file)
@@ -1513,24 +1513,14 @@ static int ab8500_interrupts_show(struct seq_file *s, void *p)
 {
        int line;
 
-       seq_puts(s, "name: number:  number of: wake:\n");
+       seq_puts(s, "name: number: irq: number of: wake:\n");
 
        for (line = 0; line < num_interrupt_lines; line++) {
-               struct irq_desc *desc = irq_to_desc(line + irq_first);
-
-               seq_printf(s, "%3i:  %6i %4i",
+               seq_printf(s, "%3i:  %4i %6i %4i\n",
                           line,
+                          line + irq_first,
                           num_interrupts[line],
                           num_wake_interrupts[line]);
-
-               if (desc && desc->name)
-                       seq_printf(s, "-%-8s", desc->name);
-               if (desc && desc->action) {
-                       struct irqaction *action = desc->action;
-
-                       seq_printf(s, "  %s", action->name);
-                       while ((action = action->next) != NULL)
-                               seq_printf(s, ", %s", action->name);
                }
                seq_putc(s, '\n');
        }
index 74d4667..d5fc72b 100644 (file)
@@ -90,7 +90,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
                        int cq_idx)
 {
        struct mlx4_en_dev *mdev = priv->mdev;
-       int err = 0;
+       int irq, err = 0;
        int timestamp_en = 0;
        bool assigned_eq = false;
 
@@ -116,10 +116,8 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 
                        assigned_eq = true;
                }
-
-               cq->irq_desc =
-                       irq_to_desc(mlx4_eq_get_irq(mdev->dev,
-                                                   cq->vector));
+               irq = mlx4_eq_get_irq(mdev->dev, cq->vector);
+               cq->aff_mask = irq_get_effective_affinity_mask(irq);
        } else {
                /* For TX we use the same irq per
                ring we assigned for the RX    */
index 7954c1d..c1c9118 100644 (file)
@@ -958,18 +958,14 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
 
        /* If we used up all the quota - we're probably not done yet... */
        if (done == budget || !clean_complete) {
-               const struct cpumask *aff;
-               struct irq_data *idata;
                int cpu_curr;
 
                /* in case we got here because of !clean_complete */
                done = budget;
 
                cpu_curr = smp_processor_id();
-               idata = irq_desc_get_irq_data(cq->irq_desc);
-               aff = irq_data_get_affinity_mask(idata);
 
-               if (likely(cpumask_test_cpu(cpu_curr, aff)))
+               if (likely(cpumask_test_cpu(cpu_curr, cq->aff_mask)))
                        return budget;
 
                /* Current cpu is not according to smp_irq_affinity -
index 17f2b19..e8ed231 100644 (file)
@@ -47,6 +47,7 @@
 #endif
 #include <linux/cpu_rmap.h>
 #include <linux/ptp_clock_kernel.h>
+#include <linux/irq.h>
 #include <net/xdp.h>
 
 #include <linux/mlx4/device.h>
@@ -365,7 +366,7 @@ struct mlx4_en_cq {
        struct mlx4_cqe *buf;
 #define MLX4_EN_OPCODE_ERROR   0x1e
 
-       struct irq_desc *irq_desc;
+       const struct cpumask *aff_mask;
 };
 
 struct mlx4_en_port_profile {
index a1a81cf..055baf3 100644 (file)
@@ -684,7 +684,7 @@ struct mlx5e_channel {
        spinlock_t                 async_icosq_lock;
 
        /* data path - accessed per napi poll */
-       struct irq_desc *irq_desc;
+       const struct cpumask      *aff_mask;
        struct mlx5e_ch_stats     *stats;
 
        /* control */
index 3511189..2a2bac3 100644 (file)
@@ -479,7 +479,6 @@ int mlx5e_port_ptp_open(struct mlx5e_priv *priv, struct mlx5e_params *params,
        c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
        c->num_tc   = params->num_tc;
        c->stats    = &priv->port_ptp_stats.ch;
-       c->irq_desc = irq_to_desc(irq);
        c->lag_port = lag_port;
 
        netif_napi_add(netdev, &c->napi, mlx5e_ptp_napi_poll, 64);
index 28aa5ae..90c98ea 100644 (file)
@@ -28,7 +28,6 @@ struct mlx5e_port_ptp {
        u8                         lag_port;
 
        /* data path - accessed per napi poll */
-       struct irq_desc *irq_desc;
        struct mlx5e_ch_stats     *stats;
 
        /* control */
index 0383165..7a79d33 100644 (file)
@@ -1987,7 +1987,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
        c->num_tc   = params->num_tc;
        c->xdp      = !!params->xdp_prog;
        c->stats    = &priv->channel_stats[ix].ch;
-       c->irq_desc = irq_to_desc(irq);
+       c->aff_mask = irq_get_effective_affinity_mask(irq);
        c->lag_port = mlx5e_enumerate_lag_port(priv->mdev, ix);
 
        netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64);
index 1ec3d62..a3cfe06 100644 (file)
 static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
 {
        int current_cpu = smp_processor_id();
-       const struct cpumask *aff;
-       struct irq_data *idata;
 
-       idata = irq_desc_get_irq_data(c->irq_desc);
-       aff = irq_data_get_affinity_mask(idata);
-       return cpumask_test_cpu(current_cpu, aff);
+       return cpumask_test_cpu(current_cpu, c->aff_mask);
 }
 
 static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq)
index 052975e..4c41df6 100644 (file)
@@ -3072,6 +3072,7 @@ static int virtnet_probe(struct virtio_device *vdev)
                        dev_err(&vdev->dev,
                                "device MTU appears to have changed it is now %d < %d",
                                mtu, dev->min_mtu);
+                       err = -EINVAL;
                        goto free;
                }
 
index 0a5e884..3f05cfb 100644 (file)
@@ -282,15 +282,13 @@ int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, irq_handler_t handler,
                                  struct ntb_msi_desc *msi_desc)
 {
        struct msi_desc *entry;
-       struct irq_desc *desc;
        int ret;
 
        if (!ntb->msi)
                return -EINVAL;
 
        for_each_pci_msi_entry(entry, ntb->pdev) {
-               desc = irq_to_desc(entry->irq);
-               if (desc->action)
+               if (irq_has_action(entry->irq))
                        continue;
 
                ret = devm_request_threaded_irq(&ntb->dev, entry->irq, handler,
index 2e258be..aa53e0b 100644 (file)
@@ -7,7 +7,6 @@
 #ifndef _LINUX_BTT_H
 #define _LINUX_BTT_H
 
-#include <linux/badblocks.h>
 #include <linux/types.h>
 
 #define BTT_SIG_LEN 16
@@ -197,6 +196,8 @@ struct arena_info {
        int log_index[2];
 };
 
+struct badblocks;
+
 /**
  * struct btt - handle for a BTT instance
  * @btt_disk:          Pointer to the gendisk for BTT device
index 5a7c800..030dbde 100644 (file)
@@ -4,6 +4,7 @@
  */
 #include <linux/device.h>
 #include <linux/sizes.h>
+#include <linux/badblocks.h>
 #include "nd-core.h"
 #include "pmem.h"
 #include "pfn.h"
index c21ba06..7de592d 100644 (file)
@@ -3,7 +3,6 @@
  * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
  */
 #include <linux/libnvdimm.h>
-#include <linux/badblocks.h>
 #include <linux/suspend.h>
 #include <linux/export.h>
 #include <linux/module.h>
index 47a4828..9251441 100644 (file)
@@ -980,6 +980,15 @@ static int __blk_label_update(struct nd_region *nd_region,
                }
        }
 
+       /* release slots associated with any invalidated UUIDs */
+       mutex_lock(&nd_mapping->lock);
+       list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list)
+               if (test_and_clear_bit(ND_LABEL_REAP, &label_ent->flags)) {
+                       reap_victim(nd_mapping, label_ent);
+                       list_move(&label_ent->list, &list);
+               }
+       mutex_unlock(&nd_mapping->lock);
+
        /*
         * Find the resource associated with the first label in the set
         * per the v1.2 namespace specification.
@@ -999,8 +1008,10 @@ static int __blk_label_update(struct nd_region *nd_region,
                if (is_old_resource(res, old_res_list, old_num_resources))
                        continue; /* carry-over */
                slot = nd_label_alloc_slot(ndd);
-               if (slot == UINT_MAX)
+               if (slot == UINT_MAX) {
+                       rc = -ENXIO;
                        goto abort;
+               }
                dev_dbg(ndd->dev, "allocated: %d\n", slot);
 
                nd_label = to_label(ndd, slot);
index a2632d0..c637de3 100644 (file)
@@ -306,13 +306,11 @@ int mobiveil_host_init(struct mobiveil_pcie *pcie, bool reinit)
 
 static void mobiveil_mask_intx_irq(struct irq_data *data)
 {
-       struct irq_desc *desc = irq_to_desc(data->irq);
-       struct mobiveil_pcie *pcie;
+       struct mobiveil_pcie *pcie = irq_data_get_irq_chip_data(data);
        struct mobiveil_root_port *rp;
        unsigned long flags;
        u32 mask, shifted_val;
 
-       pcie = irq_desc_get_chip_data(desc);
        rp = &pcie->rp;
        mask = 1 << ((data->hwirq + PAB_INTX_START) - 1);
        raw_spin_lock_irqsave(&rp->intx_mask_lock, flags);
@@ -324,13 +322,11 @@ static void mobiveil_mask_intx_irq(struct irq_data *data)
 
 static void mobiveil_unmask_intx_irq(struct irq_data *data)
 {
-       struct irq_desc *desc = irq_to_desc(data->irq);
-       struct mobiveil_pcie *pcie;
+       struct mobiveil_pcie *pcie = irq_data_get_irq_chip_data(data);
        struct mobiveil_root_port *rp;
        unsigned long flags;
        u32 shifted_val, mask;
 
-       pcie = irq_desc_get_chip_data(desc);
        rp = &pcie->rp;
        mask = 1 << ((data->hwirq + PAB_INTX_START) - 1);
        raw_spin_lock_irqsave(&rp->intx_mask_lock, flags);
index 7f29c2f..07e3666 100644 (file)
@@ -374,13 +374,11 @@ static void nwl_pcie_msi_handler_low(struct irq_desc *desc)
 
 static void nwl_mask_leg_irq(struct irq_data *data)
 {
-       struct irq_desc *desc = irq_to_desc(data->irq);
-       struct nwl_pcie *pcie;
+       struct nwl_pcie *pcie = irq_data_get_irq_chip_data(data);
        unsigned long flags;
        u32 mask;
        u32 val;
 
-       pcie = irq_desc_get_chip_data(desc);
        mask = 1 << (data->hwirq - 1);
        raw_spin_lock_irqsave(&pcie->leg_mask_lock, flags);
        val = nwl_bridge_readl(pcie, MSGF_LEG_MASK);
@@ -390,13 +388,11 @@ static void nwl_mask_leg_irq(struct irq_data *data)
 
 static void nwl_unmask_leg_irq(struct irq_data *data)
 {
-       struct irq_desc *desc = irq_to_desc(data->irq);
-       struct nwl_pcie *pcie;
+       struct nwl_pcie *pcie = irq_data_get_irq_chip_data(data);
        unsigned long flags;
        u32 mask;
        u32 val;
 
-       pcie = irq_desc_get_chip_data(desc);
        mask = 1 << (data->hwirq - 1);
        raw_spin_lock_irqsave(&pcie->leg_mask_lock, flags);
        val = nwl_bridge_readl(pcie, MSGF_LEG_MASK);
index 657e35a..d4ea108 100644 (file)
@@ -948,8 +948,8 @@ static void nmk_gpio_dbg_show_one(struct seq_file *s,
                           (mode < 0) ? "unknown" : modes[mode]);
        } else {
                int irq = chip->to_irq(chip, offset);
-               struct irq_desc *desc = irq_to_desc(irq);
                const int pullidx = pull ? 1 : 0;
+               bool wake;
                int val;
                static const char * const pulls[] = {
                        "none        ",
@@ -969,8 +969,9 @@ static void nmk_gpio_dbg_show_one(struct seq_file *s,
                 * This races with request_irq(), set_irq_type(),
                 * and set_irq_wake() ... but those are "rare".
                 */
-               if (irq > 0 && desc && desc->action) {
+               if (irq > 0 && irq_has_action(irq)) {
                        char *trigger;
+                       bool wake;
 
                        if (nmk_chip->edge_rising & BIT(offset))
                                trigger = "edge-rising";
@@ -979,10 +980,10 @@ static void nmk_gpio_dbg_show_one(struct seq_file *s,
                        else
                                trigger = "edge-undefined";
 
+                       wake = !!(nmk_chip->real_wake & BIT(offset));
+
                        seq_printf(s, " irq-%d %s%s",
-                                  irq, trigger,
-                                  irqd_is_wakeup_set(&desc->irq_data)
-                                  ? " wakeup" : "");
+                                  irq, trigger, wake ? " wakeup" : "");
                }
        }
        clk_disable(nmk_chip->clk);
index 99f8661..dc78a52 100644 (file)
@@ -256,7 +256,6 @@ void dasd_alias_disconnect_device_from_lcu(struct dasd_device *device)
                return;
        device->discipline->get_uid(device, &uid);
        spin_lock_irqsave(&lcu->lock, flags);
-       list_del_init(&device->alias_list);
        /* make sure that the workers don't use this device */
        if (device == lcu->suc_data.device) {
                spin_unlock_irqrestore(&lcu->lock, flags);
@@ -283,6 +282,7 @@ void dasd_alias_disconnect_device_from_lcu(struct dasd_device *device)
 
        spin_lock_irqsave(&aliastree.lock, flags);
        spin_lock(&lcu->lock);
+       list_del_init(&device->alias_list);
        if (list_empty(&lcu->grouplist) &&
            list_empty(&lcu->active_devices) &&
            list_empty(&lcu->inactive_devices)) {
@@ -462,11 +462,19 @@ static int read_unit_address_configuration(struct dasd_device *device,
        spin_unlock_irqrestore(&lcu->lock, flags);
 
        rc = dasd_sleep_on(cqr);
-       if (rc && !suborder_not_supported(cqr)) {
+       if (!rc)
+               goto out;
+
+       if (suborder_not_supported(cqr)) {
+               /* suborder not supported or device unusable for IO */
+               rc = -EOPNOTSUPP;
+       } else {
+               /* IO failed but should be retried */
                spin_lock_irqsave(&lcu->lock, flags);
                lcu->flags |= NEED_UAC_UPDATE;
                spin_unlock_irqrestore(&lcu->lock, flags);
        }
+out:
        dasd_sfree_request(cqr, cqr->memdev);
        return rc;
 }
@@ -503,6 +511,14 @@ static int _lcu_update(struct dasd_device *refdev, struct alias_lcu *lcu)
                return rc;
 
        spin_lock_irqsave(&lcu->lock, flags);
+       /*
+        * there is another update needed skip the remaining handling
+        * the data might already be outdated
+        * but especially do not add the device to an LCU with pending
+        * update
+        */
+       if (lcu->flags & NEED_UAC_UPDATE)
+               goto out;
        lcu->pav = NO_PAV;
        for (i = 0; i < MAX_DEVICES_PER_LCU; ++i) {
                switch (lcu->uac->unit[i].ua_type) {
@@ -521,6 +537,7 @@ static int _lcu_update(struct dasd_device *refdev, struct alias_lcu *lcu)
                                 alias_list) {
                _add_device_to_lcu(lcu, device, refdev);
        }
+out:
        spin_unlock_irqrestore(&lcu->lock, flags);
        return 0;
 }
@@ -625,6 +642,7 @@ int dasd_alias_add_device(struct dasd_device *device)
        }
        if (lcu->flags & UPDATE_PENDING) {
                list_move(&device->alias_list, &lcu->active_devices);
+               private->pavgroup = NULL;
                _schedule_lcu_update(lcu, device);
        }
        spin_unlock_irqrestore(&lcu->lock, flags);
index 6caf539..92a6396 100644 (file)
@@ -9,21 +9,24 @@ menuconfig VDPA
 if VDPA
 
 config VDPA_SIM
-       tristate "vDPA device simulator"
+       tristate "vDPA device simulator core"
        depends on RUNTIME_TESTING_MENU && HAS_DMA
        select DMA_OPS
        select VHOST_RING
+       help
+         Enable this module to support vDPA device simulators. These devices
+         are used for testing, prototyping and development of vDPA.
+
+config VDPA_SIM_NET
+       tristate "vDPA simulator for networking device"
+       depends on VDPA_SIM
        select GENERIC_NET_UTILS
-       default n
        help
-         vDPA networking device simulator which loop TX traffic back
-         to RX. This device is used for testing, prototyping and
-         development of vDPA.
+         vDPA networking device simulator which loops TX traffic back to RX.
 
 config IFCVF
        tristate "Intel IFC VF vDPA driver"
        depends on PCI_MSI
-       default n
        help
          This kernel module can drive Intel IFC VF NIC to offload
          virtio dataplane traffic to hardware.
@@ -42,7 +45,6 @@ config MLX5_VDPA_NET
        tristate "vDPA driver for ConnectX devices"
        select MLX5_VDPA
        depends on MLX5_CORE
-       default n
        help
          VDPA network driver for ConnectX6 and newer. Provides offloading
          of virtio net datapath such that descriptors put on the ring will
index 8b40285..fa1af30 100644 (file)
@@ -417,16 +417,9 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
                return ret;
        }
 
-       ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
        if (ret) {
-               IFCVF_ERR(pdev, "No usable DMA confiugration\n");
-               return ret;
-       }
-
-       ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-       if (ret) {
-               IFCVF_ERR(pdev,
-                         "No usable coherent DMA confiugration\n");
+               IFCVF_ERR(pdev, "No usable DMA configuration\n");
                return ret;
        }
 
index f1d5481..88dde34 100644 (file)
@@ -479,6 +479,11 @@ static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
 {
        mlx5_cq_set_ci(&mvq->cq.mcq);
+
+       /* make sure CQ cosumer update is visible to the hardware before updating
+        * RX doorbell record.
+        */
+       dma_wmb();
        rx_post(&mvq->vqqp, num);
        if (mvq->event_cb.callback)
                mvq->event_cb.callback(mvq->event_cb.private);
index a69ffc9..c082565 100644 (file)
@@ -89,7 +89,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
        if (!vdev)
                goto err;
 
-       err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL);
+       err = ida_alloc(&vdpa_index_ida, GFP_KERNEL);
        if (err < 0)
                goto err_ida;
 
index b40278f..79d4536 100644 (file)
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_VDPA_SIM) += vdpa_sim.o
+obj-$(CONFIG_VDPA_SIM_NET) += vdpa_sim_net.o
index 6a90fdb..b3fcc67 100644 (file)
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * VDPA networking device simulator.
+ * VDPA device simulator core.
  *
  * Copyright (c) 2020, Red Hat Inc. All rights reserved.
  *     Author: Jason Wang <jasowang@redhat.com>
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/poll.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
-#include <linux/wait.h>
-#include <linux/uuid.h>
-#include <linux/iommu.h>
 #include <linux/dma-map-ops.h>
-#include <linux/sysfs.h>
-#include <linux/file.h>
-#include <linux/etherdevice.h>
 #include <linux/vringh.h>
 #include <linux/vdpa.h>
-#include <linux/virtio_byteorder.h>
 #include <linux/vhost_iotlb.h>
-#include <uapi/linux/virtio_config.h>
-#include <uapi/linux/virtio_net.h>
+
+#include "vdpa_sim.h"
 
 #define DRV_VERSION  "0.1"
 #define DRV_AUTHOR   "Jason Wang <jasowang@redhat.com>"
-#define DRV_DESC     "vDPA Device Simulator"
+#define DRV_DESC     "vDPA Device Simulator core"
 #define DRV_LICENSE  "GPL v2"
 
 static int batch_mapping = 1;
 module_param(batch_mapping, int, 0444);
 MODULE_PARM_DESC(batch_mapping, "Batched mapping 1 -Enable; 0 - Disable");
 
-static char *macaddr;
-module_param(macaddr, charp, 0);
-MODULE_PARM_DESC(macaddr, "Ethernet MAC address");
-
-struct vdpasim_virtqueue {
-       struct vringh vring;
-       struct vringh_kiov iov;
-       unsigned short head;
-       bool ready;
-       u64 desc_addr;
-       u64 device_addr;
-       u64 driver_addr;
-       u32 num;
-       void *private;
-       irqreturn_t (*cb)(void *data);
-};
+static int max_iotlb_entries = 2048;
+module_param(max_iotlb_entries, int, 0444);
+MODULE_PARM_DESC(max_iotlb_entries,
+                "Maximum number of iotlb entries. 0 means unlimited. (default: 2048)");
 
 #define VDPASIM_QUEUE_ALIGN PAGE_SIZE
 #define VDPASIM_QUEUE_MAX 256
-#define VDPASIM_DEVICE_ID 0x1
 #define VDPASIM_VENDOR_ID 0
-#define VDPASIM_VQ_NUM 0x2
-#define VDPASIM_NAME "vdpasim-netdev"
-
-static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) |
-                             (1ULL << VIRTIO_F_VERSION_1)  |
-                             (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
-                             (1ULL << VIRTIO_NET_F_MAC);
-
-/* State of each vdpasim device */
-struct vdpasim {
-       struct vdpa_device vdpa;
-       struct vdpasim_virtqueue vqs[VDPASIM_VQ_NUM];
-       struct work_struct work;
-       /* spinlock to synchronize virtqueue state */
-       spinlock_t lock;
-       struct virtio_net_config config;
-       struct vhost_iotlb *iommu;
-       void *buffer;
-       u32 status;
-       u32 generation;
-       u64 features;
-       /* spinlock to synchronize iommu table */
-       spinlock_t iommu_lock;
-};
-
-/* TODO: cross-endian support */
-static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim)
-{
-       return virtio_legacy_is_little_endian() ||
-               (vdpasim->features & (1ULL << VIRTIO_F_VERSION_1));
-}
-
-static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val)
-{
-       return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val);
-}
-
-static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val)
-{
-       return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val);
-}
-
-static struct vdpasim *vdpasim_dev;
 
 static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa)
 {
@@ -115,20 +50,34 @@ static struct vdpasim *dev_to_sim(struct device *dev)
        return vdpa_to_sim(vdpa);
 }
 
+static void vdpasim_vq_notify(struct vringh *vring)
+{
+       struct vdpasim_virtqueue *vq =
+               container_of(vring, struct vdpasim_virtqueue, vring);
+
+       if (!vq->cb)
+               return;
+
+       vq->cb(vq->private);
+}
+
 static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx)
 {
        struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
 
-       vringh_init_iotlb(&vq->vring, vdpasim_features,
+       vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features,
                          VDPASIM_QUEUE_MAX, false,
                          (struct vring_desc *)(uintptr_t)vq->desc_addr,
                          (struct vring_avail *)
                          (uintptr_t)vq->driver_addr,
                          (struct vring_used *)
                          (uintptr_t)vq->device_addr);
+
+       vq->vring.notify = vdpasim_vq_notify;
 }
 
-static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq)
+static void vdpasim_vq_reset(struct vdpasim *vdpasim,
+                            struct vdpasim_virtqueue *vq)
 {
        vq->ready = false;
        vq->desc_addr = 0;
@@ -136,16 +85,18 @@ static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq)
        vq->device_addr = 0;
        vq->cb = NULL;
        vq->private = NULL;
-       vringh_init_iotlb(&vq->vring, vdpasim_features, VDPASIM_QUEUE_MAX,
-                         false, NULL, NULL, NULL);
+       vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features,
+                         VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL);
+
+       vq->vring.notify = NULL;
 }
 
 static void vdpasim_reset(struct vdpasim *vdpasim)
 {
        int i;
 
-       for (i = 0; i < VDPASIM_VQ_NUM; i++)
-               vdpasim_vq_reset(&vdpasim->vqs[i]);
+       for (i = 0; i < vdpasim->dev_attr.nvqs; i++)
+               vdpasim_vq_reset(vdpasim, &vdpasim->vqs[i]);
 
        spin_lock(&vdpasim->iommu_lock);
        vhost_iotlb_reset(vdpasim->iommu);
@@ -156,80 +107,6 @@ static void vdpasim_reset(struct vdpasim *vdpasim)
        ++vdpasim->generation;
 }
 
-static void vdpasim_work(struct work_struct *work)
-{
-       struct vdpasim *vdpasim = container_of(work, struct
-                                                vdpasim, work);
-       struct vdpasim_virtqueue *txq = &vdpasim->vqs[1];
-       struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0];
-       ssize_t read, write;
-       size_t total_write;
-       int pkts = 0;
-       int err;
-
-       spin_lock(&vdpasim->lock);
-
-       if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK))
-               goto out;
-
-       if (!txq->ready || !rxq->ready)
-               goto out;
-
-       while (true) {
-               total_write = 0;
-               err = vringh_getdesc_iotlb(&txq->vring, &txq->iov, NULL,
-                                          &txq->head, GFP_ATOMIC);
-               if (err <= 0)
-                       break;
-
-               err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->iov,
-                                          &rxq->head, GFP_ATOMIC);
-               if (err <= 0) {
-                       vringh_complete_iotlb(&txq->vring, txq->head, 0);
-                       break;
-               }
-
-               while (true) {
-                       read = vringh_iov_pull_iotlb(&txq->vring, &txq->iov,
-                                                    vdpasim->buffer,
-                                                    PAGE_SIZE);
-                       if (read <= 0)
-                               break;
-
-                       write = vringh_iov_push_iotlb(&rxq->vring, &rxq->iov,
-                                                     vdpasim->buffer, read);
-                       if (write <= 0)
-                               break;
-
-                       total_write += write;
-               }
-
-               /* Make sure data is wrote before advancing index */
-               smp_wmb();
-
-               vringh_complete_iotlb(&txq->vring, txq->head, 0);
-               vringh_complete_iotlb(&rxq->vring, rxq->head, total_write);
-
-               /* Make sure used is visible before rasing the interrupt. */
-               smp_wmb();
-
-               local_bh_disable();
-               if (txq->cb)
-                       txq->cb(txq->private);
-               if (rxq->cb)
-                       rxq->cb(rxq->private);
-               local_bh_enable();
-
-               if (++pkts > 4) {
-                       schedule_work(&vdpasim->work);
-                       goto out;
-               }
-       }
-
-out:
-       spin_unlock(&vdpasim->lock);
-}
-
 static int dir_to_perm(enum dma_data_direction dir)
 {
        int perm = -EFAULT;
@@ -342,26 +219,28 @@ static const struct dma_map_ops vdpasim_dma_ops = {
        .free = vdpasim_free_coherent,
 };
 
-static const struct vdpa_config_ops vdpasim_net_config_ops;
-static const struct vdpa_config_ops vdpasim_net_batch_config_ops;
+static const struct vdpa_config_ops vdpasim_config_ops;
+static const struct vdpa_config_ops vdpasim_batch_config_ops;
 
-static struct vdpasim *vdpasim_create(void)
+struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr)
 {
        const struct vdpa_config_ops *ops;
        struct vdpasim *vdpasim;
        struct device *dev;
-       int ret = -ENOMEM;
+       int i, ret = -ENOMEM;
 
        if (batch_mapping)
-               ops = &vdpasim_net_batch_config_ops;
+               ops = &vdpasim_batch_config_ops;
        else
-               ops = &vdpasim_net_config_ops;
+               ops = &vdpasim_config_ops;
 
-       vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, VDPASIM_VQ_NUM);
+       vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops,
+                                   dev_attr->nvqs);
        if (!vdpasim)
                goto err_alloc;
 
-       INIT_WORK(&vdpasim->work, vdpasim_work);
+       vdpasim->dev_attr = *dev_attr;
+       INIT_WORK(&vdpasim->work, dev_attr->work_fn);
        spin_lock_init(&vdpasim->lock);
        spin_lock_init(&vdpasim->iommu_lock);
 
@@ -371,31 +250,27 @@ static struct vdpasim *vdpasim_create(void)
                goto err_iommu;
        set_dma_ops(dev, &vdpasim_dma_ops);
 
-       vdpasim->iommu = vhost_iotlb_alloc(2048, 0);
+       vdpasim->config = kzalloc(dev_attr->config_size, GFP_KERNEL);
+       if (!vdpasim->config)
+               goto err_iommu;
+
+       vdpasim->vqs = kcalloc(dev_attr->nvqs, sizeof(struct vdpasim_virtqueue),
+                              GFP_KERNEL);
+       if (!vdpasim->vqs)
+               goto err_iommu;
+
+       vdpasim->iommu = vhost_iotlb_alloc(max_iotlb_entries, 0);
        if (!vdpasim->iommu)
                goto err_iommu;
 
-       vdpasim->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       vdpasim->buffer = kvmalloc(dev_attr->buffer_size, GFP_KERNEL);
        if (!vdpasim->buffer)
                goto err_iommu;
 
-       if (macaddr) {
-               mac_pton(macaddr, vdpasim->config.mac);
-               if (!is_valid_ether_addr(vdpasim->config.mac)) {
-                       ret = -EADDRNOTAVAIL;
-                       goto err_iommu;
-               }
-       } else {
-               eth_random_addr(vdpasim->config.mac);
-       }
-
-       vringh_set_iotlb(&vdpasim->vqs[0].vring, vdpasim->iommu);
-       vringh_set_iotlb(&vdpasim->vqs[1].vring, vdpasim->iommu);
+       for (i = 0; i < dev_attr->nvqs; i++)
+               vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu);
 
        vdpasim->vdpa.dma_dev = dev;
-       ret = vdpa_register_device(&vdpasim->vdpa);
-       if (ret)
-               goto err_iommu;
 
        return vdpasim;
 
@@ -404,6 +279,7 @@ err_iommu:
 err_alloc:
        return ERR_PTR(ret);
 }
+EXPORT_SYMBOL_GPL(vdpasim_create);
 
 static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx,
                                  u64 desc_area, u64 driver_area,
@@ -498,28 +374,21 @@ static u32 vdpasim_get_vq_align(struct vdpa_device *vdpa)
 
 static u64 vdpasim_get_features(struct vdpa_device *vdpa)
 {
-       return vdpasim_features;
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+       return vdpasim->dev_attr.supported_features;
 }
 
 static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features)
 {
        struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
-       struct virtio_net_config *config = &vdpasim->config;
 
        /* DMA mapping must be done by driver */
        if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
                return -EINVAL;
 
-       vdpasim->features = features & vdpasim_features;
-
-       /* We generally only know whether guest is using the legacy interface
-        * here, so generally that's the earliest we can set config fields.
-        * Note: We actually require VIRTIO_F_ACCESS_PLATFORM above which
-        * implies VIRTIO_F_VERSION_1, but let's not try to be clever here.
-        */
+       vdpasim->features = features & vdpasim->dev_attr.supported_features;
 
-       config->mtu = cpu_to_vdpasim16(vdpasim, 1500);
-       config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP);
        return 0;
 }
 
@@ -536,7 +405,9 @@ static u16 vdpasim_get_vq_num_max(struct vdpa_device *vdpa)
 
 static u32 vdpasim_get_device_id(struct vdpa_device *vdpa)
 {
-       return VDPASIM_DEVICE_ID;
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+       return vdpasim->dev_attr.id;
 }
 
 static u32 vdpasim_get_vendor_id(struct vdpa_device *vdpa)
@@ -572,14 +443,27 @@ static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset,
 {
        struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
 
-       if (offset + len < sizeof(struct virtio_net_config))
-               memcpy(buf, (u8 *)&vdpasim->config + offset, len);
+       if (offset + len > vdpasim->dev_attr.config_size)
+               return;
+
+       if (vdpasim->dev_attr.get_config)
+               vdpasim->dev_attr.get_config(vdpasim, vdpasim->config);
+
+       memcpy(buf, vdpasim->config + offset, len);
 }
 
 static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset,
                             const void *buf, unsigned int len)
 {
-       /* No writable config supportted by vdpasim */
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+       if (offset + len > vdpasim->dev_attr.config_size)
+               return;
+
+       memcpy(vdpasim->config + offset, buf, len);
+
+       if (vdpasim->dev_attr.set_config)
+               vdpasim->dev_attr.set_config(vdpasim, vdpasim->config);
 }
 
 static u32 vdpasim_get_generation(struct vdpa_device *vdpa)
@@ -656,12 +540,14 @@ static void vdpasim_free(struct vdpa_device *vdpa)
        struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
 
        cancel_work_sync(&vdpasim->work);
-       kfree(vdpasim->buffer);
+       kvfree(vdpasim->buffer);
        if (vdpasim->iommu)
                vhost_iotlb_free(vdpasim->iommu);
+       kfree(vdpasim->vqs);
+       kfree(vdpasim->config);
 }
 
-static const struct vdpa_config_ops vdpasim_net_config_ops = {
+static const struct vdpa_config_ops vdpasim_config_ops = {
        .set_vq_address         = vdpasim_set_vq_address,
        .set_vq_num             = vdpasim_set_vq_num,
        .kick_vq                = vdpasim_kick_vq,
@@ -688,7 +574,7 @@ static const struct vdpa_config_ops vdpasim_net_config_ops = {
        .free                   = vdpasim_free,
 };
 
-static const struct vdpa_config_ops vdpasim_net_batch_config_ops = {
+static const struct vdpa_config_ops vdpasim_batch_config_ops = {
        .set_vq_address         = vdpasim_set_vq_address,
        .set_vq_num             = vdpasim_set_vq_num,
        .kick_vq                = vdpasim_kick_vq,
@@ -714,26 +600,6 @@ static const struct vdpa_config_ops vdpasim_net_batch_config_ops = {
        .free                   = vdpasim_free,
 };
 
-static int __init vdpasim_dev_init(void)
-{
-       vdpasim_dev = vdpasim_create();
-
-       if (!IS_ERR(vdpasim_dev))
-               return 0;
-
-       return PTR_ERR(vdpasim_dev);
-}
-
-static void __exit vdpasim_dev_exit(void)
-{
-       struct vdpa_device *vdpa = &vdpasim_dev->vdpa;
-
-       vdpa_unregister_device(vdpa);
-}
-
-module_init(vdpasim_dev_init)
-module_exit(vdpasim_dev_exit)
-
 MODULE_VERSION(DRV_VERSION);
 MODULE_LICENSE(DRV_LICENSE);
 MODULE_AUTHOR(DRV_AUTHOR);
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h
new file mode 100644 (file)
index 0000000..b021422
--- /dev/null
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2020, Red Hat Inc. All rights reserved.
+ */
+
+#ifndef _VDPA_SIM_H
+#define _VDPA_SIM_H
+
+#include <linux/vringh.h>
+#include <linux/vdpa.h>
+#include <linux/virtio_byteorder.h>
+#include <linux/vhost_iotlb.h>
+#include <uapi/linux/virtio_config.h>
+
+#define VDPASIM_FEATURES       ((1ULL << VIRTIO_F_ANY_LAYOUT) | \
+                                (1ULL << VIRTIO_F_VERSION_1)  | \
+                                (1ULL << VIRTIO_F_ACCESS_PLATFORM))
+
+struct vdpasim;
+
+struct vdpasim_virtqueue {
+       struct vringh vring;
+       struct vringh_kiov in_iov;
+       struct vringh_kiov out_iov;
+       unsigned short head;
+       bool ready;
+       u64 desc_addr;
+       u64 device_addr;
+       u64 driver_addr;
+       u32 num;
+       void *private;
+       irqreturn_t (*cb)(void *data);
+};
+
+struct vdpasim_dev_attr {
+       u64 supported_features;
+       size_t config_size;
+       size_t buffer_size;
+       int nvqs;
+       u32 id;
+
+       work_func_t work_fn;
+       void (*get_config)(struct vdpasim *vdpasim, void *config);
+       void (*set_config)(struct vdpasim *vdpasim, const void *config);
+};
+
+/* State of each vdpasim device */
+struct vdpasim {
+       struct vdpa_device vdpa;
+       struct vdpasim_virtqueue *vqs;
+       struct work_struct work;
+       struct vdpasim_dev_attr dev_attr;
+       /* spinlock to synchronize virtqueue state */
+       spinlock_t lock;
+       /* virtio config according to device type */
+       void *config;
+       struct vhost_iotlb *iommu;
+       void *buffer;
+       u32 status;
+       u32 generation;
+       u64 features;
+       /* spinlock to synchronize iommu table */
+       spinlock_t iommu_lock;
+};
+
+struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *attr);
+
+/* TODO: cross-endian support */
+static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim)
+{
+       return virtio_legacy_is_little_endian() ||
+               (vdpasim->features & (1ULL << VIRTIO_F_VERSION_1));
+}
+
+static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val)
+{
+       return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val)
+{
+       return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline u32 vdpasim32_to_cpu(struct vdpasim *vdpasim, __virtio32 val)
+{
+       return __virtio32_to_cpu(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline __virtio32 cpu_to_vdpasim32(struct vdpasim *vdpasim, u32 val)
+{
+       return __cpu_to_virtio32(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline u64 vdpasim64_to_cpu(struct vdpasim *vdpasim, __virtio64 val)
+{
+       return __virtio64_to_cpu(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline __virtio64 cpu_to_vdpasim64(struct vdpasim *vdpasim, u64 val)
+{
+       return __cpu_to_virtio64(vdpasim_is_little_endian(vdpasim), val);
+}
+
+#endif
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
new file mode 100644 (file)
index 0000000..c10b698
--- /dev/null
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VDPA simulator for networking device.
+ *
+ * Copyright (c) 2020, Red Hat Inc. All rights reserved.
+ *     Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/etherdevice.h>
+#include <linux/vringh.h>
+#include <linux/vdpa.h>
+#include <uapi/linux/virtio_net.h>
+
+#include "vdpa_sim.h"
+
+#define DRV_VERSION  "0.1"
+#define DRV_AUTHOR   "Jason Wang <jasowang@redhat.com>"
+#define DRV_DESC     "vDPA Device Simulator for networking device"
+#define DRV_LICENSE  "GPL v2"
+
+#define VDPASIM_NET_FEATURES   (VDPASIM_FEATURES | \
+                                (1ULL << VIRTIO_NET_F_MAC))
+
+#define VDPASIM_NET_VQ_NUM     2
+
+static char *macaddr;
+module_param(macaddr, charp, 0);
+MODULE_PARM_DESC(macaddr, "Ethernet MAC address");
+
+u8 macaddr_buf[ETH_ALEN];
+
+static struct vdpasim *vdpasim_net_dev;
+
+static void vdpasim_net_work(struct work_struct *work)
+{
+       struct vdpasim *vdpasim = container_of(work, struct vdpasim, work);
+       struct vdpasim_virtqueue *txq = &vdpasim->vqs[1];
+       struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0];
+       ssize_t read, write;
+       size_t total_write;
+       int pkts = 0;
+       int err;
+
+       spin_lock(&vdpasim->lock);
+
+       if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK))
+               goto out;
+
+       if (!txq->ready || !rxq->ready)
+               goto out;
+
+       while (true) {
+               total_write = 0;
+               err = vringh_getdesc_iotlb(&txq->vring, &txq->out_iov, NULL,
+                                          &txq->head, GFP_ATOMIC);
+               if (err <= 0)
+                       break;
+
+               err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->in_iov,
+                                          &rxq->head, GFP_ATOMIC);
+               if (err <= 0) {
+                       vringh_complete_iotlb(&txq->vring, txq->head, 0);
+                       break;
+               }
+
+               while (true) {
+                       read = vringh_iov_pull_iotlb(&txq->vring, &txq->out_iov,
+                                                    vdpasim->buffer,
+                                                    PAGE_SIZE);
+                       if (read <= 0)
+                               break;
+
+                       write = vringh_iov_push_iotlb(&rxq->vring, &rxq->in_iov,
+                                                     vdpasim->buffer, read);
+                       if (write <= 0)
+                               break;
+
+                       total_write += write;
+               }
+
+               /* Make sure data is wrote before advancing index */
+               smp_wmb();
+
+               vringh_complete_iotlb(&txq->vring, txq->head, 0);
+               vringh_complete_iotlb(&rxq->vring, rxq->head, total_write);
+
+               /* Make sure used is visible before rasing the interrupt. */
+               smp_wmb();
+
+               local_bh_disable();
+               if (vringh_need_notify_iotlb(&txq->vring) > 0)
+                       vringh_notify(&txq->vring);
+               if (vringh_need_notify_iotlb(&rxq->vring) > 0)
+                       vringh_notify(&rxq->vring);
+               local_bh_enable();
+
+               if (++pkts > 4) {
+                       schedule_work(&vdpasim->work);
+                       goto out;
+               }
+       }
+
+out:
+       spin_unlock(&vdpasim->lock);
+}
+
+static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config)
+{
+       struct virtio_net_config *net_config =
+               (struct virtio_net_config *)config;
+
+       net_config->mtu = cpu_to_vdpasim16(vdpasim, 1500);
+       net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP);
+       memcpy(net_config->mac, macaddr_buf, ETH_ALEN);
+}
+
+static int __init vdpasim_net_init(void)
+{
+       struct vdpasim_dev_attr dev_attr = {};
+       int ret;
+
+       if (macaddr) {
+               mac_pton(macaddr, macaddr_buf);
+               if (!is_valid_ether_addr(macaddr_buf)) {
+                       ret = -EADDRNOTAVAIL;
+                       goto out;
+               }
+       } else {
+               eth_random_addr(macaddr_buf);
+       }
+
+       dev_attr.id = VIRTIO_ID_NET;
+       dev_attr.supported_features = VDPASIM_NET_FEATURES;
+       dev_attr.nvqs = VDPASIM_NET_VQ_NUM;
+       dev_attr.config_size = sizeof(struct virtio_net_config);
+       dev_attr.get_config = vdpasim_net_get_config;
+       dev_attr.work_fn = vdpasim_net_work;
+       dev_attr.buffer_size = PAGE_SIZE;
+
+       vdpasim_net_dev = vdpasim_create(&dev_attr);
+       if (IS_ERR(vdpasim_net_dev)) {
+               ret = PTR_ERR(vdpasim_net_dev);
+               goto out;
+       }
+
+       ret = vdpa_register_device(&vdpasim_net_dev->vdpa);
+       if (ret)
+               goto put_dev;
+
+       return 0;
+
+put_dev:
+       put_device(&vdpasim_net_dev->vdpa.dev);
+out:
+       return ret;
+}
+
+static void __exit vdpasim_net_exit(void)
+{
+       struct vdpa_device *vdpa = &vdpasim_net_dev->vdpa;
+
+       vdpa_unregister_device(vdpa);
+}
+
+module_init(vdpasim_net_init);
+module_exit(vdpasim_net_exit);
+
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE(DRV_LICENSE);
+MODULE_AUTHOR(DRV_AUTHOR);
+MODULE_DESCRIPTION(DRV_DESC);
index 6ff8a50..4ce9f00 100644 (file)
@@ -1643,7 +1643,8 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs,
                        if (!vhost_vq_is_setup(vq))
                                continue;
 
-                       if (vhost_scsi_setup_vq_cmds(vq, vq->num))
+                       ret = vhost_scsi_setup_vq_cmds(vq, vq->num);
+                       if (ret)
                                goto destroy_vq_cmds;
                }
 
index 29ed417..ef688c8 100644 (file)
@@ -245,14 +245,10 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v,
                return -EFAULT;
        if (vhost_vdpa_config_validate(v, &config))
                return -EINVAL;
-       buf = kvzalloc(config.len, GFP_KERNEL);
-       if (!buf)
-               return -ENOMEM;
 
-       if (copy_from_user(buf, c->buf, config.len)) {
-               kvfree(buf);
-               return -EFAULT;
-       }
+       buf = vmemdup_user(c->buf, config.len);
+       if (IS_ERR(buf))
+               return PTR_ERR(buf);
 
        ops->set_config(vdpa, config.off, buf, config.len);
 
index 181e2f1..9fc9ec4 100644 (file)
@@ -27,20 +27,74 @@ static bool unplug_online = true;
 module_param(unplug_online, bool, 0644);
 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
 
-enum virtio_mem_mb_state {
+static bool force_bbm;
+module_param(force_bbm, bool, 0444);
+MODULE_PARM_DESC(force_bbm,
+               "Force Big Block Mode. Default is 0 (auto-selection)");
+
+static unsigned long bbm_block_size;
+module_param(bbm_block_size, ulong, 0444);
+MODULE_PARM_DESC(bbm_block_size,
+                "Big Block size in bytes. Default is 0 (auto-detection).");
+
+static bool bbm_safe_unplug = true;
+module_param(bbm_safe_unplug, bool, 0444);
+MODULE_PARM_DESC(bbm_safe_unplug,
+            "Use a safe unplug mechanism in BBM, avoiding long/endless loops");
+
+/*
+ * virtio-mem currently supports the following modes of operation:
+ *
+ * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
+ *   size of a Sub Block (SB) is determined based on the device block size, the
+ *   pageblock size, and the maximum allocation granularity of the buddy.
+ *   Subblocks within a Linux memory block might either be plugged or unplugged.
+ *   Memory is added/removed to Linux MM in Linux memory block granularity.
+ *
+ * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
+ *   Memory is added/removed to Linux MM in Big Block granularity.
+ *
+ * The mode is determined automatically based on the Linux memory block size
+ * and the device block size.
+ *
+ * User space / core MM (auto onlining) is responsible for onlining added
+ * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
+ * always onlined separately, and all memory within a Linux memory block is
+ * onlined to the same zone - virtio-mem relies on this behavior.
+ */
+
+/*
+ * State of a Linux memory block in SBM.
+ */
+enum virtio_mem_sbm_mb_state {
        /* Unplugged, not added to Linux. Can be reused later. */
-       VIRTIO_MEM_MB_STATE_UNUSED = 0,
+       VIRTIO_MEM_SBM_MB_UNUSED = 0,
        /* (Partially) plugged, not added to Linux. Error on add_memory(). */
-       VIRTIO_MEM_MB_STATE_PLUGGED,
+       VIRTIO_MEM_SBM_MB_PLUGGED,
        /* Fully plugged, fully added to Linux, offline. */
-       VIRTIO_MEM_MB_STATE_OFFLINE,
+       VIRTIO_MEM_SBM_MB_OFFLINE,
        /* Partially plugged, fully added to Linux, offline. */
-       VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL,
+       VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
        /* Fully plugged, fully added to Linux, online. */
-       VIRTIO_MEM_MB_STATE_ONLINE,
+       VIRTIO_MEM_SBM_MB_ONLINE,
        /* Partially plugged, fully added to Linux, online. */
-       VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL,
-       VIRTIO_MEM_MB_STATE_COUNT
+       VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL,
+       VIRTIO_MEM_SBM_MB_COUNT
+};
+
+/*
+ * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
+ */
+enum virtio_mem_bbm_bb_state {
+       /* Unplugged, not added to Linux. Can be reused later. */
+       VIRTIO_MEM_BBM_BB_UNUSED = 0,
+       /* Plugged, not added to Linux. Error on add_memory(). */
+       VIRTIO_MEM_BBM_BB_PLUGGED,
+       /* Plugged and added to Linux. */
+       VIRTIO_MEM_BBM_BB_ADDED,
+       /* All online parts are fake-offline, ready to remove. */
+       VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
+       VIRTIO_MEM_BBM_BB_COUNT
 };
 
 struct virtio_mem {
@@ -51,6 +105,7 @@ struct virtio_mem {
 
        /* Workqueue that processes the plug/unplug requests. */
        struct work_struct wq;
+       atomic_t wq_active;
        atomic_t config_changed;
 
        /* Virtqueue for guest->host requests. */
@@ -70,27 +125,13 @@ struct virtio_mem {
 
        /* The device block size (for communicating with the device). */
        uint64_t device_block_size;
-       /* The translated node id. NUMA_NO_NODE in case not specified. */
+       /* The determined node id for all memory of the device. */
        int nid;
        /* Physical start address of the memory region. */
        uint64_t addr;
        /* Maximum region size in bytes. */
        uint64_t region_size;
 
-       /* The subblock size. */
-       uint64_t subblock_size;
-       /* The number of subblocks per memory block. */
-       uint32_t nb_sb_per_mb;
-
-       /* Id of the first memory block of this device. */
-       unsigned long first_mb_id;
-       /* Id of the last memory block of this device. */
-       unsigned long last_mb_id;
-       /* Id of the last usable memory block of this device. */
-       unsigned long last_usable_mb_id;
-       /* Id of the next memory bock to prepare when needed. */
-       unsigned long next_mb_id;
-
        /* The parent resource for all memory added via this device. */
        struct resource *parent_resource;
        /*
@@ -99,31 +140,79 @@ struct virtio_mem {
         */
        const char *resource_name;
 
-       /* Summary of all memory block states. */
-       unsigned long nb_mb_state[VIRTIO_MEM_MB_STATE_COUNT];
-#define VIRTIO_MEM_NB_OFFLINE_THRESHOLD                10
-
-       /*
-        * One byte state per memory block.
-        *
-        * Allocated via vmalloc(). When preparing new blocks, resized
-        * (alloc+copy+free) when needed (crossing pages with the next mb).
-        * (when crossing pages).
-        *
-        * With 128MB memory blocks, we have states for 512GB of memory in one
-        * page.
-        */
-       uint8_t *mb_state;
-
        /*
-        * $nb_sb_per_mb bit per memory block. Handled similar to mb_state.
-        *
-        * With 4MB subblocks, we manage 128GB of memory in one page.
+        * We don't want to add too much memory if it's not getting onlined,
+        * to avoid running OOM. Besides this threshold, we allow to have at
+        * least two offline blocks at a time (whatever is bigger).
         */
-       unsigned long *sb_bitmap;
+#define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD           (1024 * 1024 * 1024)
+       atomic64_t offline_size;
+       uint64_t offline_threshold;
+
+       /* If set, the driver is in SBM, otherwise in BBM. */
+       bool in_sbm;
+
+       union {
+               struct {
+                       /* Id of the first memory block of this device. */
+                       unsigned long first_mb_id;
+                       /* Id of the last usable memory block of this device. */
+                       unsigned long last_usable_mb_id;
+                       /* Id of the next memory bock to prepare when needed. */
+                       unsigned long next_mb_id;
+
+                       /* The subblock size. */
+                       uint64_t sb_size;
+                       /* The number of subblocks per Linux memory block. */
+                       uint32_t sbs_per_mb;
+
+                       /* Summary of all memory block states. */
+                       unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
+
+                       /*
+                        * One byte state per memory block. Allocated via
+                        * vmalloc(). Resized (alloc+copy+free) on demand.
+                        *
+                        * With 128 MiB memory blocks, we have states for 512
+                        * GiB of memory in one 4 KiB page.
+                        */
+                       uint8_t *mb_states;
+
+                       /*
+                        * Bitmap: one bit per subblock. Allocated similar to
+                        * sbm.mb_states.
+                        *
+                        * A set bit means the corresponding subblock is
+                        * plugged, otherwise it's unblocked.
+                        *
+                        * With 4 MiB subblocks, we manage 128 GiB of memory
+                        * in one 4 KiB page.
+                        */
+                       unsigned long *sb_states;
+               } sbm;
+
+               struct {
+                       /* Id of the first big block of this device. */
+                       unsigned long first_bb_id;
+                       /* Id of the last usable big block of this device. */
+                       unsigned long last_usable_bb_id;
+                       /* Id of the next device bock to prepare when needed. */
+                       unsigned long next_bb_id;
+
+                       /* Summary of all big block states. */
+                       unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
+
+                       /* One byte state per big block. See sbm.mb_states. */
+                       uint8_t *bb_states;
+
+                       /* The block size used for plugging/adding/removing. */
+                       uint64_t bb_size;
+               } bbm;
+       };
 
        /*
-        * Mutex that protects the nb_mb_state, mb_state, and sb_bitmap.
+        * Mutex that protects the sbm.mb_count, sbm.mb_states,
+        * sbm.sb_states, bbm.bb_count, and bbm.bb_states
         *
         * When this lock is held the pointers can't change, ONLINE and
         * OFFLINE blocks can't change the state and no subblocks will get
@@ -160,6 +249,11 @@ static DEFINE_MUTEX(virtio_mem_mutex);
 static LIST_HEAD(virtio_mem_devices);
 
 static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
+static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
+                                                 unsigned long nr_pages);
+static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
+                                                  unsigned long nr_pages);
+static void virtio_mem_retry(struct virtio_mem *vm);
 
 /*
  * Register a virtio-mem device so it will be considered for the online_page
@@ -212,6 +306,24 @@ static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
        return mb_id * memory_block_size_bytes();
 }
 
+/*
+ * Calculate the big block id of a given address.
+ */
+static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm,
+                                             uint64_t addr)
+{
+       return addr / vm->bbm.bb_size;
+}
+
+/*
+ * Calculate the physical start address of a given big block id.
+ */
+static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm,
+                                        unsigned long bb_id)
+{
+       return bb_id * vm->bbm.bb_size;
+}
+
 /*
  * Calculate the subblock id of a given address.
  */
@@ -221,89 +333,164 @@ static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
        const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
        const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
 
-       return (addr - mb_addr) / vm->subblock_size;
+       return (addr - mb_addr) / vm->sbm.sb_size;
 }
 
+/*
+ * Set the state of a big block, taking care of the state counter.
+ */
+static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm,
+                                       unsigned long bb_id,
+                                       enum virtio_mem_bbm_bb_state state)
+{
+       const unsigned long idx = bb_id - vm->bbm.first_bb_id;
+       enum virtio_mem_bbm_bb_state old_state;
+
+       old_state = vm->bbm.bb_states[idx];
+       vm->bbm.bb_states[idx] = state;
+
+       BUG_ON(vm->bbm.bb_count[old_state] == 0);
+       vm->bbm.bb_count[old_state]--;
+       vm->bbm.bb_count[state]++;
+}
+
+/*
+ * Get the state of a big block.
+ */
+static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm,
+                                                               unsigned long bb_id)
+{
+       return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id];
+}
+
+/*
+ * Prepare the big block state array for the next big block.
+ */
+static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
+{
+       unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id;
+       unsigned long new_bytes = old_bytes + 1;
+       int old_pages = PFN_UP(old_bytes);
+       int new_pages = PFN_UP(new_bytes);
+       uint8_t *new_array;
+
+       if (vm->bbm.bb_states && old_pages == new_pages)
+               return 0;
+
+       new_array = vzalloc(new_pages * PAGE_SIZE);
+       if (!new_array)
+               return -ENOMEM;
+
+       mutex_lock(&vm->hotplug_mutex);
+       if (vm->bbm.bb_states)
+               memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE);
+       vfree(vm->bbm.bb_states);
+       vm->bbm.bb_states = new_array;
+       mutex_unlock(&vm->hotplug_mutex);
+
+       return 0;
+}
+
+#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \
+       for (_bb_id = vm->bbm.first_bb_id; \
+            _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \
+            _bb_id++) \
+               if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
+
+#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
+       for (_bb_id = vm->bbm.next_bb_id - 1; \
+            _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
+            _bb_id--) \
+               if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
+
 /*
  * Set the state of a memory block, taking care of the state counter.
  */
-static void virtio_mem_mb_set_state(struct virtio_mem *vm, unsigned long mb_id,
-                                   enum virtio_mem_mb_state state)
+static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
+                                       unsigned long mb_id, uint8_t state)
 {
-       const unsigned long idx = mb_id - vm->first_mb_id;
-       enum virtio_mem_mb_state old_state;
+       const unsigned long idx = mb_id - vm->sbm.first_mb_id;
+       uint8_t old_state;
 
-       old_state = vm->mb_state[idx];
-       vm->mb_state[idx] = state;
+       old_state = vm->sbm.mb_states[idx];
+       vm->sbm.mb_states[idx] = state;
 
-       BUG_ON(vm->nb_mb_state[old_state] == 0);
-       vm->nb_mb_state[old_state]--;
-       vm->nb_mb_state[state]++;
+       BUG_ON(vm->sbm.mb_count[old_state] == 0);
+       vm->sbm.mb_count[old_state]--;
+       vm->sbm.mb_count[state]++;
 }
 
 /*
  * Get the state of a memory block.
  */
-static enum virtio_mem_mb_state virtio_mem_mb_get_state(struct virtio_mem *vm,
-                                                       unsigned long mb_id)
+static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
+                                          unsigned long mb_id)
 {
-       const unsigned long idx = mb_id - vm->first_mb_id;
+       const unsigned long idx = mb_id - vm->sbm.first_mb_id;
 
-       return vm->mb_state[idx];
+       return vm->sbm.mb_states[idx];
 }
 
 /*
  * Prepare the state array for the next memory block.
  */
-static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm)
+static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
 {
-       unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id + 1;
-       unsigned long new_bytes = vm->next_mb_id - vm->first_mb_id + 2;
-       int old_pages = PFN_UP(old_bytes);
-       int new_pages = PFN_UP(new_bytes);
-       uint8_t *new_mb_state;
+       int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id);
+       int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1);
+       uint8_t *new_array;
 
-       if (vm->mb_state && old_pages == new_pages)
+       if (vm->sbm.mb_states && old_pages == new_pages)
                return 0;
 
-       new_mb_state = vzalloc(new_pages * PAGE_SIZE);
-       if (!new_mb_state)
+       new_array = vzalloc(new_pages * PAGE_SIZE);
+       if (!new_array)
                return -ENOMEM;
 
        mutex_lock(&vm->hotplug_mutex);
-       if (vm->mb_state)
-               memcpy(new_mb_state, vm->mb_state, old_pages * PAGE_SIZE);
-       vfree(vm->mb_state);
-       vm->mb_state = new_mb_state;
+       if (vm->sbm.mb_states)
+               memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE);
+       vfree(vm->sbm.mb_states);
+       vm->sbm.mb_states = new_array;
        mutex_unlock(&vm->hotplug_mutex);
 
        return 0;
 }
 
-#define virtio_mem_for_each_mb_state(_vm, _mb_id, _state) \
-       for (_mb_id = _vm->first_mb_id; \
-            _mb_id < _vm->next_mb_id && _vm->nb_mb_state[_state]; \
+#define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
+       for (_mb_id = _vm->sbm.first_mb_id; \
+            _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
             _mb_id++) \
-               if (virtio_mem_mb_get_state(_vm, _mb_id) == _state)
+               if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
 
-#define virtio_mem_for_each_mb_state_rev(_vm, _mb_id, _state) \
-       for (_mb_id = _vm->next_mb_id - 1; \
-            _mb_id >= _vm->first_mb_id && _vm->nb_mb_state[_state]; \
+#define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
+       for (_mb_id = _vm->sbm.next_mb_id - 1; \
+            _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
             _mb_id--) \
-               if (virtio_mem_mb_get_state(_vm, _mb_id) == _state)
+               if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
+
+/*
+ * Calculate the bit number in the subblock bitmap for the given subblock
+ * inside the given memory block.
+ */
+static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
+                                         unsigned long mb_id, int sb_id)
+{
+       return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
+}
 
 /*
  * Mark all selected subblocks plugged.
  *
  * Will not modify the state of the memory block.
  */
-static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm,
-                                        unsigned long mb_id, int sb_id,
-                                        int count)
+static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
+                                         unsigned long mb_id, int sb_id,
+                                         int count)
 {
-       const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
+       const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 
-       __bitmap_set(vm->sb_bitmap, bit, count);
+       __bitmap_set(vm->sbm.sb_states, bit, count);
 }
 
 /*
@@ -311,105 +498,114 @@ static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm,
  *
  * Will not modify the state of the memory block.
  */
-static void virtio_mem_mb_set_sb_unplugged(struct virtio_mem *vm,
-                                          unsigned long mb_id, int sb_id,
-                                          int count)
+static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
+                                           unsigned long mb_id, int sb_id,
+                                           int count)
 {
-       const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
+       const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 
-       __bitmap_clear(vm->sb_bitmap, bit, count);
+       __bitmap_clear(vm->sbm.sb_states, bit, count);
 }
 
 /*
  * Test if all selected subblocks are plugged.
  */
-static bool virtio_mem_mb_test_sb_plugged(struct virtio_mem *vm,
-                                         unsigned long mb_id, int sb_id,
-                                         int count)
+static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
+                                          unsigned long mb_id, int sb_id,
+                                          int count)
 {
-       const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
+       const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 
        if (count == 1)
-               return test_bit(bit, vm->sb_bitmap);
+               return test_bit(bit, vm->sbm.sb_states);
 
        /* TODO: Helper similar to bitmap_set() */
-       return find_next_zero_bit(vm->sb_bitmap, bit + count, bit) >=
+       return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
               bit + count;
 }
 
 /*
  * Test if all selected subblocks are unplugged.
  */
-static bool virtio_mem_mb_test_sb_unplugged(struct virtio_mem *vm,
-                                           unsigned long mb_id, int sb_id,
-                                           int count)
+static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
+                                            unsigned long mb_id, int sb_id,
+                                            int count)
 {
-       const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
+       const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 
        /* TODO: Helper similar to bitmap_set() */
-       return find_next_bit(vm->sb_bitmap, bit + count, bit) >= bit + count;
+       return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
+              bit + count;
 }
 
 /*
- * Find the first unplugged subblock. Returns vm->nb_sb_per_mb in case there is
+ * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is
  * none.
  */
-static int virtio_mem_mb_first_unplugged_sb(struct virtio_mem *vm,
+static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
                                            unsigned long mb_id)
 {
-       const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb;
+       const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
 
-       return find_next_zero_bit(vm->sb_bitmap, bit + vm->nb_sb_per_mb, bit) -
-              bit;
+       return find_next_zero_bit(vm->sbm.sb_states,
+                                 bit + vm->sbm.sbs_per_mb, bit) - bit;
 }
 
 /*
  * Prepare the subblock bitmap for the next memory block.
  */
-static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm)
+static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
 {
-       const unsigned long old_nb_mb = vm->next_mb_id - vm->first_mb_id;
-       const unsigned long old_nb_bits = old_nb_mb * vm->nb_sb_per_mb;
-       const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->nb_sb_per_mb;
+       const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
+       const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
+       const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
        int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
        int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
-       unsigned long *new_sb_bitmap, *old_sb_bitmap;
+       unsigned long *new_bitmap, *old_bitmap;
 
-       if (vm->sb_bitmap && old_pages == new_pages)
+       if (vm->sbm.sb_states && old_pages == new_pages)
                return 0;
 
-       new_sb_bitmap = vzalloc(new_pages * PAGE_SIZE);
-       if (!new_sb_bitmap)
+       new_bitmap = vzalloc(new_pages * PAGE_SIZE);
+       if (!new_bitmap)
                return -ENOMEM;
 
        mutex_lock(&vm->hotplug_mutex);
-       if (new_sb_bitmap)
-               memcpy(new_sb_bitmap, vm->sb_bitmap, old_pages * PAGE_SIZE);
+       if (new_bitmap)
+               memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE);
 
-       old_sb_bitmap = vm->sb_bitmap;
-       vm->sb_bitmap = new_sb_bitmap;
+       old_bitmap = vm->sbm.sb_states;
+       vm->sbm.sb_states = new_bitmap;
        mutex_unlock(&vm->hotplug_mutex);
 
-       vfree(old_sb_bitmap);
+       vfree(old_bitmap);
        return 0;
 }
 
 /*
- * Try to add a memory block to Linux. This will usually only fail
- * if out of memory.
+ * Test if we could add memory without creating too much offline memory -
+ * to avoid running OOM if memory is getting onlined deferred.
+ */
+static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
+{
+       if (WARN_ON_ONCE(size > vm->offline_threshold))
+               return false;
+
+       return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
+}
+
+/*
+ * Try adding memory to Linux. Will usually only fail if out of memory.
  *
  * Must not be called with the vm->hotplug_mutex held (possible deadlock with
  * onlining code).
  *
- * Will not modify the state of the memory block.
+ * Will not modify the state of memory blocks in virtio-mem.
  */
-static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id)
+static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
+                                uint64_t size)
 {
-       const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
-       int nid = vm->nid;
-
-       if (nid == NUMA_NO_NODE)
-               nid = memory_add_physaddr_to_nid(addr);
+       int rc;
 
        /*
         * When force-unloading the driver and we still have memory added to
@@ -422,53 +618,155 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id)
                        return -ENOMEM;
        }
 
-       dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id);
-       return add_memory_driver_managed(nid, addr, memory_block_size_bytes(),
-                                        vm->resource_name,
-                                        MEMHP_MERGE_RESOURCE);
+       dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
+               addr + size - 1);
+       /* Memory might get onlined immediately. */
+       atomic64_add(size, &vm->offline_size);
+       rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name,
+                                      MEMHP_MERGE_RESOURCE);
+       if (rc) {
+               atomic64_sub(size, &vm->offline_size);
+               dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
+               /*
+                * TODO: Linux MM does not properly clean up yet in all cases
+                * where adding of memory failed - especially on -ENOMEM.
+                */
+       }
+       return rc;
+}
+
+/*
+ * See virtio_mem_add_memory(): Try adding a single Linux memory block.
+ */
+static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
+{
+       const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
+       const uint64_t size = memory_block_size_bytes();
+
+       return virtio_mem_add_memory(vm, addr, size);
+}
+
+/*
+ * See virtio_mem_add_memory(): Try adding a big block.
+ */
+static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id)
+{
+       const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
+       const uint64_t size = vm->bbm.bb_size;
+
+       return virtio_mem_add_memory(vm, addr, size);
 }
 
 /*
- * Try to remove a memory block from Linux. Will only fail if the memory block
- * is not offline.
+ * Try removing memory from Linux. Will only fail if memory blocks aren't
+ * offline.
  *
  * Must not be called with the vm->hotplug_mutex held (possible deadlock with
  * onlining code).
  *
- * Will not modify the state of the memory block.
+ * Will not modify the state of memory blocks in virtio-mem.
+ */
+static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
+                                   uint64_t size)
+{
+       int rc;
+
+       dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
+               addr + size - 1);
+       rc = remove_memory(vm->nid, addr, size);
+       if (!rc) {
+               atomic64_sub(size, &vm->offline_size);
+               /*
+                * We might have freed up memory we can now unplug, retry
+                * immediately instead of waiting.
+                */
+               virtio_mem_retry(vm);
+       } else {
+               dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc);
+       }
+       return rc;
+}
+
+/*
+ * See virtio_mem_remove_memory(): Try removing a single Linux memory block.
  */
-static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id)
+static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id)
 {
        const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
-       int nid = vm->nid;
+       const uint64_t size = memory_block_size_bytes();
 
-       if (nid == NUMA_NO_NODE)
-               nid = memory_add_physaddr_to_nid(addr);
+       return virtio_mem_remove_memory(vm, addr, size);
+}
+
+/*
+ * See virtio_mem_remove_memory(): Try to remove all Linux memory blocks covered
+ * by the big block.
+ */
+static int virtio_mem_bbm_remove_bb(struct virtio_mem *vm, unsigned long bb_id)
+{
+       const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
+       const uint64_t size = vm->bbm.bb_size;
 
-       dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id);
-       return remove_memory(nid, addr, memory_block_size_bytes());
+       return virtio_mem_remove_memory(vm, addr, size);
 }
 
 /*
- * Try to offline and remove a memory block from Linux.
+ * Try offlining and removing memory from Linux.
  *
  * Must not be called with the vm->hotplug_mutex held (possible deadlock with
  * onlining code).
  *
- * Will not modify the state of the memory block.
+ * Will not modify the state of memory blocks in virtio-mem.
  */
-static int virtio_mem_mb_offline_and_remove(struct virtio_mem *vm,
-                                           unsigned long mb_id)
+static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
+                                               uint64_t addr,
+                                               uint64_t size)
+{
+       int rc;
+
+       dev_dbg(&vm->vdev->dev,
+               "offlining and removing memory: 0x%llx - 0x%llx\n", addr,
+               addr + size - 1);
+
+       rc = offline_and_remove_memory(vm->nid, addr, size);
+       if (!rc) {
+               atomic64_sub(size, &vm->offline_size);
+               /*
+                * We might have freed up memory we can now unplug, retry
+                * immediately instead of waiting.
+                */
+               virtio_mem_retry(vm);
+       } else {
+               dev_dbg(&vm->vdev->dev,
+                       "offlining and removing memory failed: %d\n", rc);
+       }
+       return rc;
+}
+
+/*
+ * See virtio_mem_offline_and_remove_memory(): Try offlining and removing
+ * a single Linux memory block.
+ */
+static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
+                                               unsigned long mb_id)
 {
        const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
-       int nid = vm->nid;
+       const uint64_t size = memory_block_size_bytes();
 
-       if (nid == NUMA_NO_NODE)
-               nid = memory_add_physaddr_to_nid(addr);
+       return virtio_mem_offline_and_remove_memory(vm, addr, size);
+}
+
+/*
+ * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
+ * all Linux memory blocks covered by the big block.
+ */
+static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
+                                               unsigned long bb_id)
+{
+       const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
+       const uint64_t size = vm->bbm.bb_size;
 
-       dev_dbg(&vm->vdev->dev, "offlining and removing memory block: %lu\n",
-               mb_id);
-       return offline_and_remove_memory(nid, addr, memory_block_size_bytes());
+       return virtio_mem_offline_and_remove_memory(vm, addr, size);
 }
 
 /*
@@ -499,31 +797,28 @@ static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
  * Test if a virtio-mem device overlaps with the given range. Can be called
  * from (notifier) callbacks lockless.
  */
-static bool virtio_mem_overlaps_range(struct virtio_mem *vm,
-                                     unsigned long start, unsigned long size)
+static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start,
+                                     uint64_t size)
 {
-       unsigned long dev_start = virtio_mem_mb_id_to_phys(vm->first_mb_id);
-       unsigned long dev_end = virtio_mem_mb_id_to_phys(vm->last_mb_id) +
-                               memory_block_size_bytes();
-
-       return start < dev_end && dev_start < start + size;
+       return start < vm->addr + vm->region_size && vm->addr < start + size;
 }
 
 /*
- * Test if a virtio-mem device owns a memory block. Can be called from
+ * Test if a virtio-mem device contains a given range. Can be called from
  * (notifier) callbacks lockless.
  */
-static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id)
+static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
+                                     uint64_t size)
 {
-       return mb_id >= vm->first_mb_id && mb_id <= vm->last_mb_id;
+       return start >= vm->addr && start + size <= vm->addr + vm->region_size;
 }
 
-static int virtio_mem_notify_going_online(struct virtio_mem *vm,
-                                         unsigned long mb_id)
+static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
+                                             unsigned long mb_id)
 {
-       switch (virtio_mem_mb_get_state(vm, mb_id)) {
-       case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL:
-       case VIRTIO_MEM_MB_STATE_OFFLINE:
+       switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
+       case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
+       case VIRTIO_MEM_SBM_MB_OFFLINE:
                return NOTIFY_OK;
        default:
                break;
@@ -533,108 +828,100 @@ static int virtio_mem_notify_going_online(struct virtio_mem *vm,
        return NOTIFY_BAD;
 }
 
-static void virtio_mem_notify_offline(struct virtio_mem *vm,
-                                     unsigned long mb_id)
+static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm,
+                                         unsigned long mb_id)
 {
-       switch (virtio_mem_mb_get_state(vm, mb_id)) {
-       case VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL:
-               virtio_mem_mb_set_state(vm, mb_id,
-                                       VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL);
+       switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
+       case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL:
+               virtio_mem_sbm_set_mb_state(vm, mb_id,
+                                           VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
                break;
-       case VIRTIO_MEM_MB_STATE_ONLINE:
-               virtio_mem_mb_set_state(vm, mb_id,
-                                       VIRTIO_MEM_MB_STATE_OFFLINE);
+       case VIRTIO_MEM_SBM_MB_ONLINE:
+               virtio_mem_sbm_set_mb_state(vm, mb_id,
+                                           VIRTIO_MEM_SBM_MB_OFFLINE);
                break;
        default:
                BUG();
                break;
        }
-
-       /*
-        * Trigger the workqueue, maybe we can now unplug memory. Also,
-        * when we offline and remove a memory block, this will re-trigger
-        * us immediately - which is often nice because the removal of
-        * the memory block (e.g., memmap) might have freed up memory
-        * on other memory blocks we manage.
-        */
-       virtio_mem_retry(vm);
 }
 
-static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id)
+static void virtio_mem_sbm_notify_online(struct virtio_mem *vm,
+                                        unsigned long mb_id)
 {
-       unsigned long nb_offline;
-
-       switch (virtio_mem_mb_get_state(vm, mb_id)) {
-       case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL:
-               virtio_mem_mb_set_state(vm, mb_id,
-                                       VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL);
+       switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
+       case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
+               virtio_mem_sbm_set_mb_state(vm, mb_id,
+                                       VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL);
                break;
-       case VIRTIO_MEM_MB_STATE_OFFLINE:
-               virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_ONLINE);
+       case VIRTIO_MEM_SBM_MB_OFFLINE:
+               virtio_mem_sbm_set_mb_state(vm, mb_id,
+                                           VIRTIO_MEM_SBM_MB_ONLINE);
                break;
        default:
                BUG();
                break;
        }
-       nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] +
-                    vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL];
-
-       /* see if we can add new blocks now that we onlined one block */
-       if (nb_offline == VIRTIO_MEM_NB_OFFLINE_THRESHOLD - 1)
-               virtio_mem_retry(vm);
 }
 
-static void virtio_mem_notify_going_offline(struct virtio_mem *vm,
-                                           unsigned long mb_id)
+static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm,
+                                               unsigned long mb_id)
 {
-       const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
-       struct page *page;
+       const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
        unsigned long pfn;
-       int sb_id, i;
+       int sb_id;
 
-       for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
-               if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
+       for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
+               if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
                        continue;
-               /*
-                * Drop our reference to the pages so the memory can get
-                * offlined and add the unplugged pages to the managed
-                * page counters (so offlining code can correctly subtract
-                * them again).
-                */
                pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
-                              sb_id * vm->subblock_size);
-               adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
-               for (i = 0; i < nr_pages; i++) {
-                       page = pfn_to_page(pfn + i);
-                       if (WARN_ON(!page_ref_dec_and_test(page)))
-                               dump_page(page, "unplugged page referenced");
-               }
+                              sb_id * vm->sbm.sb_size);
+               virtio_mem_fake_offline_going_offline(pfn, nr_pages);
        }
 }
 
-static void virtio_mem_notify_cancel_offline(struct virtio_mem *vm,
-                                            unsigned long mb_id)
+static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm,
+                                                unsigned long mb_id)
 {
-       const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
+       const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
        unsigned long pfn;
-       int sb_id, i;
+       int sb_id;
 
-       for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
-               if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
+       for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
+               if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
                        continue;
-               /*
-                * Get the reference we dropped when going offline and
-                * subtract the unplugged pages from the managed page
-                * counters.
-                */
                pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
-                              sb_id * vm->subblock_size);
-               adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
-               for (i = 0; i < nr_pages; i++)
-                       page_ref_inc(pfn_to_page(pfn + i));
+                              sb_id * vm->sbm.sb_size);
+               virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
        }
 }
 
+static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
+                                               unsigned long bb_id,
+                                               unsigned long pfn,
+                                               unsigned long nr_pages)
+{
+       /*
+        * When marked as "fake-offline", all online memory of this device block
+        * is allocated by us. Otherwise, we don't have any memory allocated.
+        */
+       if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
+           VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
+               return;
+       virtio_mem_fake_offline_going_offline(pfn, nr_pages);
+}
+
+static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
+                                                unsigned long bb_id,
+                                                unsigned long pfn,
+                                                unsigned long nr_pages)
+{
+       if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
+           VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
+               return;
+       virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
+}
+
 /*
  * This callback will either be called synchronously from add_memory() or
  * asynchronously (e.g., triggered via user space). We have to be careful
@@ -648,20 +935,33 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
        struct memory_notify *mhp = arg;
        const unsigned long start = PFN_PHYS(mhp->start_pfn);
        const unsigned long size = PFN_PHYS(mhp->nr_pages);
-       const unsigned long mb_id = virtio_mem_phys_to_mb_id(start);
        int rc = NOTIFY_OK;
+       unsigned long id;
 
        if (!virtio_mem_overlaps_range(vm, start, size))
                return NOTIFY_DONE;
 
-       /*
-        * Memory is onlined/offlined in memory block granularity. We cannot
-        * cross virtio-mem device boundaries and memory block boundaries. Bail
-        * out if this ever changes.
-        */
-       if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
-                        !IS_ALIGNED(start, memory_block_size_bytes())))
-               return NOTIFY_BAD;
+       if (vm->in_sbm) {
+               id = virtio_mem_phys_to_mb_id(start);
+               /*
+                * In SBM, we add memory in separate memory blocks - we expect
+                * it to be onlined/offlined in the same granularity. Bail out
+                * if this ever changes.
+                */
+               if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
+                                !IS_ALIGNED(start, memory_block_size_bytes())))
+                       return NOTIFY_BAD;
+       } else {
+               id = virtio_mem_phys_to_bb_id(vm, start);
+               /*
+                * In BBM, we only care about onlining/offlining happening
+                * within a single big block, we don't care about the
+                * actual granularity as we don't track individual Linux
+                * memory blocks.
+                */
+               if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1)))
+                       return NOTIFY_BAD;
+       }
 
        /*
         * Avoid circular locking lockdep warnings. We lock the mutex
@@ -680,7 +980,12 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
                        break;
                }
                vm->hotplug_active = true;
-               virtio_mem_notify_going_offline(vm, mb_id);
+               if (vm->in_sbm)
+                       virtio_mem_sbm_notify_going_offline(vm, id);
+               else
+                       virtio_mem_bbm_notify_going_offline(vm, id,
+                                                           mhp->start_pfn,
+                                                           mhp->nr_pages);
                break;
        case MEM_GOING_ONLINE:
                mutex_lock(&vm->hotplug_mutex);
@@ -690,22 +995,51 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
                        break;
                }
                vm->hotplug_active = true;
-               rc = virtio_mem_notify_going_online(vm, mb_id);
+               if (vm->in_sbm)
+                       rc = virtio_mem_sbm_notify_going_online(vm, id);
                break;
        case MEM_OFFLINE:
-               virtio_mem_notify_offline(vm, mb_id);
+               if (vm->in_sbm)
+                       virtio_mem_sbm_notify_offline(vm, id);
+
+               atomic64_add(size, &vm->offline_size);
+               /*
+                * Trigger the workqueue. Now that we have some offline memory,
+                * maybe we can handle pending unplug requests.
+                */
+               if (!unplug_online)
+                       virtio_mem_retry(vm);
+
                vm->hotplug_active = false;
                mutex_unlock(&vm->hotplug_mutex);
                break;
        case MEM_ONLINE:
-               virtio_mem_notify_online(vm, mb_id);
+               if (vm->in_sbm)
+                       virtio_mem_sbm_notify_online(vm, id);
+
+               atomic64_sub(size, &vm->offline_size);
+               /*
+                * Start adding more memory once we onlined half of our
+                * threshold. Don't trigger if it's possibly due to our actipn
+                * (e.g., us adding memory which gets onlined immediately from
+                * the core).
+                */
+               if (!atomic_read(&vm->wq_active) &&
+                   virtio_mem_could_add_memory(vm, vm->offline_threshold / 2))
+                       virtio_mem_retry(vm);
+
                vm->hotplug_active = false;
                mutex_unlock(&vm->hotplug_mutex);
                break;
        case MEM_CANCEL_OFFLINE:
                if (!vm->hotplug_active)
                        break;
-               virtio_mem_notify_cancel_offline(vm, mb_id);
+               if (vm->in_sbm)
+                       virtio_mem_sbm_notify_cancel_offline(vm, id);
+               else
+                       virtio_mem_bbm_notify_cancel_offline(vm, id,
+                                                            mhp->start_pfn,
+                                                            mhp->nr_pages);
                vm->hotplug_active = false;
                mutex_unlock(&vm->hotplug_mutex);
                break;
@@ -729,7 +1063,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
  * (via generic_online_page()) using PageDirty().
  */
 static void virtio_mem_set_fake_offline(unsigned long pfn,
-                                       unsigned int nr_pages, bool onlined)
+                                       unsigned long nr_pages, bool onlined)
 {
        for (; nr_pages--; pfn++) {
                struct page *page = pfn_to_page(pfn);
@@ -748,7 +1082,7 @@ static void virtio_mem_set_fake_offline(unsigned long pfn,
  * (via generic_online_page()), clear PageDirty().
  */
 static void virtio_mem_clear_fake_offline(unsigned long pfn,
-                                         unsigned int nr_pages, bool onlined)
+                                         unsigned long nr_pages, bool onlined)
 {
        for (; nr_pages--; pfn++) {
                struct page *page = pfn_to_page(pfn);
@@ -763,16 +1097,17 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn,
  * Release a range of fake-offline pages to the buddy, effectively
  * fake-onlining them.
  */
-static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages)
+static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
 {
-       const int order = MAX_ORDER - 1;
-       int i;
+       const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES;
+       unsigned long i;
 
        /*
-        * We are always called with subblock granularity, which is at least
-        * aligned to MAX_ORDER - 1.
+        * We are always called at least with MAX_ORDER_NR_PAGES
+        * granularity/alignment (e.g., the way subblocks work). All pages
+        * inside such a block are alike.
         */
-       for (i = 0; i < nr_pages; i += 1 << order) {
+       for (i = 0; i < nr_pages; i += max_nr_pages) {
                struct page *page = pfn_to_page(pfn + i);
 
                /*
@@ -782,42 +1117,128 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages)
                 * alike.
                 */
                if (PageDirty(page)) {
-                       virtio_mem_clear_fake_offline(pfn + i, 1 << order,
+                       virtio_mem_clear_fake_offline(pfn + i, max_nr_pages,
                                                      false);
-                       generic_online_page(page, order);
+                       generic_online_page(page, MAX_ORDER - 1);
                } else {
-                       virtio_mem_clear_fake_offline(pfn + i, 1 << order,
+                       virtio_mem_clear_fake_offline(pfn + i, max_nr_pages,
                                                      true);
-                       free_contig_range(pfn + i, 1 << order);
-                       adjust_managed_page_count(page, 1 << order);
+                       free_contig_range(pfn + i, max_nr_pages);
+                       adjust_managed_page_count(page, max_nr_pages);
                }
        }
 }
 
+/*
+ * Try to allocate a range, marking pages fake-offline, effectively
+ * fake-offlining them.
+ */
+static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages)
+{
+       const bool is_movable = zone_idx(page_zone(pfn_to_page(pfn))) ==
+                               ZONE_MOVABLE;
+       int rc, retry_count;
+
+       /*
+        * TODO: We want an alloc_contig_range() mode that tries to allocate
+        * harder (e.g., dealing with temporarily pinned pages, PCP), especially
+        * with ZONE_MOVABLE. So for now, retry a couple of times with
+        * ZONE_MOVABLE before giving up - because that zone is supposed to give
+        * some guarantees.
+        */
+       for (retry_count = 0; retry_count < 5; retry_count++) {
+               rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
+                                       GFP_KERNEL);
+               if (rc == -ENOMEM)
+                       /* whoops, out of memory */
+                       return rc;
+               else if (rc && !is_movable)
+                       break;
+               else if (rc)
+                       continue;
+
+               virtio_mem_set_fake_offline(pfn, nr_pages, true);
+               adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
+               return 0;
+       }
+
+       return -EBUSY;
+}
+
+/*
+ * Handle fake-offline pages when memory is going offline - such that the
+ * pages can be skipped by mm-core when offlining.
+ */
+static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
+                                                 unsigned long nr_pages)
+{
+       struct page *page;
+       unsigned long i;
+
+       /*
+        * Drop our reference to the pages so the memory can get offlined
+        * and add the unplugged pages to the managed page counters (so
+        * offlining code can correctly subtract them again).
+        */
+       adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
+       /* Drop our reference to the pages so the memory can get offlined. */
+       for (i = 0; i < nr_pages; i++) {
+               page = pfn_to_page(pfn + i);
+               if (WARN_ON(!page_ref_dec_and_test(page)))
+                       dump_page(page, "fake-offline page referenced");
+       }
+}
+
+/*
+ * Handle fake-offline pages when memory offlining is canceled - to undo
+ * what we did in virtio_mem_fake_offline_going_offline().
+ */
+static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
+                                                  unsigned long nr_pages)
+{
+       unsigned long i;
+
+       /*
+        * Get the reference we dropped when going offline and subtract the
+        * unplugged pages from the managed page counters.
+        */
+       adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
+       for (i = 0; i < nr_pages; i++)
+               page_ref_inc(pfn_to_page(pfn + i));
+}
+
 static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
 {
        const unsigned long addr = page_to_phys(page);
-       const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
+       unsigned long id, sb_id;
        struct virtio_mem *vm;
-       int sb_id;
+       bool do_online;
 
-       /*
-        * We exploit here that subblocks have at least MAX_ORDER - 1
-        * size/alignment and that this callback is is called with such a
-        * size/alignment. So we cannot cross subblocks and therefore
-        * also not memory blocks.
-        */
        rcu_read_lock();
        list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
-               if (!virtio_mem_owned_mb(vm, mb_id))
+               if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order)))
                        continue;
 
-               sb_id = virtio_mem_phys_to_sb_id(vm, addr);
-               /*
-                * If plugged, online the pages, otherwise, set them fake
-                * offline (PageOffline).
-                */
-               if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
+               if (vm->in_sbm) {
+                       /*
+                        * We exploit here that subblocks have at least
+                        * MAX_ORDER_NR_PAGES size/alignment - so we cannot
+                        * cross subblocks within one call.
+                        */
+                       id = virtio_mem_phys_to_mb_id(addr);
+                       sb_id = virtio_mem_phys_to_sb_id(vm, addr);
+                       do_online = virtio_mem_sbm_test_sb_plugged(vm, id,
+                                                                  sb_id, 1);
+               } else {
+                       /*
+                        * If the whole block is marked fake offline, keep
+                        * everything that way.
+                        */
+                       id = virtio_mem_phys_to_bb_id(vm, addr);
+                       do_online = virtio_mem_bbm_get_bb_state(vm, id) !=
+                                   VIRTIO_MEM_BBM_BB_FAKE_OFFLINE;
+               }
+               if (do_online)
                        generic_online_page(page, order);
                else
                        virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
@@ -870,23 +1291,33 @@ static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
                .u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
                .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
        };
+       int rc = -ENOMEM;
 
        if (atomic_read(&vm->config_changed))
                return -EAGAIN;
 
+       dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
+               addr + size - 1);
+
        switch (virtio_mem_send_request(vm, &req)) {
        case VIRTIO_MEM_RESP_ACK:
                vm->plugged_size += size;
                return 0;
        case VIRTIO_MEM_RESP_NACK:
-               return -EAGAIN;
+               rc = -EAGAIN;
+               break;
        case VIRTIO_MEM_RESP_BUSY:
-               return -ETXTBSY;
+               rc = -ETXTBSY;
+               break;
        case VIRTIO_MEM_RESP_ERROR:
-               return -EINVAL;
+               rc = -EINVAL;
+               break;
        default:
-               return -ENOMEM;
+               break;
        }
+
+       dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc);
+       return rc;
 }
 
 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
@@ -898,21 +1329,30 @@ static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
                .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
                .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
        };
+       int rc = -ENOMEM;
 
        if (atomic_read(&vm->config_changed))
                return -EAGAIN;
 
+       dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
+               addr + size - 1);
+
        switch (virtio_mem_send_request(vm, &req)) {
        case VIRTIO_MEM_RESP_ACK:
                vm->plugged_size -= size;
                return 0;
        case VIRTIO_MEM_RESP_BUSY:
-               return -ETXTBSY;
+               rc = -ETXTBSY;
+               break;
        case VIRTIO_MEM_RESP_ERROR:
-               return -EINVAL;
+               rc = -EINVAL;
+               break;
        default:
-               return -ENOMEM;
+               break;
        }
+
+       dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc);
+       return rc;
 }
 
 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
@@ -920,6 +1360,9 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
        const struct virtio_mem_req req = {
                .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
        };
+       int rc = -ENOMEM;
+
+       dev_dbg(&vm->vdev->dev, "unplugging all memory");
 
        switch (virtio_mem_send_request(vm, &req)) {
        case VIRTIO_MEM_RESP_ACK:
@@ -929,30 +1372,31 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
                atomic_set(&vm->config_changed, 1);
                return 0;
        case VIRTIO_MEM_RESP_BUSY:
-               return -ETXTBSY;
+               rc = -ETXTBSY;
+               break;
        default:
-               return -ENOMEM;
+               break;
        }
+
+       dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc);
+       return rc;
 }
 
 /*
  * Plug selected subblocks. Updates the plugged state, but not the state
  * of the memory block.
  */
-static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
-                                int sb_id, int count)
+static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
+                                 int sb_id, int count)
 {
        const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
-                             sb_id * vm->subblock_size;
-       const uint64_t size = count * vm->subblock_size;
+                             sb_id * vm->sbm.sb_size;
+       const uint64_t size = count * vm->sbm.sb_size;
        int rc;
 
-       dev_dbg(&vm->vdev->dev, "plugging memory block: %lu : %i - %i\n", mb_id,
-               sb_id, sb_id + count - 1);
-
        rc = virtio_mem_send_plug_request(vm, addr, size);
        if (!rc)
-               virtio_mem_mb_set_sb_plugged(vm, mb_id, sb_id, count);
+               virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
        return rc;
 }
 
@@ -960,23 +1404,46 @@ static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
  * Unplug selected subblocks. Updates the plugged state, but not the state
  * of the memory block.
  */
-static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
-                                  int sb_id, int count)
+static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
+                                   int sb_id, int count)
 {
        const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
-                             sb_id * vm->subblock_size;
-       const uint64_t size = count * vm->subblock_size;
+                             sb_id * vm->sbm.sb_size;
+       const uint64_t size = count * vm->sbm.sb_size;
        int rc;
 
-       dev_dbg(&vm->vdev->dev, "unplugging memory block: %lu : %i - %i\n",
-               mb_id, sb_id, sb_id + count - 1);
-
        rc = virtio_mem_send_unplug_request(vm, addr, size);
        if (!rc)
-               virtio_mem_mb_set_sb_unplugged(vm, mb_id, sb_id, count);
+               virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count);
        return rc;
 }
 
+/*
+ * Request to unplug a big block.
+ *
+ * Will not modify the state of the big block.
+ */
+static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id)
+{
+       const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
+       const uint64_t size = vm->bbm.bb_size;
+
+       return virtio_mem_send_unplug_request(vm, addr, size);
+}
+
+/*
+ * Request to plug a big block.
+ *
+ * Will not modify the state of the big block.
+ */
+static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id)
+{
+       const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
+       const uint64_t size = vm->bbm.bb_size;
+
+       return virtio_mem_send_plug_request(vm, addr, size);
+}
+
 /*
  * Unplug the desired number of plugged subblocks of a offline or not-added
  * memory block. Will fail if any subblock cannot get unplugged (instead of
@@ -986,29 +1453,29 @@ static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
  *
  * Note: can fail after some subblocks were unplugged.
  */
-static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm,
-                                      unsigned long mb_id, uint64_t *nb_sb)
+static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm,
+                                       unsigned long mb_id, uint64_t *nb_sb)
 {
        int sb_id, count;
        int rc;
 
-       sb_id = vm->nb_sb_per_mb - 1;
+       sb_id = vm->sbm.sbs_per_mb - 1;
        while (*nb_sb) {
                /* Find the next candidate subblock */
                while (sb_id >= 0 &&
-                      virtio_mem_mb_test_sb_unplugged(vm, mb_id, sb_id, 1))
+                      virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1))
                        sb_id--;
                if (sb_id < 0)
                        break;
                /* Try to unplug multiple subblocks at a time */
                count = 1;
                while (count < *nb_sb && sb_id > 0 &&
-                      virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
+                      virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
                        count++;
                        sb_id--;
                }
 
-               rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count);
+               rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
                if (rc)
                        return rc;
                *nb_sb -= count;
@@ -1025,63 +1492,50 @@ static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm,
  *
  * Note: can fail after some subblocks were unplugged.
  */
-static int virtio_mem_mb_unplug(struct virtio_mem *vm, unsigned long mb_id)
+static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id)
 {
-       uint64_t nb_sb = vm->nb_sb_per_mb;
+       uint64_t nb_sb = vm->sbm.sbs_per_mb;
 
-       return virtio_mem_mb_unplug_any_sb(vm, mb_id, &nb_sb);
+       return virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb);
 }
 
 /*
  * Prepare tracking data for the next memory block.
  */
-static int virtio_mem_prepare_next_mb(struct virtio_mem *vm,
-                                     unsigned long *mb_id)
+static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm,
+                                         unsigned long *mb_id)
 {
        int rc;
 
-       if (vm->next_mb_id > vm->last_usable_mb_id)
+       if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id)
                return -ENOSPC;
 
        /* Resize the state array if required. */
-       rc = virtio_mem_mb_state_prepare_next_mb(vm);
+       rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm);
        if (rc)
                return rc;
 
        /* Resize the subblock bitmap if required. */
-       rc = virtio_mem_sb_bitmap_prepare_next_mb(vm);
+       rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm);
        if (rc)
                return rc;
 
-       vm->nb_mb_state[VIRTIO_MEM_MB_STATE_UNUSED]++;
-       *mb_id = vm->next_mb_id++;
+       vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++;
+       *mb_id = vm->sbm.next_mb_id++;
        return 0;
 }
 
-/*
- * Don't add too many blocks that are not onlined yet to avoid running OOM.
- */
-static bool virtio_mem_too_many_mb_offline(struct virtio_mem *vm)
-{
-       unsigned long nb_offline;
-
-       nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] +
-                    vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL];
-       return nb_offline >= VIRTIO_MEM_NB_OFFLINE_THRESHOLD;
-}
-
 /*
  * Try to plug the desired number of subblocks and add the memory block
  * to Linux.
  *
  * Will modify the state of the memory block.
  */
-static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm,
-                                     unsigned long mb_id,
-                                     uint64_t *nb_sb)
+static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
+                                         unsigned long mb_id, uint64_t *nb_sb)
 {
-       const int count = min_t(int, *nb_sb, vm->nb_sb_per_mb);
-       int rc, rc2;
+       const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb);
+       int rc;
 
        if (WARN_ON_ONCE(!count))
                return -EINVAL;
@@ -1090,7 +1544,7 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm,
         * Plug the requested number of subblocks before adding it to linux,
         * so that onlining will directly online all plugged subblocks.
         */
-       rc = virtio_mem_mb_plug_sb(vm, mb_id, 0, count);
+       rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
        if (rc)
                return rc;
 
@@ -1098,29 +1552,21 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm,
         * Mark the block properly offline before adding it to Linux,
         * so the memory notifiers will find the block in the right state.
         */
-       if (count == vm->nb_sb_per_mb)
-               virtio_mem_mb_set_state(vm, mb_id,
-                                       VIRTIO_MEM_MB_STATE_OFFLINE);
+       if (count == vm->sbm.sbs_per_mb)
+               virtio_mem_sbm_set_mb_state(vm, mb_id,
+                                           VIRTIO_MEM_SBM_MB_OFFLINE);
        else
-               virtio_mem_mb_set_state(vm, mb_id,
-                                       VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL);
+               virtio_mem_sbm_set_mb_state(vm, mb_id,
+                                           VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
 
        /* Add the memory block to linux - if that fails, try to unplug. */
-       rc = virtio_mem_mb_add(vm, mb_id);
+       rc = virtio_mem_sbm_add_mb(vm, mb_id);
        if (rc) {
-               enum virtio_mem_mb_state new_state = VIRTIO_MEM_MB_STATE_UNUSED;
-
-               dev_err(&vm->vdev->dev,
-                       "adding memory block %lu failed with %d\n", mb_id, rc);
-               rc2 = virtio_mem_mb_unplug_sb(vm, mb_id, 0, count);
+               int new_state = VIRTIO_MEM_SBM_MB_UNUSED;
 
-               /*
-                * TODO: Linux MM does not properly clean up yet in all cases
-                * where adding of memory failed - especially on -ENOMEM.
-                */
-               if (rc2)
-                       new_state = VIRTIO_MEM_MB_STATE_PLUGGED;
-               virtio_mem_mb_set_state(vm, mb_id, new_state);
+               if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count))
+                       new_state = VIRTIO_MEM_SBM_MB_PLUGGED;
+               virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
                return rc;
        }
 
@@ -1136,8 +1582,9 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm,
  *
  * Note: Can fail after some subblocks were successfully plugged.
  */
-static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id,
-                                    uint64_t *nb_sb, bool online)
+static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
+                                     unsigned long mb_id, uint64_t *nb_sb,
+                                     bool online)
 {
        unsigned long pfn, nr_pages;
        int sb_id, count;
@@ -1147,17 +1594,16 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id,
                return -EINVAL;
 
        while (*nb_sb) {
-               sb_id = virtio_mem_mb_first_unplugged_sb(vm, mb_id);
-               if (sb_id >= vm->nb_sb_per_mb)
+               sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id);
+               if (sb_id >= vm->sbm.sbs_per_mb)
                        break;
                count = 1;
                while (count < *nb_sb &&
-                      sb_id + count < vm->nb_sb_per_mb &&
-                      !virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id + count,
-                                                     1))
+                      sb_id + count < vm->sbm.sbs_per_mb &&
+                      !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
                        count++;
 
-               rc = virtio_mem_mb_plug_sb(vm, mb_id, sb_id, count);
+               rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
                if (rc)
                        return rc;
                *nb_sb -= count;
@@ -1166,29 +1612,26 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id,
 
                /* fake-online the pages if the memory block is online */
                pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
-                              sb_id * vm->subblock_size);
-               nr_pages = PFN_DOWN(count * vm->subblock_size);
+                              sb_id * vm->sbm.sb_size);
+               nr_pages = PFN_DOWN(count * vm->sbm.sb_size);
                virtio_mem_fake_online(pfn, nr_pages);
        }
 
-       if (virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
+       if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
                if (online)
-                       virtio_mem_mb_set_state(vm, mb_id,
-                                               VIRTIO_MEM_MB_STATE_ONLINE);
+                       virtio_mem_sbm_set_mb_state(vm, mb_id,
+                                                   VIRTIO_MEM_SBM_MB_ONLINE);
                else
-                       virtio_mem_mb_set_state(vm, mb_id,
-                                               VIRTIO_MEM_MB_STATE_OFFLINE);
+                       virtio_mem_sbm_set_mb_state(vm, mb_id,
+                                                   VIRTIO_MEM_SBM_MB_OFFLINE);
        }
 
        return 0;
 }
 
-/*
- * Try to plug the requested amount of memory.
- */
-static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
+static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
 {
-       uint64_t nb_sb = diff / vm->subblock_size;
+       uint64_t nb_sb = diff / vm->sbm.sb_size;
        unsigned long mb_id;
        int rc;
 
@@ -1199,18 +1642,18 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
        mutex_lock(&vm->hotplug_mutex);
 
        /* Try to plug subblocks of partially plugged online blocks. */
-       virtio_mem_for_each_mb_state(vm, mb_id,
-                                    VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) {
-               rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, true);
+       virtio_mem_sbm_for_each_mb(vm, mb_id,
+                                  VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) {
+               rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, true);
                if (rc || !nb_sb)
                        goto out_unlock;
                cond_resched();
        }
 
        /* Try to plug subblocks of partially plugged offline blocks. */
-       virtio_mem_for_each_mb_state(vm, mb_id,
-                                    VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) {
-               rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, false);
+       virtio_mem_sbm_for_each_mb(vm, mb_id,
+                                  VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
+               rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, false);
                if (rc || !nb_sb)
                        goto out_unlock;
                cond_resched();
@@ -1223,11 +1666,11 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
        mutex_unlock(&vm->hotplug_mutex);
 
        /* Try to plug and add unused blocks */
-       virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED) {
-               if (virtio_mem_too_many_mb_offline(vm))
+       virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
+               if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
                        return -ENOSPC;
 
-               rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb);
+               rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
                if (rc || !nb_sb)
                        return rc;
                cond_resched();
@@ -1235,13 +1678,13 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
 
        /* Try to prepare, plug and add new blocks */
        while (nb_sb) {
-               if (virtio_mem_too_many_mb_offline(vm))
+               if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
                        return -ENOSPC;
 
-               rc = virtio_mem_prepare_next_mb(vm, &mb_id);
+               rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
                if (rc)
                        return rc;
-               rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb);
+               rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
                if (rc)
                        return rc;
                cond_resched();
@@ -1253,6 +1696,112 @@ out_unlock:
        return rc;
 }
 
+/*
+ * Plug a big block and add it to Linux.
+ *
+ * Will modify the state of the big block.
+ */
+static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm,
+                                         unsigned long bb_id)
+{
+       int rc;
+
+       if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
+                        VIRTIO_MEM_BBM_BB_UNUSED))
+               return -EINVAL;
+
+       rc = virtio_mem_bbm_plug_bb(vm, bb_id);
+       if (rc)
+               return rc;
+       virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
+
+       rc = virtio_mem_bbm_add_bb(vm, bb_id);
+       if (rc) {
+               if (!virtio_mem_bbm_unplug_bb(vm, bb_id))
+                       virtio_mem_bbm_set_bb_state(vm, bb_id,
+                                                   VIRTIO_MEM_BBM_BB_UNUSED);
+               else
+                       /* Retry from the main loop. */
+                       virtio_mem_bbm_set_bb_state(vm, bb_id,
+                                                   VIRTIO_MEM_BBM_BB_PLUGGED);
+               return rc;
+       }
+       return 0;
+}
+
+/*
+ * Prepare tracking data for the next big block.
+ */
+static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm,
+                                         unsigned long *bb_id)
+{
+       int rc;
+
+       if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id)
+               return -ENOSPC;
+
+       /* Resize the big block state array if required. */
+       rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm);
+       if (rc)
+               return rc;
+
+       vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++;
+       *bb_id = vm->bbm.next_bb_id;
+       vm->bbm.next_bb_id++;
+       return 0;
+}
+
+static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff)
+{
+       uint64_t nb_bb = diff / vm->bbm.bb_size;
+       unsigned long bb_id;
+       int rc;
+
+       if (!nb_bb)
+               return 0;
+
+       /* Try to plug and add unused big blocks */
+       virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) {
+               if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
+                       return -ENOSPC;
+
+               rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
+               if (!rc)
+                       nb_bb--;
+               if (rc || !nb_bb)
+                       return rc;
+               cond_resched();
+       }
+
+       /* Try to prepare, plug and add new big blocks */
+       while (nb_bb) {
+               if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
+                       return -ENOSPC;
+
+               rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id);
+               if (rc)
+                       return rc;
+               rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
+               if (!rc)
+                       nb_bb--;
+               if (rc)
+                       return rc;
+               cond_resched();
+       }
+
+       return 0;
+}
+
+/*
+ * Try to plug the requested amount of memory.
+ */
+static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
+{
+       if (vm->in_sbm)
+               return virtio_mem_sbm_plug_request(vm, diff);
+       return virtio_mem_bbm_plug_request(vm, diff);
+}
+
 /*
  * Unplug the desired number of plugged subblocks of an offline memory block.
  * Will fail if any subblock cannot get unplugged (instead of skipping it).
@@ -1262,33 +1811,33 @@ out_unlock:
  *
  * Note: Can fail after some subblocks were successfully unplugged.
  */
-static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm,
-                                              unsigned long mb_id,
-                                              uint64_t *nb_sb)
+static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm,
+                                               unsigned long mb_id,
+                                               uint64_t *nb_sb)
 {
        int rc;
 
-       rc = virtio_mem_mb_unplug_any_sb(vm, mb_id, nb_sb);
+       rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, nb_sb);
 
        /* some subblocks might have been unplugged even on failure */
-       if (!virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb))
-               virtio_mem_mb_set_state(vm, mb_id,
-                                       VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL);
+       if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
+               virtio_mem_sbm_set_mb_state(vm, mb_id,
+                                           VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
        if (rc)
                return rc;
 
-       if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
+       if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
                /*
                 * Remove the block from Linux - this should never fail.
                 * Hinder the block from getting onlined by marking it
                 * unplugged. Temporarily drop the mutex, so
                 * any pending GOING_ONLINE requests can be serviced/rejected.
                 */
-               virtio_mem_mb_set_state(vm, mb_id,
-                                       VIRTIO_MEM_MB_STATE_UNUSED);
+               virtio_mem_sbm_set_mb_state(vm, mb_id,
+                                           VIRTIO_MEM_SBM_MB_UNUSED);
 
                mutex_unlock(&vm->hotplug_mutex);
-               rc = virtio_mem_mb_remove(vm, mb_id);
+               rc = virtio_mem_sbm_remove_mb(vm, mb_id);
                BUG_ON(rc);
                mutex_lock(&vm->hotplug_mutex);
        }
@@ -1300,38 +1849,31 @@ static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm,
  *
  * Will modify the state of the memory block.
  */
-static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm,
-                                         unsigned long mb_id, int sb_id,
-                                         int count)
+static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm,
+                                          unsigned long mb_id, int sb_id,
+                                          int count)
 {
-       const unsigned long nr_pages = PFN_DOWN(vm->subblock_size) * count;
+       const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count;
        unsigned long start_pfn;
        int rc;
 
        start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
-                            sb_id * vm->subblock_size);
-       rc = alloc_contig_range(start_pfn, start_pfn + nr_pages,
-                               MIGRATE_MOVABLE, GFP_KERNEL);
-       if (rc == -ENOMEM)
-               /* whoops, out of memory */
-               return rc;
-       if (rc)
-               return -EBUSY;
+                            sb_id * vm->sbm.sb_size);
 
-       /* Mark it as fake-offline before unplugging it */
-       virtio_mem_set_fake_offline(start_pfn, nr_pages, true);
-       adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
+       rc = virtio_mem_fake_offline(start_pfn, nr_pages);
+       if (rc)
+               return rc;
 
        /* Try to unplug the allocated memory */
-       rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count);
+       rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
        if (rc) {
                /* Return the memory to the buddy. */
                virtio_mem_fake_online(start_pfn, nr_pages);
                return rc;
        }
 
-       virtio_mem_mb_set_state(vm, mb_id,
-                               VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL);
+       virtio_mem_sbm_set_mb_state(vm, mb_id,
+                                   VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL);
        return 0;
 }
 
@@ -1345,34 +1887,34 @@ static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm,
  * Note: Can fail after some subblocks were successfully unplugged. Can
  *       return 0 even if subblocks were busy and could not get unplugged.
  */
-static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm,
-                                             unsigned long mb_id,
-                                             uint64_t *nb_sb)
+static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm,
+                                              unsigned long mb_id,
+                                              uint64_t *nb_sb)
 {
        int rc, sb_id;
 
        /* If possible, try to unplug the complete block in one shot. */
-       if (*nb_sb >= vm->nb_sb_per_mb &&
-           virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
-               rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, 0,
-                                                   vm->nb_sb_per_mb);
+       if (*nb_sb >= vm->sbm.sbs_per_mb &&
+           virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
+               rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0,
+                                                    vm->sbm.sbs_per_mb);
                if (!rc) {
-                       *nb_sb -= vm->nb_sb_per_mb;
+                       *nb_sb -= vm->sbm.sbs_per_mb;
                        goto unplugged;
                } else if (rc != -EBUSY)
                        return rc;
        }
 
        /* Fallback to single subblocks. */
-       for (sb_id = vm->nb_sb_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
+       for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
                /* Find the next candidate subblock */
                while (sb_id >= 0 &&
-                      !virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
+                      !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
                        sb_id--;
                if (sb_id < 0)
                        break;
 
-               rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, sb_id, 1);
+               rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1);
                if (rc == -EBUSY)
                        continue;
                else if (rc)
@@ -1386,24 +1928,21 @@ unplugged:
         * remove it. This will usually not fail, as no memory is in use
         * anymore - however some other notifiers might NACK the request.
         */
-       if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) {
+       if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
                mutex_unlock(&vm->hotplug_mutex);
-               rc = virtio_mem_mb_offline_and_remove(vm, mb_id);
+               rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
                mutex_lock(&vm->hotplug_mutex);
                if (!rc)
-                       virtio_mem_mb_set_state(vm, mb_id,
-                                               VIRTIO_MEM_MB_STATE_UNUSED);
+                       virtio_mem_sbm_set_mb_state(vm, mb_id,
+                                                   VIRTIO_MEM_SBM_MB_UNUSED);
        }
 
        return 0;
 }
 
-/*
- * Try to unplug the requested amount of memory.
- */
-static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
+static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
 {
-       uint64_t nb_sb = diff / vm->subblock_size;
+       uint64_t nb_sb = diff / vm->sbm.sb_size;
        unsigned long mb_id;
        int rc;
 
@@ -1418,20 +1957,17 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
        mutex_lock(&vm->hotplug_mutex);
 
        /* Try to unplug subblocks of partially plugged offline blocks. */
-       virtio_mem_for_each_mb_state_rev(vm, mb_id,
-                                        VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) {
-               rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id,
-                                                        &nb_sb);
+       virtio_mem_sbm_for_each_mb_rev(vm, mb_id,
+                                      VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
+               rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb);
                if (rc || !nb_sb)
                        goto out_unlock;
                cond_resched();
        }
 
        /* Try to unplug subblocks of plugged offline blocks. */
-       virtio_mem_for_each_mb_state_rev(vm, mb_id,
-                                        VIRTIO_MEM_MB_STATE_OFFLINE) {
-               rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id,
-                                                        &nb_sb);
+       virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE) {
+               rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb);
                if (rc || !nb_sb)
                        goto out_unlock;
                cond_resched();
@@ -1443,10 +1979,9 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
        }
 
        /* Try to unplug subblocks of partially plugged online blocks. */
-       virtio_mem_for_each_mb_state_rev(vm, mb_id,
-                                        VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) {
-               rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id,
-                                                       &nb_sb);
+       virtio_mem_sbm_for_each_mb_rev(vm, mb_id,
+                                      VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) {
+               rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb);
                if (rc || !nb_sb)
                        goto out_unlock;
                mutex_unlock(&vm->hotplug_mutex);
@@ -1455,10 +1990,8 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
        }
 
        /* Try to unplug subblocks of plugged online blocks. */
-       virtio_mem_for_each_mb_state_rev(vm, mb_id,
-                                        VIRTIO_MEM_MB_STATE_ONLINE) {
-               rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id,
-                                                       &nb_sb);
+       virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE) {
+               rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb);
                if (rc || !nb_sb)
                        goto out_unlock;
                mutex_unlock(&vm->hotplug_mutex);
@@ -1473,20 +2006,212 @@ out_unlock:
        return rc;
 }
 
+/*
+ * Try to offline and remove a big block from Linux and unplug it. Will fail
+ * with -EBUSY if some memory is busy and cannot get unplugged.
+ *
+ * Will modify the state of the memory block. Might temporarily drop the
+ * hotplug_mutex.
+ */
+static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
+                                                      unsigned long bb_id)
+{
+       const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
+       const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
+       unsigned long end_pfn = start_pfn + nr_pages;
+       unsigned long pfn;
+       struct page *page;
+       int rc;
+
+       if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
+                        VIRTIO_MEM_BBM_BB_ADDED))
+               return -EINVAL;
+
+       if (bbm_safe_unplug) {
+               /*
+                * Start by fake-offlining all memory. Once we marked the device
+                * block as fake-offline, all newly onlined memory will
+                * automatically be kept fake-offline. Protect from concurrent
+                * onlining/offlining until we have a consistent state.
+                */
+               mutex_lock(&vm->hotplug_mutex);
+               virtio_mem_bbm_set_bb_state(vm, bb_id,
+                                           VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
+
+               for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+                       page = pfn_to_online_page(pfn);
+                       if (!page)
+                               continue;
+
+                       rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION);
+                       if (rc) {
+                               end_pfn = pfn;
+                               goto rollback_safe_unplug;
+                       }
+               }
+               mutex_unlock(&vm->hotplug_mutex);
+       }
+
+       rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
+       if (rc) {
+               if (bbm_safe_unplug) {
+                       mutex_lock(&vm->hotplug_mutex);
+                       goto rollback_safe_unplug;
+               }
+               return rc;
+       }
+
+       rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
+       if (rc)
+               virtio_mem_bbm_set_bb_state(vm, bb_id,
+                                           VIRTIO_MEM_BBM_BB_PLUGGED);
+       else
+               virtio_mem_bbm_set_bb_state(vm, bb_id,
+                                           VIRTIO_MEM_BBM_BB_UNUSED);
+       return rc;
+
+rollback_safe_unplug:
+       for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+               page = pfn_to_online_page(pfn);
+               if (!page)
+                       continue;
+               virtio_mem_fake_online(pfn, PAGES_PER_SECTION);
+       }
+       virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
+       mutex_unlock(&vm->hotplug_mutex);
+       return rc;
+}
+
+/*
+ * Try to remove a big block from Linux and unplug it. Will fail with
+ * -EBUSY if some memory is online.
+ *
+ * Will modify the state of the memory block.
+ */
+static int virtio_mem_bbm_remove_and_unplug_bb(struct virtio_mem *vm,
+                                              unsigned long bb_id)
+{
+       int rc;
+
+       if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
+                        VIRTIO_MEM_BBM_BB_ADDED))
+               return -EINVAL;
+
+       rc = virtio_mem_bbm_remove_bb(vm, bb_id);
+       if (rc)
+               return -EBUSY;
+
+       rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
+       if (rc)
+               virtio_mem_bbm_set_bb_state(vm, bb_id,
+                                           VIRTIO_MEM_BBM_BB_PLUGGED);
+       else
+               virtio_mem_bbm_set_bb_state(vm, bb_id,
+                                           VIRTIO_MEM_BBM_BB_UNUSED);
+       return rc;
+}
+
+/*
+ * Test if a big block is completely offline.
+ */
+static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm,
+                                        unsigned long bb_id)
+{
+       const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
+       const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
+       unsigned long pfn;
+
+       for (pfn = start_pfn; pfn < start_pfn + nr_pages;
+            pfn += PAGES_PER_SECTION) {
+               if (pfn_to_online_page(pfn))
+                       return false;
+       }
+
+       return true;
+}
+
+static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
+{
+       uint64_t nb_bb = diff / vm->bbm.bb_size;
+       uint64_t bb_id;
+       int rc;
+
+       if (!nb_bb)
+               return 0;
+
+       /* Try to unplug completely offline big blocks first. */
+       virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
+               cond_resched();
+               /*
+                * As we're holding no locks, this check is racy as memory
+                * can get onlined in the meantime - but we'll fail gracefully.
+                */
+               if (!virtio_mem_bbm_bb_is_offline(vm, bb_id))
+                       continue;
+               rc = virtio_mem_bbm_remove_and_unplug_bb(vm, bb_id);
+               if (rc == -EBUSY)
+                       continue;
+               if (!rc)
+                       nb_bb--;
+               if (rc || !nb_bb)
+                       return rc;
+       }
+
+       if (!unplug_online)
+               return 0;
+
+       /* Try to unplug any big blocks. */
+       virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
+               cond_resched();
+               rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id);
+               if (rc == -EBUSY)
+                       continue;
+               if (!rc)
+                       nb_bb--;
+               if (rc || !nb_bb)
+                       return rc;
+       }
+
+       return nb_bb ? -EBUSY : 0;
+}
+
+/*
+ * Try to unplug the requested amount of memory.
+ */
+static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
+{
+       if (vm->in_sbm)
+               return virtio_mem_sbm_unplug_request(vm, diff);
+       return virtio_mem_bbm_unplug_request(vm, diff);
+}
+
 /*
  * Try to unplug all blocks that couldn't be unplugged before, for example,
  * because the hypervisor was busy.
  */
 static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm)
 {
-       unsigned long mb_id;
+       unsigned long id;
        int rc;
 
-       virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_PLUGGED) {
-               rc = virtio_mem_mb_unplug(vm, mb_id);
+       if (!vm->in_sbm) {
+               virtio_mem_bbm_for_each_bb(vm, id,
+                                          VIRTIO_MEM_BBM_BB_PLUGGED) {
+                       rc = virtio_mem_bbm_unplug_bb(vm, id);
+                       if (rc)
+                               return rc;
+                       virtio_mem_bbm_set_bb_state(vm, id,
+                                                   VIRTIO_MEM_BBM_BB_UNUSED);
+               }
+               return 0;
+       }
+
+       virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) {
+               rc = virtio_mem_sbm_unplug_mb(vm, id);
                if (rc)
                        return rc;
-               virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED);
+               virtio_mem_sbm_set_mb_state(vm, id,
+                                           VIRTIO_MEM_SBM_MB_UNUSED);
        }
 
        return 0;
@@ -1511,7 +2236,13 @@ static void virtio_mem_refresh_config(struct virtio_mem *vm)
                        usable_region_size, &usable_region_size);
        end_addr = vm->addr + usable_region_size;
        end_addr = min(end_addr, phys_limit);
-       vm->last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr) - 1;
+
+       if (vm->in_sbm)
+               vm->sbm.last_usable_mb_id =
+                                        virtio_mem_phys_to_mb_id(end_addr) - 1;
+       else
+               vm->bbm.last_usable_bb_id =
+                                    virtio_mem_phys_to_bb_id(vm, end_addr) - 1;
 
        /* see if there is a request to change the size */
        virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size,
@@ -1535,6 +2266,7 @@ static void virtio_mem_run_wq(struct work_struct *work)
        if (vm->broken)
                return;
 
+       atomic_set(&vm->wq_active, 1);
 retry:
        rc = 0;
 
@@ -1595,6 +2327,8 @@ retry:
                        "unknown error, marking device broken: %d\n", rc);
                vm->broken = true;
        }
+
+       atomic_set(&vm->wq_active, 0);
 }
 
 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
@@ -1631,6 +2365,7 @@ static int virtio_mem_init_vq(struct virtio_mem *vm)
 static int virtio_mem_init(struct virtio_mem *vm)
 {
        const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS;
+       uint64_t sb_size, addr;
        uint16_t node_id;
 
        if (!vm->vdev->config->get) {
@@ -1659,15 +2394,9 @@ static int virtio_mem_init(struct virtio_mem *vm)
        virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
                        &vm->region_size);
 
-       /*
-        * We always hotplug memory in memory block granularity. This way,
-        * we have to wait for exactly one memory block to online.
-        */
-       if (vm->device_block_size > memory_block_size_bytes()) {
-               dev_err(&vm->vdev->dev,
-                       "The block size is not supported (too big).\n");
-               return -EINVAL;
-       }
+       /* Determine the nid for the device based on the lowest address. */
+       if (vm->nid == NUMA_NO_NODE)
+               vm->nid = memory_add_physaddr_to_nid(vm->addr);
 
        /* bad device setup - warn only */
        if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
@@ -1681,23 +2410,57 @@ static int virtio_mem_init(struct virtio_mem *vm)
                         "Some memory is not addressable. This can make some memory unusable.\n");
 
        /*
-        * Calculate the subblock size:
-        * - At least MAX_ORDER - 1 / pageblock_order.
-        * - At least the device block size.
-        * In the worst case, a single subblock per memory block.
+        * We want subblocks to span at least MAX_ORDER_NR_PAGES and
+        * pageblock_nr_pages pages. This:
+        * - Simplifies our page onlining code (virtio_mem_online_page_cb)
+        *   and fake page onlining code (virtio_mem_fake_online).
+        * - Is required for now for alloc_contig_range() to work reliably -
+        *   it doesn't properly handle smaller granularity on ZONE_NORMAL.
         */
-       vm->subblock_size = PAGE_SIZE * 1ul << max_t(uint32_t, MAX_ORDER - 1,
-                                                    pageblock_order);
-       vm->subblock_size = max_t(uint64_t, vm->device_block_size,
-                                 vm->subblock_size);
-       vm->nb_sb_per_mb = memory_block_size_bytes() / vm->subblock_size;
-
-       /* Round up to the next full memory block */
-       vm->first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 +
-                                                  memory_block_size_bytes());
-       vm->next_mb_id = vm->first_mb_id;
-       vm->last_mb_id = virtio_mem_phys_to_mb_id(vm->addr +
-                        vm->region_size) - 1;
+       sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES,
+                       pageblock_nr_pages) * PAGE_SIZE;
+       sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
+
+       if (sb_size < memory_block_size_bytes() && !force_bbm) {
+               /* SBM: At least two subblocks per Linux memory block. */
+               vm->in_sbm = true;
+               vm->sbm.sb_size = sb_size;
+               vm->sbm.sbs_per_mb = memory_block_size_bytes() /
+                                    vm->sbm.sb_size;
+
+               /* Round up to the next full memory block */
+               addr = vm->addr + memory_block_size_bytes() - 1;
+               vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr);
+               vm->sbm.next_mb_id = vm->sbm.first_mb_id;
+       } else {
+               /* BBM: At least one Linux memory block. */
+               vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size,
+                                       memory_block_size_bytes());
+
+               if (bbm_block_size) {
+                       if (!is_power_of_2(bbm_block_size)) {
+                               dev_warn(&vm->vdev->dev,
+                                        "bbm_block_size is not a power of 2");
+                       } else if (bbm_block_size < vm->bbm.bb_size) {
+                               dev_warn(&vm->vdev->dev,
+                                        "bbm_block_size is too small");
+                       } else {
+                               vm->bbm.bb_size = bbm_block_size;
+                       }
+               }
+
+               /* Round up to the next aligned big block */
+               addr = vm->addr + vm->bbm.bb_size - 1;
+               vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr);
+               vm->bbm.next_bb_id = vm->bbm.first_bb_id;
+       }
+
+       /* Prepare the offline threshold - make sure we can add two blocks. */
+       vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(),
+                                     VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
+       /* In BBM, we also want at least two big blocks. */
+       vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size,
+                                     vm->offline_threshold);
 
        dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
        dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
@@ -1705,9 +2468,13 @@ static int virtio_mem_init(struct virtio_mem *vm)
                 (unsigned long long)vm->device_block_size);
        dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
                 memory_block_size_bytes());
-       dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
-                (unsigned long long)vm->subblock_size);
-       if (vm->nid != NUMA_NO_NODE)
+       if (vm->in_sbm)
+               dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
+                        (unsigned long long)vm->sbm.sb_size);
+       else
+               dev_info(&vm->vdev->dev, "big block size: 0x%llx",
+                        (unsigned long long)vm->bbm.bb_size);
+       if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
                dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
 
        return 0;
@@ -1753,6 +2520,20 @@ static void virtio_mem_delete_resource(struct virtio_mem *vm)
        vm->parent_resource = NULL;
 }
 
+static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
+{
+       return 1;
+}
+
+static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
+{
+       const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+       return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
+                                  vm->addr + vm->region_size, NULL,
+                                  virtio_mem_range_has_system_ram) == 1;
+}
+
 static int virtio_mem_probe(struct virtio_device *vdev)
 {
        struct virtio_mem *vm;
@@ -1849,21 +2630,24 @@ static void virtio_mem_remove(struct virtio_device *vdev)
        cancel_work_sync(&vm->wq);
        hrtimer_cancel(&vm->retry_timer);
 
-       /*
-        * After we unregistered our callbacks, user space can online partially
-        * plugged offline blocks. Make sure to remove them.
-        */
-       virtio_mem_for_each_mb_state(vm, mb_id,
-                                    VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) {
-               rc = virtio_mem_mb_remove(vm, mb_id);
-               BUG_ON(rc);
-               virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED);
+       if (vm->in_sbm) {
+               /*
+                * After we unregistered our callbacks, user space can online
+                * partially plugged offline blocks. Make sure to remove them.
+                */
+               virtio_mem_sbm_for_each_mb(vm, mb_id,
+                                          VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
+                       rc = virtio_mem_sbm_remove_mb(vm, mb_id);
+                       BUG_ON(rc);
+                       virtio_mem_sbm_set_mb_state(vm, mb_id,
+                                                   VIRTIO_MEM_SBM_MB_UNUSED);
+               }
+               /*
+                * After we unregistered our callbacks, user space can no longer
+                * offline partially plugged online memory blocks. No need to
+                * worry about them.
+                */
        }
-       /*
-        * After we unregistered our callbacks, user space can no longer
-        * offline partially plugged online memory blocks. No need to worry
-        * about them.
-        */
 
        /* unregister callbacks */
        unregister_virtio_mem_device(vm);
@@ -1874,10 +2658,7 @@ static void virtio_mem_remove(struct virtio_device *vdev)
         * the system. And there is no way to stop the driver/device from going
         * away. Warn at least.
         */
-       if (vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] ||
-           vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL] ||
-           vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE] ||
-           vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL]) {
+       if (virtio_mem_has_memory_added(vm)) {
                dev_warn(&vdev->dev, "device still has system memory added\n");
        } else {
                virtio_mem_delete_resource(vm);
@@ -1885,8 +2666,12 @@ static void virtio_mem_remove(struct virtio_device *vdev)
        }
 
        /* remove all tracking data - no locking needed */
-       vfree(vm->mb_state);
-       vfree(vm->sb_bitmap);
+       if (vm->in_sbm) {
+               vfree(vm->sbm.mb_states);
+               vfree(vm->sbm.sb_states);
+       } else {
+               vfree(vm->bbm.bb_states);
+       }
 
        /* reset the device and cleanup the queues */
        vdev->config->reset(vdev);
index becc776..71e16b5 100644 (file)
@@ -1608,7 +1608,6 @@ static struct virtqueue *vring_create_virtqueue_packed(
        vq->num_added = 0;
        vq->packed_ring = true;
        vq->use_dma_api = vring_use_dma_api(vdev);
-       list_add_tail(&vq->vq.list, &vdev->vqs);
 #ifdef DEBUG
        vq->in_use = false;
        vq->last_add_time_valid = false;
@@ -1669,6 +1668,7 @@ static struct virtqueue *vring_create_virtqueue_packed(
                        cpu_to_le16(vq->packed.event_flags_shadow);
        }
 
+       list_add_tail(&vq->vq.list, &vdev->vqs);
        return &vq->vq;
 
 err_desc_extra:
@@ -1676,9 +1676,9 @@ err_desc_extra:
 err_desc_state:
        kfree(vq);
 err_vq:
-       vring_free_queue(vdev, event_size_in_bytes, device, ring_dma_addr);
+       vring_free_queue(vdev, event_size_in_bytes, device, device_event_dma_addr);
 err_device:
-       vring_free_queue(vdev, event_size_in_bytes, driver, ring_dma_addr);
+       vring_free_queue(vdev, event_size_in_bytes, driver, driver_event_dma_addr);
 err_driver:
        vring_free_queue(vdev, ring_size_in_bytes, ring, ring_dma_addr);
 err_ring:
@@ -2085,7 +2085,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
        vq->last_used_idx = 0;
        vq->num_added = 0;
        vq->use_dma_api = vring_use_dma_api(vdev);
-       list_add_tail(&vq->vq.list, &vdev->vqs);
 #ifdef DEBUG
        vq->in_use = false;
        vq->last_add_time_valid = false;
@@ -2127,6 +2126,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
        memset(vq->split.desc_state, 0, vring.num *
                        sizeof(struct vring_desc_state_split));
 
+       list_add_tail(&vq->vq.list, &vdev->vqs);
        return &vq->vq;
 }
 EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
index 6038c4c..a803033 100644 (file)
@@ -95,7 +95,8 @@ struct irq_info {
        struct list_head list;
        struct list_head eoi_list;
        short refcnt;
-       short spurious_cnt;
+       u8 spurious_cnt;
+       u8 is_accounted;
        enum xen_irq_type type; /* type */
        unsigned irq;
        evtchn_port_t evtchn;   /* event channel */
@@ -161,6 +162,9 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
 /* IRQ <-> IPI mapping */
 static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
 
+/* Event channel distribution data */
+static atomic_t channels_on_cpu[NR_CPUS];
+
 static int **evtchn_to_irq;
 #ifdef CONFIG_X86
 static unsigned long *pirq_eoi_map;
@@ -257,6 +261,32 @@ static void set_info_for_irq(unsigned int irq, struct irq_info *info)
                irq_set_chip_data(irq, info);
 }
 
+/* Per CPU channel accounting */
+static void channels_on_cpu_dec(struct irq_info *info)
+{
+       if (!info->is_accounted)
+               return;
+
+       info->is_accounted = 0;
+
+       if (WARN_ON_ONCE(info->cpu >= nr_cpu_ids))
+               return;
+
+       WARN_ON_ONCE(!atomic_add_unless(&channels_on_cpu[info->cpu], -1 , 0));
+}
+
+static void channels_on_cpu_inc(struct irq_info *info)
+{
+       if (WARN_ON_ONCE(info->cpu >= nr_cpu_ids))
+               return;
+
+       if (WARN_ON_ONCE(!atomic_add_unless(&channels_on_cpu[info->cpu], 1,
+                                           INT_MAX)))
+               return;
+
+       info->is_accounted = 1;
+}
+
 /* Constructors for packed IRQ information. */
 static int xen_irq_info_common_setup(struct irq_info *info,
                                     unsigned irq,
@@ -339,6 +369,7 @@ static void xen_irq_info_cleanup(struct irq_info *info)
 {
        set_evtchn_to_irq(info->evtchn, -1);
        info->evtchn = 0;
+       channels_on_cpu_dec(info);
 }
 
 /*
@@ -433,18 +464,25 @@ static bool pirq_needs_eoi_flag(unsigned irq)
        return info->u.pirq.flags & PIRQ_NEEDS_EOI;
 }
 
-static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu)
+static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu,
+                              bool force_affinity)
 {
        int irq = get_evtchn_to_irq(evtchn);
        struct irq_info *info = info_for_irq(irq);
 
        BUG_ON(irq == -1);
-#ifdef CONFIG_SMP
-       cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(cpu));
-#endif
+
+       if (IS_ENABLED(CONFIG_SMP) && force_affinity) {
+               cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(cpu));
+               cpumask_copy(irq_get_effective_affinity_mask(irq),
+                            cpumask_of(cpu));
+       }
+
        xen_evtchn_port_bind_to_cpu(evtchn, cpu, info->cpu);
 
+       channels_on_cpu_dec(info);
        info->cpu = cpu;
+       channels_on_cpu_inc(info);
 }
 
 /**
@@ -523,8 +561,10 @@ static void xen_irq_lateeoi_locked(struct irq_info *info, bool spurious)
                return;
 
        if (spurious) {
-               if ((1 << info->spurious_cnt) < (HZ << 2))
-                       info->spurious_cnt++;
+               if ((1 << info->spurious_cnt) < (HZ << 2)) {
+                       if (info->spurious_cnt != 0xFF)
+                               info->spurious_cnt++;
+               }
                if (info->spurious_cnt > 1) {
                        delay = 1 << (info->spurious_cnt - 2);
                        if (delay > HZ)
@@ -615,11 +655,6 @@ static void xen_irq_init(unsigned irq)
 {
        struct irq_info *info;
 
-#ifdef CONFIG_SMP
-       /* By default all event channels notify CPU#0. */
-       cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(0));
-#endif
-
        info = kzalloc(sizeof(*info), GFP_KERNEL);
        if (info == NULL)
                panic("Unable to allocate metadata for IRQ%d\n", irq);
@@ -628,6 +663,11 @@ static void xen_irq_init(unsigned irq)
        info->refcnt = -1;
 
        set_info_for_irq(irq, info);
+       /*
+        * Interrupt affinity setting can be immediate. No point
+        * in delaying it until an interrupt is handled.
+        */
+       irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
 
        INIT_LIST_HEAD(&info->eoi_list);
        list_add_tail(&info->list, &xen_irq_list_head);
@@ -739,18 +779,7 @@ static void eoi_pirq(struct irq_data *data)
        if (!VALID_EVTCHN(evtchn))
                return;
 
-       if (unlikely(irqd_is_setaffinity_pending(data)) &&
-           likely(!irqd_irq_disabled(data))) {
-               int masked = test_and_set_mask(evtchn);
-
-               clear_evtchn(evtchn);
-
-               irq_move_masked_irq(data);
-
-               if (!masked)
-                       unmask_evtchn(evtchn);
-       } else
-               clear_evtchn(evtchn);
+       clear_evtchn(evtchn);
 
        if (pirq_needs_eoi(data->irq)) {
                rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
@@ -794,7 +823,7 @@ static unsigned int __startup_pirq(unsigned int irq)
                goto err;
 
        info->evtchn = evtchn;
-       bind_evtchn_to_cpu(evtchn, 0);
+       bind_evtchn_to_cpu(evtchn, 0, false);
 
        rc = xen_evtchn_port_setup(evtchn);
        if (rc)
@@ -1113,8 +1142,14 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip)
                        irq = ret;
                        goto out;
                }
-               /* New interdomain events are bound to VCPU 0. */
-               bind_evtchn_to_cpu(evtchn, 0);
+               /*
+                * New interdomain events are initially bound to vCPU0 This
+                * is required to setup the event channel in the first
+                * place and also important for UP guests because the
+                * affinity setting is not invoked on them so nothing would
+                * bind the channel.
+                */
+               bind_evtchn_to_cpu(evtchn, 0, false);
        } else {
                struct irq_info *info = info_for_irq(irq);
                WARN_ON(info == NULL || info->type != IRQT_EVTCHN);
@@ -1132,12 +1167,6 @@ int bind_evtchn_to_irq(evtchn_port_t evtchn)
 }
 EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
 
-int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn)
-{
-       return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip);
-}
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi);
-
 static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 {
        struct evtchn_bind_ipi bind_ipi;
@@ -1168,7 +1197,11 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
                        irq = ret;
                        goto out;
                }
-               bind_evtchn_to_cpu(evtchn, cpu);
+               /*
+                * Force the affinity mask to the target CPU so proc shows
+                * the correct target.
+                */
+               bind_evtchn_to_cpu(evtchn, cpu, true);
        } else {
                struct irq_info *info = info_for_irq(irq);
                WARN_ON(info == NULL || info->type != IRQT_IPI);
@@ -1281,7 +1314,11 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu)
                        goto out;
                }
 
-               bind_evtchn_to_cpu(evtchn, cpu);
+               /*
+                * Force the affinity mask for percpu interrupts so proc
+                * shows the correct target.
+                */
+               bind_evtchn_to_cpu(evtchn, cpu, percpu);
        } else {
                struct irq_info *info = info_for_irq(irq);
                WARN_ON(info == NULL || info->type != IRQT_VIRQ);
@@ -1646,9 +1683,7 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq)
 
        mutex_unlock(&irq_mapping_update_lock);
 
-        bind_evtchn_to_cpu(evtchn, info->cpu);
-       /* This will be deferred until interrupt is processed */
-       irq_set_affinity(irq, cpumask_of(info->cpu));
+       bind_evtchn_to_cpu(evtchn, info->cpu, false);
 
        /* Unmask the event channel. */
        enable_irq(irq);
@@ -1682,7 +1717,7 @@ static int xen_rebind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int tcpu)
         * it, but don't do the xenlinux-level rebind in that case.
         */
        if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
-               bind_evtchn_to_cpu(evtchn, tcpu);
+               bind_evtchn_to_cpu(evtchn, tcpu, false);
 
        if (!masked)
                unmask_evtchn(evtchn);
@@ -1690,27 +1725,47 @@ static int xen_rebind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int tcpu)
        return 0;
 }
 
+/*
+ * Find the CPU within @dest mask which has the least number of channels
+ * assigned. This is not precise as the per cpu counts can be modified
+ * concurrently.
+ */
+static unsigned int select_target_cpu(const struct cpumask *dest)
+{
+       unsigned int cpu, best_cpu = UINT_MAX, minch = UINT_MAX;
+
+       for_each_cpu_and(cpu, dest, cpu_online_mask) {
+               unsigned int curch = atomic_read(&channels_on_cpu[cpu]);
+
+               if (curch < minch) {
+                       minch = curch;
+                       best_cpu = cpu;
+               }
+       }
+
+       /*
+        * Catch the unlikely case that dest contains no online CPUs. Can't
+        * recurse.
+        */
+       if (best_cpu == UINT_MAX)
+               return select_target_cpu(cpu_online_mask);
+
+       return best_cpu;
+}
+
 static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
                            bool force)
 {
-       unsigned tcpu = cpumask_first_and(dest, cpu_online_mask);
-       int ret = xen_rebind_evtchn_to_cpu(evtchn_from_irq(data->irq), tcpu);
+       unsigned int tcpu = select_target_cpu(dest);
+       int ret;
 
+       ret = xen_rebind_evtchn_to_cpu(evtchn_from_irq(data->irq), tcpu);
        if (!ret)
                irq_data_update_effective_affinity(data, cpumask_of(tcpu));
 
        return ret;
 }
 
-/* To be called with desc->lock held. */
-int xen_set_affinity_evtchn(struct irq_desc *desc, unsigned int tcpu)
-{
-       struct irq_data *d = irq_desc_get_irq_data(desc);
-
-       return set_affinity_irq(d, cpumask_of(tcpu), false);
-}
-EXPORT_SYMBOL_GPL(xen_set_affinity_evtchn);
-
 static void enable_dynirq(struct irq_data *data)
 {
        evtchn_port_t evtchn = evtchn_from_irq(data->irq);
@@ -1734,18 +1789,7 @@ static void ack_dynirq(struct irq_data *data)
        if (!VALID_EVTCHN(evtchn))
                return;
 
-       if (unlikely(irqd_is_setaffinity_pending(data)) &&
-           likely(!irqd_irq_disabled(data))) {
-               int masked = test_and_set_mask(evtchn);
-
-               clear_evtchn(evtchn);
-
-               irq_move_masked_irq(data);
-
-               if (!masked)
-                       unmask_evtchn(evtchn);
-       } else
-               clear_evtchn(evtchn);
+       clear_evtchn(evtchn);
 }
 
 static void mask_ack_dynirq(struct irq_data *data)
@@ -1830,7 +1874,8 @@ static void restore_cpu_virqs(unsigned int cpu)
 
                /* Record the new mapping. */
                (void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq);
-               bind_evtchn_to_cpu(evtchn, cpu);
+               /* The affinity mask is still valid */
+               bind_evtchn_to_cpu(evtchn, cpu, false);
        }
 }
 
@@ -1855,7 +1900,8 @@ static void restore_cpu_ipis(unsigned int cpu)
 
                /* Record the new mapping. */
                (void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi);
-               bind_evtchn_to_cpu(evtchn, cpu);
+               /* The affinity mask is still valid */
+               bind_evtchn_to_cpu(evtchn, cpu, false);
        }
 }
 
@@ -1938,8 +1984,12 @@ void xen_irq_resume(void)
        xen_evtchn_resume();
 
        /* No IRQ <-> event-channel mappings. */
-       list_for_each_entry(info, &xen_irq_list_head, list)
-               info->evtchn = 0; /* zap event-channel binding */
+       list_for_each_entry(info, &xen_irq_list_head, list) {
+               /* Zap event-channel binding */
+               info->evtchn = 0;
+               /* Adjust accounting */
+               channels_on_cpu_dec(info);
+       }
 
        clear_evtchn_to_irq_all();
 
index 5dc016d..a7a8571 100644 (file)
@@ -421,36 +421,6 @@ static void evtchn_unbind_from_user(struct per_user_data *u,
        del_evtchn(u, evtchn);
 }
 
-static DEFINE_PER_CPU(int, bind_last_selected_cpu);
-
-static void evtchn_bind_interdom_next_vcpu(evtchn_port_t evtchn)
-{
-       unsigned int selected_cpu, irq;
-       struct irq_desc *desc;
-       unsigned long flags;
-
-       irq = irq_from_evtchn(evtchn);
-       desc = irq_to_desc(irq);
-
-       if (!desc)
-               return;
-
-       raw_spin_lock_irqsave(&desc->lock, flags);
-       selected_cpu = this_cpu_read(bind_last_selected_cpu);
-       selected_cpu = cpumask_next_and(selected_cpu,
-                       desc->irq_common_data.affinity, cpu_online_mask);
-
-       if (unlikely(selected_cpu >= nr_cpu_ids))
-               selected_cpu = cpumask_first_and(desc->irq_common_data.affinity,
-                               cpu_online_mask);
-
-       this_cpu_write(bind_last_selected_cpu, selected_cpu);
-
-       /* unmask expects irqs to be disabled */
-       xen_set_affinity_evtchn(desc, selected_cpu);
-       raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-
 static long evtchn_ioctl(struct file *file,
                         unsigned int cmd, unsigned long arg)
 {
@@ -508,10 +478,8 @@ static long evtchn_ioctl(struct file *file,
                        break;
 
                rc = evtchn_bind_to_user(u, bind_interdomain.local_port);
-               if (rc == 0) {
+               if (rc == 0)
                        rc = bind_interdomain.local_port;
-                       evtchn_bind_interdom_next_vcpu(rc);
-               }
                break;
        }
 
index 9e56ee1..9293045 100644 (file)
@@ -1,9 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- *  linux/fs/block_dev.c
- *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright (C) 2016 - 2020 Christoph Hellwig
  */
 
 #include <linux/init.h>
index ea04858..97e81a8 100644 (file)
@@ -793,10 +793,17 @@ static inline bool fast_dput(struct dentry *dentry)
         * a reference to the dentry and change that, but
         * our work is done - we can leave the dentry
         * around with a zero refcount.
+        *
+        * Nevertheless, there are two cases that we should kill
+        * the dentry anyway.
+        * 1. free disconnected dentries as soon as their refcount
+        *    reached zero.
+        * 2. free dentries if they should not be cached.
         */
        smp_rmb();
        d_flags = READ_ONCE(dentry->d_flags);
-       d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED;
+       d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
+                       DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
 
        /* Nothing to do? Dropping the reference was all we needed? */
        if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
index 1d640b1..f45f9fe 100644 (file)
@@ -185,7 +185,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t start, tmp;
 
-       J_ASSERT_BH(bh, buffer_locked(bh));
+       ASSERT(buffer_locked(bh));
 
        /* If checksum is bad mark all blocks used to prevent allocation
         * essentially implementing a per-group read-only flag. */
index 8e6ca23..4666b55 100644 (file)
@@ -176,12 +176,10 @@ static int ext4_protect_reserved_inode(struct super_block *sb,
                        err = add_system_zone(system_blks, map.m_pblk, n, ino);
                        if (err < 0) {
                                if (err == -EFSCORRUPTED) {
-                                       __ext4_error(sb, __func__, __LINE__,
-                                                    -err, map.m_pblk,
-                                                    "blocks %llu-%llu from inode %u overlap system zone",
-                                                    map.m_pblk,
-                                                    map.m_pblk + map.m_len - 1,
-                                                    ino);
+                                       EXT4_ERROR_INODE_ERR(inode, -err,
+                                               "blocks %llu-%llu from inode overlap system zone",
+                                               map.m_pblk,
+                                               map.m_pblk + map.m_len - 1);
                                }
                                break;
                        }
@@ -206,7 +204,7 @@ static void ext4_destroy_system_zone(struct rcu_head *rcu)
  *
  * The update of system_blks pointer in this function is protected by
  * sb->s_umount semaphore. However we have to be careful as we can be
- * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * racing with ext4_inode_block_valid() calls reading system_blks rbtree
  * protected only by RCU. That's why we first build the rbtree and then
  * swap it in place.
  */
@@ -258,7 +256,7 @@ int ext4_setup_system_zone(struct super_block *sb)
 
        /*
         * System blks rbtree complete, announce it once to prevent racing
-        * with ext4_data_block_valid() accessing the rbtree at the same
+        * with ext4_inode_block_valid() accessing the rbtree at the same
         * time.
         */
        rcu_assign_pointer(sbi->s_system_blks, system_blks);
@@ -278,7 +276,7 @@ err:
  *
  * The update of system_blks pointer in this function is protected by
  * sb->s_umount semaphore. However we have to be careful as we can be
- * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * racing with ext4_inode_block_valid() calls reading system_blks rbtree
  * protected only by RCU. So we first clear the system_blks pointer and
  * then free the rbtree only after RCU grace period expires.
  */
index c64ea8f..2866d24 100644 (file)
 #define ext_debug(ino, fmt, ...)       no_printk(fmt, ##__VA_ARGS__)
 #endif
 
+#define ASSERT(assert)                                         \
+do {                                                                   \
+       if (unlikely(!(assert))) {                                      \
+               printk(KERN_EMERG                                       \
+                      "Assertion failure in %s() at %s:%d: '%s'\n",    \
+                      __func__, __FILE__, __LINE__, #assert);          \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
 
@@ -1619,6 +1629,27 @@ struct ext4_sb_info {
        errseq_t s_bdev_wb_err;
        spinlock_t s_bdev_wb_lock;
 
+       /* Information about errors that happened during this mount */
+       spinlock_t s_error_lock;
+       int s_add_error_count;
+       int s_first_error_code;
+       __u32 s_first_error_line;
+       __u32 s_first_error_ino;
+       __u64 s_first_error_block;
+       const char *s_first_error_func;
+       time64_t s_first_error_time;
+       int s_last_error_code;
+       __u32 s_last_error_line;
+       __u32 s_last_error_ino;
+       __u64 s_last_error_block;
+       const char *s_last_error_func;
+       time64_t s_last_error_time;
+       /*
+        * If we are in a context where we cannot update error information in
+        * the on-disk superblock, we queue this work to do it.
+        */
+       struct work_struct s_error_work;
+
        /* Ext4 fast commit stuff */
        atomic_t s_fc_subtid;
        atomic_t s_fc_ineligible_updates;
@@ -1858,7 +1889,6 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
 #define EXT4_GOOD_OLD_REV      0       /* The good old (original) format */
 #define EXT4_DYNAMIC_REV       1       /* V2 format w/ dynamic inode sizes */
 
-#define EXT4_CURRENT_REV       EXT4_GOOD_OLD_REV
 #define EXT4_MAX_SUPP_REV      EXT4_DYNAMIC_REV
 
 #define EXT4_GOOD_OLD_INODE_SIZE 128
@@ -2952,9 +2982,9 @@ extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
                                             ext4_group_t block_group,
                                             unsigned int flags);
 
-extern __printf(6, 7)
-void __ext4_error(struct super_block *, const char *, unsigned int, int, __u64,
-                 const char *, ...);
+extern __printf(7, 8)
+void __ext4_error(struct super_block *, const char *, unsigned int, bool,
+                 int, __u64, const char *, ...);
 extern __printf(6, 7)
 void __ext4_error_inode(struct inode *, const char *, unsigned int,
                        ext4_fsblk_t, int, const char *, ...);
@@ -2963,9 +2993,6 @@ void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
                     const char *, ...);
 extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
-extern __printf(5, 6)
-void __ext4_abort(struct super_block *, const char *, unsigned int, int,
-                 const char *, ...);
 extern __printf(4, 5)
 void __ext4_warning(struct super_block *, const char *, unsigned int,
                    const char *, ...);
@@ -2995,6 +3022,9 @@ void __ext4_grp_locked_error(const char *, unsigned int,
 #define EXT4_ERROR_FILE(file, block, fmt, a...)                                \
        ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
 
+#define ext4_abort(sb, err, fmt, a...)                                 \
+       __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a)
+
 #ifdef CONFIG_PRINTK
 
 #define ext4_error_inode(inode, func, line, block, fmt, ...)           \
@@ -3005,11 +3035,11 @@ void __ext4_grp_locked_error(const char *, unsigned int,
 #define ext4_error_file(file, func, line, block, fmt, ...)             \
        __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
 #define ext4_error(sb, fmt, ...)                                       \
-       __ext4_error((sb), __func__, __LINE__, 0, 0, (fmt), ##__VA_ARGS__)
+       __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt),      \
+               ##__VA_ARGS__)
 #define ext4_error_err(sb, err, fmt, ...)                              \
-       __ext4_error((sb), __func__, __LINE__, (err), 0, (fmt), ##__VA_ARGS__)
-#define ext4_abort(sb, err, fmt, ...)                                  \
-       __ext4_abort((sb), __func__, __LINE__, (err), (fmt), ##__VA_ARGS__)
+       __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt),  \
+               ##__VA_ARGS__)
 #define ext4_warning(sb, fmt, ...)                                     \
        __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
 #define ext4_warning_inode(inode, fmt, ...)                            \
@@ -3042,17 +3072,12 @@ do {                                                                    \
 #define ext4_error(sb, fmt, ...)                                       \
 do {                                                                   \
        no_printk(fmt, ##__VA_ARGS__);                                  \
-       __ext4_error(sb, "", 0, 0, 0, " ");                             \
+       __ext4_error(sb, "", 0, false, 0, 0, " ");                      \
 } while (0)
 #define ext4_error_err(sb, err, fmt, ...)                              \
 do {                                                                   \
        no_printk(fmt, ##__VA_ARGS__);                                  \
-       __ext4_error(sb, "", 0, err, 0, " ");                           \
-} while (0)
-#define ext4_abort(sb, err, fmt, ...)                                  \
-do {                                                                   \
-       no_printk(fmt, ##__VA_ARGS__);                                  \
-       __ext4_abort(sb, "", 0, err, " ");                              \
+       __ext4_error(sb, "", 0, false, err, 0, " ");                    \
 } while (0)
 #define ext4_warning(sb, fmt, ...)                                     \
 do {                                                                   \
@@ -3361,6 +3386,21 @@ static inline void ext4_unlock_group(struct super_block *sb,
        spin_unlock(ext4_group_lock_ptr(sb, group));
 }
 
+#ifdef CONFIG_QUOTA
+static inline bool ext4_quota_capable(struct super_block *sb)
+{
+       return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb));
+}
+
+static inline bool ext4_is_quota_journalled(struct super_block *sb)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       return (ext4_has_feature_quota(sb) ||
+               sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]);
+}
+#endif
+
 /*
  * Block validity checking
  */
@@ -3609,7 +3649,6 @@ extern void ext4_io_submit(struct ext4_io_submit *io);
 extern int ext4_bio_write_page(struct ext4_io_submit *io,
                               struct page *page,
                               int len,
-                              struct writeback_control *wbc,
                               bool keep_towrite);
 extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
 extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
index 0fd0c42..1a0a827 100644 (file)
@@ -296,8 +296,8 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
        if (err) {
                ext4_journal_abort_handle(where, line, __func__,
                                          bh, handle, err);
-               __ext4_abort(inode->i_sb, where, line, -err,
-                          "error %d when attempting revoke", err);
+               __ext4_error(inode->i_sb, where, line, true, -err, 0,
+                            "error %d when attempting revoke", err);
        }
        BUFFER_TRACE(bh, "exit");
        return err;
index 00dc668..a124c68 100644 (file)
 #ifdef CONFIG_QUOTA
 /* Amount of blocks needed for quota update - we know that the structure was
  * allocated so we need to update only data block */
-#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
-               ext4_has_feature_quota(sb)) ? 1 : 0)
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0)
 /* Amount of blocks needed for quota insert/delete - we do some block writes
  * but inode, sb and group updates are done only once */
-#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
-               ext4_has_feature_quota(sb)) ?\
+#define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
                (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_INIT_REWRITE) : 0)
 
-#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
-               ext4_has_feature_quota(sb)) ?\
+#define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
                (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_DEL_REWRITE) : 0)
 #else
index 17d7096..3960b7e 100644 (file)
@@ -5815,8 +5815,8 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
        int ret;
 
        path = ext4_find_extent(inode, start, NULL, 0);
-       if (!path)
-               return -EINVAL;
+       if (IS_ERR(path))
+               return PTR_ERR(path);
        ex = path[path->p_depth].p_ext;
        if (!ex) {
                ret = -EFSCORRUPTED;
@@ -5988,7 +5988,6 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
                        kfree(path);
                        break;
                }
-               ex = path2[path2->p_depth].p_ext;
                for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
                        cmp1 = cmp2 = 0;
                        if (i <= path->p_depth)
index f2033e1..4fcc21c 100644 (file)
  *
  * Replay code should thus check for all the valid tails in the FC area.
  *
+ * Fast Commit Replay Idempotence
+ * ------------------------------
+ *
+ * Fast commits tags are idempotent in nature provided the recovery code follows
+ * certain rules. The guiding principle that the commit path follows while
+ * committing is that it stores the result of a particular operation instead of
+ * storing the procedure.
+ *
+ * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
+ * was associated with inode 10. During fast commit, instead of storing this
+ * operation as a procedure "rename a to b", we store the resulting file system
+ * state as a "series" of outcomes:
+ *
+ * - Link dirent b to inode 10
+ * - Unlink dirent a
+ * - Inode <10> with valid refcount
+ *
+ * Now when recovery code runs, it needs "enforce" this state on the file
+ * system. This is what guarantees idempotence of fast commit replay.
+ *
+ * Let's take an example of a procedure that is not idempotent and see how fast
+ * commits make it idempotent. Consider following sequence of operations:
+ *
+ *     rm A;    mv B A;    read A
+ *  (x)     (y)        (z)
+ *
+ * (x), (y) and (z) are the points at which we can crash. If we store this
+ * sequence of operations as is then the replay is not idempotent. Let's say
+ * while in replay, we crash at (z). During the second replay, file A (which was
+ * actually created as a result of "mv B A" operation) would get deleted. Thus,
+ * file named A would be absent when we try to read A. So, this sequence of
+ * operations is not idempotent. However, as mentioned above, instead of storing
+ * the procedure fast commits store the outcome of each procedure. Thus the fast
+ * commit log for above procedure would be as follows:
+ *
+ * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
+ * inode 11 before the replay)
+ *
+ *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
+ * (w)          (x)                    (y)          (z)
+ *
+ * If we crash at (z), we will have file A linked to inode 11. During the second
+ * replay, we will remove file A (inode 11). But we will create it back and make
+ * it point to inode 11. We won't find B, so we'll just skip that step. At this
+ * point, the refcount for inode 11 is not reliable, but that gets fixed by the
+ * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
+ * similarly. Thus, by converting a non-idempotent procedure into a series of
+ * idempotent outcomes, fast commits ensured idempotence during the replay.
+ *
  * TODOs
  * -----
+ *
+ * 0) Fast commit replay path hardening: Fast commit replay code should use
+ *    journal handles to make sure all the updates it does during the replay
+ *    path are atomic. With that if we crash during fast commit replay, after
+ *    trying to do recovery again, we will find a file system where fast commit
+ *    area is invalid (because new full commit would be found). In order to deal
+ *    with that, fast commit replay code should ensure that the "FC_REPLAY"
+ *    superblock state is persisted before starting the replay, so that after
+ *    the crash, fast commit recovery code can look at that flag and perform
+ *    fast commit recovery even if that area is invalidated by later full
+ *    commits.
+ *
  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
  *    eligible update must be protected within ext4_fc_start_update() and
  *    ext4_fc_stop_update(). These routines are called at much higher
@@ -1220,18 +1281,6 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
 
 /* Ext4 Replay Path Routines */
 
-/* Get length of a particular tlv */
-static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
-{
-       return le16_to_cpu(tl->fc_len);
-}
-
-/* Get a pointer to "value" of a tlv */
-static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
-{
-       return (u8 *)tl + sizeof(*tl);
-}
-
 /* Helper struct for dentry replay routines */
 struct dentry_info_args {
        int parent_ino, dname_len, ino, inode_len;
@@ -1770,32 +1819,6 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
        return 0;
 }
 
-static inline const char *tag2str(u16 tag)
-{
-       switch (tag) {
-       case EXT4_FC_TAG_LINK:
-               return "TAG_ADD_ENTRY";
-       case EXT4_FC_TAG_UNLINK:
-               return "TAG_DEL_ENTRY";
-       case EXT4_FC_TAG_ADD_RANGE:
-               return "TAG_ADD_RANGE";
-       case EXT4_FC_TAG_CREAT:
-               return "TAG_CREAT_DENTRY";
-       case EXT4_FC_TAG_DEL_RANGE:
-               return "TAG_DEL_RANGE";
-       case EXT4_FC_TAG_INODE:
-               return "TAG_INODE";
-       case EXT4_FC_TAG_PAD:
-               return "TAG_PAD";
-       case EXT4_FC_TAG_TAIL:
-               return "TAG_TAIL";
-       case EXT4_FC_TAG_HEAD:
-               return "TAG_HEAD";
-       default:
-               return "TAG_ERROR";
-       }
-}
-
 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
 {
        struct ext4_fc_replay_state *state;
index 3a6e5a1..b77f70f 100644 (file)
@@ -3,6 +3,11 @@
 #ifndef __FAST_COMMIT_H__
 #define __FAST_COMMIT_H__
 
+/*
+ * Note this file is present in e2fsprogs/lib/ext2fs/fast_commit.h and
+ * linux/fs/ext4/fast_commit.h. These file should always be byte identical.
+ */
+
 /* Fast commit tags */
 #define EXT4_FC_TAG_ADD_RANGE          0x0001
 #define EXT4_FC_TAG_DEL_RANGE          0x0002
@@ -50,7 +55,7 @@ struct ext4_fc_del_range {
 struct ext4_fc_dentry_info {
        __le32 fc_parent_ino;
        __le32 fc_ino;
-       u8 fc_dname[0];
+       __u8 fc_dname[0];
 };
 
 /* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */
@@ -65,19 +70,6 @@ struct ext4_fc_tail {
        __le32 fc_crc;
 };
 
-/*
- * In memory list of dentry updates that are performed on the file
- * system used by fast commit code.
- */
-struct ext4_fc_dentry_update {
-       int fcd_op;             /* Type of update create / unlink / link */
-       int fcd_parent;         /* Parent inode number */
-       int fcd_ino;            /* Inode number */
-       struct qstr fcd_name;   /* Dirent name */
-       unsigned char fcd_iname[DNAME_INLINE_LEN];      /* Dirent name string */
-       struct list_head fcd_list;
-};
-
 /*
  * Fast commit reason codes
  */
@@ -107,6 +99,20 @@ enum {
        EXT4_FC_REASON_MAX
 };
 
+#ifdef __KERNEL__
+/*
+ * In memory list of dentry updates that are performed on the file
+ * system used by fast commit code.
+ */
+struct ext4_fc_dentry_update {
+       int fcd_op;             /* Type of update create / unlink / link */
+       int fcd_parent;         /* Parent inode number */
+       int fcd_ino;            /* Inode number */
+       struct qstr fcd_name;   /* Dirent name */
+       unsigned char fcd_iname[DNAME_INLINE_LEN];      /* Dirent name string */
+       struct list_head fcd_list;
+};
+
 struct ext4_fc_stats {
        unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX];
        unsigned long fc_num_commits;
@@ -145,13 +151,51 @@ struct ext4_fc_replay_state {
 };
 
 #define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1)
+#endif
 
 #define fc_for_each_tl(__start, __end, __tl)                           \
-       for (tl = (struct ext4_fc_tl *)start;                           \
-               (u8 *)tl < (u8 *)end;                                   \
-               tl = (struct ext4_fc_tl *)((u8 *)tl +                   \
+       for (tl = (struct ext4_fc_tl *)(__start);                       \
+            (__u8 *)tl < (__u8 *)(__end);                              \
+               tl = (struct ext4_fc_tl *)((__u8 *)tl +                 \
                                        sizeof(struct ext4_fc_tl) +     \
                                        + le16_to_cpu(tl->fc_len)))
 
+static inline const char *tag2str(__u16 tag)
+{
+       switch (tag) {
+       case EXT4_FC_TAG_LINK:
+               return "ADD_ENTRY";
+       case EXT4_FC_TAG_UNLINK:
+               return "DEL_ENTRY";
+       case EXT4_FC_TAG_ADD_RANGE:
+               return "ADD_RANGE";
+       case EXT4_FC_TAG_CREAT:
+               return "CREAT_DENTRY";
+       case EXT4_FC_TAG_DEL_RANGE:
+               return "DEL_RANGE";
+       case EXT4_FC_TAG_INODE:
+               return "INODE";
+       case EXT4_FC_TAG_PAD:
+               return "PAD";
+       case EXT4_FC_TAG_TAIL:
+               return "TAIL";
+       case EXT4_FC_TAG_HEAD:
+               return "HEAD";
+       default:
+               return "ERROR";
+       }
+}
+
+/* Get length of a particular tlv */
+static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
+{
+       return le16_to_cpu(tl->fc_len);
+}
+
+/* Get a pointer to "value" of a tlv */
+static inline __u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
+{
+       return (__u8 *)tl + sizeof(*tl);
+}
 
 #endif /* __FAST_COMMIT_H__ */
index a42ca95..113bfb0 100644 (file)
@@ -136,7 +136,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (unlikely(ext4_forced_shutdown(sbi)))
                return -EIO;
 
-       J_ASSERT(ext4_journal_current_handle() == NULL);
+       ASSERT(ext4_journal_current_handle() == NULL);
 
        trace_ext4_sync_file_enter(file, datasync);
 
index 05efa68..1223a18 100644 (file)
@@ -534,8 +534,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
        ext4_fsblk_t first_block = 0;
 
        trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
-       J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
-       J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
+       ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
+       ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
                                   &blocks_to_boundary);
 
index 0d8385a..2794688 100644 (file)
@@ -175,6 +175,7 @@ void ext4_evict_inode(struct inode *inode)
         */
        int extra_credits = 6;
        struct ext4_xattr_inode_array *ea_inode_array = NULL;
+       bool freeze_protected = false;
 
        trace_ext4_evict_inode(inode);
 
@@ -232,9 +233,14 @@ void ext4_evict_inode(struct inode *inode)
 
        /*
         * Protect us against freezing - iput() caller didn't have to have any
-        * protection against it
+        * protection against it. When we are in a running transaction though,
+        * we are already protected against freezing and we cannot grab further
+        * protection due to lock ordering constraints.
         */
-       sb_start_intwrite(inode->i_sb);
+       if (!ext4_journal_current_handle()) {
+               sb_start_intwrite(inode->i_sb);
+               freeze_protected = true;
+       }
 
        if (!IS_NOQUOTA(inode))
                extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
@@ -253,7 +259,8 @@ void ext4_evict_inode(struct inode *inode)
                 * cleaned up.
                 */
                ext4_orphan_del(NULL, inode);
-               sb_end_intwrite(inode->i_sb);
+               if (freeze_protected)
+                       sb_end_intwrite(inode->i_sb);
                goto no_delete;
        }
 
@@ -294,7 +301,8 @@ void ext4_evict_inode(struct inode *inode)
 stop_handle:
                ext4_journal_stop(handle);
                ext4_orphan_del(NULL, inode);
-               sb_end_intwrite(inode->i_sb);
+               if (freeze_protected)
+                       sb_end_intwrite(inode->i_sb);
                ext4_xattr_inode_array_free(ea_inode_array);
                goto no_delete;
        }
@@ -323,7 +331,8 @@ stop_handle:
        else
                ext4_free_inode(handle, inode);
        ext4_journal_stop(handle);
-       sb_end_intwrite(inode->i_sb);
+       if (freeze_protected)
+               sb_end_intwrite(inode->i_sb);
        ext4_xattr_inode_array_free(ea_inode_array);
        return;
 no_delete:
@@ -830,8 +839,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
        int create = map_flags & EXT4_GET_BLOCKS_CREATE;
        int err;
 
-       J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
-                || handle != NULL || create == 0);
+       ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+                   || handle != NULL || create == 0);
 
        map.m_lblk = block;
        map.m_len = 1;
@@ -846,9 +855,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
        if (unlikely(!bh))
                return ERR_PTR(-ENOMEM);
        if (map.m_flags & EXT4_MAP_NEW) {
-               J_ASSERT(create != 0);
-               J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
-                        || (handle != NULL));
+               ASSERT(create != 0);
+               ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+                           || (handle != NULL));
 
                /*
                 * Now that we do not always journal data, we should
@@ -2055,7 +2064,7 @@ static int ext4_writepage(struct page *page,
                unlock_page(page);
                return -ENOMEM;
        }
-       ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
+       ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite);
        ext4_io_submit(&io_submit);
        /* Drop io_end reference we got from init */
        ext4_put_io_end_defer(io_submit.io_end);
@@ -2089,7 +2098,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
                len = size & ~PAGE_MASK;
        else
                len = PAGE_SIZE;
-       err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
+       err = ext4_bio_write_page(&mpd->io_submit, page, len, false);
        if (!err)
                mpd->wbc->nr_to_write--;
        mpd->first_page++;
@@ -4610,7 +4619,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
            (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
                if (flags & EXT4_IGET_HANDLE)
                        return ERR_PTR(-ESTALE);
-               __ext4_error(sb, function, line, EFSCORRUPTED, 0,
+               __ext4_error(sb, function, line, false, EFSCORRUPTED, 0,
                             "inode #%lu: comm %s: iget: illegal inode #",
                             ino, current->comm);
                return ERR_PTR(-EFSCORRUPTED);
index 24af9ed..99bf091 100644 (file)
@@ -822,24 +822,6 @@ void ext4_mb_generate_buddy(struct super_block *sb,
        spin_unlock(&sbi->s_bal_lock);
 }
 
-static void mb_regenerate_buddy(struct ext4_buddy *e4b)
-{
-       int count;
-       int order = 1;
-       void *buddy;
-
-       while ((buddy = mb_find_buddy(e4b, order++, &count))) {
-               ext4_set_bits(buddy, 0, count);
-       }
-       e4b->bd_info->bb_fragments = 0;
-       memset(e4b->bd_info->bb_counters, 0,
-               sizeof(*e4b->bd_info->bb_counters) *
-               (e4b->bd_sb->s_blocksize_bits + 2));
-
-       ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
-               e4b->bd_bitmap, e4b->bd_group);
-}
-
 /* The buddy information is attached the buddy cache inode
  * for convenience. The information regarding each group
  * is loaded via ext4_mb_load_buddy. The information involve
@@ -1307,22 +1289,18 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 
 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
 {
-       int order = 1;
-       int bb_incr = 1 << (e4b->bd_blkbits - 1);
+       int order = 1, max;
        void *bb;
 
        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
        BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
 
-       bb = e4b->bd_buddy;
        while (order <= e4b->bd_blkbits + 1) {
-               block = block >> 1;
-               if (!mb_test_bit(block, bb)) {
+               bb = mb_find_buddy(e4b, order, &max);
+               if (!mb_test_bit(block >> order, bb)) {
                        /* this block is part of buddy of order 'order' */
                        return order;
                }
-               bb += bb_incr;
-               bb_incr >>= 1;
                order++;
        }
        return 0;
@@ -1512,7 +1490,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                                sb, e4b->bd_group,
                                EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                }
-               mb_regenerate_buddy(e4b);
                goto done;
        }
 
@@ -2395,9 +2372,9 @@ repeat:
 
                                nr = sbi->s_mb_prefetch;
                                if (ext4_has_feature_flex_bg(sb)) {
-                                       nr = (group / sbi->s_mb_prefetch) *
-                                               sbi->s_mb_prefetch;
-                                       nr = nr + sbi->s_mb_prefetch - group;
+                                       nr = 1 << sbi->s_log_groups_per_flex;
+                                       nr -= group & (nr - 1);
+                                       nr = min(nr, sbi->s_mb_prefetch);
                                }
                                prefetch_grp = ext4_mb_prefetch(sb, group,
                                                        nr, &prefetch_ios);
@@ -2733,7 +2710,8 @@ static int ext4_mb_init_backend(struct super_block *sb)
 
        if (ext4_has_feature_flex_bg(sb)) {
                /* a single flex group is supposed to be read by a single IO */
-               sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+               sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex,
+                       BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
                sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
        } else {
                sbi->s_mb_prefetch = 32;
@@ -5126,6 +5104,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                                ext4_group_first_block_no(sb, group) +
                                EXT4_C2B(sbi, cluster),
                                "Block already on to-be-freed list");
+                       kmem_cache_free(ext4_free_data_cachep, new_entry);
                        return 0;
                }
        }
index 326fe40..b17a082 100644 (file)
@@ -182,10 +182,6 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
        return bh;
 }
 
-#ifndef assert
-#define assert(test) J_ASSERT(test)
-#endif
-
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
@@ -843,7 +839,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
                                        break;
                                }
                        }
-                       assert (at == p - 1);
+                       ASSERT(at == p - 1);
                }
 
                at = p - 1;
@@ -1259,8 +1255,8 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
        struct dx_entry *old = frame->at, *new = old + 1;
        int count = dx_get_count(entries);
 
-       assert(count < dx_get_limit(entries));
-       assert(old < entries + count);
+       ASSERT(count < dx_get_limit(entries));
+       ASSERT(old < entries + count);
        memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
        dx_set_hash(new, hash);
        dx_set_block(new, block);
@@ -2959,7 +2955,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
         * hold i_mutex, or the inode can not be referenced from outside,
         * so i_nlink should not be bumped due to race
         */
-       J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+       ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
 
        BUFFER_TRACE(sbi->s_sbh, "get_write_access");
index defd2e1..03a44a0 100644 (file)
@@ -111,9 +111,6 @@ static void ext4_finish_bio(struct bio *bio)
                unsigned under_io = 0;
                unsigned long flags;
 
-               if (!page)
-                       continue;
-
                if (fscrypt_is_bounce_page(page)) {
                        bounce_page = page;
                        page = fscrypt_pagecache_page(bounce_page);
@@ -438,7 +435,6 @@ submit_and_retry:
 int ext4_bio_write_page(struct ext4_io_submit *io,
                        struct page *page,
                        int len,
-                       struct writeback_control *wbc,
                        bool keep_towrite)
 {
        struct page *bounce_page = NULL;
@@ -448,6 +444,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
        int ret = 0;
        int nr_submitted = 0;
        int nr_to_submit = 0;
+       struct writeback_control *wbc = io->io_wbc;
 
        BUG_ON(!PageLocked(page));
        BUG_ON(PageWriteback(page));
index 830c196..2112178 100644 (file)
@@ -404,10 +404,8 @@ void ext4_itable_unused_set(struct super_block *sb,
                bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
 }
 
-static void __ext4_update_tstamp(__le32 *lo, __u8 *hi)
+static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now)
 {
-       time64_t now = ktime_get_real_seconds();
-
        now = clamp_val(now, 0, (1ull << 40) - 1);
 
        *lo = cpu_to_le32(lower_32_bits(now));
@@ -419,108 +417,11 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
        return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
 }
 #define ext4_update_tstamp(es, tstamp) \
-       __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
+       __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \
+                            ktime_get_real_seconds())
 #define ext4_get_tstamp(es, tstamp) \
        __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
 
-static void __save_error_info(struct super_block *sb, int error,
-                             __u32 ino, __u64 block,
-                             const char *func, unsigned int line)
-{
-       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-       int err;
-
-       EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-       if (bdev_read_only(sb->s_bdev))
-               return;
-       es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-       ext4_update_tstamp(es, s_last_error_time);
-       strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
-       es->s_last_error_line = cpu_to_le32(line);
-       es->s_last_error_ino = cpu_to_le32(ino);
-       es->s_last_error_block = cpu_to_le64(block);
-       switch (error) {
-       case EIO:
-               err = EXT4_ERR_EIO;
-               break;
-       case ENOMEM:
-               err = EXT4_ERR_ENOMEM;
-               break;
-       case EFSBADCRC:
-               err = EXT4_ERR_EFSBADCRC;
-               break;
-       case 0:
-       case EFSCORRUPTED:
-               err = EXT4_ERR_EFSCORRUPTED;
-               break;
-       case ENOSPC:
-               err = EXT4_ERR_ENOSPC;
-               break;
-       case ENOKEY:
-               err = EXT4_ERR_ENOKEY;
-               break;
-       case EROFS:
-               err = EXT4_ERR_EROFS;
-               break;
-       case EFBIG:
-               err = EXT4_ERR_EFBIG;
-               break;
-       case EEXIST:
-               err = EXT4_ERR_EEXIST;
-               break;
-       case ERANGE:
-               err = EXT4_ERR_ERANGE;
-               break;
-       case EOVERFLOW:
-               err = EXT4_ERR_EOVERFLOW;
-               break;
-       case EBUSY:
-               err = EXT4_ERR_EBUSY;
-               break;
-       case ENOTDIR:
-               err = EXT4_ERR_ENOTDIR;
-               break;
-       case ENOTEMPTY:
-               err = EXT4_ERR_ENOTEMPTY;
-               break;
-       case ESHUTDOWN:
-               err = EXT4_ERR_ESHUTDOWN;
-               break;
-       case EFAULT:
-               err = EXT4_ERR_EFAULT;
-               break;
-       default:
-               err = EXT4_ERR_UNKNOWN;
-       }
-       es->s_last_error_errcode = err;
-       if (!es->s_first_error_time) {
-               es->s_first_error_time = es->s_last_error_time;
-               es->s_first_error_time_hi = es->s_last_error_time_hi;
-               strncpy(es->s_first_error_func, func,
-                       sizeof(es->s_first_error_func));
-               es->s_first_error_line = cpu_to_le32(line);
-               es->s_first_error_ino = es->s_last_error_ino;
-               es->s_first_error_block = es->s_last_error_block;
-               es->s_first_error_errcode = es->s_last_error_errcode;
-       }
-       /*
-        * Start the daily error reporting function if it hasn't been
-        * started already
-        */
-       if (!es->s_error_count)
-               mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
-       le32_add_cpu(&es->s_error_count, 1);
-}
-
-static void save_error_info(struct super_block *sb, int error,
-                           __u32 ino, __u64 block,
-                           const char *func, unsigned int line)
-{
-       __save_error_info(sb, error, ino, block, func, line);
-       if (!bdev_read_only(sb->s_bdev))
-               ext4_commit_super(sb, 1);
-}
-
 /*
  * The del_gendisk() function uninitializes the disk-specific data
  * structures, including the bdi structure, without telling anyone
@@ -649,6 +550,83 @@ static bool system_going_down(void)
                || system_state == SYSTEM_RESTART;
 }
 
+struct ext4_err_translation {
+       int code;
+       int errno;
+};
+
+#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err }
+
+static struct ext4_err_translation err_translation[] = {
+       EXT4_ERR_TRANSLATE(EIO),
+       EXT4_ERR_TRANSLATE(ENOMEM),
+       EXT4_ERR_TRANSLATE(EFSBADCRC),
+       EXT4_ERR_TRANSLATE(EFSCORRUPTED),
+       EXT4_ERR_TRANSLATE(ENOSPC),
+       EXT4_ERR_TRANSLATE(ENOKEY),
+       EXT4_ERR_TRANSLATE(EROFS),
+       EXT4_ERR_TRANSLATE(EFBIG),
+       EXT4_ERR_TRANSLATE(EEXIST),
+       EXT4_ERR_TRANSLATE(ERANGE),
+       EXT4_ERR_TRANSLATE(EOVERFLOW),
+       EXT4_ERR_TRANSLATE(EBUSY),
+       EXT4_ERR_TRANSLATE(ENOTDIR),
+       EXT4_ERR_TRANSLATE(ENOTEMPTY),
+       EXT4_ERR_TRANSLATE(ESHUTDOWN),
+       EXT4_ERR_TRANSLATE(EFAULT),
+};
+
+static int ext4_errno_to_code(int errno)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(err_translation); i++)
+               if (err_translation[i].errno == errno)
+                       return err_translation[i].code;
+       return EXT4_ERR_UNKNOWN;
+}
+
+static void __save_error_info(struct super_block *sb, int error,
+                             __u32 ino, __u64 block,
+                             const char *func, unsigned int line)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+       if (bdev_read_only(sb->s_bdev))
+               return;
+       /* We default to EFSCORRUPTED error... */
+       if (error == 0)
+               error = EFSCORRUPTED;
+
+       spin_lock(&sbi->s_error_lock);
+       sbi->s_add_error_count++;
+       sbi->s_last_error_code = error;
+       sbi->s_last_error_line = line;
+       sbi->s_last_error_ino = ino;
+       sbi->s_last_error_block = block;
+       sbi->s_last_error_func = func;
+       sbi->s_last_error_time = ktime_get_real_seconds();
+       if (!sbi->s_first_error_time) {
+               sbi->s_first_error_code = error;
+               sbi->s_first_error_line = line;
+               sbi->s_first_error_ino = ino;
+               sbi->s_first_error_block = block;
+               sbi->s_first_error_func = func;
+               sbi->s_first_error_time = sbi->s_last_error_time;
+       }
+       spin_unlock(&sbi->s_error_lock);
+}
+
+static void save_error_info(struct super_block *sb, int error,
+                           __u32 ino, __u64 block,
+                           const char *func, unsigned int line)
+{
+       __save_error_info(sb, error, ino, block, func, line);
+       if (!bdev_read_only(sb->s_bdev))
+               ext4_commit_super(sb, 1);
+}
+
 /* Deal with the reporting of failure conditions on a filesystem such as
  * inconsistencies detected or read IO failures.
  *
@@ -662,40 +640,50 @@ static bool system_going_down(void)
  * We'll just use the jbd2_journal_abort() error code to record an error in
  * the journal instead.  On recovery, the journal will complain about
  * that error until we've noted it down and cleared it.
+ *
+ * If force_ro is set, we unconditionally force the filesystem into an
+ * ABORT|READONLY state, unless the error response on the fs has been set to
+ * panic in which case we take the easy way out and panic immediately. This is
+ * used to deal with unrecoverable failures such as journal IO errors or ENOMEM
+ * at a critical moment in log management.
  */
-
-static void ext4_handle_error(struct super_block *sb)
+static void ext4_handle_error(struct super_block *sb, bool force_ro)
 {
+       journal_t *journal = EXT4_SB(sb)->s_journal;
+
        if (test_opt(sb, WARN_ON_ERROR))
                WARN_ON_ONCE(1);
 
-       if (sb_rdonly(sb))
+       if (sb_rdonly(sb) || (!force_ro && test_opt(sb, ERRORS_CONT)))
                return;
 
-       if (!test_opt(sb, ERRORS_CONT)) {
-               journal_t *journal = EXT4_SB(sb)->s_journal;
-
-               ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
-               if (journal)
-                       jbd2_journal_abort(journal, -EIO);
-       }
+       ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
+       if (journal)
+               jbd2_journal_abort(journal, -EIO);
        /*
         * We force ERRORS_RO behavior when system is rebooting. Otherwise we
         * could panic during 'reboot -f' as the underlying device got already
         * disabled.
         */
-       if (test_opt(sb, ERRORS_RO) || system_going_down()) {
-               ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
-               /*
-                * Make sure updated value of ->s_mount_flags will be visible
-                * before ->s_flags update
-                */
-               smp_wmb();
-               sb->s_flags |= SB_RDONLY;
-       } else if (test_opt(sb, ERRORS_PANIC)) {
+       if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
                panic("EXT4-fs (device %s): panic forced after error\n",
                        sb->s_id);
        }
+       ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+       /*
+        * Make sure updated value of ->s_mount_flags will be visible before
+        * ->s_flags update
+        */
+       smp_wmb();
+       sb->s_flags |= SB_RDONLY;
+}
+
+static void flush_stashed_error_work(struct work_struct *work)
+{
+       struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
+                                               s_error_work);
+
+       ext4_commit_super(sbi->s_sb, 1);
 }
 
 #define ext4_error_ratelimit(sb)                                       \
@@ -703,7 +691,7 @@ static void ext4_handle_error(struct super_block *sb)
                             "EXT4-fs error")
 
 void __ext4_error(struct super_block *sb, const char *function,
-                 unsigned int line, int error, __u64 block,
+                 unsigned int line, bool force_ro, int error, __u64 block,
                  const char *fmt, ...)
 {
        struct va_format vaf;
@@ -723,7 +711,7 @@ void __ext4_error(struct super_block *sb, const char *function,
                va_end(args);
        }
        save_error_info(sb, error, 0, block, function, line);
-       ext4_handle_error(sb);
+       ext4_handle_error(sb, force_ro);
 }
 
 void __ext4_error_inode(struct inode *inode, const char *function,
@@ -755,7 +743,7 @@ void __ext4_error_inode(struct inode *inode, const char *function,
        }
        save_error_info(inode->i_sb, error, inode->i_ino, block,
                        function, line);
-       ext4_handle_error(inode->i_sb);
+       ext4_handle_error(inode->i_sb, false);
 }
 
 void __ext4_error_file(struct file *file, const char *function,
@@ -794,7 +782,7 @@ void __ext4_error_file(struct file *file, const char *function,
        }
        save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block,
                        function, line);
-       ext4_handle_error(inode->i_sb);
+       ext4_handle_error(inode->i_sb, false);
 }
 
 const char *ext4_decode_error(struct super_block *sb, int errno,
@@ -862,51 +850,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
        }
 
        save_error_info(sb, -errno, 0, 0, function, line);
-       ext4_handle_error(sb);
-}
-
-/*
- * ext4_abort is a much stronger failure handler than ext4_error.  The
- * abort function may be used to deal with unrecoverable failures such
- * as journal IO errors or ENOMEM at a critical moment in log management.
- *
- * We unconditionally force the filesystem into an ABORT|READONLY state,
- * unless the error response on the fs has been set to panic in which
- * case we take the easy way out and panic immediately.
- */
-
-void __ext4_abort(struct super_block *sb, const char *function,
-                 unsigned int line, int error, const char *fmt, ...)
-{
-       struct va_format vaf;
-       va_list args;
-
-       if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
-               return;
-
-       save_error_info(sb, error, 0, 0, function, line);
-       va_start(args, fmt);
-       vaf.fmt = fmt;
-       vaf.va = &args;
-       printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: %pV\n",
-              sb->s_id, function, line, &vaf);
-       va_end(args);
-
-       if (sb_rdonly(sb) == 0) {
-               ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
-               if (EXT4_SB(sb)->s_journal)
-                       jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
-
-               ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
-               /*
-                * Make sure updated value of ->s_mount_flags will be visible
-                * before ->s_flags update
-                */
-               smp_wmb();
-               sb->s_flags |= SB_RDONLY;
-       }
-       if (test_opt(sb, ERRORS_PANIC) && !system_going_down())
-               panic("EXT4-fs panic from previous error\n");
+       ext4_handle_error(sb, false);
 }
 
 void __ext4_msg(struct super_block *sb,
@@ -982,8 +926,6 @@ __acquires(bitlock)
                return;
 
        trace_ext4_error(sb, function, line);
-       __save_error_info(sb, EFSCORRUPTED, ino, block, function, line);
-
        if (ext4_error_ratelimit(sb)) {
                va_start(args, fmt);
                vaf.fmt = fmt;
@@ -999,17 +941,16 @@ __acquires(bitlock)
                va_end(args);
        }
 
-       if (test_opt(sb, WARN_ON_ERROR))
-               WARN_ON_ONCE(1);
-
        if (test_opt(sb, ERRORS_CONT)) {
-               ext4_commit_super(sb, 0);
+               if (test_opt(sb, WARN_ON_ERROR))
+                       WARN_ON_ONCE(1);
+               __save_error_info(sb, EFSCORRUPTED, ino, block, function, line);
+               schedule_work(&EXT4_SB(sb)->s_error_work);
                return;
        }
-
        ext4_unlock_group(sb, grp);
-       ext4_commit_super(sb, 1);
-       ext4_handle_error(sb);
+       save_error_info(sb, EFSCORRUPTED, ino, block, function, line);
+       ext4_handle_error(sb, false);
        /*
         * We only get here in the ERRORS_RO case; relocking the group
         * may be dangerous, but nothing bad will happen since the
@@ -1181,6 +1122,7 @@ static void ext4_put_super(struct super_block *sb)
        ext4_unregister_li_request(sb);
        ext4_quota_off_umount(sb);
 
+       flush_work(&sbi->s_error_work);
        destroy_workqueue(sbi->rsv_conversion_wq);
 
        /*
@@ -1240,7 +1182,7 @@ static void ext4_put_super(struct super_block *sb)
         * in-memory list had better be clean by this point. */
        if (!list_empty(&sbi->s_orphan))
                dump_orphan_list(sb, sbi);
-       J_ASSERT(list_empty(&sbi->s_orphan));
+       ASSERT(list_empty(&sbi->s_orphan));
 
        sync_blockdev(sb->s_bdev);
        invalidate_bdev(sb->s_bdev);
@@ -4005,6 +3947,21 @@ static void ext4_set_resv_clusters(struct super_block *sb)
        atomic64_set(&sbi->s_resv_clusters, resv_clusters);
 }
 
+static const char *ext4_quota_mode(struct super_block *sb)
+{
+#ifdef CONFIG_QUOTA
+       if (!ext4_quota_capable(sb))
+               return "none";
+
+       if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb))
+               return "journalled";
+       else
+               return "writeback";
+#else
+       return "disabled";
+#endif
+}
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
@@ -4073,7 +4030,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (IS_ERR(bh)) {
                ext4_msg(sb, KERN_ERR, "unable to read superblock");
                ret = PTR_ERR(bh);
-               bh = NULL;
                goto out_fail;
        }
        /*
@@ -4187,19 +4143,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
 
-       blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
-
-       if (blocksize == PAGE_SIZE)
-               set_opt(sb, DIOREAD_NOLOCK);
-
-       if (blocksize < EXT4_MIN_BLOCK_SIZE ||
-           blocksize > EXT4_MAX_BLOCK_SIZE) {
+       if (le32_to_cpu(es->s_log_block_size) >
+           (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
+               ext4_msg(sb, KERN_ERR,
+                        "Invalid log block size: %u",
+                        le32_to_cpu(es->s_log_block_size));
+               goto failed_mount;
+       }
+       if (le32_to_cpu(es->s_log_cluster_size) >
+           (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
                ext4_msg(sb, KERN_ERR,
-                      "Unsupported filesystem blocksize %d (%d log_block_size)",
-                        blocksize, le32_to_cpu(es->s_log_block_size));
+                        "Invalid log cluster size: %u",
+                        le32_to_cpu(es->s_log_cluster_size));
                goto failed_mount;
        }
 
+       blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+       if (blocksize == PAGE_SIZE)
+               set_opt(sb, DIOREAD_NOLOCK);
+
        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
                sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
                sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
@@ -4417,21 +4380,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
                goto failed_mount;
 
-       if (le32_to_cpu(es->s_log_block_size) >
-           (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
-               ext4_msg(sb, KERN_ERR,
-                        "Invalid log block size: %u",
-                        le32_to_cpu(es->s_log_block_size));
-               goto failed_mount;
-       }
-       if (le32_to_cpu(es->s_log_cluster_size) >
-           (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
-               ext4_msg(sb, KERN_ERR,
-                        "Invalid log cluster size: %u",
-                        le32_to_cpu(es->s_log_cluster_size));
-               goto failed_mount;
-       }
-
        if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) {
                ext4_msg(sb, KERN_ERR,
                         "Number of reserved GDT blocks insanely large: %d",
@@ -4702,7 +4650,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                               "can't read group descriptor %d", i);
                        db_count = i;
                        ret = PTR_ERR(bh);
-                       bh = NULL;
                        goto failed_mount2;
                }
                rcu_read_lock();
@@ -4717,6 +4664,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
 
        timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
+       spin_lock_init(&sbi->s_error_lock);
+       INIT_WORK(&sbi->s_error_work, flush_stashed_error_work);
 
        /* Register extent status tree shrinker */
        if (ext4_es_register_shrinker(sbi))
@@ -4872,6 +4821,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                               "requested data journaling mode");
                        goto failed_mount_wq;
                }
+               break;
        default:
                break;
        }
@@ -5000,13 +4950,11 @@ no_journal:
        block = ext4_count_free_clusters(sb);
        ext4_free_blocks_count_set(sbi->s_es, 
                                   EXT4_C2B(sbi, block));
-       ext4_superblock_csum_set(sb);
        err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
                                  GFP_KERNEL);
        if (!err) {
                unsigned long freei = ext4_count_free_inodes(sb);
                sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
-               ext4_superblock_csum_set(sb);
                err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
                                          GFP_KERNEL);
        }
@@ -5086,10 +5034,11 @@ no_journal:
 
        if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
                ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
-                        "Opts: %.*s%s%s", descr,
+                        "Opts: %.*s%s%s. Quota mode: %s.", descr,
                         (int) sizeof(sbi->s_es->s_mount_opts),
                         sbi->s_es->s_mount_opts,
-                        *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
+                        *sbi->s_es->s_mount_opts ? "; " : "", orig_data,
+                        ext4_quota_mode(sb));
 
        if (es->s_error_count)
                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -5154,6 +5103,7 @@ failed_mount3a:
        ext4_es_unregister_shrinker(sbi);
 failed_mount3:
        del_timer_sync(&sbi->s_err_report);
+       flush_work(&sbi->s_error_work);
        if (sbi->s_mmp_tsk)
                kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
@@ -5480,6 +5430,7 @@ err_out:
 
 static int ext4_commit_super(struct super_block *sb, int sync)
 {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
        int error = 0;
@@ -5511,6 +5462,46 @@ static int ext4_commit_super(struct super_block *sb, int sync)
                es->s_free_inodes_count =
                        cpu_to_le32(percpu_counter_sum_positive(
                                &EXT4_SB(sb)->s_freeinodes_counter));
+       /* Copy error information to the on-disk superblock */
+       spin_lock(&sbi->s_error_lock);
+       if (sbi->s_add_error_count > 0) {
+               es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+               if (!es->s_first_error_time && !es->s_first_error_time_hi) {
+                       __ext4_update_tstamp(&es->s_first_error_time,
+                                            &es->s_first_error_time_hi,
+                                            sbi->s_first_error_time);
+                       strncpy(es->s_first_error_func, sbi->s_first_error_func,
+                               sizeof(es->s_first_error_func));
+                       es->s_first_error_line =
+                               cpu_to_le32(sbi->s_first_error_line);
+                       es->s_first_error_ino =
+                               cpu_to_le32(sbi->s_first_error_ino);
+                       es->s_first_error_block =
+                               cpu_to_le64(sbi->s_first_error_block);
+                       es->s_first_error_errcode =
+                               ext4_errno_to_code(sbi->s_first_error_code);
+               }
+               __ext4_update_tstamp(&es->s_last_error_time,
+                                    &es->s_last_error_time_hi,
+                                    sbi->s_last_error_time);
+               strncpy(es->s_last_error_func, sbi->s_last_error_func,
+                       sizeof(es->s_last_error_func));
+               es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line);
+               es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino);
+               es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);
+               es->s_last_error_errcode =
+                               ext4_errno_to_code(sbi->s_last_error_code);
+               /*
+                * Start the daily error reporting function if it hasn't been
+                * started already
+                */
+               if (!es->s_error_count)
+                       mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);
+               le32_add_cpu(&es->s_error_count, sbi->s_add_error_count);
+               sbi->s_add_error_count = 0;
+       }
+       spin_unlock(&sbi->s_error_lock);
+
        BUFFER_TRACE(sbh, "marking dirty");
        ext4_superblock_csum_set(sb);
        if (sync)
@@ -5864,6 +5855,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
        }
 
+       /* Flush outstanding errors before changing fs state */
+       flush_work(&sbi->s_error_work);
+
        if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
                if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) {
                        err = -EROFS;
@@ -6022,7 +6016,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
         */
        *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags);
 
-       ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+       ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s. Quota mode: %s.",
+                orig_data, ext4_quota_mode(sb));
        kfree(orig_data);
        return 0;
 
@@ -6201,11 +6196,8 @@ static int ext4_release_dquot(struct dquot *dquot)
 static int ext4_mark_dquot_dirty(struct dquot *dquot)
 {
        struct super_block *sb = dquot->dq_sb;
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-       /* Are we journaling quotas? */
-       if (ext4_has_feature_quota(sb) ||
-           sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+       if (ext4_is_quota_journalled(sb)) {
                dquot_mark_dquot_dirty(dquot);
                return ext4_write_dquot(dquot);
        } else {
index 6127e94..4e3b1f8 100644 (file)
@@ -1927,7 +1927,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
        } else {
                /* Allocate a buffer where we construct the new block. */
                s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
-               /* assert(header == s->base) */
                error = -ENOMEM;
                if (s->base == NULL)
                        goto cleanup;
index c070c0d..aea3545 100644 (file)
@@ -315,7 +315,7 @@ retry:
        if (mode & FMODE_WRITE)
                r = w = 1;
 
-       name = dentry_name(file->f_path.dentry);
+       name = dentry_name(d_real(file->f_path.dentry, file->f_inode));
        if (name == NULL)
                return -ENOMEM;
 
index cb008ac..6442d97 100644 (file)
@@ -1624,7 +1624,9 @@ static void iput_final(struct inode *inode)
        else
                drop = generic_drop_inode(inode);
 
-       if (!drop && (sb->s_flags & SB_ACTIVE)) {
+       if (!drop &&
+           !(inode->i_state & I_DONTCACHE) &&
+           (sb->s_flags & SB_ACTIVE)) {
                inode_add_lru(inode);
                spin_unlock(&inode->i_lock);
                return;
index f72d538..a564f36 100644 (file)
@@ -36,8 +36,7 @@ enum {
 
 enum {
        IO_WQ_BIT_EXIT          = 0,    /* wq exiting */
-       IO_WQ_BIT_CANCEL        = 1,    /* cancel work on list */
-       IO_WQ_BIT_ERROR         = 2,    /* error on setup */
+       IO_WQ_BIT_ERROR         = 1,    /* error on setup */
 };
 
 enum {
@@ -561,12 +560,6 @@ get_next:
 
                        next_hashed = wq_next_work(work);
                        io_impersonate_work(worker, work);
-                       /*
-                        * OK to set IO_WQ_WORK_CANCEL even for uncancellable
-                        * work, the worker function will do the right thing.
-                        */
-                       if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
-                               work->flags |= IO_WQ_WORK_CANCEL;
 
                        old_work = work;
                        linked = wq->do_work(work);
@@ -732,12 +725,6 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
        return acct->nr_workers < acct->max_workers;
 }
 
-static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
-{
-       send_sig(SIGINT, worker->task, 1);
-       return false;
-}
-
 /*
  * Iterate the passed in list and call the specific function for each
  * worker that isn't exiting
@@ -938,21 +925,6 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
        work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
 }
 
-void io_wq_cancel_all(struct io_wq *wq)
-{
-       int node;
-
-       set_bit(IO_WQ_BIT_CANCEL, &wq->state);
-
-       rcu_read_lock();
-       for_each_node(node) {
-               struct io_wqe *wqe = wq->wqes[node];
-
-               io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
-       }
-       rcu_read_unlock();
-}
-
 struct io_cb_cancel_data {
        work_cancel_fn *fn;
        void *data;
index 069496c..b158f8a 100644 (file)
@@ -59,6 +59,7 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
                list->last->next = node;
                list->last = node;
        }
+       node->next = NULL;
 }
 
 static inline void wq_list_cut(struct io_wq_work_list *list,
@@ -128,8 +129,6 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work)
        return work->flags & IO_WQ_WORK_HASHED;
 }
 
-void io_wq_cancel_all(struct io_wq *wq);
-
 typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
 
 enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
index 6f9392c..7e35283 100644 (file)
@@ -1693,6 +1693,11 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
        return io_wq_current_is_worker();
 }
 
+static inline unsigned __io_cqring_events(struct io_ring_ctx *ctx)
+{
+       return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
+}
+
 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 {
        if (waitqueue_active(&ctx->wait))
@@ -1703,15 +1708,6 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
                eventfd_signal(ctx->cq_ev_fd, 1);
 }
 
-static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
-{
-       if (list_empty(&ctx->cq_overflow_list)) {
-               clear_bit(0, &ctx->sq_check_overflow);
-               clear_bit(0, &ctx->cq_check_overflow);
-               ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
-       }
-}
-
 /* Returns true if there are no backlogged entries after the flush */
 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
                                     struct task_struct *tsk,
@@ -1721,23 +1717,13 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
        struct io_kiocb *req, *tmp;
        struct io_uring_cqe *cqe;
        unsigned long flags;
+       bool all_flushed;
        LIST_HEAD(list);
 
-       if (!force) {
-               if (list_empty_careful(&ctx->cq_overflow_list))
-                       return true;
-               if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
-                   rings->cq_ring_entries))
-                       return false;
-       }
+       if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
+               return false;
 
        spin_lock_irqsave(&ctx->completion_lock, flags);
-
-       /* if force is set, the ring is going away. always drop after that */
-       if (force)
-               ctx->cq_overflow_flushed = 1;
-
-       cqe = NULL;
        list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
                if (!io_match_task(req, tsk, files))
                        continue;
@@ -1758,9 +1744,14 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
                }
        }
 
-       io_commit_cqring(ctx);
-       io_cqring_mark_overflow(ctx);
+       all_flushed = list_empty(&ctx->cq_overflow_list);
+       if (all_flushed) {
+               clear_bit(0, &ctx->sq_check_overflow);
+               clear_bit(0, &ctx->cq_check_overflow);
+               ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
+       }
 
+       io_commit_cqring(ctx);
        spin_unlock_irqrestore(&ctx->completion_lock, flags);
        io_cqring_ev_posted(ctx);
 
@@ -1770,7 +1761,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
                io_put_req(req);
        }
 
-       return cqe != NULL;
+       return all_flushed;
 }
 
 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
@@ -2320,8 +2311,6 @@ static void io_double_put_req(struct io_kiocb *req)
 
 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
 {
-       struct io_rings *rings = ctx->rings;
-
        if (test_bit(0, &ctx->cq_check_overflow)) {
                /*
                 * noflush == true is from the waitqueue handler, just ensure
@@ -2336,7 +2325,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
 
        /* See comment at the top of this file */
        smp_rmb();
-       return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
+       return __io_cqring_events(ctx);
 }
 
 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
@@ -3136,9 +3125,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
                iov[0].iov_len = kbuf->len;
                return 0;
        }
-       if (!req->rw.len)
-               return 0;
-       else if (req->rw.len > 1)
+       if (req->rw.len != 1)
                return -EINVAL;
 
 #ifdef CONFIG_COMPAT
@@ -3784,6 +3771,8 @@ static int io_shutdown(struct io_kiocb *req, bool force_nonblock)
                return -ENOTSOCK;
 
        ret = __sys_shutdown_sock(sock, req->shutdown.how);
+       if (ret < 0)
+               req_set_fail_links(req);
        io_req_complete(req, ret);
        return 0;
 #else
@@ -6107,15 +6096,15 @@ static void io_req_drop_files(struct io_kiocb *req)
        struct io_uring_task *tctx = req->task->io_uring;
        unsigned long flags;
 
+       put_files_struct(req->work.identity->files);
+       put_nsproxy(req->work.identity->nsproxy);
        spin_lock_irqsave(&ctx->inflight_lock, flags);
        list_del(&req->inflight_entry);
-       if (atomic_read(&tctx->in_idle))
-               wake_up(&tctx->wait);
        spin_unlock_irqrestore(&ctx->inflight_lock, flags);
        req->flags &= ~REQ_F_INFLIGHT;
-       put_files_struct(req->work.identity->files);
-       put_nsproxy(req->work.identity->nsproxy);
        req->work.flags &= ~IO_WQ_WORK_FILES;
+       if (atomic_read(&tctx->in_idle))
+               wake_up(&tctx->wait);
 }
 
 static void __io_clean_op(struct io_kiocb *req)
@@ -6343,19 +6332,28 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
        }
 
        if (ret) {
+               struct io_ring_ctx *lock_ctx = NULL;
+
+               if (req->ctx->flags & IORING_SETUP_IOPOLL)
+                       lock_ctx = req->ctx;
+
                /*
-                * io_iopoll_complete() does not hold completion_lock to complete
-                * polled io, so here for polled io, just mark it done and still let
-                * io_iopoll_complete() complete it.
+                * io_iopoll_complete() does not hold completion_lock to
+                * complete polled io, so here for polled io, we can not call
+                * io_req_complete() directly, otherwise there maybe concurrent
+                * access to cqring, defer_list, etc, which is not safe. Given
+                * that io_iopoll_complete() is always called under uring_lock,
+                * so here for polled io, we also get uring_lock to complete
+                * it.
                 */
-               if (req->ctx->flags & IORING_SETUP_IOPOLL) {
-                       struct kiocb *kiocb = &req->rw.kiocb;
+               if (lock_ctx)
+                       mutex_lock(&lock_ctx->uring_lock);
 
-                       kiocb_done(kiocb, ret, NULL);
-               } else {
-                       req_set_fail_links(req);
-                       io_req_complete(req, ret);
-               }
+               req_set_fail_links(req);
+               io_req_complete(req, ret);
+
+               if (lock_ctx)
+                       mutex_unlock(&lock_ctx->uring_lock);
        }
 
        return io_steal_work(req);
@@ -6824,8 +6822,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 
        /* if we have a backlog and couldn't flush it all, return BUSY */
        if (test_bit(0, &ctx->sq_check_overflow)) {
-               if (!list_empty(&ctx->cq_overflow_list) &&
-                   !io_cqring_overflow_flush(ctx, false, NULL, NULL))
+               if (!io_cqring_overflow_flush(ctx, false, NULL, NULL))
                        return -EBUSY;
        }
 
@@ -8155,10 +8152,13 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
                __io_unaccount_mem(ctx->user, nr_pages);
 
        if (ctx->mm_account) {
-               if (acct == ACCT_LOCKED)
+               if (acct == ACCT_LOCKED) {
+                       mmap_write_lock(ctx->mm_account);
                        ctx->mm_account->locked_vm -= nr_pages;
-               else if (acct == ACCT_PINNED)
+                       mmap_write_unlock(ctx->mm_account);
+               }else if (acct == ACCT_PINNED) {
                        atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
+               }
        }
 }
 
@@ -8174,10 +8174,13 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
        }
 
        if (ctx->mm_account) {
-               if (acct == ACCT_LOCKED)
+               if (acct == ACCT_LOCKED) {
+                       mmap_write_lock(ctx->mm_account);
                        ctx->mm_account->locked_vm += nr_pages;
-               else if (acct == ACCT_PINNED)
+                       mmap_write_unlock(ctx->mm_account);
+               } else if (acct == ACCT_PINNED) {
                        atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
+               }
        }
 
        return 0;
@@ -8643,10 +8646,19 @@ static void io_ring_exit_work(struct work_struct *work)
        io_ring_ctx_free(ctx);
 }
 
+static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+{
+       struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+       return req->ctx == data;
+}
+
 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 {
        mutex_lock(&ctx->uring_lock);
        percpu_ref_kill(&ctx->refs);
+       /* if force is set, the ring is going away. always drop after that */
+       ctx->cq_overflow_flushed = 1;
        if (ctx->rings)
                io_cqring_overflow_flush(ctx, true, NULL, NULL);
        mutex_unlock(&ctx->uring_lock);
@@ -8655,7 +8667,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
        io_poll_remove_all(ctx, NULL, NULL);
 
        if (ctx->io_wq)
-               io_wq_cancel_all(ctx->io_wq);
+               io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true);
 
        /* if we failed setting up the ctx, we might not have any rings */
        io_iopoll_try_reap_events(ctx);
@@ -8798,9 +8810,9 @@ static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
 
                ret |= io_poll_remove_all(ctx, task, NULL);
                ret |= io_kill_timeouts(ctx, task, NULL);
+               ret |= io_run_task_work();
                if (!ret)
                        break;
-               io_run_task_work();
                cond_resched();
        }
 }
@@ -8849,10 +8861,9 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
 static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
 {
        struct io_uring_task *tctx = current->io_uring;
+       int ret;
 
        if (unlikely(!tctx)) {
-               int ret;
-
                ret = io_uring_alloc_task_context(current);
                if (unlikely(ret))
                        return ret;
@@ -8863,7 +8874,12 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
 
                if (!old) {
                        get_file(file);
-                       xa_store(&tctx->xa, (unsigned long)file, file, GFP_KERNEL);
+                       ret = xa_err(xa_store(&tctx->xa, (unsigned long)file,
+                                               file, GFP_KERNEL));
+                       if (ret) {
+                               fput(file);
+                               return ret;
+                       }
                }
                tctx->last = file;
        }
@@ -8986,9 +9002,9 @@ void __io_uring_task_cancel(void)
                if (inflight != tctx_inflight(tctx))
                        continue;
                schedule();
+               finish_wait(&tctx->wait, &wait);
        } while (1);
 
-       finish_wait(&tctx->wait, &wait);
        atomic_dec(&tctx->in_idle);
 }
 
@@ -9156,10 +9172,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
         */
        ret = 0;
        if (ctx->flags & IORING_SETUP_SQPOLL) {
-               io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
-               if (!list_empty_careful(&ctx->cq_overflow_list))
+               if (!list_empty_careful(&ctx->cq_overflow_list)) {
+                       bool needs_lock = ctx->flags & IORING_SETUP_IOPOLL;
+
+                       io_ring_submit_lock(ctx, needs_lock);
                        io_cqring_overflow_flush(ctx, false, NULL, NULL);
-               io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
+                       io_ring_submit_unlock(ctx, needs_lock);
+               }
                if (flags & IORING_ENTER_SQ_WAKEUP)
                        wake_up(&ctx->sq_data->wait);
                if (flags & IORING_ENTER_SQ_WAIT)
@@ -9369,55 +9388,52 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
        return 0;
 }
 
+static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
+{
+       int ret, fd;
+
+       fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+       if (fd < 0)
+               return fd;
+
+       ret = io_uring_add_task_file(ctx, file);
+       if (ret) {
+               put_unused_fd(fd);
+               return ret;
+       }
+       fd_install(fd, file);
+       return fd;
+}
+
 /*
  * Allocate an anonymous fd, this is what constitutes the application
  * visible backing of an io_uring instance. The application mmaps this
  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
  * we have to tie this fd to a socket for file garbage collection purposes.
  */
-static int io_uring_get_fd(struct io_ring_ctx *ctx)
+static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
 {
        struct file *file;
+#if defined(CONFIG_UNIX)
        int ret;
-       int fd;
 
-#if defined(CONFIG_UNIX)
        ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
                                &ctx->ring_sock);
        if (ret)
-               return ret;
+               return ERR_PTR(ret);
 #endif
 
-       ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
-       if (ret < 0)
-               goto err;
-       fd = ret;
-
        file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
                                        O_RDWR | O_CLOEXEC);
-       if (IS_ERR(file)) {
-               put_unused_fd(fd);
-               ret = PTR_ERR(file);
-               goto err;
-       }
-
 #if defined(CONFIG_UNIX)
-       ctx->ring_sock->file = file;
-#endif
-       ret = io_uring_add_task_file(ctx, file);
-       if (ret) {
-               fput(file);
-               put_unused_fd(fd);
-               goto err;
+       if (IS_ERR(file)) {
+               sock_release(ctx->ring_sock);
+               ctx->ring_sock = NULL;
+       } else {
+               ctx->ring_sock->file = file;
        }
-       fd_install(fd, file);
-       return fd;
-err:
-#if defined(CONFIG_UNIX)
-       sock_release(ctx->ring_sock);
-       ctx->ring_sock = NULL;
 #endif
-       return ret;
+       return file;
 }
 
 static int io_uring_create(unsigned entries, struct io_uring_params *p,
@@ -9425,6 +9441,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 {
        struct user_struct *user = NULL;
        struct io_ring_ctx *ctx;
+       struct file *file;
        bool limit_mem;
        int ret;
 
@@ -9572,13 +9589,22 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
                goto err;
        }
 
+       file = io_uring_get_file(ctx);
+       if (IS_ERR(file)) {
+               ret = PTR_ERR(file);
+               goto err;
+       }
+
        /*
         * Install ring fd as the very last thing, so we don't risk someone
         * having closed it before we finish setup
         */
-       ret = io_uring_get_fd(ctx);
-       if (ret < 0)
-               goto err;
+       ret = io_uring_install_fd(ctx, file);
+       if (ret < 0) {
+               /* fput will clean it up */
+               fput(file);
+               return ret;
+       }
 
        trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
        return ret;
index 188f79d..2dc9444 100644 (file)
@@ -1869,9 +1869,7 @@ static int load_superblock(journal_t *journal)
 
        if (jbd2_has_feature_fast_commit(journal)) {
                journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
-               num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks);
-               if (!num_fc_blocks)
-                       num_fc_blocks = JBD2_MIN_FC_BLOCKS;
+               num_fc_blocks = jbd2_journal_get_num_fc_blks(sb);
                if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS)
                        journal->j_last = journal->j_fc_last - num_fc_blocks;
                journal->j_fc_first = journal->j_last + 1;
@@ -2102,9 +2100,7 @@ jbd2_journal_initialize_fast_commit(journal_t *journal)
        journal_superblock_t *sb = journal->j_superblock;
        unsigned long long num_fc_blks;
 
-       num_fc_blks = be32_to_cpu(sb->s_num_fc_blks);
-       if (num_fc_blks == 0)
-               num_fc_blks = JBD2_MIN_FC_BLOCKS;
+       num_fc_blks = jbd2_journal_get_num_fc_blks(sb);
        if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
                return -ENOSPC;
 
index 03d0e11..78443a8 100644 (file)
@@ -2114,8 +2114,10 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                return PTR_ERR(name);
        while (*name=='/')
                name++;
-       if (!*name)
+       if (!*name) {
+               nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
                return 0;
+       }
 
        /* At this point we know we have a real path component. */
        for(;;) {
index 2b681f6..d2db7df 100644 (file)
@@ -156,10 +156,10 @@ static inline void mnt_add_count(struct mount *mnt, int n)
 /*
  * vfsmount lock must be held for write
  */
-unsigned int mnt_get_count(struct mount *mnt)
+int mnt_get_count(struct mount *mnt)
 {
 #ifdef CONFIG_SMP
-       unsigned int count = 0;
+       int count = 0;
        int cpu;
 
        for_each_possible_cpu(cpu) {
@@ -1139,6 +1139,7 @@ static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
 static void mntput_no_expire(struct mount *mnt)
 {
        LIST_HEAD(list);
+       int count;
 
        rcu_read_lock();
        if (likely(READ_ONCE(mnt->mnt_ns))) {
@@ -1162,7 +1163,9 @@ static void mntput_no_expire(struct mount *mnt)
         */
        smp_mb();
        mnt_add_count(mnt, -1);
-       if (mnt_get_count(mnt)) {
+       count = mnt_get_count(mnt);
+       if (count != 0) {
+               WARN_ON(count < 0);
                rcu_read_unlock();
                unlock_mount_hash();
                return;
index 49a058c..26f74e0 100644 (file)
@@ -44,7 +44,7 @@ int propagate_mount_busy(struct mount *, int);
 void propagate_mount_unlock(struct mount *);
 void mnt_release_group_id(struct mount *);
 int get_dominating_id(struct mount *mnt, const struct path *root);
-unsigned int mnt_get_count(struct mount *mnt);
+int mnt_get_count(struct mount *mnt);
 void mnt_set_mountpoint(struct mount *, struct mountpoint *,
                        struct mount *);
 void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
index d7c0e73..763b816 100644 (file)
@@ -817,12 +817,6 @@ static inline bool efi_enabled(int feature)
 static inline void
 efi_reboot(enum reboot_mode reboot_mode, const char *__unused) {}
 
-static inline bool
-efi_capsule_pending(int *reset_type)
-{
-       return false;
-}
-
 static inline bool efi_soft_reserve_enabled(void)
 {
        return false;
@@ -1038,6 +1032,7 @@ bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
 bool efivar_variable_is_removable(efi_guid_t vendor, const char *name,
                                  size_t len);
 
+#if IS_ENABLED(CONFIG_EFI_CAPSULE_LOADER)
 extern bool efi_capsule_pending(int *reset_type);
 
 extern int efi_capsule_supported(efi_guid_t guid, u32 flags,
@@ -1045,6 +1040,9 @@ extern int efi_capsule_supported(efi_guid_t guid, u32 flags,
 
 extern int efi_capsule_update(efi_capsule_header_t *capsule,
                              phys_addr_t *pages);
+#else
+static inline bool efi_capsule_pending(int *reset_type) { return false; }
+#endif
 
 #ifdef CONFIG_EFI_RUNTIME_MAP
 int efi_runtime_map_init(struct kobject *);
@@ -1089,7 +1087,28 @@ enum efi_secureboot_mode {
        efi_secureboot_mode_disabled,
        efi_secureboot_mode_enabled,
 };
-enum efi_secureboot_mode efi_get_secureboot(void);
+
+static inline
+enum efi_secureboot_mode efi_get_secureboot_mode(efi_get_variable_t *get_var)
+{
+       u8 secboot, setupmode = 0;
+       efi_status_t status;
+       unsigned long size;
+
+       size = sizeof(secboot);
+       status = get_var(L"SecureBoot", &EFI_GLOBAL_VARIABLE_GUID, NULL, &size,
+                        &secboot);
+       if (status == EFI_NOT_FOUND)
+               return efi_secureboot_mode_disabled;
+       if (status != EFI_SUCCESS)
+               return efi_secureboot_mode_unknown;
+
+       size = sizeof(setupmode);
+       get_var(L"SetupMode", &EFI_GLOBAL_VARIABLE_GUID, NULL, &size, &setupmode);
+       if (secboot == 0 || setupmode == 1)
+               return efi_secureboot_mode_disabled;
+       return efi_secureboot_mode_enabled;
+}
 
 #ifdef CONFIG_RESET_ATTACK_MITIGATION
 void efi_enable_reset_attack_mitigation(void);
index ad4cf1b..fd47dee 100644 (file)
@@ -2876,8 +2876,7 @@ extern int inode_needs_sync(struct inode *inode);
 extern int generic_delete_inode(struct inode *inode);
 static inline int generic_drop_inode(struct inode *inode)
 {
-       return !inode->i_nlink || inode_unhashed(inode) ||
-               (inode->i_state & I_DONTCACHE);
+       return !inode->i_nlink || inode_unhashed(inode);
 }
 extern void d_mark_dontcache(struct inode *inode);
 
index 870b325..bb8ff90 100644 (file)
@@ -232,6 +232,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
 # define local_irq_enable_in_hardirq() local_irq_enable()
 #endif
 
+bool irq_has_action(unsigned int irq);
 extern void disable_irq_nosync(unsigned int irq);
 extern bool disable_hardirq(unsigned int irq);
 extern void disable_irq(unsigned int irq);
index c332871..4aeb1c4 100644 (file)
@@ -906,6 +906,13 @@ struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
 }
 #endif
 
+static inline struct cpumask *irq_get_effective_affinity_mask(unsigned int irq)
+{
+       struct irq_data *d = irq_get_irq_data(irq);
+
+       return d ? irq_data_get_effective_affinity_mask(d) : NULL;
+}
+
 unsigned int arch_dynirq_lower_bound(unsigned int from);
 
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
index 5745491..891b323 100644 (file)
@@ -113,6 +113,12 @@ static inline void irq_unlock_sparse(void) { }
 extern struct irq_desc irq_desc[NR_IRQS];
 #endif
 
+static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc,
+                                             unsigned int cpu)
+{
+       return desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+}
+
 static inline struct irq_desc *irq_data_to_desc(struct irq_data *data)
 {
        return container_of(data->common, struct irq_desc, irq_common_data);
@@ -179,12 +185,7 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
 /* Test to see if a driver has successfully requested an irq */
 static inline int irq_desc_has_action(struct irq_desc *desc)
 {
-       return desc->action != NULL;
-}
-
-static inline int irq_has_action(unsigned int irq)
-{
-       return irq_desc_has_action(irq_to_desc(irq));
+       return desc && desc->action != NULL;
 }
 
 /**
@@ -228,40 +229,31 @@ irq_set_chip_handler_name_locked(struct irq_data *data, struct irq_chip *chip,
        data->chip = chip;
 }
 
+bool irq_check_status_bit(unsigned int irq, unsigned int bitmask);
+
 static inline bool irq_balancing_disabled(unsigned int irq)
 {
-       struct irq_desc *desc;
-
-       desc = irq_to_desc(irq);
-       return desc->status_use_accessors & IRQ_NO_BALANCING_MASK;
+       return irq_check_status_bit(irq, IRQ_NO_BALANCING_MASK);
 }
 
 static inline bool irq_is_percpu(unsigned int irq)
 {
-       struct irq_desc *desc;
-
-       desc = irq_to_desc(irq);
-       return desc->status_use_accessors & IRQ_PER_CPU;
+       return irq_check_status_bit(irq, IRQ_PER_CPU);
 }
 
 static inline bool irq_is_percpu_devid(unsigned int irq)
 {
-       struct irq_desc *desc;
-
-       desc = irq_to_desc(irq);
-       return desc->status_use_accessors & IRQ_PER_CPU_DEVID;
+       return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID);
 }
 
+void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
+                            struct lock_class_key *request_class);
 static inline void
 irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
                      struct lock_class_key *request_class)
 {
-       struct irq_desc *desc = irq_to_desc(irq);
-
-       if (desc) {
-               lockdep_set_class(&desc->lock, lock_class);
-               lockdep_set_class(&desc->request_mutex, request_class);
-       }
+       if (IS_ENABLED(CONFIG_LOCKDEP))
+               __irq_set_lockdep_class(irq, lock_class, request_class);
 }
 
 #endif
index 578ff19..99d3cd0 100644 (file)
@@ -68,7 +68,7 @@ extern void *jbd2_alloc(size_t size, gfp_t flags);
 extern void jbd2_free(void *ptr, size_t size);
 
 #define JBD2_MIN_JOURNAL_BLOCKS 1024
-#define JBD2_MIN_FC_BLOCKS     256
+#define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256
 
 #ifdef __KERNEL__
 
@@ -538,6 +538,7 @@ struct transaction_chp_stats_s {
  * The transaction keeps track of all of the buffers modified by a
  * running transaction, and all of the buffers committed but not yet
  * flushed to home for finished transactions.
+ * (Locking Documentation improved by LockDoc)
  */
 
 /*
@@ -658,12 +659,12 @@ struct transaction_s
        unsigned long           t_start;
 
        /*
-        * When commit was requested
+        * When commit was requested [j_state_lock]
         */
        unsigned long           t_requested;
 
        /*
-        * Checkpointing stats [j_checkpoint_sem]
+        * Checkpointing stats [j_list_lock]
         */
        struct transaction_chp_stats_s t_chp_stats;
 
@@ -1691,6 +1692,13 @@ static inline int jbd2_journal_has_csum_v2or3(journal_t *journal)
        return journal->j_chksum_driver != NULL;
 }
 
+static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb)
+{
+       int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks);
+
+       return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS;
+}
+
 /*
  * Return number of free blocks in the log. Must be called under j_state_lock.
  */
index 89f0745..44ae1a7 100644 (file)
@@ -67,7 +67,6 @@ static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu)
 /*
  * Number of interrupts per specific IRQ source, since bootup
  */
-extern unsigned int kstat_irqs(unsigned int irq);
 extern unsigned int kstat_irqs_usr(unsigned int irq);
 
 /*
index 30bc7a7..0fefeb9 100644 (file)
@@ -42,6 +42,7 @@ struct vdpa_vq_state {
  * @config: the configuration ops for this device.
  * @index: device index
  * @features_valid: were features initialized? for legacy guests
+ * @nvqs: maximum number of supported virtqueues
  */
 struct vdpa_device {
        struct device dev;
index 0b68699..e282ce0 100644 (file)
@@ -11,7 +11,7 @@ struct ioc_gq;
 
 #include <linux/tracepoint.h>
 
-TRACE_EVENT(iocost_iocg_activate,
+DECLARE_EVENT_CLASS(iocost_iocg_state,
 
        TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
                u64 last_period, u64 cur_period, u64 vtime),
@@ -59,6 +59,20 @@ TRACE_EVENT(iocost_iocg_activate,
        )
 );
 
+DEFINE_EVENT(iocost_iocg_state, iocost_iocg_activate,
+       TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
+                u64 last_period, u64 cur_period, u64 vtime),
+
+       TP_ARGS(iocg, path, now, last_period, cur_period, vtime)
+);
+
+DEFINE_EVENT(iocost_iocg_state, iocost_iocg_idle,
+       TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
+                u64 last_period, u64 cur_period, u64 vtime),
+
+       TP_ARGS(iocg, path, now, last_period, cur_period, vtime)
+);
+
 DECLARE_EVENT_CLASS(iocg_inuse_update,
 
        TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
index b052355..bc1c062 100644 (file)
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE. */
 
-#define VIRTIO_ID_NET          1 /* virtio net */
-#define VIRTIO_ID_BLOCK                2 /* virtio block */
-#define VIRTIO_ID_CONSOLE      3 /* virtio console */
-#define VIRTIO_ID_RNG          4 /* virtio rng */
-#define VIRTIO_ID_BALLOON      5 /* virtio balloon */
-#define VIRTIO_ID_RPMSG                7 /* virtio remote processor messaging */
-#define VIRTIO_ID_SCSI         8 /* virtio scsi */
-#define VIRTIO_ID_9P           9 /* 9p virtio console */
-#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */
-#define VIRTIO_ID_CAIF        12 /* Virtio caif */
-#define VIRTIO_ID_GPU          16 /* virtio GPU */
-#define VIRTIO_ID_INPUT        18 /* virtio input */
-#define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
-#define VIRTIO_ID_CRYPTO       20 /* virtio crypto */
-#define VIRTIO_ID_IOMMU        23 /* virtio IOMMU */
-#define VIRTIO_ID_MEM          24 /* virtio mem */
-#define VIRTIO_ID_FS           26 /* virtio filesystem */
-#define VIRTIO_ID_PMEM         27 /* virtio pmem */
-#define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */
+#define VIRTIO_ID_NET                  1 /* virtio net */
+#define VIRTIO_ID_BLOCK                        2 /* virtio block */
+#define VIRTIO_ID_CONSOLE              3 /* virtio console */
+#define VIRTIO_ID_RNG                  4 /* virtio rng */
+#define VIRTIO_ID_BALLOON              5 /* virtio balloon */
+#define VIRTIO_ID_IOMEM                        6 /* virtio ioMemory */
+#define VIRTIO_ID_RPMSG                        7 /* virtio remote processor messaging */
+#define VIRTIO_ID_SCSI                 8 /* virtio scsi */
+#define VIRTIO_ID_9P                   9 /* 9p virtio console */
+#define VIRTIO_ID_MAC80211_WLAN                10 /* virtio WLAN MAC */
+#define VIRTIO_ID_RPROC_SERIAL         11 /* virtio remoteproc serial link */
+#define VIRTIO_ID_CAIF                 12 /* Virtio caif */
+#define VIRTIO_ID_MEMORY_BALLOON       13 /* virtio memory balloon */
+#define VIRTIO_ID_GPU                  16 /* virtio GPU */
+#define VIRTIO_ID_CLOCK                        17 /* virtio clock/timer */
+#define VIRTIO_ID_INPUT                        18 /* virtio input */
+#define VIRTIO_ID_VSOCK                        19 /* virtio vsock transport */
+#define VIRTIO_ID_CRYPTO               20 /* virtio crypto */
+#define VIRTIO_ID_SIGNAL_DIST          21 /* virtio signal distribution device */
+#define VIRTIO_ID_PSTORE               22 /* virtio pstore device */
+#define VIRTIO_ID_IOMMU                        23 /* virtio IOMMU */
+#define VIRTIO_ID_MEM                  24 /* virtio mem */
+#define VIRTIO_ID_FS                   26 /* virtio filesystem */
+#define VIRTIO_ID_PMEM                 27 /* virtio pmem */
+#define VIRTIO_ID_MAC80211_HWSIM       29 /* virtio mac80211-hwsim */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
index e810eb9..cc1a094 100644 (file)
@@ -147,12 +147,12 @@ static ssize_t per_cpu_count_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
 {
        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
-       int cpu, irq = desc->irq_data.irq;
        ssize_t ret = 0;
        char *p = "";
+       int cpu;
 
        for_each_possible_cpu(cpu) {
-               unsigned int c = kstat_irqs_cpu(irq, cpu);
+               unsigned int c = irq_desc_kstat_cpu(desc, cpu);
 
                ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c);
                p = ",";
@@ -352,7 +352,9 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 {
        return radix_tree_lookup(&irq_desc_tree, irq);
 }
-EXPORT_SYMBOL(irq_to_desc);
+#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE
+EXPORT_SYMBOL_GPL(irq_to_desc);
+#endif
 
 static void delete_irq_desc(unsigned int irq)
 {
@@ -924,15 +926,7 @@ static bool irq_is_nmi(struct irq_desc *desc)
        return desc->istate & IRQS_NMI;
 }
 
-/**
- * kstat_irqs - Get the statistics for an interrupt
- * @irq:       The interrupt number
- *
- * Returns the sum of interrupt counts on all cpus since boot for
- * @irq. The caller must ensure that the interrupt is not removed
- * concurrently.
- */
-unsigned int kstat_irqs(unsigned int irq)
+static unsigned int kstat_irqs(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned int sum = 0;
@@ -943,21 +937,22 @@ unsigned int kstat_irqs(unsigned int irq)
        if (!irq_settings_is_per_cpu_devid(desc) &&
            !irq_settings_is_per_cpu(desc) &&
            !irq_is_nmi(desc))
-           return desc->tot_count;
+               return data_race(desc->tot_count);
 
        for_each_possible_cpu(cpu)
-               sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
+               sum += data_race(*per_cpu_ptr(desc->kstat_irqs, cpu));
        return sum;
 }
 
 /**
- * kstat_irqs_usr - Get the statistics for an interrupt
+ * kstat_irqs_usr - Get the statistics for an interrupt from thread context
  * @irq:       The interrupt number
  *
  * Returns the sum of interrupt counts on all cpus since boot for @irq.
- * Contrary to kstat_irqs() this can be called from any context.
- * It uses rcu since a concurrent removal of an interrupt descriptor is
- * observing an rcu grace period before delayed_free_desc()/irq_kobj_release().
+ *
+ * It uses rcu to protect the access since a concurrent removal of an
+ * interrupt descriptor is observing an rcu grace period before
+ * delayed_free_desc()/irq_kobj_release().
  */
 unsigned int kstat_irqs_usr(unsigned int irq)
 {
@@ -968,3 +963,17 @@ unsigned int kstat_irqs_usr(unsigned int irq)
        rcu_read_unlock();
        return sum;
 }
+
+#ifdef CONFIG_LOCKDEP
+void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
+                            struct lock_class_key *request_class)
+{
+       struct irq_desc *desc = irq_to_desc(irq);
+
+       if (desc) {
+               lockdep_set_class(&desc->lock, lock_class);
+               lockdep_set_class(&desc->request_mutex, request_class);
+       }
+}
+EXPORT_SYMBOL_GPL(__irq_set_lockdep_class);
+#endif
index c826ba4..ab8567f 100644 (file)
@@ -2822,3 +2822,40 @@ out_unlock:
        return err;
 }
 EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
+
+/**
+ * irq_has_action - Check whether an interrupt is requested
+ * @irq:       The linux irq number
+ *
+ * Returns: A snapshot of the current state
+ */
+bool irq_has_action(unsigned int irq)
+{
+       bool res;
+
+       rcu_read_lock();
+       res = irq_desc_has_action(irq_to_desc(irq));
+       rcu_read_unlock();
+       return res;
+}
+EXPORT_SYMBOL_GPL(irq_has_action);
+
+/**
+ * irq_check_status_bit - Check whether bits in the irq descriptor status are set
+ * @irq:       The linux irq number
+ * @bitmask:   The bitmask to evaluate
+ *
+ * Returns: True if one of the bits in @bitmask is set
+ */
+bool irq_check_status_bit(unsigned int irq, unsigned int bitmask)
+{
+       struct irq_desc *desc;
+       bool res = false;
+
+       rcu_read_lock();
+       desc = irq_to_desc(irq);
+       if (desc)
+               res = !!(desc->status_use_accessors & bitmask);
+       rcu_read_unlock();
+       return res;
+}
index 72513ed..9813878 100644 (file)
@@ -488,9 +488,10 @@ int show_interrupts(struct seq_file *p, void *v)
        if (!desc || irq_settings_is_hidden(desc))
                goto outsparse;
 
-       if (desc->kstat_irqs)
+       if (desc->kstat_irqs) {
                for_each_online_cpu(j)
-                       any_count |= *per_cpu_ptr(desc->kstat_irqs, j);
+                       any_count |= data_race(*per_cpu_ptr(desc->kstat_irqs, j));
+       }
 
        if ((!desc->action || irq_desc_is_chained(desc)) && !any_count)
                goto outsparse;
index c016042..af41fb9 100644 (file)
@@ -1784,39 +1784,112 @@ int remove_memory(int nid, u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(remove_memory);
 
+static int try_offline_memory_block(struct memory_block *mem, void *arg)
+{
+       uint8_t online_type = MMOP_ONLINE_KERNEL;
+       uint8_t **online_types = arg;
+       struct page *page;
+       int rc;
+
+       /*
+        * Sense the online_type via the zone of the memory block. Offlining
+        * with multiple zones within one memory block will be rejected
+        * by offlining code ... so we don't care about that.
+        */
+       page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
+       if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
+               online_type = MMOP_ONLINE_MOVABLE;
+
+       rc = device_offline(&mem->dev);
+       /*
+        * Default is MMOP_OFFLINE - change it only if offlining succeeded,
+        * so try_reonline_memory_block() can do the right thing.
+        */
+       if (!rc)
+               **online_types = online_type;
+
+       (*online_types)++;
+       /* Ignore if already offline. */
+       return rc < 0 ? rc : 0;
+}
+
+static int try_reonline_memory_block(struct memory_block *mem, void *arg)
+{
+       uint8_t **online_types = arg;
+       int rc;
+
+       if (**online_types != MMOP_OFFLINE) {
+               mem->online_type = **online_types;
+               rc = device_online(&mem->dev);
+               if (rc < 0)
+                       pr_warn("%s: Failed to re-online memory: %d",
+                               __func__, rc);
+       }
+
+       /* Continue processing all remaining memory blocks. */
+       (*online_types)++;
+       return 0;
+}
+
 /*
- * Try to offline and remove a memory block. Might take a long time to
- * finish in case memory is still in use. Primarily useful for memory devices
- * that logically unplugged all memory (so it's no longer in use) and want to
- * offline + remove the memory block.
+ * Try to offline and remove memory. Might take a long time to finish in case
+ * memory is still in use. Primarily useful for memory devices that logically
+ * unplugged all memory (so it's no longer in use) and want to offline + remove
+ * that memory.
  */
 int offline_and_remove_memory(int nid, u64 start, u64 size)
 {
-       struct memory_block *mem;
-       int rc = -EINVAL;
+       const unsigned long mb_count = size / memory_block_size_bytes();
+       uint8_t *online_types, *tmp;
+       int rc;
 
        if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
-           size != memory_block_size_bytes())
-               return rc;
+           !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
+               return -EINVAL;
+
+       /*
+        * We'll remember the old online type of each memory block, so we can
+        * try to revert whatever we did when offlining one memory block fails
+        * after offlining some others succeeded.
+        */
+       online_types = kmalloc_array(mb_count, sizeof(*online_types),
+                                    GFP_KERNEL);
+       if (!online_types)
+               return -ENOMEM;
+       /*
+        * Initialize all states to MMOP_OFFLINE, so when we abort processing in
+        * try_offline_memory_block(), we'll skip all unprocessed blocks in
+        * try_reonline_memory_block().
+        */
+       memset(online_types, MMOP_OFFLINE, mb_count);
 
        lock_device_hotplug();
-       mem = find_memory_block(__pfn_to_section(PFN_DOWN(start)));
-       if (mem)
-               rc = device_offline(&mem->dev);
-       /* Ignore if the device is already offline. */
-       if (rc > 0)
-               rc = 0;
+
+       tmp = online_types;
+       rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
 
        /*
-        * In case we succeeded to offline the memory block, remove it.
+        * In case we succeeded to offline all memory, remove it.
         * This cannot fail as it cannot get onlined in the meantime.
         */
        if (!rc) {
                rc = try_remove_memory(nid, start, size);
-               WARN_ON_ONCE(rc);
+               if (rc)
+                       pr_err("%s: Failed to remove memory: %d", __func__, rc);
+       }
+
+       /*
+        * Rollback what we did. While memory onlining might theoretically fail
+        * (nacked by a notifier), it barely ever happens.
+        */
+       if (rc) {
+               tmp = online_types;
+               walk_memory_blocks(start, size, &tmp,
+                                  try_reonline_memory_block);
        }
        unlock_device_hotplug();
 
+       kfree(online_types);
        return rc;
 }
 EXPORT_SYMBOL_GPL(offline_and_remove_memory);
index 209bb04..65fee63 100755 (executable)
@@ -16,7 +16,6 @@ if [ ! -x "$SPATCH" ]; then
 fi
 
 SPATCH_VERSION=$($SPATCH --version | head -1 | awk '{print $3}')
-SPATCH_VERSION_NUM=$(echo $SPATCH_VERSION | ${DIR}/scripts/ld-version.sh)
 
 USE_JOBS="no"
 $SPATCH --help | grep "\-\-jobs" > /dev/null && USE_JOBS="yes"
@@ -61,6 +60,18 @@ COCCIINCLUDE=${COCCIINCLUDE// -include/ --include}
 if [ "$C" = "1" -o "$C" = "2" ]; then
     ONLINE=1
 
+    if [[ $# -le 0 ]]; then
+           echo ''
+           echo 'Specifying both the variable "C" and rule "coccicheck" in the make
+command results in a shift count error.'
+           echo ''
+           echo 'Try specifying "scripts/coccicheck" as a value for the CHECK variable instead.'
+           echo ''
+           echo 'Example:      make C=2 CHECK=scripts/coccicheck drivers/net/ethernet/ethoc.o'
+           echo ''
+           exit 1
+    fi
+
     # Take only the last argument, which is the C file to test
     shift $(( $# - 1 ))
     OPTIONS="$COCCIINCLUDE $1"
@@ -186,14 +197,11 @@ coccinelle () {
 
     OPT=`grep "Options:" $COCCI | cut -d':' -f2`
     REQ=`grep "Requires:" $COCCI | cut -d':' -f2 | sed "s| ||"`
-    REQ_NUM=$(echo $REQ | ${DIR}/scripts/ld-version.sh)
-    if [ "$REQ_NUM" != "0" ] ; then
-           if [ "$SPATCH_VERSION_NUM" -lt "$REQ_NUM" ] ; then
-                   echo "Skipping coccinelle SmPL patch: $COCCI"
-                   echo "You have coccinelle:           $SPATCH_VERSION"
-                   echo "This SmPL patch requires:      $REQ"
-                   return
-           fi
+    if [ -n "$REQ" ] && ! { echo "$REQ"; echo "$SPATCH_VERSION"; } | sort -CV ; then
+           echo "Skipping coccinelle SmPL patch: $COCCI"
+           echo "You have coccinelle:           $SPATCH_VERSION"
+           echo "This SmPL patch requires:      $REQ"
+           return
     fi
 
 #   The option '--parse-cocci' can be used to syntactically check the SmPL files.
diff --git a/scripts/coccinelle/api/ptr_ret.cocci b/scripts/coccinelle/api/ptr_ret.cocci
deleted file mode 100644 (file)
index e76cd5d..0000000
+++ /dev/null
@@ -1,97 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-///
-/// Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR
-///
-// Confidence: High
-// Copyright: (C) 2012 Julia Lawall, INRIA/LIP6.
-// Copyright: (C) 2012 Gilles Muller, INRIA/LiP6.
-// URL: http://coccinelle.lip6.fr/
-// Options: --no-includes --include-headers
-//
-// Keywords: ERR_PTR, PTR_ERR, PTR_ERR_OR_ZERO
-// Version min: 2.6.39
-//
-
-virtual context
-virtual patch
-virtual org
-virtual report
-
-@depends on patch@
-expression ptr;
-@@
-
-- if (IS_ERR(ptr)) return PTR_ERR(ptr); else return 0;
-+ return PTR_ERR_OR_ZERO(ptr);
-
-@depends on patch@
-expression ptr;
-@@
-
-- if (IS_ERR(ptr)) return PTR_ERR(ptr); return 0;
-+ return PTR_ERR_OR_ZERO(ptr);
-
-@depends on patch@
-expression ptr;
-@@
-
-- (IS_ERR(ptr) ? PTR_ERR(ptr) : 0)
-+ PTR_ERR_OR_ZERO(ptr)
-
-@r1 depends on !patch@
-expression ptr;
-position p1;
-@@
-
-* if@p1 (IS_ERR(ptr)) return PTR_ERR(ptr); else return 0;
-
-@r2 depends on !patch@
-expression ptr;
-position p2;
-@@
-
-* if@p2 (IS_ERR(ptr)) return PTR_ERR(ptr); return 0;
-
-@r3 depends on !patch@
-expression ptr;
-position p3;
-@@
-
-* IS_ERR@p3(ptr) ? PTR_ERR(ptr) : 0
-
-@script:python depends on org@
-p << r1.p1;
-@@
-
-coccilib.org.print_todo(p[0], "WARNING: PTR_ERR_OR_ZERO can be used")
-
-
-@script:python depends on org@
-p << r2.p2;
-@@
-
-coccilib.org.print_todo(p[0], "WARNING: PTR_ERR_OR_ZERO can be used")
-
-@script:python depends on org@
-p << r3.p3;
-@@
-
-coccilib.org.print_todo(p[0], "WARNING: PTR_ERR_OR_ZERO can be used")
-
-@script:python depends on report@
-p << r1.p1;
-@@
-
-coccilib.report.print_report(p[0], "WARNING: PTR_ERR_OR_ZERO can be used")
-
-@script:python depends on report@
-p << r2.p2;
-@@
-
-coccilib.report.print_report(p[0], "WARNING: PTR_ERR_OR_ZERO can be used")
-
-@script:python depends on report@
-p << r3.p3;
-@@
-
-coccilib.report.print_report(p[0], "WARNING: PTR_ERR_OR_ZERO can be used")
diff --git a/scripts/coccinelle/misc/boolinit.cocci b/scripts/coccinelle/misc/boolinit.cocci
deleted file mode 100644 (file)
index fed6126..0000000
+++ /dev/null
@@ -1,195 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/// Bool initializations should use true and false.  Bool tests don't need
-/// comparisons.  Based on contributions from Joe Perches, Rusty Russell
-/// and Bruce W Allan.
-///
-// Confidence: High
-// Copyright: (C) 2012 Julia Lawall, INRIA/LIP6.
-// Copyright: (C) 2012 Gilles Muller, INRIA/LiP6.
-// URL: http://coccinelle.lip6.fr/
-// Options: --include-headers
-
-virtual patch
-virtual context
-virtual org
-virtual report
-
-@boolok@
-symbol true,false;
-@@
-(
-true
-|
-false
-)
-
-@depends on patch@
-bool t;
-@@
-
-(
-- t == true
-+ t
-|
-- true == t
-+ t
-|
-- t != true
-+ !t
-|
-- true != t
-+ !t
-|
-- t == false
-+ !t
-|
-- false == t
-+ !t
-|
-- t != false
-+ t
-|
-- false != t
-+ t
-)
-
-@depends on patch disable is_zero, isnt_zero@
-bool t;
-@@
-
-(
-- t == 1
-+ t
-|
-- t != 1
-+ !t
-|
-- t == 0
-+ !t
-|
-- t != 0
-+ t
-)
-
-@depends on patch && boolok@
-bool b;
-@@
-(
- b =
-- 0
-+ false
-|
- b =
-- 1
-+ true
-)
-
-// ---------------------------------------------------------------------
-
-@r1 depends on !patch@
-bool t;
-position p;
-@@
-
-(
-* t@p == true
-|
-* true == t@p
-|
-* t@p != true
-|
-* true != t@p
-|
-* t@p == false
-|
-* false == t@p
-|
-* t@p != false
-|
-* false != t@p
-)
-
-@r2 depends on !patch disable is_zero, isnt_zero@
-bool t;
-position p;
-@@
-
-(
-* t@p == 1
-|
-* t@p != 1
-|
-* t@p == 0
-|
-* t@p != 0
-)
-
-@r3 depends on !patch && boolok@
-bool b;
-position p1;
-@@
-(
-*b@p1 = 0
-|
-*b@p1 = 1
-)
-
-@r4 depends on !patch@
-bool b;
-position p2;
-identifier i;
-constant c != {0,1};
-@@
-(
- b = i
-|
-*b@p2 = c
-)
-
-@script:python depends on org@
-p << r1.p;
-@@
-
-cocci.print_main("WARNING: Comparison to bool",p)
-
-@script:python depends on org@
-p << r2.p;
-@@
-
-cocci.print_main("WARNING: Comparison of 0/1 to bool variable",p)
-
-@script:python depends on org@
-p1 << r3.p1;
-@@
-
-cocci.print_main("WARNING: Assignment of 0/1 to bool variable",p1)
-
-@script:python depends on org@
-p2 << r4.p2;
-@@
-
-cocci.print_main("ERROR: Assignment of non-0/1 constant to bool variable",p2)
-
-@script:python depends on report@
-p << r1.p;
-@@
-
-coccilib.report.print_report(p[0],"WARNING: Comparison to bool")
-
-@script:python depends on report@
-p << r2.p;
-@@
-
-coccilib.report.print_report(p[0],"WARNING: Comparison of 0/1 to bool variable")
-
-@script:python depends on report@
-p1 << r3.p1;
-@@
-
-coccilib.report.print_report(p1[0],"WARNING: Assignment of 0/1 to bool variable")
-
-@script:python depends on report@
-p2 << r4.p2;
-@@
-
-coccilib.report.print_report(p2[0],"ERROR: Assignment of non-0/1 constant to bool variable")
index dab4c1a..e8ce2a4 100644 (file)
@@ -12,11 +12,9 @@ if [ ! -x "$SPATCH" ]; then
        exit 1
 fi
 
-SPATCH_REQ_VERSION_NUM=$(echo $SPATCH_REQ_VERSION | ${DIR}/scripts/ld-version.sh)
 SPATCH_VERSION=$($SPATCH --version | head -1 | awk '{print $3}')
-SPATCH_VERSION_NUM=$(echo $SPATCH_VERSION | ${DIR}/scripts/ld-version.sh)
 
-if [ "$SPATCH_VERSION_NUM" -lt "$SPATCH_REQ_VERSION_NUM" ] ; then
+if ! { echo "$SPATCH_REQ_VERSION"; echo "$SPATCH_VERSION"; } | sort -CV ; then
        echo "spatch needs to be version $SPATCH_REQ_VERSION or higher"
        exit 1
 fi
index 67dabca..2499f24 100644 (file)
@@ -14,3 +14,7 @@ ima-$(CONFIG_HAVE_IMA_KEXEC) += ima_kexec.o
 ima-$(CONFIG_IMA_BLACKLIST_KEYRING) += ima_mok.o
 ima-$(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) += ima_asymmetric_keys.o
 ima-$(CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS) += ima_queue_keys.o
+
+ifeq ($(CONFIG_EFI),y)
+ima-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_efi.o
+endif
diff --git a/security/integrity/ima/ima_efi.c b/security/integrity/ima/ima_efi.c
new file mode 100644 (file)
index 0000000..71786d0
--- /dev/null
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2018 IBM Corporation
+ */
+#include <linux/efi.h>
+#include <linux/module.h>
+#include <linux/ima.h>
+#include <asm/efi.h>
+
+#ifndef arch_ima_efi_boot_mode
+#define arch_ima_efi_boot_mode efi_secureboot_mode_unset
+#endif
+
+static enum efi_secureboot_mode get_sb_mode(void)
+{
+       enum efi_secureboot_mode mode;
+
+       if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) {
+               pr_info("ima: secureboot mode unknown, no efi\n");
+               return efi_secureboot_mode_unknown;
+       }
+
+       mode = efi_get_secureboot_mode(efi.get_variable);
+       if (mode == efi_secureboot_mode_disabled)
+               pr_info("ima: secureboot mode disabled\n");
+       else if (mode == efi_secureboot_mode_unknown)
+               pr_info("ima: secureboot mode unknown\n");
+       else
+               pr_info("ima: secureboot mode enabled\n");
+       return mode;
+}
+
+bool arch_ima_get_secureboot(void)
+{
+       static enum efi_secureboot_mode sb_mode;
+       static bool initialized;
+
+       if (!initialized && efi_enabled(EFI_BOOT)) {
+               sb_mode = arch_ima_efi_boot_mode;
+
+               if (sb_mode == efi_secureboot_mode_unset)
+                       sb_mode = get_sb_mode();
+               initialized = true;
+       }
+
+       if (sb_mode == efi_secureboot_mode_enabled)
+               return true;
+       else
+               return false;
+}
+
+/* secureboot arch rules */
+static const char * const sb_arch_rules[] = {
+#if !IS_ENABLED(CONFIG_KEXEC_SIG)
+       "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig",
+#endif /* CONFIG_KEXEC_SIG */
+       "measure func=KEXEC_KERNEL_CHECK",
+#if !IS_ENABLED(CONFIG_MODULE_SIG)
+       "appraise func=MODULE_CHECK appraise_type=imasig",
+#endif
+       "measure func=MODULE_CHECK",
+       NULL
+};
+
+const char * const *arch_get_ima_policy(void)
+{
+       if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) {
+               if (IS_ENABLED(CONFIG_MODULE_SIG))
+                       set_module_sig_enforced();
+               return sb_arch_rules;
+       }
+       return NULL;
+}
index efe2406..7eabb44 100644 (file)
@@ -688,9 +688,10 @@ bool smack_privileged_cred(int cap, const struct cred *cred)
 bool smack_privileged(int cap)
 {
        /*
-        * All kernel tasks are privileged
+        * Kernel threads may not have credentials we can use.
+        * The io_uring kernel threads do have reliable credentials.
         */
-       if (unlikely(current->flags & PF_KTHREAD))
+       if ((current->flags & (PF_KTHREAD | PF_IO_WORKER)) == PF_KTHREAD)
                return true;
 
        return smack_privileged_cred(cap, current_cred());
index 04d563f..468435e 100644 (file)
 # define mb() abort()
 # define dma_rmb() abort()
 # define dma_wmb() abort()
+#elif defined(__aarch64__)
+#define dmb(opt) asm volatile("dmb " #opt : : : "memory")
+#define virt_mb() __sync_synchronize()
+#define virt_rmb() dmb(ishld)
+#define virt_wmb() dmb(ishst)
+#define virt_store_mb(var, value)  do { WRITE_ONCE(var, value); dmb(ish); } while (0)
+/* Weak barriers should be used. If not - it's a bug */
+# define mb() abort()
+# define dma_rmb() abort()
+# define dma_wmb() abort()
 #else
 #error Please fill in barrier macros
 #endif
index b14c2c3..813baf1 100644 (file)
@@ -2,6 +2,8 @@
 #ifndef BUG_H
 #define BUG_H
 
+#include <asm/bug.h>
+
 #define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond))
 
 #define BUILD_BUG_ON(x)
index 315e85c..0b49354 100644 (file)
@@ -11,6 +11,7 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <linux/overflow.h>
 #include <linux/list.h>
 #include <linux/printk.h>
 #include <linux/bug.h>
@@ -117,6 +118,16 @@ static inline void free_page(unsigned long addr)
 #  define unlikely(x)  (__builtin_expect(!!(x), 0))
 # endif
 
+static inline void *krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t gfp)
+{
+       size_t bytes;
+
+       if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
+               return NULL;
+
+       return krealloc(p, bytes, gfp);
+}
+
 #define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
 #ifdef DEBUG
 #define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
@@ -126,8 +137,6 @@ static inline void free_page(unsigned long addr)
 #define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
 #define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
 
-#define WARN_ON_ONCE(cond) (unlikely(cond) ? fprintf (stderr, "WARNING\n") : 0)
-
 #define min(x, y) ({                           \
        typeof(x) _min1 = (x);                  \
        typeof(y) _min2 = (y);                  \